多类分类示例代码

# -*- coding: utf-8 -*-


import sys

#加载pandas模块
import pandas as pd

#加载 sklearn.feature_extraction.text.TfidfVectorizer 
from sklearn.feature_extraction.text import TfidfVectorizer

#加载 sklearn.linear_model.logistic.LogisticRegression
from sklearn.linear_model.logistic import LogisticRegression



#读取数据
df=pd.read_csv('train.tsv/train.tsv',delimiter='\t')


# #打印 Phrase 前10列内容
# print df.Phrase.head(10)
# #打印有多少行数据
# print df.count()
#打印Sentiment 相关描述
print df.Sentiment.describe()
#打印Sentiment每个值的数量
print df.Sentiment.value_counts()



help(LogisticRegression)






# #载入pandas模块
# import pandas as pd

# #载入 numpy

# import numpy

# #加载 sklearn.feature_extraction.text.TfidfVectorizer 
# from sklearn.feature_extraction.text import TfidfVectorizer


# #加载 sklearn.linear_model.logistic.LogisticRegression
# from sklearn.linear_model.logistic import LogisticRegression


# #加载 sklearn.cross_validation.train_test_split
# from sklearn.cross_validation import train_test_split


# #加载 sklearn.cross_validation.cross_val_score
# from sklearn.cross_validation import cross_val_score

# #加载sklearn.metrics 相关函数   
# from sklearn.metrics import precision_score, recall_score, f1_score, make_scorer, roc_curve, auc

# #载入matplotlib.pyplot
# import matplotlib.pyplot as plt

# #加载数据 没有有header  且以\t 为定界符  header 用0,1,2,3 表示
# df=pd.read_csv('SMSSpamCollection',delimiter='\t',header=None,names=['label','message'])






# #分割测试集  训练集
# x_train,x_test,y_train,y_test=train_test_split(df['message'],df['label'],random_state=1);


# #建立TfidfVectorizer实例 计算TF-IDF权重
# vectorizer=TfidfVectorizer()
# x_train=vectorizer.fit_transform(x_train)
# x_test=vectorizer.transform(x_test)



# #建立LogisticRegression实例训练模型
# classifier=LogisticRegression()
# #训练
# classifier.fit(x_train, y_train)
# #预测
# predictions=classifier.predict(x_test)

# #x_test预测输出概率
# predictions2=classifier.predict_proba(x_test)

# print y_test
# #通过实际测试值 和和预测概率 计算 召回率 和误警率
# false_positive_rate, recall, thresholds = roc_curve(y_test, predictions2[:, 1],pos_label='spam')


# #计算auc值
# roc_auc = auc(false_positive_rate, recall)

# print false_positive_rate
# print recall

# #配置标题
# plt.title('Receiver Operating Characteristic')
# #绘图
# plt.plot(false_positive_rate, recall, 'b', label=roc_auc)
# #说明的位置 如果不设置 label不会生效
# plt.legend(loc=4)
# #设置x y限制
# plt.xlim([0.0, 1.0])
# plt.ylim([0.0, 1.0])
# #指定x y 标签
# plt.ylabel('Recall')
# plt.xlabel('Fall-out')

# plt.show()



# #循环预测值   打印预测值  和测试值
# #n=0
# #k=0
# # for x,y in enumerate(predictions):
# #     print  y,y_test.iloc[2]
# #     k+=1
# #     if y == y_test.iloc[2]:
# #         pass
# #     else:
# #         n+=1


# #输出测试结果
# print classifier.score(x_test,y_test)


# #使用交叉检测
# scores = cross_val_score(classifier,x_test,y_test,cv=5)


# #输出交叉测试结果平均值 和 单次的值
# print numpy.mean(scores),scores


# #定义精确率的 pos_label 为ham
# precision_scoring=make_scorer(precision_score,pos_label='ham')


# #使用交叉测试计算精确率
# precisions = cross_val_score(classifier, x_train, y_train, cv=5, scoring=precision_scoring)


# #输出交叉测试结果平均值 和 单次的值
# print numpy.mean(precisions),precisions


# #定义召回率的 pos_label 为ham
# recall_scoring=make_scorer(recall_score,pos_label='ham')


# #使用交叉测试计算召回率
# recalls = cross_val_score(classifier,x_train,y_train,cv=5,scoring=recall_scoring)


# #输出交叉测试结果平均值 和 单次的值
# print numpy.mean(recalls),recalls


# #定义综合评价指标的 pos_label 为ham
# f1_scoring=make_scorer(f1_score,pos_label='ham')


# #使用交叉测试计算综合评价指标
# f1 = cross_val_score(classifier,x_train,y_train,cv=5,scoring=f1_scoring)


# #输出交叉测试结果平均值 和 单次的值
# print numpy.mean(f1),f1