多类分类示例代码
# -*- coding: utf-8 -*-
import sys
#加载pandas模块
import pandas as pd
#加载 sklearn.feature_extraction.text.TfidfVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
#加载 sklearn.linear_model.logistic.LogisticRegression
from sklearn.linear_model.logistic import LogisticRegression
#读取数据
df=pd.read_csv('train.tsv/train.tsv',delimiter='\t')
# #打印 Phrase 前10列内容
# print df.Phrase.head(10)
# #打印有多少行数据
# print df.count()
#打印Sentiment 相关描述
print df.Sentiment.describe()
#打印Sentiment每个值的数量
print df.Sentiment.value_counts()
help(LogisticRegression)
# #载入pandas模块
# import pandas as pd
# #载入 numpy
# import numpy
# #加载 sklearn.feature_extraction.text.TfidfVectorizer
# from sklearn.feature_extraction.text import TfidfVectorizer
# #加载 sklearn.linear_model.logistic.LogisticRegression
# from sklearn.linear_model.logistic import LogisticRegression
# #加载 sklearn.cross_validation.train_test_split
# from sklearn.cross_validation import train_test_split
# #加载 sklearn.cross_validation.cross_val_score
# from sklearn.cross_validation import cross_val_score
# #加载sklearn.metrics 相关函数
# from sklearn.metrics import precision_score, recall_score, f1_score, make_scorer, roc_curve, auc
# #载入matplotlib.pyplot
# import matplotlib.pyplot as plt
# #加载数据 没有有header 且以\t 为定界符 header 用0,1,2,3 表示
# df=pd.read_csv('SMSSpamCollection',delimiter='\t',header=None,names=['label','message'])
# #分割测试集 训练集
# x_train,x_test,y_train,y_test=train_test_split(df['message'],df['label'],random_state=1);
# #建立TfidfVectorizer实例 计算TF-IDF权重
# vectorizer=TfidfVectorizer()
# x_train=vectorizer.fit_transform(x_train)
# x_test=vectorizer.transform(x_test)
# #建立LogisticRegression实例训练模型
# classifier=LogisticRegression()
# #训练
# classifier.fit(x_train, y_train)
# #预测
# predictions=classifier.predict(x_test)
# #x_test预测输出概率
# predictions2=classifier.predict_proba(x_test)
# print y_test
# #通过实际测试值 和和预测概率 计算 召回率 和误警率
# false_positive_rate, recall, thresholds = roc_curve(y_test, predictions2[:, 1],pos_label='spam')
# #计算auc值
# roc_auc = auc(false_positive_rate, recall)
# print false_positive_rate
# print recall
# #配置标题
# plt.title('Receiver Operating Characteristic')
# #绘图
# plt.plot(false_positive_rate, recall, 'b', label=roc_auc)
# #说明的位置 如果不设置 label不会生效
# plt.legend(loc=4)
# #设置x y限制
# plt.xlim([0.0, 1.0])
# plt.ylim([0.0, 1.0])
# #指定x y 标签
# plt.ylabel('Recall')
# plt.xlabel('Fall-out')
# plt.show()
# #循环预测值 打印预测值 和测试值
# #n=0
# #k=0
# # for x,y in enumerate(predictions):
# # print y,y_test.iloc[2]
# # k+=1
# # if y == y_test.iloc[2]:
# # pass
# # else:
# # n+=1
# #输出测试结果
# print classifier.score(x_test,y_test)
# #使用交叉检测
# scores = cross_val_score(classifier,x_test,y_test,cv=5)
# #输出交叉测试结果平均值 和 单次的值
# print numpy.mean(scores),scores
# #定义精确率的 pos_label 为ham
# precision_scoring=make_scorer(precision_score,pos_label='ham')
# #使用交叉测试计算精确率
# precisions = cross_val_score(classifier, x_train, y_train, cv=5, scoring=precision_scoring)
# #输出交叉测试结果平均值 和 单次的值
# print numpy.mean(precisions),precisions
# #定义召回率的 pos_label 为ham
# recall_scoring=make_scorer(recall_score,pos_label='ham')
# #使用交叉测试计算召回率
# recalls = cross_val_score(classifier,x_train,y_train,cv=5,scoring=recall_scoring)
# #输出交叉测试结果平均值 和 单次的值
# print numpy.mean(recalls),recalls
# #定义综合评价指标的 pos_label 为ham
# f1_scoring=make_scorer(f1_score,pos_label='ham')
# #使用交叉测试计算综合评价指标
# f1 = cross_val_score(classifier,x_train,y_train,cv=5,scoring=f1_scoring)
# #输出交叉测试结果平均值 和 单次的值
# print numpy.mean(f1),f1