基于机器学习的文本情感极性分析
相关文档
代码示例(无pca曲线 无ks值 不含MLP 相关代码)
# -*- coding: utf-8 -*-
# -*- coding: <encoding name> -*-
import numpy as np
import sys
import re
import codecs
import os
import jieba
import gensim, logging
from gensim.models import word2vec
from sklearn.cross_validation import train_test_split
from sklearn.externals import joblib
from sklearn.preprocessing import scale
from sklearn.svm import SVC
from sklearn.decomposition import PCA
from scipy import stats
from sklearn.cross_validation import train_test_split
#from keras.models import Sequential
# from keras.layers import Dense, Dropout, Activation
# from keras.optimizers import SGD
from sklearn.metrics import f1_score
#from bayes_opt import BayesianOptimization as BO
from sklearn.metrics import roc_curve
import matplotlib.pyplot as plt
import sklearn
from sklearn.metrics import roc_auc_score as auc
from sklearn.metrics import accuracy_score as acc
def parseSent(sentence):
seg_list = jieba.cut(sentence)
output = ''.join(list(seg_list)) # use space to join them
return output
def sent2word(sentence):
"""
Segment a sentence to words
Delete stopwords
"""
segResult = []
segList = jieba.cut(sentence)
for w in segList:
segResult.append(w)
stopwords = readLines('stop_words.txt')
newSent=[]
stopwords_list=[]
for word in segResult:
if word in stopwords:
# print "stopword: %s" % word
continue
else:
newSent.append(word)
#output = ' '.join(list(newSent))
return newSent
def eachFile(filepath):
pathDir = os.listdir(filepath)
child=[]
for allDir in pathDir:
child.append(os.path.join('%s/%s' % (filepath, allDir)))
return child
def readLines(filename):
fopen = open(filename, 'r')
data=[]
for x in fopen.readlines():
if x.strip() != '':
data.append(x.strip())
fopen.close()
return data
def readFile(filename):
data=[]
for x in filename:
fopen = open(x, 'r')
for eachLine in fopen:
if eachLine.strip() != '':
data.append(unicode(eachLine.strip(),"GBK"))
fopen.close()
return data
def getWordVecs(wordList):
vecs = []
for word in wordList:
word = word.replace('\n', '')
try:
vecs.append(model[word])
except KeyError:
continue
return np.array(vecs, dtype = 'float')
def buildVecs(filename):
posInput = []
with open(filename, "rb") as txtfile:
for lines in txtfile:
lines = lines.split('\n')
if lines[0] == "\r" or lines[0] == "\r\n" or lines[0] == "\r\r":
pass
else:
for line in lines:
line = list(jieba.cut(line))
resultList = getWordVecs(line)
# for each sentence, the mean vector of all its vectors is used to represent this sentence
if len(resultList) != 0:
resultArray = sum(np.array(resultList))/len(resultList)
posInput.append(resultArray)
return posInput
# load word2vec model
#训练模型输出模型
filepwd=eachFile("test/new")
sentences=[]
for x in filepwd:
data=readLines(x)
sentences.extend(sent2word(data[0]))
#sentences.append(data[0])
model = gensim.models.Word2Vec(sentences, min_count=1)
# outp1 = 'corpus.model.bin'
# model.save(outp1)
filepwd_pos=eachFile("test/pos")
filepwd_neg=eachFile("test\\neg")
pos_number=0
neg_number=0
posInput=[]
negInput=[]
for pos in filepwd_pos:
pos_buildVecs=buildVecs(pos)
posInput.extend(pos_buildVecs)
pos_number+=1
if pos_number == 100:
break
for neg in filepwd_neg:
neg_buildVecs=buildVecs(neg)
negInput.extend(neg_buildVecs)
neg_number+=1
if neg_number == 100:
break
y = np.concatenate((np.ones(len(posInput)), np.zeros(len(negInput))))
X = posInput[:]
for neg in negInput:
X.append(neg)
X = np.array(X)
X=scale(X)
X_reduced = PCA(n_components = 100).fit_transform(X)
#X_reduced_train,X_reduced_test, y_reduced_train, y_reduced_test =train_test_split(X_reduced,y)
X_reduced_train,X_reduced_test, y_reduced_train, y_reduced_test =train_test_split(X_reduced,y,test_size=0.4, random_state=1)
"""
SVM (RBF)
using training data with 100 dimensions
"""
clf = SVC(C = 2, probability = True)
clf.fit(X_reduced_train, y_reduced_train)
print 'Test Accuracy: %.2f'% clf.score(X_reduced_test, y_reduced_test)
pred_probas = clf.predict_proba(X_reduced_test)[:,1]
#print "KS value: %f" % KSmetric(y_reduced_test, pred_probas)[0]
#plot ROC curve# AUC = 0.92# KS = 0.7
#输出相关结果 以及绘图
print "test:"
print clf.predict(X_reduced_test)
print "value:"
print y_reduced_test
test_value=clf.predict(X_reduced_test)
index=[]
for x in range(0,len(test_value)):
index.append(x+1)
test_value_1=0
test_value_0=0
for test_value_data in test_value:
if test_value_data == 1:
test_value_1+=1
else:
test_value_0+=1
y_reduced_test_1=0
y_reduced_test_0=0
for y_reduced_test_data in y_reduced_test:
if y_reduced_test_data == 1:
y_reduced_test_1+=1
else:
y_reduced_test_0+=1
test_value_label='test pos: '+str(test_value_1)+' neg: '+str(test_value_0)
y_reduced_test_label='value pos: '+str(y_reduced_test_1)+' neg: '+str(y_reduced_test_0)
plt.plot(index, test_value,'ro',label = test_value_label)
plt.plot(index,y_reduced_test, 'b.',label =y_reduced_test_label)
plt.xlim([0, len(test_value)])
plt.ylim([-2, 2])
plt.legend(loc = 'lower right')
plt.show()
fpr,tpr,_ = roc_curve(y_reduced_test, pred_probas)
roc_auc = sklearn.metrics.auc(fpr,tpr)
plt.plot(fpr, tpr, label = 'roc_auc = %.2f' % roc_auc)
plt.plot([0, 1], [0, 1], 'k--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.legend(loc = 'lower right')
plt.show()
不含ks值原因
无法定位原文作者KSmetric调用的什么函数 或者说内置函数简写
不含MLP的原因
笔者水平有限 还不懂人工神经网络相关技术点