python中scikit-learn機(jī)器代碼實(shí)例
更新時(shí)間:2018年08月05日 11:25:21 作者:yan456jie
這篇文章給大家分享了關(guān)于python中scikit-learn機(jī)器的代碼實(shí)例內(nèi)容,有興趣的朋友跟著小編測試下。
我們給大家?guī)砹岁P(guān)于學(xué)習(xí)python中scikit-learn機(jī)器代碼的相關(guān)具體實(shí)例,以下就是全部代碼內(nèi)容:
# -*- coding: utf-8 -*- import numpy from sklearn import metrics from sklearn.svm import LinearSVC from sklearn.naive_bayes import MultinomialNB from sklearn import linear_model from sklearn.datasets import load_iris from sklearn.cross_validation import train_test_split from sklearn.preprocessing import OneHotEncoder, StandardScaler from sklearn import cross_validation from sklearn import preprocessing #import iris_data def load_data(): iris = load_iris() x, y = iris.data, iris.target x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.20, random_state=42) return x_train,y_train,x_test,y_test def train_clf3(train_data, train_tags): clf = LinearSVC(C=1100.0)#default with 'rbf' clf.fit(train_data,train_tags) return clf def train_clf(train_data, train_tags): clf = MultinomialNB(alpha=0.01) print numpy.asarray(train_tags) clf.fit(train_data, numpy.asarray(train_tags)) return clf def evaluate(actual, pred): m_precision = metrics.precision_score(actual, pred) m_recall = metrics.recall_score(actual, pred) print 'precision:{0:.3f}'.format(m_precision) print 'recall:{0:0.3f}'.format(m_recall) print 'f1-score:{0:.8f}'.format(metrics.f1_score(actual,pred)); x_train,y_train,x_test,y_test = load_data() clf = train_clf(x_train, y_train) pred = clf.predict(x_test) evaluate(numpy.asarray(y_test), pred) print metrics.classification_report(y_test, pred) 使用自定義數(shù)據(jù) # coding: utf-8 import numpy from sklearn import metrics from sklearn.feature_extraction.text import HashingVectorizer from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.naive_bayes import MultinomialNB from sklearn.feature_extraction.text import CountVectorizer,TfidfTransformer from sklearn.neighbors import KNeighborsClassifier from sklearn.svm import SVC from sklearn.svm import LinearSVC import codecs from sklearn.ensemble import RandomForestClassifier from sklearn import cross_validation from sklearn import linear_model train_corpus = [ '我們 我們 好孩子 認(rèn)證 。 就是', '我們 好孩子 認(rèn)證 。 中國', '我們 好孩子 認(rèn)證 。 孤獨(dú)', '我們 好孩子 認(rèn)證 。', ] test_corpus = [ '我 菲律賓 韓國', '我們 好孩子 認(rèn)證 。 中國', ] def input_data(train_file, test_file): train_words = [] train_tags = [] test_words = [] test_tags = [] f1 = codecs.open(train_file,'r','utf-8','ignore') for line in f1: tks = line.split(':', 1) word_list = tks[1] word_array = word_list[1:(len(word_list)-3)].split(", ") train_words.append(" ".join(word_array)) train_tags.append(tks[0]) f2 = codecs.open(test_file,'r','utf-8','ignore') for line in f2: tks = line.split(':', 1) word_list = tks[1] word_array = word_list[1:(len(word_list)-3)].split(", ") test_words.append(" ".join(word_array)) test_tags.append(tks[0]) return train_words, train_tags, test_words, test_tags def vectorize(train_words, test_words): #v = HashingVectorizer(n_features=25000, non_negative=True) v = HashingVectorizer(non_negative=True) #v = CountVectorizer(min_df=1) train_data = v.fit_transform(train_words) test_data = v.fit_transform(test_words) return train_data, test_data def vectorize1(train_words, test_words): tv = TfidfVectorizer(sublinear_tf = False,use_idf=True); train_data = tv.fit_transform(train_words); tv2 = TfidfVectorizer(vocabulary = tv.vocabulary_); test_data = tv2.fit_transform(test_words); return train_data, test_data def vectorize2(train_words, test_words): count_v1= CountVectorizer(stop_words = 'english', max_df = 0.5); counts_train = count_v1.fit_transform(train_words); count_v2 = CountVectorizer(vocabulary=count_v1.vocabulary_); counts_test = count_v2.fit_transform(test_words); tfidftransformer = TfidfTransformer(); train_data = tfidftransformer.fit(counts_train).transform(counts_train); test_data = tfidftransformer.fit(counts_test).transform(counts_test); return train_data, test_data def evaluate(actual, pred): m_precision = metrics.precision_score(actual, pred) m_recall = metrics.recall_score(actual, pred) print 'precision:{0:.3f}'.format(m_precision) print 'recall:{0:0.3f}'.format(m_recall) print 'f1-score:{0:.8f}'.format(metrics.f1_score(actual,pred)); def train_clf(train_data, train_tags): clf = MultinomialNB(alpha=0.01) clf.fit(train_data, numpy.asarray(train_tags)) return clf def train_clf1(train_data, train_tags): #KNN Classifier clf = KNeighborsClassifier()#default with k=5 clf.fit(train_data, numpy.asarray(train_tags)) return clf def train_clf2(train_data, train_tags): clf = linear_model.LogisticRegression(C=1e5) clf.fit(train_data,train_tags) return clf def train_clf3(train_data, train_tags): clf = LinearSVC(C=1100.0)#default with 'rbf' clf.fit(train_data,train_tags) return clf def train_clf4(train_data, train_tags): """ 隨機(jī)森林,不可使用稀疏矩陣 """ clf = RandomForestClassifier(n_estimators=10) clf.fit(train_data.todense(),train_tags) return clf #使用codecs逐行讀取 def codecs_read_label_line(filename): label_list=[] f = codecs.open(filename,'r','utf-8','ignore') line = f.readline() while line: #label_list.append(line[0:len(line)-2]) label_list.append(line[0:len(line)-1]) line = f.readline() f.close() return label_list def save_test_features(test_url, test_label): test_feature_list = codecs_read_label_line('test.dat') fw = open('test_labeded.dat',"w+") for (url,label) in zip(test_feature_list,test_label): fw.write(url+'\t'+label) fw.write('\n') fw.close() def main(): train_file = u'..\\file\\py_train.txt' test_file = u'..\\file\\py_test.txt' train_words, train_tags, test_words, test_tags = input_data(train_file, test_file) #print len(train_words), len(train_tags), len(test_words), len(test_words), train_data, test_data = vectorize1(train_words, test_words) print type(train_data) print train_data.shape print test_data.shape print test_data[0].shape print numpy.asarray(test_data[0]) clf = train_clf3(train_data, train_tags) scores = cross_validation.cross_val_score( clf, train_data, train_tags, cv=5, scoring="f1_weighted") print scores #predicted = cross_validation.cross_val_predict(clf, train_data,train_tags, cv=5) ''' ''' pred = clf.predict(test_data) error_list=[] for (true_tag,predict_tag) in zip(test_tags,pred): if true_tag != predict_tag: print true_tag,predict_tag error_list.append(true_tag+' '+predict_tag) print len(error_list) evaluate(numpy.asarray(test_tags), pred) ''' #輸出打標(biāo)簽結(jié)果 test_feature_list = codecs_read_label_line('test.dat') save_test_features(test_feature_list, pred) ''' if __name__ == '__main__': main()
您可能感興趣的文章:
- 使用Python和scikit-learn創(chuàng)建混淆矩陣的示例詳解
- 分享15?個(gè)python中的?Scikit-Learn?技能
- Python機(jī)器學(xué)習(xí)工具scikit-learn的使用筆記
- Python機(jī)器學(xué)習(xí)算法庫scikit-learn學(xué)習(xí)之決策樹實(shí)現(xiàn)方法詳解
- Python機(jī)器學(xué)習(xí)之scikit-learn庫中KNN算法的封裝與使用方法
- python的scikit-learn將特征轉(zhuǎn)成one-hot特征的方法
- Python機(jī)器學(xué)習(xí)庫scikit-learn安裝與基本使用教程
- Python scikit-learn 做線性回歸的示例代碼
- Python機(jī)器學(xué)習(xí)庫scikit-learn入門開發(fā)示例
相關(guān)文章
Python?中?key?參數(shù)的含義及用法小結(jié)
我們在使用?sorted()?或?map()?函數(shù)的時(shí)候,都會(huì)看到里面有一個(gè)?key?參數(shù),其實(shí)這個(gè)?key?參數(shù)也存在于其他內(nèi)置函數(shù)中(例如?min()、max()?等),那么我們今天就來了解一下?key?參數(shù)的含義以及用途吧,需要的朋友可以參考下2023-12-12Python使用matplotlib實(shí)現(xiàn)交換式圖形顯示功能示例
這篇文章主要介紹了Python使用matplotlib實(shí)現(xiàn)交換式圖形顯示功能,結(jié)合實(shí)例形式詳細(xì)分析了Python基于matplotlib模塊的數(shù)值運(yùn)算與圖形繪制相關(guān)操作技巧,需要的朋友可以參考下2019-09-09Python爬蟲中Selenium實(shí)現(xiàn)文件上傳
這篇文章主要介紹了Python爬蟲中Selenium實(shí)現(xiàn)文件上傳,文中通過示例代碼介紹的非常詳細(xì),對大家的學(xué)習(xí)或者工作具有一定的參考學(xué)習(xí)價(jià)值,需要的朋友們下面隨著小編來一起學(xué)習(xí)學(xué)習(xí)吧2020-12-12解決python3在anaconda下安裝caffe失敗的問題
下面小編就為大家?guī)硪黄鉀Qpython3在anaconda下安裝caffe失敗的問題。小編覺得挺不錯(cuò)的,現(xiàn)在就分享給大家,也給大家做個(gè)參考。一起跟隨小編過來看看吧2017-06-06