python中scikit-learn機器代碼實例
更新時間:2018年08月05日 11:25:21 作者:yan456jie
這篇文章給大家分享了關于python中scikit-learn機器的代碼實例內(nèi)容,有興趣的朋友跟著小編測試下。
我們給大家?guī)砹岁P于學習python中scikit-learn機器代碼的相關具體實例,以下就是全部代碼內(nèi)容:
# -*- coding: utf-8 -*-
import numpy
from sklearn import metrics
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB
from sklearn import linear_model
from sklearn.datasets import load_iris
from sklearn.cross_validation import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn import cross_validation
from sklearn import preprocessing
#import iris_data
def load_data():
iris = load_iris()
x, y = iris.data, iris.target
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.20, random_state=42)
return x_train,y_train,x_test,y_test
def train_clf3(train_data, train_tags):
clf = LinearSVC(C=1100.0)#default with 'rbf'
clf.fit(train_data,train_tags)
return clf
def train_clf(train_data, train_tags):
clf = MultinomialNB(alpha=0.01)
print numpy.asarray(train_tags)
clf.fit(train_data, numpy.asarray(train_tags))
return clf
def evaluate(actual, pred):
m_precision = metrics.precision_score(actual, pred)
m_recall = metrics.recall_score(actual, pred)
print 'precision:{0:.3f}'.format(m_precision)
print 'recall:{0:0.3f}'.format(m_recall)
print 'f1-score:{0:.8f}'.format(metrics.f1_score(actual,pred));
x_train,y_train,x_test,y_test = load_data()
clf = train_clf(x_train, y_train)
pred = clf.predict(x_test)
evaluate(numpy.asarray(y_test), pred)
print metrics.classification_report(y_test, pred)
使用自定義數(shù)據(jù)
# coding: utf-8
import numpy
from sklearn import metrics
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer,TfidfTransformer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.svm import LinearSVC
import codecs
from sklearn.ensemble import RandomForestClassifier
from sklearn import cross_validation
from sklearn import linear_model
train_corpus = [
'我們 我們 好孩子 認證 。 就是',
'我們 好孩子 認證 。 中國',
'我們 好孩子 認證 。 孤獨',
'我們 好孩子 認證 。',
]
test_corpus = [
'我 菲律賓 韓國',
'我們 好孩子 認證 。 中國',
]
def input_data(train_file, test_file):
train_words = []
train_tags = []
test_words = []
test_tags = []
f1 = codecs.open(train_file,'r','utf-8','ignore')
for line in f1:
tks = line.split(':', 1)
word_list = tks[1]
word_array = word_list[1:(len(word_list)-3)].split(", ")
train_words.append(" ".join(word_array))
train_tags.append(tks[0])
f2 = codecs.open(test_file,'r','utf-8','ignore')
for line in f2:
tks = line.split(':', 1)
word_list = tks[1]
word_array = word_list[1:(len(word_list)-3)].split(", ")
test_words.append(" ".join(word_array))
test_tags.append(tks[0])
return train_words, train_tags, test_words, test_tags
def vectorize(train_words, test_words):
#v = HashingVectorizer(n_features=25000, non_negative=True)
v = HashingVectorizer(non_negative=True)
#v = CountVectorizer(min_df=1)
train_data = v.fit_transform(train_words)
test_data = v.fit_transform(test_words)
return train_data, test_data
def vectorize1(train_words, test_words):
tv = TfidfVectorizer(sublinear_tf = False,use_idf=True);
train_data = tv.fit_transform(train_words);
tv2 = TfidfVectorizer(vocabulary = tv.vocabulary_);
test_data = tv2.fit_transform(test_words);
return train_data, test_data
def vectorize2(train_words, test_words):
count_v1= CountVectorizer(stop_words = 'english', max_df = 0.5);
counts_train = count_v1.fit_transform(train_words);
count_v2 = CountVectorizer(vocabulary=count_v1.vocabulary_);
counts_test = count_v2.fit_transform(test_words);
tfidftransformer = TfidfTransformer();
train_data = tfidftransformer.fit(counts_train).transform(counts_train);
test_data = tfidftransformer.fit(counts_test).transform(counts_test);
return train_data, test_data
def evaluate(actual, pred):
m_precision = metrics.precision_score(actual, pred)
m_recall = metrics.recall_score(actual, pred)
print 'precision:{0:.3f}'.format(m_precision)
print 'recall:{0:0.3f}'.format(m_recall)
print 'f1-score:{0:.8f}'.format(metrics.f1_score(actual,pred));
def train_clf(train_data, train_tags):
clf = MultinomialNB(alpha=0.01)
clf.fit(train_data, numpy.asarray(train_tags))
return clf
def train_clf1(train_data, train_tags):
#KNN Classifier
clf = KNeighborsClassifier()#default with k=5
clf.fit(train_data, numpy.asarray(train_tags))
return clf
def train_clf2(train_data, train_tags):
clf = linear_model.LogisticRegression(C=1e5)
clf.fit(train_data,train_tags)
return clf
def train_clf3(train_data, train_tags):
clf = LinearSVC(C=1100.0)#default with 'rbf'
clf.fit(train_data,train_tags)
return clf
def train_clf4(train_data, train_tags):
"""
隨機森林,不可使用稀疏矩陣
"""
clf = RandomForestClassifier(n_estimators=10)
clf.fit(train_data.todense(),train_tags)
return clf
#使用codecs逐行讀取
def codecs_read_label_line(filename):
label_list=[]
f = codecs.open(filename,'r','utf-8','ignore')
line = f.readline()
while line:
#label_list.append(line[0:len(line)-2])
label_list.append(line[0:len(line)-1])
line = f.readline()
f.close()
return label_list
def save_test_features(test_url, test_label):
test_feature_list = codecs_read_label_line('test.dat')
fw = open('test_labeded.dat',"w+")
for (url,label) in zip(test_feature_list,test_label):
fw.write(url+'\t'+label)
fw.write('\n')
fw.close()
def main():
train_file = u'..\\file\\py_train.txt'
test_file = u'..\\file\\py_test.txt'
train_words, train_tags, test_words, test_tags = input_data(train_file, test_file)
#print len(train_words), len(train_tags), len(test_words), len(test_words),
train_data, test_data = vectorize1(train_words, test_words)
print type(train_data)
print train_data.shape
print test_data.shape
print test_data[0].shape
print numpy.asarray(test_data[0])
clf = train_clf3(train_data, train_tags)
scores = cross_validation.cross_val_score(
clf, train_data, train_tags, cv=5, scoring="f1_weighted")
print scores
#predicted = cross_validation.cross_val_predict(clf, train_data,train_tags, cv=5)
'''
'''
pred = clf.predict(test_data)
error_list=[]
for (true_tag,predict_tag) in zip(test_tags,pred):
if true_tag != predict_tag:
print true_tag,predict_tag
error_list.append(true_tag+' '+predict_tag)
print len(error_list)
evaluate(numpy.asarray(test_tags), pred)
'''
#輸出打標簽結果
test_feature_list = codecs_read_label_line('test.dat')
save_test_features(test_feature_list, pred)
'''
if __name__ == '__main__':
main()
您可能感興趣的文章:
- 使用Python和scikit-learn創(chuàng)建混淆矩陣的示例詳解
- 分享15?個python中的?Scikit-Learn?技能
- Python機器學習工具scikit-learn的使用筆記
- Python機器學習算法庫scikit-learn學習之決策樹實現(xiàn)方法詳解
- Python機器學習之scikit-learn庫中KNN算法的封裝與使用方法
- python的scikit-learn將特征轉成one-hot特征的方法
- Python機器學習庫scikit-learn安裝與基本使用教程
- Python scikit-learn 做線性回歸的示例代碼
- Python機器學習庫scikit-learn入門開發(fā)示例
相關文章
Python使用matplotlib實現(xiàn)交換式圖形顯示功能示例
這篇文章主要介紹了Python使用matplotlib實現(xiàn)交換式圖形顯示功能,結合實例形式詳細分析了Python基于matplotlib模塊的數(shù)值運算與圖形繪制相關操作技巧,需要的朋友可以參考下2019-09-09
解決python3在anaconda下安裝caffe失敗的問題
下面小編就為大家?guī)硪黄鉀Qpython3在anaconda下安裝caffe失敗的問題。小編覺得挺不錯的,現(xiàn)在就分享給大家,也給大家做個參考。一起跟隨小編過來看看吧2017-06-06

