Python機器學習應用之工業(yè)蒸汽數(shù)據(jù)分析篇詳解
一、數(shù)據(jù)集
二、數(shù)據(jù)分析
1 數(shù)據(jù)導入
#%%導入基礎包
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import warnings
warnings.filterwarnings("ignore")
#%%讀取數(shù)據(jù)
train_data_file = "D:\Python\ML\data\zhengqi_train.txt"
test_data_file = "D:\Python\ML\data\/zhengqi_test.txt"
train_data = pd.read_csv(train_data_file, sep='\t', encoding='utf-8')
test_data = pd.read_csv(test_data_file, sep='\t', encoding='utf-8')
#%%查看訓練集特征變量信息
train_infor=train_data.describe()
test_infor=test_data.describe()


2 數(shù)據(jù)特征探索(數(shù)據(jù)可視化)
#%%可視化探索數(shù)據(jù)
# 畫v0箱式圖
fig = plt.figure(figsize=(4, 6)) # 指定繪圖對象寬度和高度
sns.boxplot(y=train_data['V0'],orient="v", width=0.5)
#%%可以將所有的特征都畫出
'''
column = train_data.columns.tolist()[:39] # 列表頭
fig = plt.figure(figsize=(20, 40)) # 指定繪圖對象寬度和高度
for i in range(38):
plt.subplot(13, 3, i + 1) # 13行3列子圖
sns.boxplot(train_data[column[i]], orient="v", width=0.5) # 箱式圖
plt.ylabel(column[i], fontsize=8)
plt.show()
'''
#%%查看v0的數(shù)據(jù)分布直方圖,繪制QQ圖查看數(shù)據(jù)是否近似于正態(tài)分布
plt.figure(figsize=(10,5))
ax=plt.subplot(1,2,1)
sns.distplot(train_data['V0'],fit=stats.norm)
ax=plt.subplot(1,2,2)
res = stats.probplot(train_data['V0'], plot=plt)
#%%查看所有特征的數(shù)據(jù)分布情況
'''
train_cols = 6
train_rows = len(train_data.columns)
plt.figure(figsize=(4*train_cols,4*train_rows))
i=0
for col in train_data.columns:
i+=1
ax=plt.subplot(train_rows,train_cols,i)
sns.distplot(train_data[col],fit=stats.norm)
i+=1
ax=plt.subplot(train_rows,train_cols,i)
res = stats.probplot(train_data[col], plot=plt)
plt.show()
'''


#%%對比統(tǒng)一特征訓練集和測試集的分布情況,查看數(shù)據(jù)分布是否一致
ax = sns.kdeplot(train_data['V0'], color="Red", shade=True)
ax = sns.kdeplot(test_data['V0'], color="Blue", shade=True)
ax.set_xlabel('V0')
ax.set_ylabel("Frequency")
ax = ax.legend(["train","test"])
#%%查看所有特征的訓練集和測試集分布情況
'''
dist_cols = 6
dist_rows = len(test_data.columns)
plt.figure(figsize=(4*dist_cols,4*dist_rows))
i=1
for col in test_data.columns:
ax=plt.subplot(dist_rows,dist_cols,i)
ax = sns.kdeplot(train_data[col], color="Red", shade=True)
ax = sns.kdeplot(test_data[col], color="Blue", shade=True)
ax.set_xlabel(col)
ax.set_ylabel("Frequency")
ax = ax.legend(["train","test"])
i+=1
plt.show()
'''

#%%查看v5,v9,v11,v22,v28的數(shù)據(jù)分布
drop_col = 6
drop_row = 1
plt.figure(figsize=(5*drop_col,5*drop_row))
i=1
for col in ["V5","V9","V11","V17","V22","V28"]:
ax =plt.subplot(drop_row,drop_col,i)
ax = sns.kdeplot(train_data[col], color="Red", shade=True)
ax = sns.kdeplot(test_data[col], color="Blue", shade=True)
ax.set_xlabel(col)
ax.set_ylabel("Frequency")
ax = ax.legend(["train","test"])
i+=1
plt.show()
#%%刪除這些特征
drop_columns=["V5","V9","V11","V17","V22","V28"]
train_data=train_data.drop(columns=drop_columns)
test_data=test_data.drop(columns=drop_columns)

當訓練數(shù)據(jù)和測試數(shù)據(jù)分布不一致的時候,會導致模型的泛化能力差,采用刪除此類特征的方法

#%%可視化線性回歸關系
fcols = 2
frows = 1
plt.figure(figsize=(8,4))
ax=plt.subplot(1,2,1)
sns.regplot(x='V0', y='target', data=train_data, ax=ax,
scatter_kws={'marker':'.','s':3,'alpha':0.3},
line_kws={'color':'k'});
plt.xlabel('V0')
plt.ylabel('target')
ax=plt.subplot(1,2,2)
sns.distplot(train_data['V0'].dropna())
plt.xlabel('V0')
plt.show()
#%%查看所有特征變量與target變量的線性回歸關系
'''
fcols = 6
frows = len(test_data.columns)
plt.figure(figsize=(5*fcols,4*frows))
i=0
for col in test_data.columns:
i+=1
ax=plt.subplot(frows,fcols,i)
sns.regplot(x=col, y='target', data=train_data, ax=ax,
scatter_kws={'marker':'.','s':3,'alpha':0.3},
line_kws={'color':'k'});
plt.xlabel(col)
plt.ylabel('target')
i+=1
ax=plt.subplot(frows,fcols,i)
sns.distplot(train_data[col].dropna())
plt.xlabel(col)
'''

#%%查看特征變量的相關性 train_corr = train_data.corr() # 畫出相關性熱力圖 ax = plt.subplots(figsize=(20, 16))#調(diào)整畫布大小 ax = sns.heatmap(train_corr, vmax=.8, square=True, annot=True)#畫熱力圖 annot=True 顯示系數(shù)

#%%找出相關程度 plt.figure(figsize=(20, 16)) # 指定繪圖對象寬度和高度 colnm = train_data.columns.tolist() # 列表頭 mcorr = train_data[colnm].corr(method="spearman") # 相關系數(shù)矩陣,即給出了任意兩個變量之間的相關系數(shù) mask = np.zeros_like(mcorr, dtype=np.bool) # 構造與mcorr同維數(shù)矩陣 為bool型 mask[np.triu_indices_from(mask)] = True # 角分線右側(cè)為True cmap = sns.diverging_palette(220, 10, as_cmap=True) # 返回matplotlib colormap對象 g = sns.heatmap(mcorr, mask=mask, cmap=cmap, square=True, annot=True, fmt='0.2f') # 熱力圖(看兩兩相似度) plt.show()

#%%查找特征變量和target變量相關系數(shù)大于0.5的特征變量 #尋找K個最相關的特征信息 k = 10 # number of variables for heatmap cols = train_corr.nlargest(k, 'target')['target'].index cm = np.corrcoef(train_data[cols].values.T) hm = plt.subplots(figsize=(10, 10))#調(diào)整畫布大小 hm = sns.heatmap(train_data[cols].corr(),annot=True,square=True) plt.show()

threshold = 0.5 corrmat = train_data.corr() top_corr_features = corrmat.index[abs(corrmat["target"])>threshold] plt.figure(figsize=(10,10)) g = sns.heatmap(train_data[top_corr_features].corr(),annot=True,cmap="RdYlGn")

#%% Threshold for removing correlated variables
threshold = 0.05
# Absolute value correlation matrix
corr_matrix = train_data.corr().abs()
drop_col=corr_matrix[corr_matrix["target"]<threshold].index
#%%刪除相關性小于0.05的列
train_data=train_data.drop(columns=drop_col)
test_data=test_data.drop(columns=drop_col)
#%%將train和test合并
train_x=train_data.drop(['target'],axis=1)
data_all=pd.concat([train_x,test_data])
#%%標準化
cols_numeric=list(data_all.columns)
def scale_minmax(col):
return (col-col.min())/(col.max()-col.min())
data_all[cols_numeric] = data_all[cols_numeric].apply(scale_minmax,axis=0)
print(data_all[cols_numeric].describe())
train_data_process = train_data[cols_numeric]
train_data_process = train_data_process[cols_numeric].apply(scale_minmax,axis=0)
test_data_process = test_data[cols_numeric]
test_data_process = test_data_process[cols_numeric].apply(scale_minmax,axis=0)

#%%查看v0-v3四個特征的箱盒圖,查看其分布是否符合正態(tài)分布
cols_numeric_0to4 = cols_numeric[0:4]
## Check effect of Box-Cox transforms on distributions of continuous variables
train_data_process = pd.concat([train_data_process, train_data['target']], axis=1)
fcols = 6
frows = len(cols_numeric_0to4)
plt.figure(figsize=(4*fcols,4*frows))
i=0
for var in cols_numeric_0to4:
dat = train_data_process[[var, 'target']].dropna()
i+=1
plt.subplot(frows,fcols,i)
sns.distplot(dat[var] , fit=stats.norm);
plt.title(var+' Original')
plt.xlabel('')
i+=1
plt.subplot(frows,fcols,i)
_=stats.probplot(dat[var], plot=plt)
plt.title('skew='+'{:.4f}'.format(stats.skew(dat[var])))
plt.xlabel('')
plt.ylabel('')
i+=1
plt.subplot(frows,fcols,i)
plt.plot(dat[var], dat['target'],'.',alpha=0.5)
plt.title('corr='+'{:.2f}'.format(np.corrcoef(dat[var], dat['target'])[0][1]))
i+=1
plt.subplot(frows,fcols,i)
trans_var, lambda_var = stats.boxcox(dat[var].dropna()+1)
trans_var = scale_minmax(trans_var)
sns.distplot(trans_var , fit=stats.norm);
plt.title(var+' Tramsformed')
plt.xlabel('')
i+=1
plt.subplot(frows,fcols,i)
_=stats.probplot(trans_var, plot=plt)
plt.title('skew='+'{:.4f}'.format(stats.skew(trans_var)))
plt.xlabel('')
plt.ylabel('')
i+=1
plt.subplot(frows,fcols,i)
plt.plot(trans_var, dat['target'],'.',alpha=0.5)
plt.title('corr='+'{:.2f}'.format(np.corrcoef(trans_var,dat['target'])[0][1]))

三、特征優(yōu)化
import pandas as pd
train_data_file = "D:\Python\ML\data\zhengqi_train.txt"
test_data_file = "D:\Python\ML\data\zhengqi_test.txt"
train_data = pd.read_csv(train_data_file, sep='\t', encoding='utf-8')
test_data = pd.read_csv(test_data_file, sep='\t', encoding='utf-8')
#%%定義特征構造方法,構造特征
epsilon=1e-5
#組交叉特征,可以自行定義,如增加: x*x/y, log(x)/y 等等,使用lambda函數(shù)更方便快捷
func_dict = {
'add': lambda x,y: x+y,
'mins': lambda x,y: x-y,
'div': lambda x,y: x/(y+epsilon),
'multi': lambda x,y: x*y
}
#%%定義特征構造函數(shù)
def auto_features_make(train_data,test_data,func_dict,col_list):
train_data, test_data = train_data.copy(), test_data.copy()
for col_i in col_list:
for col_j in col_list:
for func_name, func in func_dict.items():
for data in [train_data,test_data]:
func_features = func(data[col_i],data[col_j])
col_func_features = '-'.join([col_i,func_name,col_j])
data[col_func_features] = func_features
return train_data,test_data
#%%對訓練集和測試集進行特征構造
train_data2, test_data2 = auto_features_make(train_data,test_data,func_dict,col_list=test_data.columns)
四、對特征構造后的訓練集和測試集進行主成分分析
#%%PCA from sklearn.decomposition import PCA #主成分分析法 #PCA方法降維 pca = PCA(n_components=500) train_data2_pca = pca.fit_transform(train_data2.iloc[:,0:-1]) test_data2_pca = pca.transform(test_data2) train_data2_pca = pd.DataFrame(train_data2_pca) test_data2_pca = pd.DataFrame(test_data2_pca) train_data2_pca['target'] = train_data2['target'] X_train2 = train_data2[test_data2.columns].values y_train = train_data2['target']
五、使用LightGBM模型進行訓練和預測
#%%使用lightgbm模型對新構造的特征進行模型訓練和評估
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
import lightgbm as lgb
import numpy as np
# 5折交叉驗證
kf = KFold(len(X_train2), shuffle=True, random_state=2019)
#%%
# 記錄訓練和預測MSE
MSE_DICT = {
'train_mse':[],
'test_mse':[]
}
# 線下訓練預測
for i, (train_index, test_index) in enumerate(kf.split(X_train2)):
# lgb樹模型
lgb_reg = lgb.LGBMRegressor(
learning_rate=0.01,
max_depth=-1,
n_estimators=5000,
boosting_type='gbdt',
random_state=2019,
objective='regression',
)
# 切分訓練集和預測集
X_train_KFold, X_test_KFold = X_train2[train_index], X_train2[test_index]
y_train_KFold, y_test_KFold = y_train[train_index], y_train[test_index]
# 訓練模型
lgb_reg.fit(
X=X_train_KFold,y=y_train_KFold,
eval_set=[(X_train_KFold, y_train_KFold),(X_test_KFold, y_test_KFold)],
eval_names=['Train','Test'],
early_stopping_rounds=100,
eval_metric='MSE',
verbose=50
)
# 訓練集預測 測試集預測
y_train_KFold_predict = lgb_reg.predict(X_train_KFold,num_iteration=lgb_reg.best_iteration_)
y_test_KFold_predict = lgb_reg.predict(X_test_KFold,num_iteration=lgb_reg.best_iteration_)
print('第{}折 訓練和預測 訓練MSE 預測MSE'.format(i))
train_mse = mean_squared_error(y_train_KFold_predict, y_train_KFold)
print('------\n', '訓練MSE\n', train_mse, '\n------')
test_mse = mean_squared_error(y_test_KFold_predict, y_test_KFold)
print('------\n', '預測MSE\n', test_mse, '\n------\n')
MSE_DICT['train_mse'].append(train_mse)
MSE_DICT['test_mse'].append(test_mse)
print('------\n', '訓練MSE\n', MSE_DICT['train_mse'], '\n', np.mean(MSE_DICT['train_mse']), '\n------')
print('------\n', '預測MSE\n', MSE_DICT['test_mse'], '\n', np.mean(MSE_DICT['test_mse']), '\n------')

..... 不想等它跑完了,會一直跑到score不再變化或者round=100的時候為止~
到此這篇關于Python機器學習應用之工業(yè)蒸汽數(shù)據(jù)分析篇詳解的文章就介紹到這了,更多相關Python 工業(yè)蒸汽數(shù)據(jù)分析內(nèi)容請搜索腳本之家以前的文章或繼續(xù)瀏覽下面的相關文章希望大家以后多多支持腳本之家!
相關文章
Python使用Selenium爬取淘寶異步加載的數(shù)據(jù)方法
今天小編就為大家分享一篇Python使用Selenium爬取淘寶異步加載的數(shù)據(jù)方法,具有很好的參考價值,希望對大家有所幫助。一起跟隨小編過來看看吧2018-12-12
python使用rabbitmq實現(xiàn)網(wǎng)絡爬蟲示例
這篇文章主要介紹了python使用RabbitMQ實現(xiàn)網(wǎng)絡爬蟲的示例,需要的朋友可以參考下2014-02-02
Django中URLconf和include()的協(xié)同工作方法
這篇文章主要介紹了Django中URLconf和include()的協(xié)同工作方法,Django是Python眾人氣框架中最著名的一個,需要的朋友可以參考下2015-07-07

