<matplotlib.axes._subplots.AxesSubplot at 0x121125b50>
# 2.缺失值处理# 对数据进行分析的时候要注意其中是否有缺失值。一些机器学习算法能够处理缺失值,比如神经网络,一些则不能。# 对于缺失值,一般有以下几种处理方法:# (1)如果数据集很多,但有很少的缺失值,可以删掉带缺失值的行;# (2)如果该属性相对学习来说不是很重要,可以对缺失值赋均值或者众数。# (3)对于标称属性,可以赋一个代表缺失的值,比如‘U0’。因为缺失本身也可能代表着一些隐含信息。比如船舱号Cabin这一属性,缺失可能代表并没有船舱
train_data.Embarked[train_data.Embarked.isnull()]= train_data.Embarked.dropna().mode().values
#replace missing value with U0
train_data['Cabin']= train_data.Cabin.fillna('U0')#train_data.Cabin[train_data.CAbin.isnull()]='U0'
# (4)使用回归 随机森林等模型来预测缺失属性的值。因为Age在该数据集里是一个相当重要的特征(先对Age进行分析即可得知),所以保证一定的缺失值填充准确率是非常重要的,对结果也会产生较大影响。一般情况下,会使用数据完整的条目作为模型的训练集,以此来预测缺失值。对于当前的这个数据,可以使用随机森林来预测也可以使用线性回归预测。这里使用随机森林预测模型,选取数据集中的数值属性作为特征(因为sklearn的模型只能处理数值属性,所以这里先仅选取数值特征,但在实际的应用中需要将非数值特征转换为数值特征)from sklearn.ensemble import RandomForestRegressor#choose training data to predict age
age_df = train_data[['Age','Survived','Fare','Parch','SibSp','Pclass']]
age_df_notnull = age_df.loc[(train_data['Age'].notnull())]
age_df_isnull = age_df.loc[(train_data['Age'].isnull())]
X = age_df_notnull.values[:,1:]
Y = age_df_notnull.values[:,0]# use RandomForestRegression to train data
RFR = RandomForestRegressor(n_estimators=1000, n_jobs=-1)
RFR.fit(X,Y)
predictAges = RFR.predict(age_df_isnull.values[:,1:])
train_data.loc[train_data['Age'].isnull(),['Age']]= predictAgestrain_data.info()
# 3.3 年龄与存活与否的关系 Age# 分别分析不同等级船舱和不同性别下的年龄分布和生存的关系:fig,ax = plt.subplots(1,2, figsize =(18,5))
ax[0].set_yticks(range(0,110,10))
sns.violinplot("Pclass","Age",hue="Survived",data=train_data,split=True,ax=ax[0])
ax[0].set_title('Pclass and Age vs Survived') ax[1].set_yticks(range(0,110,10))
sns.violinplot("Sex","Age",hue="Survived",data=train_data,split=True,ax=ax[1])
ax[1].set_title('Sex and Age vs Survived')plt.show()
# 3.9 船舱类型和存活与否的关系 Cabin# 由于船舱的缺失值确实太多,有效值仅仅有204个,很难分析出不同的船舱和存活的关系,所以在做特征工程的时候,可以直接将该组特征丢弃掉。 当然,这里我们也可以对其进行一下分析,对于缺失的数据都分为一类。 简单地将数据分为是否有Cabin记录作为特征,与生存与否进行分析:# Replace missing values with "U0"
train_data.loc[train_data.Cabin.isnull(),'Cabin']='U0'
train_data['Has_Cabin']= train_data['Cabin'].apply(lambda x:0if x =='U0'else1)
train_data[['Has_Cabin','Survived']].groupby(['Has_Cabin']).mean().plot.bar()
<matplotlib.axes._subplots.AxesSubplot at 0x1a26439910>
#对不同类型的船舱进行分析:
# create feature for the alphabetical part of the cabin number
train_data['CabinLetter']= train_data['Cabin'].map(lambda x: re.compile("([a-zA-Z]+)").search(x).group())# convert the distinct cabin letters with incremental integer values
train_data['CabinLetter']= pd.factorize(train_data['CabinLetter'])[0]
train_data[['CabinLetter','Survived']].groupby(['CabinLetter']).mean().plot.bar()
<matplotlib.axes._subplots.AxesSubplot at 0x1a2651a7d0>
# 可见,不同的船舱生存率也有不同,但是差别不是很大。所以在处理中,我们可以直接将特征删除。
# 3.10 港口和存活与否的关系 Embarked|# 泰坦尼克号从英国的南安普顿港出发,途径法国瑟堡和爱尔兰昆士敦,那么在昆士敦之前上船的人,有可能在瑟堡或昆士敦下船,这些人将不会遇到海难。sns.countplot('Embarked',hue='Survived',data=train_data)
plt.title('Embarked and Survived')
Text(0.5, 1.0, 'Embarked and Survived')
sns.factorplot('Embarked','Survived',data = train_data, size=3, aspect=2)
plt.title('Embarked and Survived rate')
plt.show()
# 4.2 Factoring# dummy不好处理Cabin(船舱号)这种标称属性,因为他出现的变量比较多。所以Pandas有一个方法叫做factorize(),它可以创建一些数字,来表示类别变量,对每一个类别映射一个ID,这种映射最后只生成一个特征,不像dummy那样生成多个特征。# Replace missing values with "U0"
train_data['Cabin'][train_data.Cabin.isnull()]='U0'# create feature for the alphabetical part of the cabin number
train_data['CabinLetter']= train_data['Cabin'].map(lambda x: re.compile("([a-zA-Z]+)").search(x).group())# convert the distinct cabin letters with incremental integer values
train_data['CabinLetter']= pd.factorize(train_data['CabinLetter'])[0]train_data[['Cabin','CabinLetter']].head()
Cabin
CabinLetter
0
U0
0
1
C85
1
2
U0
0
3
C123
1
4
U0
0
# 4.3 Scaling# Scaling可以将一个很大范围的数值映射到一个很小范围(通常是 -1到1,或者是0到1),很多情况下我们需要将数值做Scaling使其范围大小一样,否则大范围数特征将会有更高的权重。比如:Age的范围可能只是0-100,而income的范围可能是0-10000000,在某些对数组大小敏感的模型中会影响其结果。# 下面对Age进行Scaling:from sklearn import preprocessingassert np.size(train_data['Age'])==891# StandardScaler will subtract the mean from each value then scale to the unit varience
scaler = preprocessing.StandardScaler()
train_data['Age_scaled']= scaler.fit_transform(train_data['Age'].values.reshape(-1,1))
# 在将数据Binning化后,要么将数据factorize化,要么dummies化。# qcut() create a new variable that idetifies the quartile range, but we can't use the string# so either factorize or create dummies from the result# factorize
train_data['Fare_bin_id']= pd.factorize(train_data['Fare_bin'])[0]# dummies
fare_bin_dummies_df = pd.get_dummies(train_data['Fare_bin']).rename(columns=lambda x:'Fare_'+str(x))
train_data = pd.concat([train_data, fare_bin_dummies_df], axis=1)
# 6. 模型融合及测试# 6.1 利用不同的模型来对特征进行筛选,选出较为重要的特征:from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifierdefget_top_n_features(titanic_train_data_X,titanic_train_data_Y,top_n_features):#randomforestrf_est = RandomForestClassifier(random_state=0)rf_param_grid ={'n_estimators':[500],'min_samples_split':[2,3],'max_depth':[20]}rf_grid = model_selection.GridSearchCV(rf_est,rf_param_grid,n_jobs=25,cv=10,verbose=1)rf_grid.fit(titanic_train_data_X,titanic_train_data_Y)print('Top N Features Best RF Params:'+str(rf_grid.best_params_))print('Top N Features Best RF Score:'+str(rf_grid.best_score_))print('Top N Features RF Train Score:'+str(rf_grid.score(titanic_train_data_X,titanic_train_data_Y)))feature_imp_sorted_rf = pd.DataFrame({'feature':list(titanic_train_data_X),'importance':rf_grid.best_estimator_.feature_importances_}).sort_values('importance',ascending=False)features_top_n_rf = feature_imp_sorted_rf.head(top_n_features)['feature']print('Sample 10 Feeatures from RF Classifier')print(str(features_top_n_rf[:10]))#AdaBoostada_est = AdaBoostClassifier(random_state=0)ada_param_grid ={'n_estimators':[500],'learning_rate':[0.01,0.1]}ada_grid = model_selection.GridSearchCV(ada_est,ada_param_grid,n_jobs=25,cv=10,verbose=1)ada_grid.fit(titanic_train_data_X,titanic_train_data_Y)print('Top N Features Best Ada Params:'+str(ada_grid.best_params_))print('Top N Features Best Ada Score:'+str(ada_grid.best_score_))print('Top N Features Ada Train Score:'+str(ada_grid.score(titanic_train_data_X,titanic_train_data_Y)))feature_imp_sorted_ada = pd.DataFrame({'feature':list(titanic_train_data_X),'importance':ada_grid.best_estimator_.feature_importances_}).sort_values('importance',ascending=False)features_top_n_ada = feature_imp_sorted_ada.head(top_n_features)['feature']print('Sample 10 Features from Ada Classifier:')print(str(features_top_n_ada[:10]))#ExtraTreeet_est = ExtraTreesClassifier(random_state=0)et_param_grid ={'n_estimators':[500],'min_samples_split':[3,4],'max_depth':[20]}et_grid = model_selection.GridSearchCV(et_est,et_param_grid,n_jobs=25,cv=10,verbose=1)et_grid.fit(titanic_train_data_X,titanic_train_data_Y)print('Top N Features Best ET Params:'+str(et_grid.best_params_))print('Top N Features Best DT Score:'+str(et_grid.best_score_))print('Top N Features ET Train Score:'+str(et_grid.score(titanic_train_data_X,titanic_train_data_Y)))feature_imp_sorted_et = pd.DataFrame({'feature':list(titanic_train_data_X),'importance':et_grid.best_estimator_.feature_importances_}).sort_values('importance',ascending=False)features_top_n_et = feature_imp_sorted_et.head(top_n_features)['feature']print('Sample 10 Features from ET Classifier:')print(str(features_top_n_et[:10]))# GradientBoostinggb_est = GradientBoostingClassifier(random_state=0)gb_param_grid ={'n_estimators':[500],'learning_rate':[0.01,0.1],'max_depth':[20]}gb_grid = model_selection.GridSearchCV(gb_est,gb_param_grid,n_jobs=25,cv=10,verbose=1)gb_grid.fit(titanic_train_data_X,titanic_train_data_Y)print('Top N Features Best GB Params:'+str(gb_grid.best_params_))print('Top N Features Best GB Score:'+str(gb_grid.best_score_))print('Top N Features GB Train Score:'+str(gb_grid.score(titanic_train_data_X,titanic_train_data_Y)))feature_imp_sorted_gb = pd.DataFrame({'feature':list(titanic_train_data_X),'importance':gb_grid.best_estimator_.feature_importances_}).sort_values('importance',ascending=False)features_top_n_gb = feature_imp_sorted_gb.head(top_n_features)['feature']print('Sample 10 Feature from GB Classifier:')print(str(features_top_n_gb[:10]))# DecisionTreedt_est = DecisionTreeClassifier(random_state=0)dt_param_grid ={'min_samples_split':[2,4],'max_depth':[20]}dt_grid = model_selection.GridSearchCV(dt_est,dt_param_grid,n_jobs=25,cv=10,verbose=1)dt_grid.fit(titanic_train_data_X,titanic_train_data_Y)print('Top N Features Bset DT Params:'+str(dt_grid.best_params_))print('Top N Features Best DT Score:'+str(dt_grid.best_score_))print('Top N Features DT Train Score:'+str(dt_grid.score(titanic_train_data_X,titanic_train_data_Y)))feature_imp_sorted_dt = pd.DataFrame({'feature':list(titanic_train_data_X),'importance':dt_grid.best_estimator_.feature_importances_}).sort_values('importance',ascending=False)features_top_n_dt = feature_imp_sorted_dt.head(top_n_features)['feature']print('Sample 10 Features from DT Classifier:')print(str(features_top_n_dt[:10]))# merge the three modelsfeatures_top_n = pd.concat([features_top_n_rf,features_top_n_ada,features_top_n_et,features_top_n_gb,features_top_n_dt],ignore_index=True).drop_duplicates()features_importance = pd.concat([feature_imp_sorted_rf,feature_imp_sorted_ada,feature_imp_sorted_et,feature_imp_sorted_gb,feature_imp_sorted_dt],ignore_index=True)return features_top_n,features_importance
Fitting 10 folds for each of 2 candidates, totalling 20 fits[Parallel(n_jobs=25)]: Using backend LokyBackend with 25 concurrent workers.
[Parallel(n_jobs=25)]: Done 13 out of 20 | elapsed: 6.4s remaining: 3.4s
[Parallel(n_jobs=25)]: Done 20 out of 20 | elapsed: 7.7s finishedTop N Features Best RF Params:{'max_depth': 20, 'min_samples_split': 3, 'n_estimators': 500}
Top N Features Best RF Score:0.8271785268414481
Top N Features RF Train Score:0.9764309764309764
Sample 10 Feeatures from RF Classifier
1 Age
17 Name_length
2 Fare
8 Sex_1
9 Title
11 Title_0
7 Sex_0
29 Family_Size
0 Pclass
33 Ticket_Letter
Name: feature, dtype: object
Fitting 10 folds for each of 2 candidates, totalling 20 fits[Parallel(n_jobs=25)]: Using backend LokyBackend with 25 concurrent workers.
[Parallel(n_jobs=25)]: Done 13 out of 20 | elapsed: 3.7s remaining: 2.0s
[Parallel(n_jobs=25)]: Done 20 out of 20 | elapsed: 5.0s finishedTop N Features Best Ada Params:{'learning_rate': 0.01, 'n_estimators': 500}
Top N Features Best Ada Score:0.8181897627965042
Top N Features Ada Train Score:0.8204264870931538
Sample 10 Features from Ada Classifier:
11 Title_0
2 Fare
30 Family_Size_Category_0
29 Family_Size
7 Sex_0
0 Pclass
3 Cabin
8 Sex_1
17 Name_length
1 Age
Name: feature, dtype: object
Fitting 10 folds for each of 2 candidates, totalling 20 fits[Parallel(n_jobs=25)]: Using backend LokyBackend with 25 concurrent workers.
[Parallel(n_jobs=25)]: Done 13 out of 20 | elapsed: 3.5s remaining: 1.9s
[Parallel(n_jobs=25)]: Done 20 out of 20 | elapsed: 4.0s finishedTop N Features Best ET Params:{'max_depth': 20, 'min_samples_split': 4, 'n_estimators': 500}
Top N Features Best DT Score:0.8237952559300874
Top N Features ET Train Score:0.9708193041526375
Sample 10 Features from ET Classifier:
11 Title_0
7 Sex_0
8 Sex_1
17 Name_length
1 Age
2 Fare
3 Cabin
9 Title
33 Ticket_Letter
13 Title_2
Name: feature, dtype: object
Fitting 10 folds for each of 2 candidates, totalling 20 fits[Parallel(n_jobs=25)]: Using backend LokyBackend with 25 concurrent workers.
[Parallel(n_jobs=25)]: Done 13 out of 20 | elapsed: 12.3s remaining: 6.6s
[Parallel(n_jobs=25)]: Done 20 out of 20 | elapsed: 12.7s finishedTop N Features Best GB Params:{'learning_rate': 0.1, 'max_depth': 20, 'n_estimators': 500}
Top N Features Best GB Score:0.7835081148564295
Top N Features GB Train Score:0.9966329966329966
Sample 10 Feature from GB Classifier:
11 Title_0
1 Age
2 Fare
17 Name_length
30 Family_Size_Category_0
29 Family_Size
0 Pclass
9 Title
28 Pclass_5
33 Ticket_Letter
Name: feature, dtype: object
Fitting 10 folds for each of 2 candidates, totalling 20 fits
Top N Features Bset DT Params:{'max_depth': 20, 'min_samples_split': 4}
Top N Features Best DT Score:0.7823220973782771
Top N Features DT Train Score:0.9607182940516273
Sample 10 Features from DT Classifier:
11 Title_0
1 Age
2 Fare
17 Name_length
30 Family_Size_Category_0
16 Title_5
28 Pclass_5
0 Pclass
33 Ticket_Letter
29 Family_Size
Name: feature, dtype: object[Parallel(n_jobs=25)]: Using backend LokyBackend with 25 concurrent workers.
[Parallel(n_jobs=25)]: Done 13 out of 20 | elapsed: 0.1s remaining: 0.0s
[Parallel(n_jobs=25)]: Done 20 out of 20 | elapsed: 0.1s finished
# 用视图可视化不同算法筛选的特征排序:rf_feature_imp = feature_importance[:10]
Ada_feature_imp = feature_importance[32:32+10].reset_index(drop=True)# make importances relative to max importance
rf_feature_importance =100.0*(rf_feature_imp['importance']/ rf_feature_imp['importance'].max())
Ada_feature_importance =100.0*(Ada_feature_imp['importance']/ Ada_feature_imp['importance'].max())# Get the indexes of all features over the importance threshold
rf_important_idx = np.where(rf_feature_importance)[0]
Ada_important_idx = np.where(Ada_feature_importance)[0]# Adapted from http://scikit-learn.org/stable/auto_examples/ensemble/plot_gradient_boosting_regression.html
pos = np.arange(rf_important_idx.shape[0])+.5plt.figure(1, figsize =(18,8))plt.subplot(121)
plt.barh(pos, rf_feature_importance[rf_important_idx][::-1])
plt.yticks(pos, rf_feature_imp['feature'][::-1])
plt.xlabel('Relative Importance')
plt.title('RandomForest Feature Importance')plt.subplot(122)
plt.barh(pos, Ada_feature_importance[Ada_important_idx][::-1])
plt.yticks(pos, Ada_feature_imp['feature'][::-1])
plt.xlabel('Relative Importance')
plt.title('AdaBoost Feature Importance')plt.show()
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifierrf = RandomForestClassifier(n_estimators=500,warm_start=True,max_features='sqrt',max_depth=6,min_samples_split=3,min_samples_leaf=2,n_jobs=-1,verbose=0)ada = AdaBoostClassifier(n_estimators=500,learning_rate=0.1)et = ExtraTreesClassifier(n_estimators=500,n_jobs=-1,max_depth=8,min_samples_leaf=2,verbose=0)gb = GradientBoostingClassifier(n_estimators=500,learning_rate=0.008,min_samples_split=3,min_samples_leaf=2,max_depth=5,verbose=0)dt = DecisionTreeClassifier(max_depth=8)knn = KNeighborsClassifier(n_neighbors=2)svm = SVC(kernel='linear',C=0.025)
# 将pandas转换为arrays:# Create Numpy arrays of train,test and target(Survived) dataframes to feed into our models
x_train = titanic_train_data_X.values #Creates an array of the train data
x_test = titanic_test_data_X.values #Creates an array of the test data
y_train = titanic_train_data_Y.values
# Create our OOF train and test predictions.These base result will be used as new featurs
rf_oof_train,rf_oof_test = get_out_fold(rf,x_train,y_train,x_test)# Random Forest
ada_oof_train,ada_oof_test = get_out_fold(ada,x_train,y_train,x_test)# AdaBoost
et_oof_train,et_oof_test = get_out_fold(et,x_train,y_train,x_test)# Extra Trees
gb_oof_train,gb_oof_test = get_out_fold(gb,x_train,y_train,x_test)# Gradient Boost
dt_oof_train,dt_oof_test = get_out_fold(dt,x_train,y_train,x_test)#Decision Tree
knn_oof_train,knn_oof_test = get_out_fold(knn,x_train,y_train,x_test)# KNeighbors
svm_oof_train,svm_oof_test = get_out_fold(svm,x_train,y_train,x_test)# Support Vectorprint("Training is complete")
from sklearn.model_selection import learning_curve
# from sklearn.learning_curve import learning_curvedefplot_learning_curve(estimator,title,X,y,ylim=None,cv=None,n_jobs=1,train_sizes=np.linspace(.1,1.0,5),verbose=0):"""Generate a simple plot of the test and training learning curve.Parameters-------------estimator:object type that implents the "fit" and "predict" methodsAn object of that type which is cloned for each validation.title:stringTitle for the chart.X:array-like,shape(n_samples,n_features)Training vector,where n_samples is the number of samples and n_features is the number of features.y:array-like,shape(n_samples) or (n_samples,n_features),optionalTarget relative to X for classification or regression;None for unsupervised learning.ylim:tuple,shape(ymin,ymax),optionalDefines minimum and maximum yvalues plotted.cv:integer,cross-validation generator,optionalIf an integer is passed,it is the number of folds(defaults to 3).Specific cross-validation objects can be passed,seesklearn.cross_validation module for the list of possible objectsn_jobs:integer,optionalNumber of jobs to run in parallel(default 1)."""plt.figure()plt.title(title)if ylim isnotNone:plt.ylim(*ylim)plt.xlabel("Training examples")plt.ylabel("Score")train_sizes,train_scores,test_scores = learning_curve(estimator,X,y,cv=cv,n_jobs=n_jobs,train_sizes=train_sizes)train_scores_mean = np.mean(train_scores,axis=1)train_scores_std = np.std(train_scores,axis=1)test_scores_mean = np.mean(test_scores,axis=1)test_scores_std = np.std(test_scores,axis=1)plt.grid()plt.fill_between(train_sizes,train_scores_mean - train_scores_std,train_scores_mean + train_scores_std,alpha=0.1,color='r')plt.fill_between(train_sizes,test_scores_mean - test_scores_std,test_scores_mean + test_scores_std,alpha=0.1,color='g')plt.plot(train_sizes,train_scores_mean,'o-',color="r",label="Training score")plt.plot(train_sizes,test_scores_mean,'o-',color="g",label="Cross-validation score")plt.legend(loc="best")return plt
最近,在研究深度学习方面的知识,结合Tensorflow,完成了基于lstm的小说预测程序demo。 lstm是改进的RNN,具有长期记忆功能,相对于RNN,增加了多个门来控制输入与输出。原理方面的知识网上很多,在此,我只是将我短暂学习的tensorflow写一个预测小说的demo,如果有错误,还望大家指出。 1、将小说进行分词,去除空格,建立词汇表与id的字典,生成初始输入模型的x与y d
这篇论文的标题是《在混合CNN-LSTM模型中应用贝叶斯推断进行时间序列预测》,作者是Thi-Lich Nghiem, Viet-Duc Le, Thi-Lan Le, Pierre Maréchal, Daniel Delahaye, Andrija Vidosavljevic。论文发表在2022年10月于越南富国岛举行的国际多媒体分析与模式识别会议(MAPR)上。 摘要部分提到,卷积