本文主要是介绍scikit-learn 决策树预测泰坦尼克号幸存者,希望对大家解决编程问题提供一定的参考价值,需要的开发者们随着小编来一起学习吧!
决策树的使用手册
http://sklearn.apachecn.org/cn/0.19.0/modules/tree.html
决策树预测结果容易理解,易于解释,预测速度快。
基于Entropy的分类:
ID3, C4.5,
C5.0,运算效率更高,使用内存更小,创建出来的决策树更小,准确性高,适合大数据集的决策树创建;
基于gini不纯度:
CART,分类回归树。
sklearn.tree.DecisionTreeClassifier
主要参数:
criterion:特征选择算法,entropy or gini,两种算法差异性不大,对模型的准确性影响不大,但是entropy效率低,因为log运算。
max_depth: 决策树的最大深度,调整该参数,可以解决模型过拟合问题;
- 使用决策树预测泰坦尼克号幸存者
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd#数据清洗def read_dataset(fname):# 指定第一列作为行索引data = pd.read_csv(fname, index_col=0) # 丢弃无用的数据data.drop(['Name', 'Ticket', 'Cabin'], axis=1, inplace=True)# 处理性别数据data['Sex'] = (data['Sex'] == 'male').astype('int')# 处理登船港口数据labels = data['Embarked'].unique().tolist()data['Embarked'] = data['Embarked'].apply(lambda n: labels.index(n))# 处理缺失数据data = data.fillna(0)return datatrain = read_dataset('datasets/titanic/train.csv')from sklearn.model_selection import train_test_splity = train['Survived'].values
X = train.drop(['Survived'], axis=1).valuesX_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)print('train dataset: {0}; test dataset: {1}'.format(X_train.shape, X_test.shape))
# train dataset: (712, 7); test dataset: (179, 7)from sklearn.tree import DecisionTreeClassifierclf = DecisionTreeClassifier()
clf.fit(X_train, y_train)
train_score = clf.score(X_train, y_train)
test_score = clf.score(X_test, y_test)
print('train score: {0}; test score: {1}'.format(train_score, test_score))# train score: 0.9831460674157303; test score: 0.8044692737430168
- 参数选择
# 参数选择 max_depth
def cv_score(d):clf = DecisionTreeClassifier(max_depth=d)clf.fit(X_train, y_train)tr_score = clf.score(X_train, y_train)cv_score = clf.score(X_test, y_test)return (tr_score, cv_score)depths = range(2, 15)
scores = [cv_score(d) for d in depths]
tr_scores = [s[0] for s in scores]
cv_scores = [s[1] for s in scores]best_score_index = np.argmax(cv_scores)
best_score = cv_scores[best_score_index]
best_param = depths[best_score_index]
print('best param: {0}; best score: {1}'.format(best_param, best_score))plt.figure(figsize=(10, 6), dpi=144)
plt.grid()
plt.xlabel('max depth of decision tree')
plt.ylabel('score')
plt.plot(depths, cv_scores, '.g-', label='cross-validation score')
plt.plot(depths, tr_scores, '.r--', label='training score')
plt.legend()# best param: 11; best score: 0.8435754189944135# 训练模型,并计算评分
def cv_score(val):clf = DecisionTreeClassifier(criterion='gini', min_impurity_split=val)clf.fit(X_train, y_train)tr_score = clf.score(X_train, y_train)cv_score = clf.score(X_test, y_test)return (tr_score, cv_score)# 指定参数范围,分别训练模型,并计算评分
values = np.linspace(0, 0.5, 50)
scores = [cv_score(v) for v in values]
tr_scores = [s[0] for s in scores]
cv_scores = [s[1] for s in scores]# 找出评分最高的模型参数
best_score_index = np.argmax(cv_scores)
best_score = cv_scores[best_score_index]
best_param = values[best_score_index]
print('best param: {0}; best score: {1}'.format(best_param, best_score))# 画出模型参数与模型评分的关系
plt.figure(figsize=(10, 6), dpi=144)
plt.grid()
plt.xlabel('threshold of entropy')
plt.ylabel('score')
plt.plot(values, cv_scores, '.g-', label='cross-validation score')
plt.plot(values, tr_scores, '.r--', label='training score')
plt.legend()#best param: 0.21428571428571427; best score: 0.8715083798882681def plot_curve(train_sizes, cv_results, xlabel):train_scores_mean = cv_results['mean_train_score']train_scores_std = cv_results['std_train_score']test_scores_mean = cv_results['mean_test_score']test_scores_std = cv_results['std_test_score']plt.figure(figsize=(10, 6), dpi=144)plt.title('parameters turning')plt.grid()plt.xlabel(xlabel)plt.ylabel('score')plt.fill_between(train_sizes, train_scores_mean - train_scores_std,train_scores_mean + train_scores_std, alpha=0.1, color="r")plt.fill_between(train_sizes, test_scores_mean - test_scores_std,test_scores_mean + test_scores_std, alpha=0.1, color="g")plt.plot(train_sizes, train_scores_mean, '.--', color="r",label="Training score")plt.plot(train_sizes, test_scores_mean, '.-', color="g",label="Cross-validation score")plt.legend(loc="best")from sklearn.model_selection import GridSearchCVthresholds = np.linspace(0, 0.5, 50)
# Set the parameters by cross-validation
param_grid = {'min_impurity_split': thresholds}clf = GridSearchCV(DecisionTreeClassifier(), param_grid, cv=5)
clf.fit(X, y)
print("best param: {0}\nbest score: {1}".format(clf.best_params_, clf.best_score_))plot_curve(thresholds, clf.cv_results_, xlabel='gini thresholds')#best param: {'min_impurity_split': 0.22448979591836732}
#best score: 0.820426487093#多参数选择
from sklearn.model_selection import GridSearchCVentropy_thresholds = np.linspace(0, 1, 50)
gini_thresholds = np.linspace(0, 0.5, 50)# Set the parameters by cross-validation
param_grid = [{'criterion': ['entropy'], 'min_impurity_split': entropy_thresholds},{'criterion': ['gini'], 'min_impurity_split': gini_thresholds},{'max_depth': range(2, 10)},{'min_samples_split': range(2, 30, 2)}]clf = GridSearchCV(DecisionTreeClassifier(), param_grid, cv=5)
clf.fit(X, y)
print("best param: {0}\nbest score: {1}".format(clf.best_params_, clf.best_score_))#生成决策树图形
clf = DecisionTreeClassifier(criterion='entropy', min_impurity_split=0.53061224489795911)
clf.fit(X_train, y_train)
train_score = clf.score(X_train, y_train)
test_score = clf.score(X_test, y_test)
print('train score: {0}; test score: {1}'.format(train_score, test_score))# 导出 titanic.dot 文件
with open("titanic.dot", 'w') as f:f = export_graphviz(clf, out_file=f)# 1. 在电脑上安装 graphviz
# 2. 运行 `dot -Tpng titanic.dot -o titanic.png`
# 3. 在当前目录查看生成的决策树 titanic.png#train score: 0.933988764045; test score: 0.798882681564
扩展阅读:
集合算法ensemble可以有效的解决过拟合问题。sklearn.ensemble
Bagging, BaggingClassifier + BaggingRegressor
Boosting,AdaBoostClassifier + AdaBoostRegressor
随机森林,RandomForestClassifier + RandomForestRegressor
这篇关于scikit-learn 决策树预测泰坦尼克号幸存者的文章就介绍到这儿,希望我们推荐的文章对编程师们有所帮助!