本文主要是介绍[小白系列]通过pydot+GraphViz实现泰坦尼克号生存预测模型的决策树可视化,希望对大家解决编程问题提供一定的参考价值,需要的开发者们随着小编来一起学习吧!
具体GraphViz安装,请点击链接
import numpy as np
import pandas as pd
from sklearn.tree import DecisionTreeClassifier # 从sklearn中导入决策树分类器模型
from sklearn.feature_extraction import DictVectorizer # 特征抽取:将特征与值的映射字典组成的列表转换成向量。
from sklearn.model_selection import cross_val_score # 导入数据交叉验证的数据方法
from sklearn import metrics # 指明Python sklearn机器学习各种评价指标
import matplotlib.pyplot as plt
import seaborn as sns# 数据加载
train_data = pd.read_csv('./train.csv')
test_data = pd.read_csv('./test.csv')# 使用平均年龄来填充年龄中的nan值
train_data['Age'].fillna(train_data['Age'].mean(), inplace=True)
test_data['Age'].fillna(test_data['Age'].mean(),inplace=True)
# 使用票价的均值填充票价中的nan值
train_data['Fare'].fillna(train_data['Fare'].mean(), inplace=True)
test_data['Fare'].fillna(test_data['Fare'].mean(),inplace=True)
#print(train_data['Embarked'].value_counts())
# 使用登录最多的港口来填充登录港口的nan值
train_data['Embarked'].fillna('S', inplace=True)
test_data['Embarked'].fillna('S',inplace=True)
# 特征选择
features = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']
train_features = train_data[features]# 显示特征之间的相关系数
plt.figure(figsize=(10, 10))
plt.title('Pearson Correlation between Features',y=1.05,size=15)
train_data_hot_encoded = train_features.drop('Embarked',1).join(train_features.Embarked.str.get_dummies())
train_data_hot_encoded = train_data_hot_encoded.drop('Sex',1).join(train_data_hot_encoded.Sex.str.get_dummies())
# 计算特征之间的Pearson系数,即相似度,具体可以看链接:https://blog.csdn.net/KaelCui/article/details/105235136
sns.heatmap(train_data_hot_encoded.corr(),linewidths=0.1,vmax=1.0, fmt= '.2f', square=True,linecolor='white',annot=True)
plt.show()# 使用饼图来进行Survived取值的可视化
#print(type(train_data["Survived"].value_counts()))
train_data["Survived"].value_counts().plot(kind = "pie", label='Survived')
plt.show()# 不同的Pclass,幸存人数(条形图)
sns.barplot(x = 'Pclass', y = "Survived", data = train_data);
plt.show()# 不同的Embarked,幸存人数(条形图)
sns.barplot(x = 'Embarked', y = "Survived", data = train_data);
plt.show()# 训练并显示特征向量的重要程度
def train(train_features, train_labels):# 构造CART决策树clf = DecisionTreeClassifier()# 决策树训练clf.fit(train_features, train_labels)# 显示特征向量的重要程度coeffs = clf.feature_importances_#print(coeffs)df_co = pd.DataFrame(coeffs, columns=["importance_"])# 下标设置为Feature Namedf_co.index = train_features.columns#print(df_co.index)df_co.sort_values("importance_", ascending=True, inplace=True)df_co.importance_.plot(kind="barh")plt.title("Feature Importance")plt.show()return clfclf = train(train_data_hot_encoded, train_data["Survived"])# 决策树可视化
import pydotplus # 可以将Sklearn生成dot格式
from sklearn.externals.six import StringIO
from sklearn.tree import export_graphviz # #可视化所需的导入工具从sklearn.tree导入export_graphviz导入pydot导入操作系统def show_tree(clf):dot_data = StringIO()export_graphviz(clf, out_file=dot_data)graph = pydotplus.graph_from_dot_data(dot_data.getvalue())graph.write_pdf("titanic_tree.pdf")show_tree(clf)
这篇关于[小白系列]通过pydot+GraphViz实现泰坦尼克号生存预测模型的决策树可视化的文章就介绍到这儿,希望我们推荐的文章对编程师们有所帮助!