本文主要是介绍天池新人赛-零基础入门数据挖掘 - 二手车交易价格预测-排名374,希望对大家解决编程问题提供一定的参考价值,需要的开发者们随着小编来一起学习吧!
赛题介绍:
赛题以预测二手车的交易价格为任务,数据集报名后可见并可下载,该数据来自某交易平台的二手车交易记录,总数据量超过40w,包含31列变量信息,其中15列为匿名变量。为了保证比赛的公平性,将会从中抽取15万条作为训练集,5万条作为测试集A,5万条作为测试集B,同时会对name、model、brand和regionCode等信息进行脱敏。
具体介绍:二手车交易价格预测
具体思路:
用中位数填充空值
修改异常数据
特征归一化
切分数据集
使用神经网络和极端回归树做Stacking
提交记录:
主要运行代码如下:
import osimport numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from tensorflow import kerasfrom Aero_engine_life.data_model import build_model_etros.chdir(r'E:\项目文件\二手车交易价格\\')
from sklearn.metrics import mean_absolute_errordata_train = pd.read_csv(r'used_car_train_20200313.csv', sep=' ')
data_test = pd.read_csv(r'used_car_testB_20200421.csv', sep=' ')
data_train.replace(to_replace='-', value=np.nan, inplace=True)
data_test.replace(to_replace='-', value=np.nan, inplace=True)
# 用中位数填充空值
data_train.fillna(data_train.median(), inplace=True)
data_test.fillna(data_train.median(), inplace=True)
tags = ['model', 'brand', 'bodyType', 'fuelType', 'regionCode', 'regionCode', 'regDate', 'creatDate', 'kilometer','notRepairedDamage', 'power', 'v_0', 'v_1', 'v_2', 'v_3', 'v_4', 'v_5', 'v_6','v_7', 'v_8', 'v_9', 'v_10', 'v_11', 'v_12', 'v_13', 'v_14']
# 修改异常数据
data_train['power'][data_train['power'] > 600] = 600
data_test['power'][data_test['power'] > 600] = 600
# 特征归一化
min_max_scaler = MinMaxScaler()
min_max_scaler.fit(data_train[tags].values)
x = min_max_scaler.transform(data_train[tags].values)
x_ = min_max_scaler.transform(data_test[tags].values)
# 获得y值
y = data_train['price'].values
# 切分数据集
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)
model = keras.Sequential([keras.layers.Dense(250, activation='relu', input_shape=[26]),keras.layers.Dense(250, activation='relu'),keras.layers.Dense(250, activation='relu'),keras.layers.Dense(1)])
model.compile(loss='mean_absolute_error',optimizer='adam')
model.fit(x_train, y_train, batch_size=2048, epochs=111)
# 比较训练集和测试集效果
x_predict = model.predict(x_train)
test_pred = model.predict(x_test)# model_lgb = build_model_lgb(x_train, y_train)
# val_lgb = model_lgb.predict(x_test)
model_etr = build_model_etr(x_train, y_train)
val_etr = model_etr.predict(x_test)
# model_rf = build_model_rf(x_train, y_train)
# val_rf = model_rf.predict(x_test)
# Starking 第一层
print(mean_absolute_error(y_train, x_predict))
print(mean_absolute_error(y_test, test_pred))
train_etr_pred = model_etr.predict(x_train)
print('etr训练集,mae:', mean_absolute_error(y_train, train_etr_pred))
# train_lgb_pred = model_lgb.predict(x_train)
# print('lgb训练集,mae:', mean_absolute_error(y_train, train_lgb_pred))
# write_mae('lgb', '训练集', mean_absolute_error(y_train, train_lgb_pred))
# train_rf_pred = model_rf.predict(x_train)
# print('rf训练集,mae:', mean_absolute_error(y_train, train_rf_pred))
# write_mae('rf', '训练集', mean_absolute_error(y_train, train_rf_pred))Strak_X_train = pd.DataFrame()
# Strak_X_train['Method_1'] = train_rf_pred
# Strak_X_train['Method_2'] = train_lgb_pred
Strak_X_train['Method_3'] = train_etr_pred
Strak_X_train['Method_4'] = x_predictStrak_X_val = pd.DataFrame()
# Strak_X_val['Method_1'] = val_rf
# Strak_X_val['Method_2'] = val_lgb
Strak_X_val['Method_3'] = val_etr
Strak_X_val['Method_4'] = test_pred# 第二层
model_Stacking = build_model_etr(Strak_X_train, y_train)val_pre_Stacking = model_Stacking.predict(Strak_X_val)test_pred1 = model.predict(x_)
subA_etr = model_etr.predict(x_)
# subA_lgb = model_lgb.predict(x_)
# subA_rf = model_rf.predict(x_)
Strak_X_test = pd.DataFrame()
# Strak_X_test['Method_1'] = subA_rf
# Strak_X_test['Method_2'] = subA_lgb
Strak_X_test['Method_3'] = subA_etr
Strak_X_test['Method_4'] = test_pred1pred = model_Stacking.predict(Strak_X_test)
print(test_pred1)
np.savetxt('submit_s.csv', test_pred1)
模型代码如下:
import osfrom sklearn.metrics import mean_absolute_error, mean_squared_error
from lightgbm import LGBMRegressor
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor
from sklearn.model_selection import GridSearchCVfrom utils.read_write import writeOneCsv, pdReadCsvdef get_train():file = 'train_label.csv'# file = 'download_label.csv'# file = 'test_label.csv'train = pdReadCsv(file, ',')return train.values[:, 3:-1], train.values[:, -1:].ravel()def build_model_rf(x_train, y_train):estimator = RandomForestRegressor(criterion='mse')param_grid = {'max_depth': range(33, 35, 9),'n_estimators': range(73, 77, 9),}model = GridSearchCV(estimator, param_grid, cv=3)model.fit(x_train, y_train)print('rf')print(model.best_params_)writeParams('rf', model.best_params_)return modeldef build_model_etr(x_train, y_train):# 极端随机森林回归 n_estimators 即ExtraTreesRegressor最大的决策树个数estimator = ExtraTreesRegressor(criterion='mse')param_grid = {'max_depth': range(33, 39, 9),'n_estimators': range(96, 99, 9),}model = GridSearchCV(estimator, param_grid)model.fit(x_train, y_train)print('etr')print(model.best_params_)writeParams('etr', model.best_params_)return modeldef build_model_lgb(x_train, y_train):estimator = LGBMRegressor()param_grid = {'learning_rate': [0.1],'n_estimators': range(77, 78, 9),'num_leaves': range(59, 66, 9)}gbm = GridSearchCV(estimator, param_grid)gbm.fit(x_train, y_train.ravel())print('lgb')print(gbm.best_params_)writeParams('lgb', gbm.best_params_)return gbmdef scatter_line(y_val, y_pre):import matplotlib.pyplot as pltxx = range(0, len(y_val))plt.scatter(xx, y_val, color="red", label="Sample Point", linewidth=3)plt.plot(xx, y_pre, color="orange", label="Fitting Line", linewidth=2)plt.legend()plt.show()def score_model(train, test, predict, model, data_type):score = model.score(train, test)print(data_type + ",R^2,", round(score, 6))writeOneCsv(['staking', data_type, 'R^2', round(score, 6)], src + '调参记录.csv')mae = mean_absolute_error(test, predict)print(data_type + ',MAE,', mae)writeOneCsv(['staking', data_type, 'MAE', mae], src + '调参记录.csv')mse = mean_squared_error(test, predict)print(data_type + ",MSE,", mse)writeOneCsv(['staking', data_type, 'MSE', mse], src + '调参记录.csv')def writeParams(model, best):if model == 'lgb':writeOneCsv([model, best['num_leaves'], best['n_estimators'], best['learning_rate']], src + '调参记录.csv')else:writeOneCsv([model, best['max_depth'], best['n_estimators'], 0], src + '调参记录.csv')def write_mse(model, data_type, mse):writeOneCsv([model, data_type, 'mse', mse], src + '调参记录.csv')
读写文件read_write.py的方法:
# -*- coding: utf-8 二层循环版本、比较慢
import csv
import json
import os
from urllib import request
import numpy as np
import pandas as pd
from tqdm import tqdm# 写CSV文件,写一行就换行,追加方式
def writeCsv(relate_record, src):with open(src, 'w', newline='\n') as csvFile:writer = csv.writer(csvFile)for row in relate_record:try:writer.writerow(row)except Exception as e:print(e)print(row)# writeCsvUTF8(relate_record,bus)# def writeExcept(row,bus):
# with open(filePath, 'r', encoding='utf-8') as dic:
# ## dic.read()
# for item in dic:
# if item.encode('utf-8').decode('utf-8-sig').strip() == s:
# print('ok')
# print(item)
# print(s)# 写CSV文件,写一行就换行,追加方式
def writeOneCsv(relate_record, src):try:with open(src, 'a', newline='\n') as csvFile:writer = csv.writer(csvFile)writer.writerow(relate_record)# csvFile.close()except Exception as e:print(e)print(relate_record)# writeCsvGBK(relate_record,bus)# 写CSV文件,写一行就换行,追加方式
def writeCsvUTF8(relate_record, src):try:with open(src, 'a', newline='\n', encoding='utf-8') as csvFile:writer = csv.writer(csvFile)writer.writerow(relate_record)except:print(relate_record)# 写CSV文件,写一行就换行,追加方式
def writeCsvGbk(relate_record, src):try:with open(src, 'a', newline='\n', encoding='gbk') as csvFile:writer = csv.writer(csvFile)writer.writerow(relate_record)except:print(relate_record)# 写Txt文件,写一行就换行,追加方式
def writeTxt(relate_record, src):with open(src, 'w', newline='\n') as file:for i in relate_record:for cell in i:file.write(cell)file.write(',')file.write('\n')file.close()# 写Txt文件,写一行就换行,追加方式
def writeOneTxt(one_record, src):try:with open(src, 'a') as file:file.write(one_record)file.write('\n')except Exception as e:print(e)# 写Json文件,写一行就换行,追加方式
def writeJson(relate_record, src):Json_str = json.dumps(relate_record, ensure_ascii=False)with open(src, 'a') as Json_file:Json_file.write(Json_str)Json_file.close()# 写Json文件,一个数据一个文件
def writeOneJson(relate_record, src):Json_str = json.dumps(relate_record, ensure_ascii=False)with open(src, 'w', encoding='utf-8') as Json_file:Json_file.write(Json_str)Json_file.close()def readJsonToCsv(dict, src):df = pd.DataFrame.from_dict(dict, orient='index')df.transpose()df.to_csv(src)def savPng(url,filename):try:rsp = request.urlopen(url)img = rsp.read()with open(filename, 'wb') as f:f.write(img)except Exception as e:print(url)print(e)def readJson(filepath):try:with open(filepath, 'r', encoding='GBK') as file_open:data = json.load(file_open)file_open.close()return dataexcept:try:with open(filepath, 'r', encoding='utf-8') as file_open:data = json.load(file_open)file_open.close()return dataexcept:with open(filepath, 'r', encoding = "unicode_escape") as file_open:data = json.load(file_open)file_open.close()return datadef readBigData(filePath,sep):data = pd.read_csv(filePath, sep=sep, engine='python', iterator=True)chunkSize = 100chunks = []chunk = data.get_chunk(chunkSize)chunks.append(chunk)print('开始合并')data = pd.concat(chunks, ignore_index=True)return datadef readerPandas(file,sep, chunkSize=100000, patitions=10 ** 4):reader = pd.read_csv(file, iterator=True,sep=sep)chunks = []with tqdm(range(patitions), 'Reading ...') as t:for _ in t:try:chunk = reader.get_chunk(chunkSize)chunks.append(chunk)except StopIteration:breakreturn pd.concat(chunks, ignore_index=True)def readTxt(filepath):try:with open(filepath, 'r', encoding='gbk') as f:lines = []for one in f:one = one.rstrip("\n\t")lines.append(one)f.close()return linesexcept:with open(filepath, 'r', encoding='utf-8') as f:lines = []for one in f:one = one.rstrip("\n\t")lines.append(one)f.close()return linesdef readTxtJson(filepath):try:with open(filepath, 'r', encoding='utf-8') as f:lines = []for line in f:line = line.rstrip("\n\t")line = line.rstrip(" ")lines.append(line)Json_data = "".join(lines)data = eval(Json_data)f.close()return dataexcept:print(filepath)def readToStr(filepath):try:with open(filepath, 'r', encoding='gbk') as f:data = f.read()f.close()return dataexcept:with open(filepath, 'r', encoding='utf-8') as f:data = f.read()f.close()return datadef readCsv(filepath):# encoding = 'utf-8'encoding = 'gbk'birth_data = []try:with open(filepath, 'r',encoding=encoding) as csvfile:csv_reader = csv.reader(csvfile) # 使用csv.reader读取csvfile中的文件for row in csv_reader: # 将csv 文件中的数据保存到birth_data中birth_data.append(row)csvfile.close()return birth_dataexcept:with open(filepath, 'r',encoding='utf-8') as csvfile:csv_reader = csv.reader(csvfile) # 使用csv.reader读取csvfile中的文件for row in csv_reader: # 将csv 文件中的数据保存到birth_data中birth_data.append(row)csvfile.close()return birth_datadef pdReadCsv(file, sep):try:data = pd.read_csv(file, sep=sep,encoding='utf-8',error_bad_lines=False,engine='python')return dataexcept:data = pd.read_csv(file,sep=sep,encoding='gbk',error_bad_lines=False,engine='python')return datadef pdToCsv(data,path):data.to_csv(path,index=True,header=False,mode='a',sep=',')def readExcel(file):data = pd.read_excel(file)return data# 求数组中出现最多的元素
def max_list(gid_list):temp = 0max_rec = []for rec in gid_list:if gid_list.count(rec) > temp:max_rec = rectemp = gid_list.count(max_rec)return max_rec# 遍历文件夹中的所有文件
def eachFile(filepath):pathDir = os.listdir(filepath) # 获取当前路径下的文件名,返回Listreturn pathDirdef find_dir_files(path):files_list = []for root, files in os.walk(path):# for dir in dirs:# print(os.path.join(root, dir))for file in files:files_list.append(os.path.join(root, file))return files_listdef get_file_list(file_path):dir_list = os.listdir(file_path)# 注意,这里使用lambda表达式,将文件按照最后修改时间顺序升序排列# os.path.getmtime() 函数是获取文件最后修改时间# os.path.getctime() 函数是获取文件最后创建时间dir_list = sorted(dir_list, key=lambda x: os.path.getmtime(os.path.join(file_path, x)))return dir_list# reduce_mem_usage 函数通过调整数据类型,帮助我们减少数据在内存中占用的空间
def reduce_mem_usage(df):""" iterate through all the columns of a dataframe and modify the data typeto reduce memory usage."""start_mem = df.memory_usage().sum()print('Memory usage of dataframe is {:.2f} kB'.format(start_mem))for col in df.columns:col_type = df[col].dtypeif col_type != object:c_min = df[col].min()c_max = df[col].max()if str(col_type)[:3] == 'int':if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:df[col] = df[col].astype(np.int8)elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:df[col] = df[col].astype(np.int16)elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:df[col] = df[col].astype(np.int32)elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:df[col] = df[col].astype(np.int64)else:if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:df[col] = df[col].astype(np.float16)elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:df[col] = df[col].astype(np.float32)else:df[col] = df[col].astype(np.float64)else:df[col] = df[col].astype('category')end_mem = df.memory_usage().sum()print('Memory usage after optimization is: {:.2f} kB'.format(end_mem))print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))return dfdef dataToDict(file):station = pd.read_table(file, sep='\t', usecols=[1, 2], encoding='gbk')stationID = station.values.tolist()dict ={}for stations in stationID:id = '"'+str(stations[0]) +'"'dict[id] = '"'+stations[1]+'"'print(dict)if __name__ == '__main__':file = '.Txt'dataToDict(file)# change_list()# src = 'D:\data\jianguiyaun\\all_bus_line\\bianli2019\\bus_route_all\\bus_route_9000\\'# src_list = get_file_list(src)# full_path = os.path.join(src, src_list[0])
欢迎大家一键三连,我会持续分享的
这篇关于天池新人赛-零基础入门数据挖掘 - 二手车交易价格预测-排名374的文章就介绍到这儿,希望我们推荐的文章对编程师们有所帮助!