机器学习笔记——逻辑斯蒂回归

本文主要是介绍机器学习笔记——逻辑斯蒂回归，希望对大家解决编程问题提供一定的参考价值，需要的开发者们随着小编来一起学习吧！

参数化模型与非参数化

像前面的KNN模型，不需要对f的形式做出假设，在学习中可以得到任意的模型叫非参数化
而需要对参数进行学习的模型叫参数化模型，参数化限制了f的可能的集合，学习难度相对较低

逻辑斯蒂回归

逻辑斯蒂函数
在这里插入图片描述
似然函数

对数似然函数

在多分类使用softmax函数

重点

ROC曲线

真阳性率、假阳性率 FPR的变化曲线就叫做ROC曲线
ROC曲线的面积就叫AUC

在这里插入图片描述

import numpy as np
import matplotlib.pyplot as plt
from matplotlib.ticker import MaxNLocator
#%%
# 从源文件中读入数据并处理
lines = np.loadtxt('./data/lr_dataset.csv', delimiter=',', dtype=float)
x_total = lines[:, 0:2]
y_total = lines[:, 2]
print('数据集大小：', len(x_total))
#%%
pos_index=np.where(y_total==1)
neg_index=np.where(y_total==0)
plt.scatter(x_total[pos_index,0],x_total[pos_index,1],marker='o',color='coral',s=10)
plt.scatter(x_total[neg_index,0],x_total[neg_index,1],marker='x',color='blue',s=10)
plt.xlabel('X1')
plt.ylabel('X2')
plt.show()#%%
np.random.seed(0)
ratio = 0.7
split = int(len(x_total) * ratio)
idx = np.random.permutation(len(x_total))
x_total = x_total[idx]
y_total = y_total[idx]
x_train, y_train = x_total[:split], y_total[:split]
x_test, y_test = x_total[split:], y_total[split:]#%%
y_test
idx=np.argsort(y_test[::-1])#%%
y_test
#%%
def acc(y_true,y_pred):return np.mean(y_true==y_pred)
def auc(y_true,y_pred):idx=np.argsort(y_pred)[::-1]y_true=y_true[idx]y_pred=y_pred[idx]tp=np.cumsum(y_true) #累加fp=np.cumsum(1-y_true)tpr=tp/tp[-1]fpr=fp/fp[-1]s=0.0tpr = np.concatenate([[0], tpr]) #拼接函数fpr = np.concatenate([[0], fpr])for i in range(1, len(fpr)):s += (fpr[i] - fpr[i - 1]) * tpr[i]return s
#%%def logistic(z):return 1/(1+np.exp(-z))
def GD(num_steps,learning_rate,l2_coef):theta=np.random.normal(size=(X.shape[1],))train_losses=[]test_losses = []train_acc = []test_acc = []train_auc = []test_auc = []for i in range(num_steps):pred = logistic(X @ theta)grad = -X.T @ (y_train - pred) + l2_coef * thetatheta -= learning_rate * gradtrain_loss = - y_train.T @ np.log(pred) \- (1 - y_train).T @ np.log(1 - pred) \+ l2_coef * np.linalg.norm(theta) ** 2 / 2train_losses.append(train_loss / len(X))test_pred = logistic(X_test @ theta)test_loss = - y_test.T @ np.log(test_pred) \- (1 - y_test).T @ np.log(1 - test_pred)test_losses.append(test_loss / len(X_test))# 记录各个评价指标，阈值采用0.5train_acc.append(acc(y_train, pred >= 0.5))test_acc.append(acc(y_test, test_pred >= 0.5))train_auc.append(auc(y_train, pred))test_auc.append(auc(y_test, test_pred))return theta, train_losses, test_losses, \train_acc, test_acc, train_auc, test_auc
#%%
# 定义梯度下降迭代的次数，学习率，以及L2正则系数
num_steps = 250
learning_rate = 0.002
l2_coef = 1.0
np.random.seed(0)# 在x矩阵上拼接1
X = np.concatenate([x_train, np.ones((x_train.shape[0], 1))], axis=1)
X_test = np.concatenate([x_test, np.ones((x_test.shape[0], 1))], axis=1)theta, train_losses, test_losses, train_acc, test_acc, \train_auc, test_auc = GD(num_steps, learning_rate, l2_coef)# 计算测试集上的预测准确率
y_pred = np.where(logistic(X_test @ theta) >= 0.5, 1, 0)
final_acc = acc(y_test, y_pred)
print('预测准确率：', final_acc)
print('回归系数：', theta)plt.figure(figsize=(13, 9))
xticks = np.arange(num_steps) + 1#%%
# 绘制训练曲线
plt.subplot(221)
plt.plot(xticks, train_losses, color='blue', label='train loss')
plt.plot(xticks, test_losses, color='red', ls='--', label='test loss')
plt.gca().xaxis.set_major_locator(MaxNLocator(integer=True))
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()#%%
# 绘制准确率
plt.subplot(222)
plt.plot(xticks, train_acc, color='blue', label='train accuracy')
plt.plot(xticks, test_acc, color='red', ls='--', label='test accuracy')
plt.gca().xaxis.set_major_locator(MaxNLocator(integer=True))
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend()