本文主要是介绍【机器学习】(5.4)聚类--密度聚类(DBSCAN、MDCA),希望对大家解决编程问题提供一定的参考价值,需要的开发者们随着小编来一起学习吧!
1. 密度聚类方法
2. DBSCAN
2.1 DBSCAN算法的若干概念
2.2 DBSCAN算法流程
代码中运用了并查集,并查集详解 ——图文解说,简单易懂(转)_mjiansun的博客-CSDN博客
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.colors
import sklearn.datasets as ds
from sklearn.preprocessing import StandardScaler
import mathdef expand(a, b):d = (b - a) * 0.1return a-d, b+ddef euler_distance(point1: np.ndarray, point2: list) -> float:"""计算两点之间的欧拉距离,支持多维"""distance = 0.0for a, b in zip(point1, point2):distance += math.pow(a - b, 2)return math.sqrt(distance)class UnionFindSet(object):def __init__(self, data_num = 0):self.data_num = data_numself.prev = {}self.init_uf()def init_uf(self):for i in range(self.data_num):self.prev[i] = idef union_data(self, x, y):x_root = self.find_data(x)y_root = self.find_data(y)if x_root != y_root:self.prev[y_root] = x_rootdef find_data(self, x):x_root = self.prev[x]while self.prev[x_root] != x_root:x_root = self.prev[x_root]# 路径压缩,不写这部也没事,这里只是为了加速用x_copy = xwhile x_copy != x_root:temp = self.prev[x_copy]self.prev[x_copy] = x_rootx_copy = tempreturn x_rootdef get_uf_set(self):return self.prevclass DBSCAN(object):def __init__(self, eps, min_samples):self.eps = epsself.min_samples = min_samplesdef get_cluster(self, data):eps =self.epsmin_samples = self.min_samplesdata_num, feature_num = data.shapesim = [[] for i in range(data_num)]for i in range(data_num):for j in range(data_num):d = euler_distance(data[i, :], data[j, :])if d < eps:sim[i].append(j)uf = UnionFindSet(data_num)for i in range(data_num):single_data = sim[i]if len(single_data) <= min_samples:continuefor j in single_data:uf.union_data(i, j)cluster_cls = {}cls_num = 0for i in range(data_num):r = uf.find_data(i)if len(sim[i]) <= min_samples:continueif r not in cluster_cls:cls_num += 1cluster_cls[r] = cls_numdata_cls = [0] * data_numfor i in range(data_num):r = uf.find_data(i)if r != i or len(sim[i]) > min_samples:data_cls[i] = cluster_cls[r]return data_clsif __name__ == "__main__":N = 1000centers = [[1, 2], [-1, -1], [1, -1], [-1, 1]]data, y = ds.make_blobs(N, n_features=2, centers=centers, cluster_std=[0.5, 0.25, 0.7, 0.5], random_state=0)data = StandardScaler().fit_transform(data)params = ((0.19, 5), (0.2, 10), (0.2, 15), (0.3, 5), (0.3, 10), (0.3, 15))eps, min_samples = params[0]dbscan = DBSCAN(eps, min_samples)data_cls = dbscan.get_cluster(data)y_hat = np.array(data_cls)core_indices = np.zeros_like(y_hat, dtype=bool)core_indices[np.where(y_hat >= 1)] = Truey_unique = np.unique(y_hat)n_clusters = y_unique.size - (1 if 0 in y_hat else 0)print(y_unique, '聚类簇的个数为:', n_clusters)# plt.subplot(2, 3, i + 1)clrs = plt.cm.Spectral(np.linspace(0, 0.8, y_unique.size))print(clrs)for k, clr in zip(y_unique, clrs):cur = (y_hat == k)if k == 0:plt.scatter(data[cur, 0], data[cur, 1], s=10, c='k')continueplt.scatter(data[cur, 0], data[cur, 1], s=15, c=clr, edgecolors='k')plt.scatter(data[cur & core_indices][:, 0], data[cur & core_indices][:, 1], s=30, c=clr, marker='o',edgecolors='k')x1_min, x2_min = np.min(data, axis=0)x1_max, x2_max = np.max(data, axis=0)x1_min, x1_max = expand(x1_min, x1_max)x2_min, x2_max = expand(x2_min, x2_max)plt.xlim((x1_min, x1_max))plt.ylim((x2_min, x2_max))plt.plot()plt.grid(b=True, ls=':', color='#606060')plt.title(r'$\epsilon$ = %.1f m = %d,聚类数目:%d' % (eps, min_samples, n_clusters), fontsize=12)plt.tight_layout()plt.subplots_adjust(top=0.9)plt.show()
3. MDCA
MDCA(Maximum Density Clustering Application):将基于密度的思想引入到划分聚类中,使用密度而不是初始质心作为考察簇归属情况的依据,能够自动确定簇数量并发现任意形状的簇。MDCA一般不保留噪声,因此也避免了由于阈值选择不当而造成大量对象丢弃情况。
我是依据参考文献2实现的内容进行的复现,我不清楚是否正确,因为我对分类结果不是很满意,与我的预期有所差别。
3.1 核心思想
1.最大密度点:可用K近邻距离之和的倒数表示密度
2. 密度曲线:根据所有对象与x的欧式距离对数据集重新排序
3. 将密度曲线中第一个谷值之前的数据归为一类,并将其剔除
4. 重复步骤1,2,3直到所有的点都在ρ0之下或者ρ0之上
5. 两个簇Ci和Cj,用最近样本距离作为簇间距离
6. 根据簇间距离阈值d0,判断是否需要合并两类
3.2 代码实战
import os
import numpy as np
import sklearn.datasets as ds
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScalerdef expand(a, b):d = (b - a) * 0.1return a-d, b+ddef find_cls(p, y_pred, distance, density_min, cls_num):candidate_indices = np.where(y_pred == 0)[0]cur_p = p[candidate_indices]cur_distance = distance[np.argmax(cur_p), candidate_indices]# 按照到最大密度点的距离从小到大排序cur_distance_sort = np.argsort(cur_distance)cur_curve = cur_p[cur_distance_sort] # 密度按照离“最高密度”的距离排序ori_indices = candidate_indices[cur_distance_sort] # 对应到最原始的索引值# plt.plot(cur_distance[cur_distance_sort], cur_curve)# plt.show()if np.max(cur_curve) > density_min:loc = np.where(cur_curve <= density_min)[0]loc_length = len(loc)if loc_length >= 2:for i in range(loc_length - 1):if loc[i + 1] - loc[i] <= 2:breaky_pred[ori_indices[:loc[i+1]]] = cls_numy_pred, cls_num = find_cls(p, y_pred, distance, density_min, cls_num+1)return y_pred, cls_numif __name__ == "__main__":N = 1000centers = [[1, 2], [-1, -1], [1, -1], [-1, 1]]data, y = ds.make_blobs(N, n_features=2, centers=centers, cluster_std=[0.25, 0.25, 0.25, 0.25], random_state=0)# data = StandardScaler().fit_transform(data)max_data = np.max(data)min_data = np.min(data)data = (data - min_data) / (max_data- min_data)# 判断密度时检测周围点的个数k = 3# 最小阈值密度,设置该值主要是使用K近邻距离之和的倒数表示密度,但是这个值怎么选出来暂时不知道density_min = 10000# 最小阈值距离distance_min = 0.008# 样本数,特征数目sample_num, feature_num = data.shape# 预测结果y_pred = np.zeros(sample_num)# p为每个样本的密度p = np.zeros(sample_num)# distance(i, j) 代表第i个样本到第j个样本的距离distance = np.zeros((sample_num,sample_num))# 利用k近邻均值定义密度较好,不会出现很多密度相同的点。如果采用半径内个数的定义方法,可能一块区域会出现很多的类别for i in range(sample_num):distance[i, :] = np.sum((data[i, :] - data)**2, axis=1)# temp_sort = np.sort(distance[i, :])# p[i] = k / np.sum(distance[i, distance[i, :] <= temp_sort[k]])temp_sort = np.argsort(distance[i, :])p[i] = k / np.sum(distance[i, temp_sort[:k+1]])cls_num = 1# 确定初始的类簇y_pred, cls_num = find_cls(p, y_pred, distance, density_min, cls_num)# 合并距离较近的类while True:flag_1 = 0flag_2 = 0for i in range(1, cls_num):for j in range(i+1, cls_num):a = np.where(y_pred == i)[0]b = np.where(y_pred == j)[0]temp = distance[a, :][:, b]if np.min(temp) <= distance_min:y_pred[y_pred==j] = iy_pred[y_pred > j] = y_pred[y_pred > j] - 1cls_num = cls_num - 1flag_1 = 1flag_2 = 0break #跳出的第二个forif flag_1 == 1: #跳出的第一个forbreakflag_2 = 1if flag_2 == 1: # 跳出的whilebreak# 合并离聚类团较小的散点scatter_indices = np.where(y_pred==0)[0]loc_indices = np.where(y_pred > 0)[0]scatter_cls_distance = distance[scatter_indices,:][:,loc_indices]scatter_cls_min_distance = np.min(scatter_cls_distance, axis=1)scatter_cls_min_indices = np.argmin(scatter_cls_distance, axis=1)a = y_pred[loc_indices[scatter_cls_min_indices]]b = (scatter_cls_min_distance <= distance_min)y_pred[scatter_indices] = y_pred[loc_indices[scatter_cls_min_indices]] * (scatter_cls_min_distance <= distance_min)# 画图y_hat = np.array(y_pred)core_indices = np.zeros_like(y_hat, dtype=bool)core_indices[np.where(y_hat >= 1)] = Truey_unique = np.unique(y_hat)n_clusters = y_unique.size - (1 if 0 in y_hat else 0)print(y_unique, '聚类簇的个数为:', n_clusters)# plt.subplot(2, 3, i + 1)clrs = plt.cm.Spectral(np.linspace(0, 0.8, y_unique.size))print(clrs)for k, clr in zip(y_unique, clrs):cur = (y_hat == k)if k == 0:plt.scatter(data[cur, 0], data[cur, 1], s=10, c='k')continueplt.scatter(data[cur, 0], data[cur, 1], s=15, c=clr, edgecolors='k')plt.scatter(data[cur & core_indices][:, 0], data[cur & core_indices][:, 1], s=30, c=clr, marker='o',edgecolors='k')x1_min, x2_min = np.min(data, axis=0)x1_max, x2_max = np.max(data, axis=0)x1_min, x1_max = expand(x1_min, x1_max)x2_min, x2_max = expand(x2_min, x2_max)plt.xlim((x1_min, x1_max))plt.ylim((x2_min, x2_max))plt.plot()plt.grid(b=True, ls=':', color='#606060')# plt.title(r'$\epsilon$ = %.1f m = %d,聚类数目:%d' % (1, 1, n_clusters), fontsize=12)plt.tight_layout()plt.subplots_adjust(top=0.9)plt.show()print("end")
3.3 实验结果
3.4 性能比较(个人感觉不太好用,参数太难调了)
- 优点:
- 对噪声数据不敏感
- 不依赖初始数据点的选择
- 可以完成任意形状的聚类
- 缺点:
- 算法复杂,分类速度较慢
- 需要在测试前确定密度阈值
- 对于高维数据,距离的度量并不是很好
- 不适合数据集密度差异较大或整体密度基本相同的情况
参考
1. 并查集详解 ——图文解说,简单易懂(转)_mjiansun的博客-CSDN博客
2. 密度最大值聚类(MDCA) | GitHub
这篇关于【机器学习】(5.4)聚类--密度聚类(DBSCAN、MDCA)的文章就介绍到这儿,希望我们推荐的文章对编程师们有所帮助!