forked from intohole/moodstyle
-
Notifications
You must be signed in to change notification settings - Fork 0
/
HCluster.py
110 lines (83 loc) · 3.31 KB
/
HCluster.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
# coding=utf-8
import math
from BaseStrut import WeightArray
class HierarchicalClustering(object):
def __init__(self):
pass
def cluster(self, datas, cluster_num, threshold=0.03):
'''
'''
no_change = False
# 创建数据距离词典
distance_map = WeightArray(datas, self.distance)
# 创建一个cluster,每个数据都是一个cluster
clusters = [[datas[i]] for i in range(len(datas))]
# 如果聚类不小于要求聚类数目继续
while len(clusters) > cluster_num:
min_distance = None #最短距离保存值
min_cluster_pair = None #最短距离所对应的数据
for i in range(len(clusters)):
for j in range(i + 1, len(clusters)):
d = self.get_cluster_distance(
clusters[i], clusters[j], distance_map)
if d < threshold and (min_distance is None or d < min_distance):
min_distance = d
min_cluster_pair = (i, j)
if min_cluster_pair:
clusters[min_cluster_pair[0]].extend(
clusters[min_cluster_pair[1]])
del clusters[min_cluster_pair[1]]
else:
break
return clusters
def distance(self, data1, data2):
'''
function:
计算两个数据的距离
params:
data1 第一个数据
data2 第二个数据
return
distance 两个数据的距离
'''
return math.sqrt(sum([(data1[i] - data2[i]) ** 2 for i in range(1, len(data1))]))
def get_cluster_distance(self, cluster1, cluster2, distance_map):
'''
function
实现类之间平均距离
params:
cluster1 簇1
cluster2 簇2
distance_map DataDistance实例
return
两个类之间平均距离
'''
raise NotImplementedError
class ALHierarchicalClustering(HierarchicalClustering):
'''
主要算法:
计算cluster之间平均距离
'''
def get_cluster_distance(self, cluster1, cluster2, distance_map):
return sum([sum(distance_map[(data1[0], data2[0])]for data2 in cluster2) for data1 in cluster1]) / float(len(cluster1) * len(cluster2))
class SLHierarchicalClustering(HierarchicalClustering):
'''
主要算法:
两个cluster中最小的两个数据之间距离
'''
def get_cluster_distance(self, cluster1, cluster2, distance_map):
return min([min(distance_map[(data1[0], data2[0])] for data2 in cluster2) for data1 in cluster1]) / float(len(cluster1) * len(cluster2))
class CLHierarchicalClustering(HierarchicalClustering):
'''
主要算法:
两个cluster中距离最大两个数据距离
'''
def get_cluster_distance(self, cluster1, cluster2, distance_map):
return max([max(distance_map[(data1[0], data2[0])] for data2 in cluster2) for data1 in cluster1]) / float(len(cluster1) * len(cluster2))
if __name__ == '__main__':
hc = ALHierarchicalClustering()
from random import randint
datas = [[i, randint(1, 20), randint(1, 20)] for i in range(10)]
clusters = hc.cluster(datas, 4, 100)
for cluster in clusters:
print cluster