先给出完整代码,再分别说明
#-*- coding:utf-8
import traceback
from sklearn.cluster import KMeans
import numpy as np
import matplotlib.pyplot as plt
'''
函数名:draw_original
功能:根据样本和真相画图
@X:样本
@Y:真相
'''
def draw_original(X, Y):
try:
plt.subplot(1,1,1)
plt.scatter(X[:, 0], X[:, 1], c=Y)
plt.title("original clusters")
plt.xlabel("Feature1")
plt.ylabel("Feature2")
plt.show()
except Exception,e:
print traceback.print_exc()
'''
函数名: drawing_n_clusters
功能: 根据聚类列表,聚类中心画图
@cluster_list: 聚类列表,每个元素是一个聚类。这个聚类样本的列表,注意是列表
@centroids: 聚类中心,矩阵形式存储
'''
def drawing_n_clusters(cluster_list, centroids):
try:
n_clusters = len(cluster_list)
k_clusters = [np.array(cluster) for cluster in cluster_list] # 每一个类别以矩阵形式存储
plt.subplot(1, 1, 1)
color_list = ['b', 'g', 'r', 'c', 'm', 'y', 'k', 'w']
for idx, cluster in enumerate(k_clusters):
label_str = "cluster" + str(idx)
plt.scatter(cluster[:, 0], cluster[:, 1], c=color_list[idx], label=label_str)
plt.scatter(centroids[:, 0], centroids[:, 1], marker='+', color='y', s=200)
title = str(n_clusters) + " clusters"
plt.title(title)
plt.xlabel("Feature1")
plt.ylabel("Feature2")
plt.legend(loc=2)
plt.show()
except Exception, e:
print traceback.print_exc()
'''
函数名: draw_error
功能: 画出不同K时的损失和
@k_list: K列表
@error_list: 每个K下的损失和
'''
def draw_error( k_list, error_list ):
try:
# 画框设置
plt.figure(figsize=(8, 5), dpi=80) # 创建图像
plt.subplot(1,1,1)
# 画点
plt.plot(k_list, error_list, marker='o', c = 'blue')
# 加标题
plt.title("Total Error vs. # of Clusters")
# 加坐标轴
plt.xticks(k_list)
plt.xlabel("k")
plt.ylabel("total squared error")
# 显示
plt.show()
except Exception,e:
print traceback.print_exc()
'''
-------------------------------------------------------------------------------
'''
'''
函数名:load_dataset
功能:加载数据集:
@input_path:输入路径
@X: 样本列表,每个样本也以列表形式存储
@Y: 原始标签列表,以列表形式存储
'''
def load_dataset( input_path ):
try:
X = []
Y = []
infile = open(input_path, "r")
for line in infile:
data = line.rstrip('\r\n').split('\t')
x = []
y = []
y.append(int(data[0]))
x.append(float(data[1]))
x.append(float(data[2]))
X.append(x)
Y.append(y)
infile.close()
return X, Y
print "[INFO]: load_dataset is finished!"
except Exception,e:
print traceback.print_exc()
'''
函数名:training
功能:训练kmeans聚类器,初始点的选择采用kmeans++,对于K的训练迭代多次,返回最优值的聚类结果
@X: 样本-矩阵形式,均以向量的形式保存
@K: 聚类数量
@label: 返回每个样本的训练标签
@loss: means square均方误差
@centroids: 聚类中心
'''
def training( X, K ):
try:
kmeans = KMeans(n_clusters=K).fit(X)
label = kmeans.labels_
loss = kmeans.inertia_
centroids = kmeans.cluster_centers_
return label, loss, centroids
except Exception,e:
print traceback.print_exc()
'''
函数名:get_clusters
功能: 根据样本和聚类结果,获得每个聚类
@X: 样本-矩阵形式
@label: 样本标签-矩阵形式
@K: 聚类数量
@cluster_list: 类别
'''
def get_clusters(X, label, K):
try:
cluster_list = [ [] for x in range(K) ] # 每一个聚类用一个列表存。每个列表存这个聚类的样本向量
idx = 0
len_label = len(label)
while idx < len_label:
cluster_list[label[idx]].append(X[idx])
idx += 1
return cluster_list
except Exception,e:
print traceback.print_exc()
'''
-------------------------------------------------------------------------------
'''
def find_K():
try:
INPUT_PATH = "../data/4k2_far.txt"
OUTPUT_PATH_K = "../output/test_for_4k2/inertia.txt"
_X, _Y = load_dataset(INPUT_PATH)
X = np.array(_X)
#Y = np.array(_Y)
error_list = []
outfile = open(OUTPUT_PATH_K, "w")
for K in range(1,21):
_, loss, _ = training(X, K)
error_list.append(loss)
line = "K=" + str(K) + "," + str(loss)
outfile.write(line + '\n')
outfile.close()
k_list = [ k for k in range(1,21) ]
draw_error(k_list, error_list)
print "[INFO]: find_K is finished!"
except Exception,e:
print traceback.print_exc()
def main():
try:
INPUT_PATH = "../data/4k2_far.txt"
_X, _Y = load_dataset(INPUT_PATH)
X = np.array(_X)
Y = np.array(_Y)
K = 2
label, loss, centroids = training(X, K)
cluster_list = get_clusters(X, label, K)
drawing_n_clusters(cluster_list, centroids)
except Exception,e:
print traceback.print_exc()
if __name__ == '__main__':
#main()
find_K()
本站转载的文章为个人学习借鉴使用,本站对版权不负任何法律责任。如果侵犯了您的隐私权益,请联系我们删除。