import numpy as np
import os
import warnings
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.spatial.distance import cdist
import pandas as pd
import random
class KMeans:
“””The k-means algorithm.”””
def __init__(self, n_clusters):
self.n_clusters = n_clusters
def close_centroid(self, point, centroids):
“””
get the closest centeroid with the point
:param point: data point in shape (num_features,)
:param centroids: centeroids in shape (k, num_features)
:return: the centeroid index
“””
# Your code goes here (~5 lines)
# 4.1.1
pass
def fit(self, data):
“””
:param data: training data in shape (num_data, num_features)
:return:
centroids: cluster centeroids in shape (k, num_features)
clusters: label assignment for each data point in shape (num_data,)
“””
data = data.copy(deep=True)
clusters = np.zeros(len(data))
if ‘species’ in data.columns:
data.drop(‘species’, axis=1, inplace=True)
# Randomly initialize centroids
unique_rows = data.drop_duplicates()
unique_rows.reset_index(drop=True, inplace=True)
centroids = unique_rows.sample(n=self.n_clusters)
centroids.reset_index(drop=True, inplace=True)
# Initialize old centroids as a matrix of all 0’s
old_centroids = pd.DataFrame(np.zeros(shape=(self.n_clusters, data.shape[1])),
columns=data.columns)
# Compare every data point in our dataset to each of the k-means
# and assign each point to closest cluster
while not old_centroids.equals(centroids): # Repeat until convergence
# Stash old centroids
old_centroids = centroids.copy(deep=True)
# Compute cluster assignment (labels) for each data point in the matrix:
for row_i in range(0, len(data)):
point = data.iloc[row_i]
clusters[row_i] = self.close_centroid(point, centroids)
# Your code goes here (~6 lines)
# 4.1.2
# Compute centroids (mean) for each cluster.
# For each cluster extract the values which now belong to each cluster and calculate new k-means
pass
return centroids, clusters
if __name__ == ‘__main__’:
random.seed(10)
warnings.filterwarnings(“ignore”)
sns.set(style=”white”, color_codes=True)
# pre-process the iris dataset.
i_names = [‘s_len’, ‘s_wid’, ‘p_len’, ‘p_wid’, ‘species’]
data = pd.read_table(os.path.join(os.getcwd(), ‘data/iris.data’), header=None, sep=’,’, names=i_names)
# print information of the iris dataset.
print(“Some information of the dataset”)
print(“Samples:”)
print(data.head())
print()
print(“The dataset contains {} records and {} features.”.format(data.shape[0], data.shape[1]))
print()
print(“The dataset contains following 3 species:”)
print(data[‘species’].value_counts())
print()
# Run through a simple application
number_of_clusters = 3
kmeans = KMeans(n_clusters=number_of_clusters)
centroids, clusters = kmeans.fit(data=data)
# Extract the results
data[‘cluster’] = clusters
centroids[‘cluster’] = ‘centroid’
all_data = pd.concat([data, centroids])
# Plot the cluster centroids
sns.FacetGrid(all_data, hue=”cluster”, size=5, hue_kws={“marker”: [“o”, “o”, “o”, “x”]}).map(plt.scatter, “s_len”, “s_wid”).add_legend()
plt.savefig(“output/clustering_1.pdf”)
plt.clf()
# Set up parameters
losses = list()
K = [1, 2, 3, 4, 5]
# Test out multiple values for k
for k in K:
kmeans = KMeans(n_clusters=k)
centroids, clusters = kmeans.fit(data=data)
# Extract the results
data[‘cluster’] = clusters
# Calculate distortion
centroid_vals = centroids[centroids.columns[:-1]]
d = data[data.columns[:-2]]
loss = sum(np.min(cdist(d, centroid_vals, ‘euclidean’), axis=1)) / d.shape[0]
print(“For k={}\tLoss: {}”.format(k, loss))
# Keep track of cluster size metrics
losses.append(loss)
# Elbow plot
plt.plot(K, losses, ‘s-‘, markersize=8, color=’cadetblue’, mec=’gray’)
plt.xlabel(‘k’)
plt.xticks(K)
plt.ylabel(‘Distortion’)
plt.title(‘Losses under different k’)
plt.savefig(“output/clustering_2.pdf”)