CS计算机代考程序代写 algorithm import numpy as np

import numpy as np
import os
import warnings
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.spatial.distance import cdist
import pandas as pd
import random

class KMeans:
“””The k-means algorithm.”””

def __init__(self, n_clusters):
self.n_clusters = n_clusters

def close_centroid(self, point, centroids):
“””
get the closest centeroid with the point
:param point: data point in shape (num_features,)
:param centroids: centeroids in shape (k, num_features)
:return: the centeroid index
“””
# Your code goes here (~5 lines)
# 4.1.1
pass

def fit(self, data):
“””

:param data: training data in shape (num_data, num_features)
:return:
centroids: cluster centeroids in shape (k, num_features)
clusters: label assignment for each data point in shape (num_data,)
“””
data = data.copy(deep=True)
clusters = np.zeros(len(data))

if ‘species’ in data.columns:
data.drop(‘species’, axis=1, inplace=True)

# Randomly initialize centroids
unique_rows = data.drop_duplicates()
unique_rows.reset_index(drop=True, inplace=True)
centroids = unique_rows.sample(n=self.n_clusters)
centroids.reset_index(drop=True, inplace=True)

# Initialize old centroids as a matrix of all 0’s
old_centroids = pd.DataFrame(np.zeros(shape=(self.n_clusters, data.shape[1])),
columns=data.columns)

# Compare every data point in our dataset to each of the k-means
# and assign each point to closest cluster
while not old_centroids.equals(centroids): # Repeat until convergence

# Stash old centroids
old_centroids = centroids.copy(deep=True)

# Compute cluster assignment (labels) for each data point in the matrix:
for row_i in range(0, len(data)):
point = data.iloc[row_i]
clusters[row_i] = self.close_centroid(point, centroids)

# Your code goes here (~6 lines)
# 4.1.2
# Compute centroids (mean) for each cluster.
# For each cluster extract the values which now belong to each cluster and calculate new k-means
pass

return centroids, clusters

if __name__ == ‘__main__’:
random.seed(10)

warnings.filterwarnings(“ignore”)
sns.set(style=”white”, color_codes=True)

# pre-process the iris dataset.
i_names = [‘s_len’, ‘s_wid’, ‘p_len’, ‘p_wid’, ‘species’]
data = pd.read_table(os.path.join(os.getcwd(), ‘data/iris.data’), header=None, sep=’,’, names=i_names)

# print information of the iris dataset.
print(“Some information of the dataset”)
print(“Samples:”)
print(data.head())
print()
print(“The dataset contains {} records and {} features.”.format(data.shape[0], data.shape[1]))
print()
print(“The dataset contains following 3 species:”)
print(data[‘species’].value_counts())
print()

# Run through a simple application
number_of_clusters = 3
kmeans = KMeans(n_clusters=number_of_clusters)
centroids, clusters = kmeans.fit(data=data)

# Extract the results
data[‘cluster’] = clusters
centroids[‘cluster’] = ‘centroid’
all_data = pd.concat([data, centroids])

# Plot the cluster centroids
sns.FacetGrid(all_data, hue=”cluster”, size=5, hue_kws={“marker”: [“o”, “o”, “o”, “x”]}).map(plt.scatter, “s_len”, “s_wid”).add_legend()
plt.savefig(“output/clustering_1.pdf”)
plt.clf()

# Set up parameters
losses = list()
K = [1, 2, 3, 4, 5]

# Test out multiple values for k
for k in K:
kmeans = KMeans(n_clusters=k)
centroids, clusters = kmeans.fit(data=data)

# Extract the results
data[‘cluster’] = clusters

# Calculate distortion
centroid_vals = centroids[centroids.columns[:-1]]
d = data[data.columns[:-2]]
loss = sum(np.min(cdist(d, centroid_vals, ‘euclidean’), axis=1)) / d.shape[0]

print(“For k={}\tLoss: {}”.format(k, loss))

# Keep track of cluster size metrics
losses.append(loss)

# Elbow plot
plt.plot(K, losses, ‘s-‘, markersize=8, color=’cadetblue’, mec=’gray’)
plt.xlabel(‘k’)
plt.xticks(K)
plt.ylabel(‘Distortion’)
plt.title(‘Losses under different k’)
plt.savefig(“output/clustering_2.pdf”)

Related Posts