Unsupervised learning
In unsupervised learning our goal is to find structure from unlabelled data. In machine
learning terms that means that we only have samples without the target values . For
example, could you say anything about the CIFAR-10 images without the class labels?
Example 5.1 Floo Powder is used by the wizards (read Harry Potter saga) and one floo
poweder company is interested to find out why its certain customers give up their monthly
contract of floo powder delivery. Provide them reply based on your analysis.
→xi yi
lec06_clustering http://localhost:8888/nbconvert/html/lec06_clustering….
1 of 12 9/15/21, 16:17
In [27]:
import time
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cluster import MiniBatchKMeans, KMeans
from sklearn.metrics.pairwise import pairwise_distances_argmin
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import make_blobs
# Generate sample data
#np.random.seed(13)
np.random.seed(42)
mu1 = [15, 1000]
mu2 = [42, 40000]
mu3 = [90, 26000]
N1 = 300
N2 = 150
N3 = 290
cov1 = [[2**2,0],[0,250**2]]
cov2 = [[13**2,0],[0,4000**2]]
cov3 = [[8**2,0],[0,3000**2]]
X1 = np.random.multivariate_normal(mu1, cov1, N1).T
X2 = np.random.multivariate_normal(mu2, cov2, N2).T
X3 = np.random.multivariate_normal(mu3, cov3, N3).T
X = np.concatenate((X1,X2,X3),axis=1)
plt.figure(figsize=(10,4))
plt.plot(X[0,:],X[1,:], ‘w’, markerfacecolor=’black’, marker=’.’)
plt.xlabel(‘age [y]’)
plt.ylabel(‘annual income [Sickles]’)
plt.show()
lec06_clustering http://localhost:8888/nbconvert/html/lec06_clustering….
2 of 12 9/15/21, 16:17
Clustering
The idea in clustering is to find those “clusters” in data automatically.
K-means clustering
In [28]:
plt.figure(figsize=(10,4))
plt.plot(X[0,:],X[1,:], ‘w’, markerfacecolor=’black’, marker=’.’)
plt.xlabel(‘age [y]’)
plt.ylabel(‘annual income [Sickles]’)
plt.plot(mu1[0], mu1[1], ‘o’, markerfacecolor=’black’, markeredgecolor=’k’,
plt.plot(mu2[0], mu2[1], ‘o’, markerfacecolor=’black’, markeredgecolor=’k’,
plt.plot(mu3[0], mu3[1], ‘o’, markerfacecolor=’black’, markeredgecolor=’k’,
plt.axis()
plt.text(mu1[0]+5,mu1[1]+1500, ‘(%.1f, %.1f)’ % (mu1[0],mu1[1]), color=’red’
plt.text(mu2[0]+5,mu2[1]+1500, ‘(%.1f, %.1f)’ % (mu2[0],mu2[1]), color=’red’
plt.text(mu3[0]+5,mu3[1]+1500, ‘(%.1f, %.1f)’ % (mu3[0],mu3[1]), color=’red’
plt.show()
lec06_clustering http://localhost:8888/nbconvert/html/lec06_clustering….
3 of 12 9/15/21, 16:17
In [34]:
# LET’S TEST K-MEANS – TEST n_iters = 1,2, …
n_iters = 1000
from sklearn.datasets import make_blobs
from sklearn.datasets import make_moons
# A suitable seed for nice pictures
#np.random.seed(666) # very bad
#np.random.seed(3) # very good
#np.random.seed(4) # super good
# Generate sample data: easy
centers = [[0, 0], [1, 0], [1, 1]]
n_clusters = len(centers)
X, labels_true = make_blobs(n_samples=3000, centers=centers, cluster_std=[0.1
#
# Compute clustering with Means
k_means = KMeans(init=’random’,n_clusters=3, max_iter=n_iters, n_init=1)
k_means.fit(X)
#
# Plot result
fig = plt.figure(figsize=(4, 3))
#fig.subplots_adjust(left=0.02, right=0.98, bottom=0.05, top=0.9)
colors = [‘#4EACC5’, ‘#FF9C34’, ‘#4E9A06’]
#colors = [‘cyan’, ‘magenta’, ‘yellow’]
k_means_cluster_centers = k_means.cluster_centers_
k_means_labels = pairwise_distances_argmin(X, k_means_cluster_centers)
for k, col in zip(range(n_clusters), colors):
my_members = k_means_labels == k
cluster_center = k_means_cluster_centers[k]
plt.plot(X[my_members, 0], X[my_members, 1], ‘w’,
markerfacecolor=col, marker=’.’)
plt.plot(cluster_center[0], cluster_center[1], ‘o’, markerfacecolor=col
markeredgecolor=’k’, markersize=6)
plt.title(‘K-means %2d iteraation jälkeen’ %n_iters)
plt.axis(‘equal’)
plt.show()
lec06_clustering http://localhost:8888/nbconvert/html/lec06_clustering….
4 of 12 9/15/21, 16:17
In [36]:
from sklearn.datasets import make_blobs
from sklearn.datasets import make_moons
# A suitable seed for nice pictures
#np.random.seed(666) # very bad
#np.random.seed(3) # very good
#p.random.seed(4) # super good
# Generate sample data: easy
centers = [[0, 0], [1, 0], [1, 1]]
n_clusters = len(centers)
X, labels_true = make_blobs(n_samples=3000, centers=centers, cluster_std=[0.1
#
# Compute clustering with Means
n_iters = 1000
k_means = KMeans(init=’random’,n_clusters=3, max_iter=n_iters, n_init=1)
k_means.fit(X)
#
# Plot result
fig = plt.figure(figsize=(4, 3))
#fig.subplots_adjust(left=0.02, right=0.98, bottom=0.05, top=0.9)
colors = [‘#4EACC5’, ‘#FF9C34’, ‘#4E9A06’]
#colors = [‘cyan’, ‘magenta’, ‘yellow’]
k_means_cluster_centers = k_means.cluster_centers_
k_means_labels = pairwise_distances_argmin(X, k_means_cluster_centers)
for k, col in zip(range(n_clusters), colors):
my_members = k_means_labels == k
cluster_center = k_means_cluster_centers[k]
plt.plot(X[my_members, 0], X[my_members, 1], ‘w’,
markerfacecolor=col, marker=’.’)
plt.plot(cluster_center[0], cluster_center[1], ‘o’, markerfacecolor=col
markeredgecolor=’k’, markersize=6)
plt.title(‘K-means %2d iteraation jälkeen’ %n_iters)
plt.axis(‘equal’)
plt.show()
#
# Generate sample data: difficult
X, labels_true = make_blobs(n_samples=500, random_state=170)
transformation = [[0.6, -0.6], [-0.4, 0.8]]
X_aniso = np.dot(X, transformation)
X = X_aniso
#
# Compute clustering with Means
n_iters = 1000
k_means = KMeans(init=’random’,n_clusters=3, max_iter=n_iters, n_init=1)
k_means.fit(X)
lec06_clustering http://localhost:8888/nbconvert/html/lec06_clustering….
5 of 12 9/15/21, 16:17
k_means_cluster_centers = k_means.cluster_centers_
k_means_labels = pairwise_distances_argmin(X, k_means_cluster_centers)
for k, col in zip(range(n_clusters), colors):
my_members = k_means_labels == k
cluster_center = k_means_cluster_centers[k]
plt.plot(X[my_members, 0], X[my_members, 1], ‘w’,
markerfacecolor=col, marker=’.’)
plt.plot(cluster_center[0], cluster_center[1], ‘o’, markerfacecolor=col
markeredgecolor=’k’, markersize=6)
plt.title(‘K-means %2d iteraation jälkeen’ %n_iters)
plt.axis(‘equal’)
plt.show()
#
# Generate sample data: difficult 2
X, labels_true = make_blobs(n_samples=1500, random_state=170, cluster_std=[
#
# Compute clustering with Means
n_iters = 1000
k_means = KMeans(init=’random’,n_clusters=3, max_iter=n_iters, n_init=1)
k_means.fit(X)
#
# Plot result
fig = plt.figure(figsize=(4, 3))
#fig.subplots_adjust(left=0.02, right=0.98, bottom=0.05, top=0.9)
colors = [‘#4EACC5’, ‘#FF9C34’, ‘#4E9A06’]
#colors = [‘cyan’, ‘magenta’, ‘yellow’]
k_means_cluster_centers = k_means.cluster_centers_
k_means_labels = pairwise_distances_argmin(X, k_means_cluster_centers)
for k, col in zip(range(n_clusters), colors):
my_members = k_means_labels == k
cluster_center = k_means_cluster_centers[k]
plt.plot(X[my_members, 0], X[my_members, 1], ‘w’,
markerfacecolor=col, marker=’.’)
plt.plot(cluster_center[0], cluster_center[1], ‘o’, markerfacecolor=col
markeredgecolor=’k’, markersize=6)
plt.title(‘K-means %2d iteraation jälkeen’ %n_iters)
plt.axis(‘equal’)
plt.show()
#
# Generate sample data: difficult 3
X, labels_true = make_moons(n_samples=1500, noise=0.05)
lec06_clustering http://localhost:8888/nbconvert/html/lec06_clustering….
6 of 12 9/15/21, 16:17
k_means_cluster_centers = k_means.cluster_centers_
k_means_labels = pairwise_distances_argmin(X, k_means_cluster_centers)
for k, col in zip(range(2), colors):
my_members = k_means_labels == k
cluster_center = k_means_cluster_centers[k]
plt.plot(X[my_members, 0], X[my_members, 1], ‘w’,
markerfacecolor=col, marker=’.’)
plt.plot(cluster_center[0], cluster_center[1], ‘o’, markerfacecolor=col
markeredgecolor=’k’, markersize=6)
plt.title(‘K-means %2d iteraation jälkeen’ %n_iters)
plt.axis(‘equal’)
plt.show()
lec06_clustering http://localhost:8888/nbconvert/html/lec06_clustering….
7 of 12 9/15/21, 16:17
Hierarchical clustering
lec06_clustering http://localhost:8888/nbconvert/html/lec06_clustering….
8 of 12 9/15/21, 16:17
In [46]:
# LET’S PLAY WITH AGGLOMERATIVE CLUSTERING AND ITS LINKAGE VALUES (TRY THEM ALL)
agglo_linkage = ‘ward’ # single / ward / complete (max) / average
from sklearn.cluster import AgglomerativeClustering
# A suitable seed for nice pictures
#np.random.seed(666) # very bad
#np.random.seed(3) # very good
#np.random.seed(4) # super good
# Generate sample data: easy
centers = [[0, 0], [1, 0], [1, 1]]
n_clusters = len(centers)
X, labels_true = make_blobs(n_samples=3000, centers=centers, cluster_std=[0.1
#
# Compute clustering with hierarchial agglomerative clustering
agglo = AgglomerativeClustering(n_clusters=3, linkage=agglo_linkage, connectivity
agglo.fit(X)
y_pred = agglo.labels_.astype(int)
#
# Plot result
fig = plt.figure(figsize=(4, 3))
#fig.subplots_adjust(left=0.02, right=0.98, bottom=0.05, top=0.9)
colors = [‘#4EACC5’, ‘#FF9C34’, ‘#4E9A06’]
#colors = [‘cyan’, ‘magenta’, ‘yellow’]
for k, col in zip(range(n_clusters), colors):
my_members = y_pred == k
plt.plot(X[my_members, 0], X[my_members, 1], ‘w’,
markerfacecolor=col, marker=’.’)
#plt.plot(cluster_center[0], cluster_center[1], ‘o’, markerfacecolor=col,
# markeredgecolor=’k’, markersize=6)
plt.title(f’Agglomerative hierarchical clustering (linkage={agglo_linkage})’
plt.axis(‘equal’)
plt.show()
#
# Generate sample data: difficult
X, labels_true = make_blobs(n_samples=500, random_state=170)
transformation = [[0.6, -0.6], [-0.4, 0.8]]
X_aniso = np.dot(X, transformation)
X = X_aniso
#
# Compute clustering with hierarchial agglomerative clustering
agglo = AgglomerativeClustering(n_clusters=3, linkage=agglo_linkage, connectivity
agglo.fit(X)
y_pred = agglo.labels_.astype(int)
#
lec06_clustering http://localhost:8888/nbconvert/html/lec06_clustering….
9 of 12 9/15/21, 16:17
for k, col in zip(range(n_clusters), colors):
my_members = y_pred == k
plt.plot(X[my_members, 0], X[my_members, 1], ‘w’,
markerfacecolor=col, marker=’.’)
#plt.plot(cluster_center[0], cluster_center[1], ‘o’, markerfacecolor=col,
# markeredgecolor=’k’, markersize=6)
plt.title(f’Agglomerative hierarchical clustering (linkage={agglo_linkage})’
plt.axis(‘equal’)
plt.show()
#
# Generate sample data: difficult 2
X, labels_true = make_blobs(n_samples=1500, random_state=170, cluster_std=[
#
# Compute clustering with hierarchial agglomerative clustering
agglo = AgglomerativeClustering(n_clusters=3, linkage=agglo_linkage, connectivity
agglo.fit(X)
y_pred = agglo.labels_.astype(int)
#
# Plot result
fig = plt.figure(figsize=(4, 3))
#fig.subplots_adjust(left=0.02, right=0.98, bottom=0.05, top=0.9)
colors = [‘#4EACC5’, ‘#FF9C34’, ‘#4E9A06’]
#colors = [‘cyan’, ‘magenta’, ‘yellow’]
for k, col in zip(range(n_clusters), colors):
my_members = y_pred == k
plt.plot(X[my_members, 0], X[my_members, 1], ‘w’,
markerfacecolor=col, marker=’.’)
#plt.plot(cluster_center[0], cluster_center[1], ‘o’, markerfacecolor=col,
# markeredgecolor=’k’, markersize=6)
plt.title(f’Agglomerative hierarchical clustering (linkage={agglo_linkage})’
plt.axis(‘equal’)
plt.show()
#
# Generate sample data: difficult 3
X, labels_true = make_moons(n_samples=1500, noise=0.05)
#
# Compute clustering with hierarchial agglomerative clustering
agglo = AgglomerativeClustering(n_clusters=2, linkage=agglo_linkage, connectivity
agglo.fit(X)
y_pred = agglo.labels_.astype(int)
lec06_clustering http://localhost:8888/nbconvert/html/lec06_clustering….
10 of 12 9/15/21, 16:17
# markeredgecolor=’k’, markersize=6)
plt.title(f’Agglomerative hierarchical clustering (linkage={agglo_linkage})’
plt.axis(‘equal’)
plt.show()
lec06_clustering http://localhost:8888/nbconvert/html/lec06_clustering….
11 of 12 9/15/21, 16:17
References
• A.R. Web and K.D. Copsey: Statistical Pattern Recognition, 3rd ed, 2011. Chapters
10-11
• T. Hastie and R. Tibshirani and J. Friedman: The Elements of Statistical Learning, 2009,
Springer. Chapter 15
lec06_clustering http://localhost:8888/nbconvert/html/lec06_clustering….
12 of 12 9/15/21, 16:17