Unsupervised learning 2 – Manifold learning
In manifold learning the goals is to find a lower dimensional ( ) manifold that represents the
data in a high dimesional ( ) space so that . Since we often want to visualize the
data somehow the dimensions of are often 1, 2 or 3.
Many of these methods you can find from https://scikit-learn.org/stable/modules/manifold.html
Multidimensional Scaling (MDS)
Highly popular in many fields.
RD̂
RD̂ D̂ << D
D̂
lec07_manifold_learning http://localhost:8888/nbconvert/html/lec07_manifold_l...
1 of 10 9/20/21, 15:47
https://scikit-learn.org/stable/modules/manifold.html
https://scikit-learn.org/stable/modules/manifold.html
In [19]:
from collections import OrderedDict
from functools import partial
from time import time
import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
from matplotlib.ticker import NullFormatter
from sklearn import manifold, datasets, random_projection
n_points = 500
X, color = datasets.make_s_curve(n_points, random_state=0, noise=0.1)
n_neighbors = 10
n_components = 2
# Create figure
fig = plt.figure() #fig = plt.figure(figsize=(15, 8))
#fig.suptitle("Manifold Learning with %i points, %i neighbors"
# % (1000, n_neighbors), fontsize=14)
# Add 3d scatter plot
#ax = fig.add_subplot(projection='3d')
#ax.scatter(X[:, 0], X[:, 1], X[:, 2], c=color, cmap=plt.cm.Spectral)
#ax.view_init(4, -72)
plt.scatter(X[:,0],X[:,2], c=color, cmap=plt.cm.Spectral)
plt.show()
X2d = np.concatenate(([X[:,0]], [X[:,2]]),axis=0).T
print(X2d.shape)
mds = manifold.MDS(1, max_iter=1000, n_init=10, random_state=666)
# Plot results
t0 = time()
Y = mds.fit_transform(X2d)
t1 = time()
print("%s: %.2g sec" % ("MDS", t1 - t0))
#ax = fig.add_subplot(2, 5, 2 + i + (i > 3))
plt.scatter(Y, np.zeros(Y.size), c=color, cmap=plt.cm.Spectral)
#ax.set_title(“%s (%.2g sec)” % (label, t1 – t0))
#ax.xaxis.set_major_formatter(NullFormatter())
#ax.yaxis.set_major_formatter(NullFormatter())
plt.axis(‘tight’)
plt.show()
## Next line to silence pyflakes. This import is needed.
Axes3D
fig = plt.figure() #fig = plt.figure(figsize=(15, 8))
ax = fig.add_subplot(projection=’3d’)
ax.scatter(X[:, 0], X[:, 1], X[:, 2], c=color, cmap=plt.cm.Spectral)
ax.view_init(4, -72)
plt.show()
mds = manifold.MDS(2, max_iter=1000, n_init=10)
# Plot results
t0 = time()
Y = mds.fit_transform(X)
lec07_manifold_learning http://localhost:8888/nbconvert/html/lec07_manifold_l…
2 of 10 9/20/21, 15:47
(500, 2)
MDS: 1.2 sec
MDS: 2.5 sec
plt.axis(‘tight’)
plt.show()
lec07_manifold_learning http://localhost:8888/nbconvert/html/lec07_manifold_l…
3 of 10 9/20/21, 15:47
Random Projections
Surprisingly powerful often, but depends on the random seed.
Random: 0.0019 sec
Self-organizing Map (SOM)
The main usage is data visualization, but this method is surprisingly powerful and would
deserve to be implemented to SkiLearn package.
Super implementation (Matlab) http://www.cis.hut.fi/somtoolbox/
Python implementation https://pypi.org
In [23]:
# Re-run multiple times
t0 = time()
rp = random_projection.GaussianRandomProjection(n_components=2)
Y = rp.fit_transform(X)
t1 = time()
print(“%s: %.2g sec” % (“Random”, t1 – t0))
plt.scatter(Y[:,0], Y[:,1], c=color, cmap=plt.cm.Spectral)
#ax.set_title(“%s (%.2g sec)” % (label, t1 – t0))
ax.xaxis.set_major_formatter(NullFormatter())
ax.yaxis.set_major_formatter(NullFormatter())
plt.axis(‘tight’)
plt.show()
lec07_manifold_learning http://localhost:8888/nbconvert/html/lec07_manifold_l…
4 of 10 9/20/21, 15:47
http://www.cis.hut.fi/somtoolbox/
http://www.cis.hut.fi/somtoolbox/
https://pypi.org/project/sklearn-som/
https://pypi.org/project/sklearn-som/
/project/sklearn-som/
In [24]:
# Re-run multiple times !!
import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
from matplotlib.ticker import NullFormatter
from sklearn import datasets
# Import custom SOM implementation
import importlib.util
spec = importlib.util.spec_from_file_location(“sklearn_som.som”, “/home/kamarain/Work/ext/s
sklearn_som = importlib.util.module_from_spec(spec)
spec.loader.exec_module(sklearn_som)
n_points = 1000
X, color = datasets.make_s_curve(n_points, random_state=0, noise=0.1)
n_neighbors = 10
n_components = 2
X2d = np.concatenate(([X[:,0]], [X[:,2]]),axis=0).T
# Create figure
fig = plt.figure()
plt.scatter(X2d[:,0],X2d[:,1], c=color, cmap=plt.cm.Spectral)
plt.show()
som = sklearn_som.SOM(m=1, n=13, dim=2)
som.fit(X2d, shuffle=False)
#bmus = som.predict(X2d)
#X_som = som._locations[bmus,:]
X_w = som.weights
plt.scatter(X2d[:,0],X2d[:,1], c=color, cmap=plt.cm.Spectral)
plt.plot(X_w[:,0],X_w[:,1],’k-‘)
plt.plot(X_w[:,0],X_w[:,1],’ko’)
plt.show()
lec07_manifold_learning http://localhost:8888/nbconvert/html/lec07_manifold_l…
5 of 10 9/20/21, 15:47
https://pypi.org/project/sklearn-som/
https://pypi.org/project/sklearn-som/
lec07_manifold_learning http://localhost:8888/nbconvert/html/lec07_manifold_l…
6 of 10 9/20/21, 15:47
In [25]:
# THIS CODE IS MODIFIED VERSION OF SCIKIT-LEARN CODE
# Original Authors: Fabian Pedregosa
# Olivier Grisel
# Gael Varoquaux
# License: BSD 3 clause (C) INRIA 2011
#from time import time
#import numpy as np
#import matplotlib.pyplot as plt
from matplotlib import offsetbox
#from sklearn import (manifold, datasets, decomposition, ensemble,
# discriminant_analysis, random_projection, neighbors)
#import importlib.util
#spec = importlib.util.spec_from_file_location(“sklearn_som.som”, “/home/kamarain/Work/ext/
#sklearn_som = importlib.util.module_from_spec(spec)
#spec.loader.exec_module(sklearn_som)
digits = datasets.load_digits(n_class=6)
X = digits.data
y = digits.target
n_samples, n_features = X.shape
n_neighbors = 30
print(X.shape)
# ———————————————————————-
# Scale and visualize the embedding vectors
def plot_embedding(X, title=None):
x_min, x_max = np.min(X, 0), np.max(X, 0)
X = (X – x_min) / (x_max – x_min)
plt.figure()
ax = plt.subplot(111)
for i in range(X.shape[0]):
plt.text(X[i, 0], X[i, 1], str(y[i]),
color=plt.cm.Set1(y[i] / 10.),
fontdict={‘weight’: ‘bold’, ‘size’: 9})
if hasattr(offsetbox, ‘AnnotationBbox’):
# only print thumbnails with matplotlib > 1.0
shown_images = np.array([[1., 1.]]) # just something big
for i in range(X.shape[0]):
dist = np.sum((X[i] – shown_images) ** 2, 1)
if np.min(dist) < 4e-3:
#if np.min(dist) < 5e-3: # Change this to plot more digit examples
# don't show points that are too close
continue
shown_images = np.r_[shown_images, [X[i]]]
imagebox = offsetbox.AnnotationBbox(
offsetbox.OffsetImage(digits.images[i], cmap=plt.cm.gray_r),
X[i])
ax.add_artist(imagebox)
plt.xticks([]), plt.yticks([])
if title is not None:
plt.title(title)
# ----------------------------------------------------------------------
# Plot images of the digits
lec07_manifold_learning http://localhost:8888/nbconvert/html/lec07_manifold_l...
7 of 10 9/20/21, 15:47
plt.imshow(img, cmap=plt.cm.binary)
plt.xticks([])
plt.yticks([])
plt.title('Esimerkkejä 8x8 Digits-datajoukosta')
plt.show()
# ----------------------------------------------------------------------
# Random 2D projection using a random unitary matrix
print("Computing random projection")
#rp = random_projection.SparseRandomProjection(n_components=2, random_state=42)
rp = random_projection.GaussianRandomProjection(n_components=2, random_state
X_projected = rp.fit_transform(X)
plot_embedding(X_projected, "Satunnaisprojektio")
print(X_projected)
#from pprint import pprint
#pprint(vars(X_projected))
plt.show()
# ----------------------------------------------------------------------
# MDS embedding of the digits dataset
print("Computing MDS embedding")
clf = manifold.MDS(n_components=2, n_init=1, max_iter=100)
t0 = time()
X_mds = clf.fit_transform(X)
print("Done. Stress: %f" % clf.stress_)
plot_embedding(X_mds,
"MDS-projektio")
plt.show()
# ----------------------------------------------------------------------
# SOM embedding of the digits dataset
print("Computing SOM embedding")
print(X.size)
som = sklearn_som.SOM(m=20, n=20, dim=64)
t0 = time()
som.fit(X)
bmus = som.predict(X)
X_som = som._locations[bmus,:]
#from pprint import pprint
#pprint(vars(som))
#print(X_som)
print("Done.")
plot_embedding(X_som,
"SOM-kartta")
plt.show()
# LATEEEEEEEER
## ----------------------------------------------------------------------
## t-SNE embedding of the digits dataset
#print("Computing t-SNE embedding")
#tsne = manifold.TSNE(n_components=2, init='pca', random_state=0)
lec07_manifold_learning http://localhost:8888/nbconvert/html/lec07_manifold_l...
8 of 10 9/20/21, 15:47
(1083, 64)
Computing random projection
[[ -68.26635221 0.30075222]
[ -65.64810565 3.01885825]
[ -73.54228309 21.47372818]
...
[-106.99211851 33.18923775]
[-113.40774221 26.87927004]
[ -88.78495966 7.37088605]]
Computing MDS embedding
Done. Stress: 150988651.554803
lec07_manifold_learning http://localhost:8888/nbconvert/html/lec07_manifold_l...
9 of 10 9/20/21, 15:47
Computing SOM embedding
69312
Done.
References
• A.R. Web and K.D. Copsey: Statistical Pattern Recognition, 3rd ed, 2011. Chapters
10-11
• T. Hastie and R. Tibshirani and J. Friedman: The Elements of Statistical Learning, 2009,
Springer. Chapter 15
• T. Kohonen (2014): MATLAB Implementations and Applications of the Self-Organizing
Map (PDF: http://docs.unigrafia.fi/publications/kohonen_teuvo/)
lec07_manifold_learning http://localhost:8888/nbconvert/html/lec07_manifold_l...
10 of 10 9/20/21, 15:47
http://docs.unigrafia.fi/publications/kohonen_teuvo/
http://docs.unigrafia.fi/publications/kohonen_teuvo/