import plotly
import numpy as np
import plotly.plotly as py
import plotly.graph_objs as go
import matplotlib.pyplot as plt
import scipy.io as sio
import scipy as scipy
%matplotlib inline
plotly.offline.init_notebook_mode(connected=True)
from IPython.core.display import display, HTML, Markdown
# The polling here is to ensure that plotly.js has already been loaded before
# setting display alignment in order to avoid a race condition.
display(HTML(
'<script>'
'var waitForPlotly = setInterval( function() {'
'if( typeof(window.Plotly) !== "undefined" ){'
'MathJax.Hub.Config({ SVG: { font: "STIX-Web" }, displayAlign: "center" });'
'MathJax.Hub.Queue(["setRenderer", MathJax.Hub, "SVG"]);'
'clearInterval(waitForPlotly);'
'}}, 250 );'
'</script>'
))
import urllib
# Interact widget
from __future__ import print_function
from ipywidgets import interact, interactive, fixed, interact_manual
import ipywidgets as widgets
import pandas as pd
import sklearn
from sklearn import cluster
faithful = pd.read_csv('faithful.csv')
faithful.columns = ['eruptions', 'waiting']
faith = np.array(faithful)
faithful.head()
trace_init = go.Scatter(
x = faithful.eruptions,
y = faithful.waiting,
mode = 'markers',
marker = dict(
size = 10,
line = dict(
width = 2,
color = 'rgb(0, 0, 0)'
)
),
showlegend = False,
)
layout_init = dict(
title = 'Old Faithful Data Scatterplot (not rescaled)',
xaxis = dict(
title = 'Duration of eruptions (mns)',
ticklen = 5,
zeroline = False,
gridwidth = 2,
),
yaxis = dict(
title = 'Time span between eruptions (mns)',
ticklen = 5,
gridwidth = 2,
),
legend = dict(
orientation = 'h',
y = -0.2
)
)
plotly.offline.iplot(dict(data=[trace_init], layout=layout_init))
faith_centered = faith - faith.mean(axis=0)
cov_matrix = faith_centered.T.dot(faith_centered)/(len(faith_centered)-1)
cov_matrix
np.cov(faith_centered.T)
faith_stand = np.linalg.inv(scipy.linalg.sqrtm(cov_matrix)).dot(faith_centered.T)
np.cov(faith_stand)
trace_init = go.Scatter(
x = faith_stand[0],
y = faith_stand[1],
mode = 'markers',
marker = dict(
size = 10,
line = dict(
width = 2,
color = 'rgb(0, 0, 0)'
)
),
showlegend = False,
)
layout_init = dict(
title = 'Old Faithful Data Scatterplot (rescaled)',
xaxis = dict(
title = 'Duration of eruptions (mns)',
ticklen = 5,
zeroline = False,
gridwidth = 2,
),
yaxis = dict(
title = 'Time span between eruptions (mns)',
ticklen = 5,
gridwidth = 2,
),
legend = dict(
orientation = 'h',
y = -0.2
)
)
_plotly.offline.iplot(dict(data=[trace_init], layout=layout_init))
$$J = \sum\limits_{ n=1 }^N \sum\limits_{ k=1 }^K r_{nk} \vert x_n - μ_k \vert^2$$
E-step:
assign each data point $x_n$ to the closest cluster
M-step:
$$\frac{∂J}{∂μ_k} = \frac{∂}{∂μ_n}\left( \sum\limits_{ n=1 }^N r_{nk} \vert x_n - μ_k \vert^2\right) = -2 \sum\limits_{ n=1 }^N r_{nk} (x_n - μ_k)$$
So:
$$ \begin{align*}
& \frac{∂J}{∂μ_k} = 0 \\
⟺ \quad & -2 \sum\limits{ n=1 }^N r{nk} (x_n - μ_k) = 0 \
⟺ \quad & μk = \frac{\sum\limits{ n=1 }^N r_{nk} xn}{\sum\limits{ n=1 }^N r_{nk}}\
\end{align*}
$$
def K_means(X, K, returnClusterIndices=False, mu=None):
if mu is None:
# Random initialization of mu
mu = np.random.uniform(np.min(X), np.max(X), (K, X.shape[1]))
# list storing the J values
J_values = []
while True:
# E-step: indices of clusters to which the x_i belong (the closest ones)
pairwise_distances = np.linalg.norm(X.reshape([-1, 1, X.shape[1]]) - mu, axis=2)**2
cluster_indices = np.argmin(pairwise_distances, axis=1)
# Compute the current J and store it
J_values.append(np.sum(np.min(pairwise_distances, axis=1)))
# M-step: compute the new mu
mu = np.array([np.mean(X[cluster_indices == k], axis=0) for k in range(K)])
# Halting condition: if J doesn't change anymore
if len(J_values)>1 and J_values[-1] == J_values[-2]:
break
if returnClusterIndices:
return mu, J_values, cluster_indices
else:
return mu, J_values
K_values = [2, 2]
mus_values = [np.array([[-2, -2], [2, 2]]), np.array([[1, 1.5], [2, 2]])]
ranges = {}
data = {}
trace = {}
mus = {}
layout = {}
for K, mu_init in zip(K_values, mus_values):
X_1 = faith_stand[0]
X_2 = faith_stand[1]
ranges_x = [int(np.min(X_1)-1), int(np.max(X_1)+1)]
ranges_y = [int(np.min(X_2)-2), int(np.max(X_2)+2)]
# K-Means
mu, J_values, cluster_indices = K_means(faith_stand.T, 2, mu=mu_init, returnClusterIndices=True)
trace = go.Scatter(
x = X_1,
y = X_2,
mode = 'markers',
marker = dict(
size = 10,
color = cluster_indices,
showscale = False,
line = dict(
width = 2,
color = 'rgb(0, 0, 0)'
)
),
showlegend = False
)
# Displaying mu
mus = go.Scatter(
x = mu[:, 0],
y = mu[:, 1],
mode = 'markers',
marker = dict(
symbol = "star-diamond",
size = 15,
color = 'yellow',
line = dict(
width = 2,
color = 'black'
)
),
name = 'Means',
showlegend= True
)
# Displaying the J-values
trace_J = go.Scatter(
x = list(range(1, len(J_values)+1)),
y = J_values,
mode = 'lines',
name = 'Distortion'
)
layout_J = go.Layout(
title= 'Evolution of J (logarithmic scale)',
hovermode= 'closest',
xaxis= dict(
title= 'Iteration',
ticklen= 5,
zeroline= False,
gridwidth= 2,
),
yaxis=dict(
title= 'J-value',
type='log',
ticklen= 5,
gridwidth= 2,
),
showlegend= True
)
plotly.offline.iplot(go.Figure(data=[trace_J], layout=layout_J))
plotly.offline.iplot(go.Figure(data=[trace, mus], layout=layout_init))