In [56]:
import plotly
import numpy as np
import plotly.plotly as py
import plotly.graph_objs as go

import matplotlib.pyplot as plt
import scipy.io as sio
import scipy as scipy

%matplotlib inline

plotly.offline.init_notebook_mode(connected=True)
from IPython.core.display import display, HTML, Markdown
# The polling here is to ensure that plotly.js has already been loaded before
# setting display alignment in order to avoid a race condition.
display(HTML(
    '<script>'
        'var waitForPlotly = setInterval( function() {'
            'if( typeof(window.Plotly) !== "undefined" ){'
                'MathJax.Hub.Config({ SVG: { font: "STIX-Web" }, displayAlign: "center" });'
                'MathJax.Hub.Queue(["setRenderer", MathJax.Hub, "SVG"]);'
                'clearInterval(waitForPlotly);'
            '}}, 250 );'
    '</script>'
))

import urllib

# Interact widget
from __future__ import print_function
from ipywidgets import interact, interactive, fixed, interact_manual
import ipywidgets as widgets


import pandas as pd
import sklearn
from sklearn import cluster

Problem 1. K-means clustering

a.

In [57]:
faithful = pd.read_csv('faithful.csv')
faithful.columns = ['eruptions', 'waiting']
faith = np.array(faithful)

faithful.head()
Out[57]:
eruptions waiting
0 3.600 79
1 1.800 54
2 3.333 74
3 2.283 62
4 4.533 85
In [58]:
trace_init = go.Scatter(
    x = faithful.eruptions,
    y = faithful.waiting,
    mode = 'markers',
    marker = dict(
        size = 10,
        line = dict(
            width = 2,
            color = 'rgb(0, 0, 0)'
        )
    ),
    showlegend = False,
)

layout_init = dict(
    title = 'Old Faithful Data Scatterplot (not rescaled)',
    xaxis = dict(
        title = 'Duration of eruptions (mns)',
        ticklen = 5,
        zeroline = False,
        gridwidth = 2,
        ),
    yaxis = dict(
        title = 'Time span between eruptions (mns)',
        ticklen = 5,
        gridwidth = 2,
        ),
    legend = dict(
        orientation = 'h',
        y = -0.2
    )
)


plotly.offline.iplot(dict(data=[trace_init], layout=layout_init))
In [59]:
faith_centered = faith - faith.mean(axis=0)
cov_matrix = faith_centered.T.dot(faith_centered)/(len(faith_centered)-1)
cov_matrix
Out[59]:
array([[   1.30272833,   13.97780785],
       [  13.97780785,  184.82331235]])
In [60]:
np.cov(faith_centered.T)
Out[60]:
array([[   1.30272833,   13.97780785],
       [  13.97780785,  184.82331235]])
In [63]:
faith_stand = np.linalg.inv(scipy.linalg.sqrtm(cov_matrix)).dot(faith_centered.T)
np.cov(faith_stand)
Out[63]:
array([[  1.00000000e+00,  -4.54904667e-15],
       [ -4.54904667e-15,   1.00000000e+00]])
In [64]:
trace_init = go.Scatter(
    x = faith_stand[0],
    y = faith_stand[1],
    mode = 'markers',
    marker = dict(
        size = 10,
        line = dict(
            width = 2,
            color = 'rgb(0, 0, 0)'
        )
    ),
    showlegend = False,
)

layout_init = dict(
    title = 'Old Faithful Data Scatterplot (rescaled)',
    xaxis = dict(
        title = 'Duration of eruptions (mns)',
        ticklen = 5,
        zeroline = False,
        gridwidth = 2,
        ),
    yaxis = dict(
        title = 'Time span between eruptions (mns)',
        ticklen = 5,
        gridwidth = 2,
        ),
    legend = dict(
        orientation = 'h',
        y = -0.2
    )
)


_plotly.offline.iplot(dict(data=[trace_init], layout=layout_init))

$$J = \sum\limits_{ n=1 }^N \sum\limits_{ k=1 }^K r_{nk} \vert x_n - μ_k \vert^2$$

  • E-step:

    assign each data point $x_n$ to the closest cluster

  • M-step:

    $$\frac{∂J}{∂μ_k} = \frac{∂}{∂μ_n}\left( \sum\limits_{ n=1 }^N r_{nk} \vert x_n - μ_k \vert^2\right) = -2 \sum\limits_{ n=1 }^N r_{nk} (x_n - μ_k)$$

    So:

    $$ \begin{align*}

      & \frac{∂J}{∂μ_k} = 0 \\
    

    ⟺ \quad & -2 \sum\limits{ n=1 }^N r{nk} (x_n - μ_k) = 0 \ ⟺ \quad & μk = \frac{\sum\limits{ n=1 }^N r_{nk} xn}{\sum\limits{ n=1 }^N r_{nk}}\
    \end{align*} $$

In [87]:
def K_means(X, K, returnClusterIndices=False, mu=None):
    if mu is None:
        # Random initialization of mu
        mu = np.random.uniform(np.min(X), np.max(X), (K, X.shape[1]))
        
    # list storing the J values
    J_values = []
    
    while True:                        
        
            # E-step: indices of clusters to which the x_i belong (the closest ones)
            pairwise_distances = np.linalg.norm(X.reshape([-1, 1, X.shape[1]]) - mu, axis=2)**2
            cluster_indices = np.argmin(pairwise_distances, axis=1)
            
            # Compute the current J and store it
            J_values.append(np.sum(np.min(pairwise_distances, axis=1)))
                        
            # M-step: compute the new mu               
            mu = np.array([np.mean(X[cluster_indices == k], axis=0) for k in range(K)])
            
            # Halting condition: if J doesn't change anymore
            if len(J_values)>1 and J_values[-1] == J_values[-2]:
                break
                
    if returnClusterIndices:
        return mu, J_values, cluster_indices
    else:
        return mu, J_values
In [98]:
K_values = [2, 2]
mus_values = [np.array([[-2, -2], [2, 2]]), np.array([[1, 1.5], [2, 2]])]

ranges = {}
data = {}
trace = {}
mus = {}
layout = {}

for K, mu_init in zip(K_values, mus_values):
    X_1 = faith_stand[0]
    X_2 = faith_stand[1]
    
    ranges_x = [int(np.min(X_1)-1), int(np.max(X_1)+1)]
    ranges_y = [int(np.min(X_2)-2), int(np.max(X_2)+2)]
    
    # K-Means
    mu, J_values, cluster_indices = K_means(faith_stand.T, 2, mu=mu_init, returnClusterIndices=True)
    
    trace = go.Scatter(
        x = X_1,
        y = X_2,
        mode = 'markers',
        marker = dict(
            size = 10,
            color = cluster_indices,
            showscale = False,
            line = dict(
                width = 2,
                color = 'rgb(0, 0, 0)'
            )
        ),
        showlegend = False
    )
    
    # Displaying mu
    mus = go.Scatter(
        x = mu[:, 0],
        y = mu[:, 1],
        mode = 'markers', 
        marker = dict(
            symbol = "star-diamond",
            size = 15,
            color = 'yellow',
            line = dict(
                width = 2,
                color = 'black'
            )
        ),
        name = 'Means',
        showlegend= True
    )
    
    # Displaying the J-values
    trace_J = go.Scatter(
        x = list(range(1, len(J_values)+1)),
        y = J_values,
        mode = 'lines',
        name = 'Distortion'
    )

    layout_J = go.Layout(
        title= 'Evolution of J (logarithmic scale)',
        hovermode= 'closest',
        xaxis= dict(
            title= 'Iteration',
            ticklen= 5,
            zeroline= False,
            gridwidth= 2,
        ),
        yaxis=dict(
            title= 'J-value',
            type='log',
            ticklen= 5,
            gridwidth= 2,
        ),
        showlegend= True
    )

    plotly.offline.iplot(go.Figure(data=[trace_J], layout=layout_J))

    plotly.offline.iplot(go.Figure(data=[trace, mus], layout=layout_init))

2. Expectation Maximization for Gaussian mixtures