Final presentation: Visualizing high-dimensional data and Meta-SNE

Younesse Kaddar

In [2]:
# /!\ I'm using Python 3 !

import numpy as np

import matplotlib.pyplot as plt 
%matplotlib inline 
# to display directly in the notebook

plt.rcParams['image.cmap'] = 'gray' 
# by default, the grayscale images are displayed with the jet colormap: use grayscale instead


import plotly.plotly as py
import plotly.graph_objs as go
from plotly import tools
import plotly.offline

plotly.offline.init_notebook_mode(connected=True)

from IPython.core.display import display, HTML, Markdown
# The polling here is to ensure that plotly.js has already been loaded before
# setting display alignment in order to avoid a race condition.
display(HTML(
    '<script>'
        'var waitForPlotly = setInterval( function() {'
            'if( typeof(window.Plotly) !== "undefined" ){'
                'MathJax.Hub.Config({ SVG: { font: "STIX-Web" }, displayAlign: "center" });'
                'MathJax.Hub.Queue(["setRenderer", MathJax.Hub, "SVG"]);'
                'clearInterval(waitForPlotly);'
            '}}, 250 );'
    '</script>'
))

import sklearn
from sklearn import manifold, datasets, decomposition
from sklearn.utils import check_random_state

from scipy.spatial.distance import squareform, pdist


def from_matplotlib(colormap, nb_points):
    colorscale = []
    
    for k in range(nb_points):
        colors = np.array([int(c) for c in colormap(k/(nb_points-1))[:3]])*255
        colorscale.append([k/(nb_points-1), 'rgb{}'.format(colors[0], colors[1], colors[2])])
    
    return colorscale

colormap = from_matplotlib(plt.cm.jet, 4)

Comparison for on a well-known data set

In [3]:
N = 1000 # number of data points

# Data points
X, colors = datasets.samples_generator.make_swiss_roll(n_samples=1500)

x, y, z = X[:, 0], X[:,1], X[:,2]
In [4]:
raw_data = go.Scatter3d(x = x,
                  y = y,
                  z = z,
                  mode = 'markers', 
                  marker = dict(
                      color = x,
                      colorscale = colormap,
                      line = dict(color='black', width=1),
                      showscale = False),
                 )

layout = dict(
    margin = dict(
        l=15,
        r=15,
        t=20,
        b=15)
    )

fig = go.Figure(data=[raw_data], layout=layout)
plotly.offline.iplot(fig)

data_traces = []
titles = []

data = np.array([x, y, z]).T

MDS

In [5]:
def MDS(data=data, data_traces=data_traces, colors=colors, colormap=colormap, ticktext=[]):
    ticktext = np.asarray(ticktext)  # converts it to an array
    
    mds = manifold.MDS(2, max_iter=100, n_init=1)
    data_MDS = mds.fit_transform(data).T

    trace = go.Scatter(
        x = data_MDS[0],
        y = data_MDS[1],
        mode='markers',
        marker=dict(color = colors,
                    colorscale = colormap,
                    size= 10,
                    showscale=bool(ticktext.size),
                    colorbar = dict(
                        tickmode = 'array',
                        tickvals = list(range(len(ticktext))),
                        ticktext = ticktext,
                        ticks = 'outside'
                    ),
                    line = dict(color='black', width=2)
                   ),
    )
    
    if data_traces is not None:
        data_traces.append(trace)

    plotly.offline.iplot(go.Figure(data=[trace], layout=layout))

MDS()

PCA

In [6]:
def PCA(data=data, data_traces=data_traces, colors=colors, colormap=colormap, ticktext=[]):
    ticktext = np.asarray(ticktext)  # converts it to an array
    
    data_PCA = decomposition.TruncatedSVD(n_components=2).fit_transform(data).T

    trace = go.Scatter(
        x = data_PCA[0],
        y = data_PCA[1],
        mode='markers',
        marker = dict(
            color = colors,
            colorscale = colormap,
            size= 10,
            showscale = bool(ticktext.size),
            colorbar = dict(
                tickmode = 'array',
                tickvals = list(range(len(ticktext))),
                ticktext = ticktext,
                ticks = 'outside'
            ),
            line=dict(color='black', width=2)
        )
    )

    if data_traces is not None:
        data_traces.append(trace)

    plotly.offline.iplot(go.Figure(data=[trace], layout=layout))

PCA()

Isomap

In [7]:
def Isomap(data=data, data_traces=data_traces, colors=colors, colormap=colormap, ticktext=[]):
    ticktext = np.asarray(ticktext)  # converts it to an array
    
    nb_neighbors = 10

    data_Isomap = manifold.Isomap(nb_neighbors, n_components=2).fit_transform(data).T

    trace = go.Scatter(
        x = data_Isomap[0], 
        y = data_Isomap[1],
        mode='markers',
        marker = dict(
            color = colors,
            colorscale = colormap,
            size= 10,
            showscale = bool(ticktext.size),
            colorbar = dict(
                tickmode = 'array',
                tickvals = list(range(len(ticktext))),
                ticktext = ticktext,
                ticks = 'outside'
            ),
            line=dict(color='black', width=2)
        )
    )

    if data_traces is not None:
        data_traces.append(trace)

    plotly.offline.iplot(go.Figure(data=[trace], layout=layout))

Isomap()

LLE

In [8]:
def LLE(data=data, data_traces=data_traces, colors=colors, colormap=colormap, ticktext=[]):
    ticktext = np.asarray(ticktext)  # converts it to an array
    
    nb_neighbors = 10

    data_LLE = manifold.LocallyLinearEmbedding(nb_neighbors, 2, method='standard').fit_transform(data).T

    trace = go.Scatter(
        x = data_LLE[0],
        y = data_LLE[1],
        mode='markers',
        marker = dict(
            color = colors,
            colorscale = colormap,
            size= 10,
            line=dict(color='black', width=2),
            showscale=bool(ticktext.size),
            colorbar = dict(
                tickmode = 'array',
                tickvals = list(range(len(ticktext))),
                ticktext = ticktext,
                ticks = 'outside'
            )
        )
    )

    if data_traces is not None:
        data_traces.append(trace)

    plotly.offline.iplot(go.Figure(data=[trace], layout=layout))

LLE()

t-SNE

In [9]:
def tSNE(data=data, data_traces=data_traces, colors=colors, colormap=colormap, ticktext=[]):
    ticktext = np.asarray(ticktext)  # converts it to an array
    
    tSNE = manifold.TSNE(n_components=2)

    data_tSNE = tSNE.fit_transform(data).T

    trace = go.Scatter(
        x = data_tSNE[0],
        y = data_tSNE[1],
        mode='markers',
        marker=dict(
            color = colors,
            colorscale = colormap,
            size = 10,
            showscale = bool(ticktext.size),
            colorbar = dict(
                tickmode = 'array',
                tickvals = list(range(len(ticktext))),
                ticktext = ticktext,
                ticks = 'outside'
            ),
            line = dict(color='black', width=2)
        )
    )

    if data_traces is not None:
        data_traces.append(trace)

    plotly.offline.iplot(go.Figure(data=[trace], layout=layout))
    
tSNE()

In more details

In [10]:
import random
import fashion_mnist.utils.mnist_reader as mnist_reader

labels = ['T-shirt/top', 'Trouser', 'Pullover', 'Dress', 'Coat',\
              'Sandal', 'Shirt', 'Sneaker', 'Bag', 'Ankle boot']

colormap2 = from_matplotlib(plt.cm.Set3, 10)

def convert_labels(lab, labels=labels):
    return labels[lab]

X_train, Y_train = mnist_reader.load_mnist('fashion_mnist/data/fashion', kind='train')
In [11]:
nbr_img = 1500 # number of images to randomly pick

ind_rand = np.random.randint(len(X_train), size=nbr_img) # indices of images

X, Y = X_train[ind_rand, :], Y_train[ind_rand] # images, labels

Y_labels = np.array(list(map(convert_labels, Y.tolist())))

# Histogram of labels

trace = go.Histogram(
    x=Y.flatten(),
    xbins=dict(
        start=-0.5,
        end=9,
        size=1
    )
)

layout = go.Layout(
    title='Labels of picked images',
    xaxis=dict(
        title='Label'
    ),
    yaxis=dict(
        title='Number'
    )
)
fig = go.Figure(data=[trace], layout=layout)


plotly.offline.iplot(fig)
In [12]:
display(Markdown("### Display a few of them:"))

rows, cols = 2, 5

plt.figure(figsize=(15,6))

for i in range(cols*rows):
    ax = plt.subplot(rows, cols, i + 1)
    ax.matshow(X[i].reshape((28, 28)))
    plt.xticks([]); plt.yticks([])
    plt.title(Y_labels[i])

plt.show()

Display a few of them:

PCA

In [13]:
N, d = X.shape

X_centered = X - np.mean(X, axis=0)

C = X_centered.T.dot(X_centered)

eig, v = np.linalg.eigh(C)

eigenvalues = eig[-1], eig[-2]
vectors_PCA = v[:,-1], v[:, -2]
In [14]:
plt.figure(figsize=(13,13))

plt.subplot(1, 2, 1)
plt.imshow(vectors_PCA[0].reshape(28, 28), cmap='gist_gray')
plt.title('First eigenvector (associated to spectral radius)')
plt.axis('off')

plt.subplot(1, 2, 2)
plt.imshow(vectors_PCA[1].reshape(28, 28), cmap='gist_gray')
plt.title('Second eigenvector')
plt.axis('off')
    
plt.show()
In [15]:
W = np.vstack(vectors_PCA).T

proj = X.dot(W)


traces_PDA = []

for i in range(10):
    traces_PDA.append(go.Scatter(
        x = proj[:, 0][Y.flatten() == i],
        y = proj[:, 1][Y.flatten() == i],
        mode = 'markers',
        marker = dict(
            size = 7,
            showscale = False,
            line = dict(
                width = 1,
                color = 'rgb(0, 0, 0)'
            )
        ),
        name = '{}'.format(labels[i]),
        showlegend= True
    ))

layout_PDA = dict(
    title = 'Projection of images on the space spanned by the first two eigenvectors',
    xaxis = dict(
        title = 'Premier eigenvector',
        ticklen = 5,
        zeroline = False,
        gridwidth = 2,
        ),
    yaxis = dict(
        title = 'Second eigenvector',
        ticklen = 5,
        gridwidth = 2,
        ),
    legend = dict(
        orientation = 'h',
        y = -0.2
    )
)


plotly.offline.iplot(go.Figure(data=traces_PDA, layout=layout_PDA))

t-SNE

In [17]:
tSNE(data=X, data_traces=None, colors=Y, colormap=colormap2, ticktext=labels)
In [18]:
MDS(data=X, data_traces=None, colors=Y, colormap=colormap2, ticktext=labels)
In [28]:
# Pairwise distances between all data points.
Dist = sklearn.metrics.pairwise.pairwise_distances(X, squared=True)

plt.imshow(Dist[::10, ::10], interpolation='none')
plt.title("Pairwise distances matrix")
plt.show()

# Similarity
Sim = squareform(sklearn.manifold.t_sne._joint_probabilities(Dist, 30., False))

plt.imshow(Sim[::10, ::10], interpolation='none')
plt.title("Conditional probabilities")
plt.show()