The aim of this exercice is to develop a human feedback classifier : positive (approval) / negative (prohibition). This classifier might be used to teach robots and/or to guide robot’s learning.
import urllib.request
import numpy as np
import pandas as pd
from google.colab import files as google_files
import itertools
import matplotlib.pyplot as plt
!pip install ggplot
import ggplot
def list_from_URL(file_URL, function_applied=None):
lines_bytes = urllib.request.urlopen(file_URL).readlines()
lines = []
for line in lines_bytes:
line = line.decode("utf-8").rstrip()
if function_applied is not None:
line = function_applied(line)
lines.append(line)
return lines
def plot_confusion_matrix(cm, classes,
normalize=True,
title='Confusion matrix',
cmap=plt.cm.Blues):
"""
From: http://scikit-learn.org/stable/auto_examples/model_selection/plot_confusion_matrix.html
This function prints and plots the confusion matrix.
Normalization can be applied by setting `normalize=True`.
"""
if normalize:
cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
print("Normalized confusion matrix")
else:
print('Confusion matrix, without normalization')
print(cm)
plt.imshow(cm, interpolation='nearest', cmap=cmap)
plt.title(title)
plt.colorbar()
tick_marks = np.arange(len(classes))
plt.xticks(tick_marks, classes, rotation=45)
plt.yticks(tick_marks, classes)
fmt = '.2f' if normalize else 'd'
thresh = cm.max() / 2.
for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
plt.text(j, i, format(cm[i, j], fmt),
horizontalalignment="center",
color="white" if cm[i, j] > thresh else "black")
plt.tight_layout()
plt.ylabel('True label')
plt.xlabel('Predicted label')
# # /!\ NO NEED TO EXECUTE THIS CELL AGAIN !!!
#
#
# filenames = list_from_URL('https://raw.githubusercontent.com/youqad/Neurorobotics_Intent-Recognition/master/filenames.txt')
# filenames = list(set(filenames))
#
# files = []
# indices = []
#
# for file in filenames:
#
# URL_f0 = 'https://raw.githubusercontent.com/youqad/Neurorobotics_Intent-Recognition/master/data_files/{}.f0'.format(file)
# file_dicts = [{key:val for key, val in zip(['time', 'f0'], map(float, l.split()))} for l in list_from_URL(URL_f0)]
#
# URL_en = 'https://raw.githubusercontent.com/youqad/Neurorobotics_Intent-Recognition/master/data_files/{}.en'.format(file)
# for l, d in zip(list_from_URL(URL_en), file_dicts):
# d["file"] = file
# d["en"] = float(l.split()[1])
# d["label"] = file[-2:]
#
# files.extend(file_dicts)
#
# # How `files` looks like:
# # files = [
# # {"file": "cy0001at", "time": 0.02, "f0": 0., "en": 0.},
# # {"file": "cy0001at", "time": 1.28, "f0": 0., "en": 0.},
# # ...
# # {"file": "li1450at", "time": 0.02, "f0": 0., "en": 0.},
# # {"file": "li1450at", "time": 1.56, "f0": 404., "en": 65.}
# # ]
#
# pd.DataFrame(files).to_csv('data.csv', encoding='utf-8', index=False) # To reuse it next time
# google_files.download('data.csv')
# loading training data
df = pd.read_csv('https://raw.githubusercontent.com/youqad/Neurorobotics_Intent-Recognition/master/data.csv').set_index('file')
df1 = df.loc[df['label'] != 'at']
df1.head()
print(df1.columns.values)
#df.groupby('file').mean().head()
#df1.groupby('file').max().head()
#df1.groupby('file').var().head()
#df1.groupby('file').median().head()
df1.groupby('file').quantile([.25, .75]).head()
list_features = ['mean',
'max',
('range', lambda x: max(x)-min(x)),
'var',
'median',
('1st_quantile', lambda x: x.quantile(.25)),
('3rd_quantile', lambda x: x.quantile(.75)),
('mean_absolute_local_derivate', lambda x: abs(x.diff()).mean())
]
df1.groupby('file')['f0','en'].agg(list_features)
voiced = df1.loc[df1['f0']!=0].groupby('file')['f0','en'].agg(list_features)
#voiced
voiced.all = df1.loc[df1['f0']!=0].groupby('label')['f0','en'].agg(list_features)
voiced.all
unvoiced = df1.loc[df1['f0']==0].groupby('file')['en'].agg(list_features)
#unvoiced
unvoiced.all = df1.loc[df1['f0']==0].groupby('label')['f0','en'].agg(list_features)
unvoiced.all
def train_test(df=df1, train_percentage=.4, seed=1):
voiced = df.loc[df['f0']!=0].groupby('file')['f0','en'].agg(list_features)
unvoiced = df.loc[df['f0']==0].groupby('file')['en'].agg(list_features)
X, Y = {}, {}
X['voiced'], Y['voiced'] = {}, {}
X['unvoiced'], Y['unvoiced'] = {}, {}
X['voiced']['all'] = np.array(df.groupby('file')['f0','en'].agg(list_features))
Y['voiced']['all'] = np.array(df.loc[df['f0']!=0].groupby(['file']).min().label.values)
X['unvoiced']['all'] = np.array(unvoiced)
Y['unvoiced']['all'] = np.array(df.loc[df['f0']==0].groupby(['file']).min().label.values)
np.random.seed(seed)
for type in ['voiced', 'unvoiced']:
n = len(X[type]['all'])
ind_rand = np.random.randint(n, size=int(train_percentage*n)) # random indices
train_mask = np.zeros(n, dtype=bool)
train_mask[ind_rand] = True
X[type]['train'], X[type]['test'] = X[type]['all'][train_mask], X[type]['all'][~train_mask]
Y[type]['train'], Y[type]['test'] = Y[type]['all'][train_mask], Y[type]['all'][~train_mask]
return X, Y
X1, Y1 = train_test()
col = ['mean', 'max', 'range', 'var', 'median', '1st_quantile', '3rd_quantile', 'mean_absolute_local_derivate']
col = ['f0_'+c for c in col]+['en_'+c for c in col]
voi = pd.DataFrame(X1['voiced']['all'], columns=col).assign(label=Y1['voiced']['all'])
ggplot.ggplot(voi, ggplot.aes(x='f0_mean', y='f0_var', color='label')) +\
ggplot.geom_point() +\
ggplot.scale_color_brewer(type='qual', palette='Set1') +\
ggplot.xlab("Mean") + ggplot.ylab("Var") + ggplot.ggtitle("Voiced: $f_0$")
col = ['mean', 'max', 'range', 'var', 'median', '1st_quantile', '3rd_quantile', 'mean_absolute_local_derivate']
unvoi = pd.DataFrame(X1['unvoiced']['all'], columns=col).assign(label=Y1['unvoiced']['all'])
ggplot.ggplot(unvoi, ggplot.aes(x='var', y='mean_absolute_local_derivate', color='label')) +\
ggplot.geom_point() +\
ggplot.scale_color_brewer(type='qual', palette='Set1') +\
ggplot.xlab("Variance") + ggplot.ylab("Mean absolute of local derivate") + ggplot.ggtitle("Unvoiced: $en$")
# Scikit Learn's kNN classifier:
# Just to test, but we will implement it ourselves of course!
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
def sklearn_knn(k, X, Y):
for type in ['voiced', 'unvoiced']:
kNN = KNeighborsClassifier(n_neighbors=k)
kNN.fit(X[type]['train'], Y[type]['train'])
print("Accuracy score for {}: {:.2f}".format(type, accuracy_score(Y[type]['test'],
kNN.predict(X[type]['test']))))
sklearn_knn(3, X1, Y1)
# Our own implementation!
from scipy.spatial.distance import cdist
from sklearn.metrics import confusion_matrix
from collections import Counter
def kNN(k, X, Y, labels=["pw", "ap"]):
# auxiliary function: label prediction (by majority vote)
# based on the nearest neighbors
def predicted_label(ind_neighbors):
label_neighbors = tuple(Y['train'][ind_neighbors])
return Counter(label_neighbors).most_common(1)[0][0]
# Pairwise distances between test and train data points
dist_matrix = cdist(X['test'], X['train'], 'euclidean')
y_predicted = []
for i in range(len(X['test'])):
ind_k_smallest = np.argpartition(dist_matrix[i, :], k)[:k]
y_predicted.append(predicted_label(ind_k_smallest))
# Confusion matrix: C[i, j] is the number of observations
# known to be in group i but predicted to be in group j
return confusion_matrix(Y['test'], np.array(y_predicted), labels=labels)
plt.figure()
cm = kNN(10, X1['voiced'], Y1['voiced'])
plot_confusion_matrix(cm, classes=["pw", "ap"],
title='Confusion matrix, with normalization')
plt.show()
cm2 = kNN(3, X1['unvoiced'], Y1['unvoiced'])
plot_confusion_matrix(cm2, classes=["pw", "ap"],
title='Confusion matrix, with normalization')
plt.show()
We consider the following intents : "Approval", "Prohibition" and "Attention"
# Easy-peasy! All the work has been done before: all we have to do now is to use
# the DataFrame `df` instead of `df1`
df.groupby('file')['f0','en'].agg(list_features).head()
X, Y = train_test(df=df)
sklearn_knn(3, X, Y)
plt.figure()
cm = kNN(3, X['voiced'], Y['voiced'], labels=["pw", "ap", "at"])
plot_confusion_matrix(cm, classes=["pw", "ap", "at"],
title='Confusion matrix, with normalization')
plt.show()
cm2 = kNN(3, X['unvoiced'], Y['unvoiced'], labels=["pw", "ap", "at"])
plot_confusion_matrix(cm2, classes=["pw", "ap", "at"],
title='Confusion matrix, with normalization')
plt.show()