import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns 
from sklearn.impute import SimpleImputer
from scipy.stats import gaussian_kde

sns.set(style="whitegrid")
df = pd.read_csv('speeddating.csv', na_values = '?')
df


df.shape

(8378, 123)


df.isna().sum()

has_null                0
wave                    0
gender                  0
age                    95
age_o                 104
                     ... 
d_guess_prob_liked      0
met                   375
decision                0
decision_o              0
match                   0
Length: 123, dtype: int64


null_data = df[df.isnull().any(axis=1)]
null_data.shape

(7330, 123)


df = df.drop('expected_num_interested_in_me', axis = 1)


dem = df[['funny_o','d_funny_o']]
dem


df=df.drop(['d_d_age','d_importance_same_race','d_importance_same_religion','d_pref_o_attractive','d_pref_o_sincere','d_pref_o_intelligence','d_pref_o_funny',
         'd_pref_o_ambitious','d_pref_o_shared_interests','d_attractive_o','d_sinsere_o',
         'd_intelligence_o','d_funny_o','d_ambitous_o','d_shared_interests_o','d_attractive_important',
         'd_sincere_important','d_intellicence_important','d_funny_important','d_ambtition_important','d_shared_interests_important','d_attractive','d_sincere','d_intelligence','d_funny',
         'd_ambition','d_attractive_partner','d_sincere_partner','d_intelligence_partner','d_funny_partner',
         'd_ambition_partner','d_shared_interests_partner','d_interests_correlate','d_expected_happy_with_sd_people',
           'd_expected_num_interested_in_me','d_expected_num_matches','d_like','d_guess_prob_liked'],axis=1)


df=df.drop(['sports','tvsports','exercise','dining','museums','art','hiking','gaming','clubbing','reading',
         'tv','theater','movies','concerts','music','shopping','yoga','d_sports','d_tvsports','d_exercise','d_dining',
         'd_museums','d_art','d_hiking','d_gaming','d_clubbing','d_reading','d_tv','d_theater','d_movies',
         'd_concerts','d_music','d_shopping','d_yoga'],axis=1)


df = df.drop(['field','has_null','wave'],axis=1)


df.shape

(8378, 47)


cat = df.select_dtypes('object').columns
cat

Index(['gender', 'race', 'race_o'], dtype='object')


no_race = df[df[['race','race_o','gender']].isnull().values]
no_race.shape

(136, 47)


df_not_na = df[df['race'].notna()]
df_not_na = df_not_na[df_not_na['race_o'].notna()]
df_not_na.isna().sum()

gender                              0
age                                31
age_o                              31
d_age                               0
race                                0
race_o                              0
samerace                            0
importance_same_race               16
importance_same_religion           16
pref_o_attractive                  16
pref_o_sincere                     16
pref_o_intelligence                16
pref_o_funny                       25
pref_o_ambitious                   34
pref_o_shared_interests            56
attractive_o                      188
sinsere_o                         260
intelligence_o                    280
funny_o                           334
ambitous_o                        693
shared_interests_o               1049
attractive_important               16
sincere_important                  16
intellicence_important             16
funny_important                    25
ambtition_important                34
shared_interests_important         56
attractive                         42
sincere                            42
intelligence                       42
funny                              42
ambition                           42
attractive_partner                188
sincere_partner                   260
intelligence_partner              280
funny_partner                     334
ambition_partner                  693
shared_interests_partner         1049
interests_correlate                32
expected_happy_with_sd_people      38
expected_num_matches             1159
like                              225
guess_prob_liked                  292
met                               357
decision                            0
decision_o                          0
match                               0
dtype: int64


print("modalités de gender=",df_not_na["gender"].unique())
print("modalités de race=",df_not_na["race"].unique())
print("modalités de race_o=",df_not_na["race_o"].unique())

modalités de gender= ['female' 'male']
modalités de race= ["'Asian/Pacific Islander/Asian-American'" 'European/Caucasian-American'
 'Other' "'Latino/Hispanic American'" "'Black/African American'"]
modalités de race_o= ['European/Caucasian-American' "'Asian/Pacific Islander/Asian-American'"
 "'Latino/Hispanic American'" 'Other' "'Black/African American'"]


from sklearn.preprocessing import LabelEncoder, OrdinalEncoder


features_cat = ['gender', 'race', 'race_o']

for i in features_cat:
    le=LabelEncoder()
    arr= df_not_na[i].astype(str)
    le.fit(arr)
    df_not_na[i]=le.transform(df_not_na[i].astype(str))


df_not_na.head()


from sklearn.model_selection import train_test_split
df_not_na = pd.DataFrame(df_not_na, columns = df.columns)

X = df_not_na.copy()
y = X.pop('match')

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 1/3)


from sklearn.ensemble import RandomForestRegressor
from sklearn.impute import KNNImputer
from sklearn.model_selection import GridSearchCV

df_not_na.isna().sum()

gender                              0
age                                31
age_o                              31
d_age                               0
race                                0
race_o                              0
samerace                            0
importance_same_race               16
importance_same_religion           16
pref_o_attractive                  16
pref_o_sincere                     16
pref_o_intelligence                16
pref_o_funny                       25
pref_o_ambitious                   34
pref_o_shared_interests            56
attractive_o                      188
sinsere_o                         260
intelligence_o                    280
funny_o                           334
ambitous_o                        693
shared_interests_o               1049
attractive_important               16
sincere_important                  16
intellicence_important             16
funny_important                    25
ambtition_important                34
shared_interests_important         56
attractive                         42
sincere                            42
intelligence                       42
funny                              42
ambition                           42
attractive_partner                188
sincere_partner                   260
intelligence_partner              280
funny_partner                     334
ambition_partner                  693
shared_interests_partner         1049
interests_correlate                32
expected_happy_with_sd_people      38
expected_num_matches             1159
like                              225
guess_prob_liked                  292
met                               357
decision                            0
decision_o                          0
match                               0
dtype: int64


knn = KNNImputer(n_neighbors = 5)

df_not_na = knn.fit_transform(df_not_na)

df_final = pd.DataFrame(df_not_na, columns = df.columns)


df_final.isna().sum()

gender                           0
age                              0
age_o                            0
d_age                            0
race                             0
race_o                           0
samerace                         0
importance_same_race             0
importance_same_religion         0
pref_o_attractive                0
pref_o_sincere                   0
pref_o_intelligence              0
pref_o_funny                     0
pref_o_ambitious                 0
pref_o_shared_interests          0
attractive_o                     0
sinsere_o                        0
intelligence_o                   0
funny_o                          0
ambitous_o                       0
shared_interests_o               0
attractive_important             0
sincere_important                0
intellicence_important           0
funny_important                  0
ambtition_important              0
shared_interests_important       0
attractive                       0
sincere                          0
intelligence                     0
funny                            0
ambition                         0
attractive_partner               0
sincere_partner                  0
intelligence_partner             0
funny_partner                    0
ambition_partner                 0
shared_interests_partner         0
interests_correlate              0
expected_happy_with_sd_people    0
expected_num_matches             0
like                             0
guess_prob_liked                 0
met                              0
decision                         0
decision_o                       0
match                            0
dtype: int64


fig,axs = plt.subplots(3,2, figsize = (32,16), sharey = True)

sns.histplot(data=df, x='shared_interests_o', kde = True,color="olive", ax=axs[0, 0])
sns.histplot(data=df_final, x="shared_interests_o",kde = True, color="olive", ax=axs[0, 1])
sns.histplot(data=df, x="shared_interests_partner",kde = True ,color="teal", ax=axs[1, 0])
sns.histplot(data=df_final, x="shared_interests_partner", kde=True, color="teal", ax=axs[1, 1])
sns.histplot(data=df, x="expected_num_matches", kde=True, color="blue", ax=axs[2, 0])
sns.histplot(data=df_final, x="expected_num_matches", kde=True, color="blue", ax=axs[2, 1])
fig.tight_layout()


fig,axs = plt.subplots(1,2, figsize = (16,8))

sns.histplot(data = df[df['met']==0], x = 'match', stat = 'percent', ax = axs[0])
sns.histplot(data = df[df['met']==1], x = 'match', stat = 'percent', ax = axs[1])

<AxesSubplot: xlabel='match', ylabel='Percent'>


df = df_final.copy()


df = df.rename(columns={'sinsere_o':'sincere_o','ambitous_o':'ambition_o','pref_o_ambitious':'pref_o_ambition',
                        'intellicence_important':'intelligence_important','ambtition_important':'ambition_important'})


for i in range(len(df)):
    if df.iloc[i]['gender'] == 1:
        df.iloc[i]['d_age'] = df.iloc[i]['age'] - df.iloc[i]['age_o']
    else :
        df.iloc[i]['d_age'] = - df.loc[i]['age'] + df.iloc[i]['age_o']


for i in range(len(df)):
    if df.iloc[i]['race'] == 4 and df.iloc[i]['race_o'] == 4:
        df.iloc[i]['samerace'] = 0


df['note_partner']=(df['attractive_partner']*df['attractive_important']+
                     df['sincere_partner']*df['sincere_important'] +
                     df['intelligence_partner']*df['intelligence_important']+
                     df['funny_partner']*df['funny_important'] +
                     df['ambition_partner']*df['ambition_important'])/(df['attractive_important']+df['sincere_important'] +df['intelligence_important']+df['funny_important']+df['ambition_important'])


df['note_from_partner']=(df['attractive_o']*df['pref_o_attractive']+
                     df['sincere_o']*df['pref_o_sincere'] +
                     df['intelligence_o']*df['pref_o_intelligence']+
                     df['funny_o']*df['pref_o_funny'] +
                     df['ambition_o']*df['pref_o_ambition'])/(df['pref_o_attractive']+df['pref_o_sincere'] +df['pref_o_intelligence']+df['pref_o_funny']+df['pref_o_ambition'])


df['note_partner'] = round(df['note_partner'],1)
df['note_from_partner'] = round(df['note_from_partner'],1)


df['note_diff'] = df['note_partner']*df['note_partner']+ df['note_from_partner']*df['note_from_partner']
df


df['note_diff'] = (df['note_diff'] - df['note_diff'].min())/(df['note_diff'].max() - df['note_diff'].min())


plt.figure(figsize = (16,8))
plt.title("Répartition des modalités de 'match' en fonction de 'note_from_partner'")
sns.histplot(data = df, x = 'note_from_partner', kde = True, hue = 'match',  multiple = 'dodge', shrink = .8)

<AxesSubplot: title={'center': "Répartition des modalités de 'match' en fonction de 'note_from_partner'"}, xlabel='note_from_partner', ylabel='Count'>


plt.figure(figsize = (16,8))
plt.title("Répartition des modalités de 'match' en fonction de 'note_partner'")
sns.histplot(data = df, x = 'note_partner', kde = True, hue = 'match',  multiple = 'dodge', shrink = .8)

<AxesSubplot: title={'center': "Répartition des modalités de 'match' en fonction de 'note_partner'"}, xlabel='note_partner', ylabel='Count'>


plt.figure(figsize = (16,8))
plt.title("Répartition des modalités de 'match' en fonction de 'note_diff'")
sns.histplot(data = df, x = 'note_diff', kde = True, hue='match',  multiple = 'dodge', shrink = .8)

<AxesSubplot: title={'center': "Répartition des modalités de 'match' en fonction de 'note_diff'"}, xlabel='note_diff', ylabel='Count'>


plt.figure(figsize = (16,8))
plt.title("Historique de répartition des notes générales données par les hommes aux femmes")
sns.histplot(data = df[df['gender']==1], x = 'note_partner', kde = True, multiple = 'dodge', shrink = .8)

<AxesSubplot: title={'center': 'Historique de répartition des notes générales données par les hommes aux femmes'}, xlabel='note_partner', ylabel='Count'>


plt.figure(figsize = (16,8))
plt.title("Historique de répartition des notes générales données par les femmes aux hommes")
sns.histplot(data = df[df['gender']==0], x = 'note_partner', kde = True, multiple = 'dodge', shrink = .8)

<AxesSubplot: title={'center': 'Historique de répartition des notes générales données par les femmes aux hommes'}, xlabel='note_partner', ylabel='Count'>


import matplotlib.pyplot as plt
plt.figure(figsize = (16,8))
plt.pie([df[df['match']==1].shape[0],df[df['match']==0].shape[0]], labels = ["match","pas de match"], normalize = True)
plt.legend()

<matplotlib.legend.Legend at 0x1af2161e500>


data_to_csv = df.to_csv('data_clean.csv')

	has_null	wave	gender	age	age_o	d_age	d_d_age	race	race_o	samerace	...	d_expected_num_interested_in_me	d_expected_num_matches	like	guess_prob_liked	d_like	d_guess_prob_liked	met	decision	decision_o	match
0	0	1	female	21.0	27.0	6	[4-6]	'Asian/Pacific Islander/Asian-American'	European/Caucasian-American	0	...	[0-3]	[3-5]	7.0	6.0	[6-8]	[5-6]	0.0	1	0	0
1	0	1	female	21.0	22.0	1	[0-1]	'Asian/Pacific Islander/Asian-American'	European/Caucasian-American	0	...	[0-3]	[3-5]	7.0	5.0	[6-8]	[5-6]	1.0	1	0	0
2	1	1	female	21.0	22.0	1	[0-1]	'Asian/Pacific Islander/Asian-American'	'Asian/Pacific Islander/Asian-American'	1	...	[0-3]	[3-5]	7.0	NaN	[6-8]	[0-4]	1.0	1	1	1
3	0	1	female	21.0	23.0	2	[2-3]	'Asian/Pacific Islander/Asian-American'	European/Caucasian-American	0	...	[0-3]	[3-5]	7.0	6.0	[6-8]	[5-6]	0.0	1	1	1
4	0	1	female	21.0	24.0	3	[2-3]	'Asian/Pacific Islander/Asian-American'	'Latino/Hispanic American'	0	...	[0-3]	[3-5]	6.0	6.0	[6-8]	[5-6]	0.0	1	1	1
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
8373	1	21	male	25.0	26.0	1	[0-1]	European/Caucasian-American	'Latino/Hispanic American'	0	...	[0-3]	[3-5]	2.0	5.0	[0-5]	[5-6]	0.0	0	1	0
8374	1	21	male	25.0	24.0	1	[0-1]	European/Caucasian-American	Other	0	...	[0-3]	[3-5]	4.0	4.0	[0-5]	[0-4]	0.0	0	0	0
8375	1	21	male	25.0	29.0	4	[4-6]	European/Caucasian-American	'Latino/Hispanic American'	0	...	[0-3]	[3-5]	6.0	5.0	[6-8]	[5-6]	0.0	0	0	0
8376	1	21	male	25.0	22.0	3	[2-3]	European/Caucasian-American	'Asian/Pacific Islander/Asian-American'	0	...	[0-3]	[3-5]	5.0	5.0	[0-5]	[5-6]	0.0	0	1	0
8377	1	21	male	25.0	22.0	3	[2-3]	European/Caucasian-American	'Asian/Pacific Islander/Asian-American'	0	...	[0-3]	[3-5]	4.0	5.0	[0-5]	[5-6]	0.0	0	1	0

	funny_o	d_funny_o
0	8.0	[6-8]
1	7.0	[6-8]
2	10.0	[9-10]
3	8.0	[6-8]
4	6.0	[6-8]
...	...	...
8373	2.0	[0-5]
8374	3.0	[0-5]
8375	2.0	[0-5]
8376	5.0	[0-5]
8377	7.0	[6-8]

	age	age_o	d_age	race_o	samerace	importance_same_race	importance_same_religion	pref_o_attractive	...	shared_interests_partner	interests_correlate	expected_happy_with_sd_people	expected_num_matches	like	guess_prob_liked	met	decision	decision_o	match
0	21.0	27.0	6	3	0	2.0	4.0	35.0	...	5.0	0.14	3.0	4.0	7.0	6.0	0.0	1	0	0
1	21.0	22.0	1	3	0	2.0	4.0	60.0	...	6.0	0.54	3.0	4.0	7.0	5.0	1.0	1	0	0
2	21.0	22.0	1	0	1	2.0	4.0	19.0	...	7.0	0.16	3.0	4.0	7.0	NaN	1.0	1	1	1
3	21.0	23.0	2	3	0	2.0	4.0	30.0	...	8.0	0.61	3.0	4.0	7.0	6.0	0.0	1	1	1
4	21.0	24.0	3	2	0	2.0	4.0	30.0	...	6.0	0.21	3.0	4.0	6.0	6.0	0.0	1	1	1

	gender	age	age_o	d_age	race	race_o	samerace	importance_same_race	importance_same_religion	pref_o_attractive	...	expected_num_matches	like	guess_prob_liked	met	decision	decision_o	match	note_partner	note_from_partner	note_diff
0	0.0	21.0	27.0	6.0	0.0	3.0	0.0	2.0	4.0	35.0	...	4.0	7.0	6.0	0.0	1.0	0.0	0.0	7.1	7.3	103.70
1	0.0	21.0	22.0	1.0	0.0	3.0	0.0	2.0	4.0	60.0	...	4.0	7.0	5.0	1.0	1.0	0.0	0.0	7.1	7.0	99.41
2	0.0	21.0	22.0	1.0	0.0	0.0	1.0	2.0	4.0	19.0	...	4.0	7.0	5.2	1.0	1.0	1.0	1.0	7.2	10.0	151.84
3	0.0	21.0	23.0	2.0	0.0	3.0	0.0	2.0	4.0	30.0	...	4.0	7.0	6.0	0.0	1.0	1.0	1.0	6.8	7.9	108.65
4	0.0	21.0	24.0	3.0	0.0	2.0	0.0	2.0	4.0	30.0	...	4.0	6.0	6.0	0.0	1.0	1.0	1.0	6.2	8.0	102.44
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
8237	1.0	25.0	26.0	-1.0	3.0	2.0	0.0	1.0	1.0	10.0	...	3.0	2.0	5.0	0.0	0.0	1.0	0.0	3.6	4.2	30.60
8238	1.0	25.0	24.0	1.0	3.0	4.0	0.0	1.0	1.0	50.0	...	3.0	4.0	4.0	0.0	0.0	0.0	0.0	4.6	5.4	50.32
8239	1.0	25.0	29.0	-4.0	3.0	2.0	0.0	1.0	1.0	40.0	...	3.0	6.0	5.0	0.0	0.0	0.0	0.0	5.2	1.9	30.65
8240	1.0	25.0	22.0	3.0	3.0	0.0	0.0	1.0	1.0	10.0	...	3.0	5.0	5.0	0.0	0.0	1.0	0.0	4.2	5.4	46.80
8241	1.0	25.0	22.0	3.0	3.0	0.0	0.0	1.0	1.0	20.0	...	3.0	4.0	5.0	0.0	0.0	1.0	0.0	3.6	7.6	70.72

Importation du dataset :¶

Suppression des variables inutiles :¶

Encodage des variables catégoriques en numériques :¶

Imputation des valeurs manquantes à l'aide d'un KNN Imputer :¶

Visualisation de la répartition des modalités de certaines variables :¶

Répartition des modalités pour les variables 'pref_o_attractive', 'pref_o_sincere', 'pref_o_intelligence', 'pref_o_funny', 'pref_o_ambitious', 'pref_o_shared_interests' après imputation des valeurs manquantes :¶

Corrélation entre 'met' et 'match' :¶

Modification du nom de certaines variables :¶

Modification de certaines variables :¶

Pour la variables d_age :¶

Pour la variable 'samerace' :¶

Création de nouvelles variables :¶

Création de la variable 'note_partner' qui représente l'estime que 'self' a envers 'partner' :¶

Création de la variable 'note_from_partner' qui est l'estime que 'partner' a en 'self' :¶

On arrondis ces trois nouvelles variables au dixième près :¶

Création de la variable 'note_diff' qui est en qlqs sorte l'écart entre l'estime qu'a self pour partner, et l'estime qu'a partner pour self¶

Un peu de visualisation :¶

Sur les variables 'note_partner' et 'note_from_partner' :¶

Sur la variable 'note_diff' :¶

Comparaison de la notation entre les hommes et les femmes :¶

Répartition des modalités de la variable 'match' :¶

On sauvegarde le dataset nettoyé sous format .csv ! On s'attaquera à la partie Machine Learning dans un second fichier.¶