import pandas as pd
import numpy as np
import seaborn as sns

df = pd.read_csv('data_clean.csv')
df.head()


# SERT A NE PAS AFFICHER LES MESSAGES D'AVERTISSEMENTS :
import sys
if not sys.warnoptions:
    import warnings
    warnings.simplefilter("ignore")


df = df.drop(['Unnamed: 0','decision','decision_o'],axis=1)


X = df.drop('match',axis=1)
y = df['match']


import sklearn
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=True, random_state=100, stratify=df['match'])


df_train, df_test = train_test_split(df,test_size = 0.2, shuffle = True, random_state=100, stratify=df['match'])


df_train_dp=df_train[df_train['match']==1]
df_double_train=pd.concat([df_train,df_train_dp])


X_double_train = df_double_train.copy()
y_double_train = X_double_train.pop('match')


X_test = df_test.copy()
y_test = X_test.pop('match')


X_double_test = X_test.copy()
y_double_test = y_test.copy()


df_double_train = df_double_train.reset_index(drop = True)
sns.histplot(data = df_double_train, x = 'match', hue = 'match', stat = 'percent')

<AxesSubplot: xlabel='match', ylabel='Percent'>


from sklearn.tree import DecisionTreeClassifier
model_tree = DecisionTreeClassifier(max_depth=6,random_state=100)
model_tree.fit(X_train, y_train)

print('Score sur le test-set :',model_tree.score(X_test, y_test))

print("=========================================================")

model_double_tree = DecisionTreeClassifier(max_depth=6,random_state=100)
model_double_tree.fit(X_double_train, y_double_train)

print('Score sur le test-set doublé :',model_double_tree.score(X_double_test, y_double_test))

Score sur le test-set : 0.8526379624014554
=========================================================
Score sur le test-set doublé : 0.8229229836264402


y_pred_tree = model_tree.predict(X_test)

from sklearn.metrics import confusion_matrix
print("Matrice de confusion du modèle :\n",confusion_matrix(y_test,y_pred_tree))

from sklearn.metrics import classification_report
print(classification_report(y_pred_tree,y_test, zero_division=0))

print('================Doublée=================')
y_double_pred_tree = model_double_tree.predict(X_double_test)

print("Matrice de confusion du modèle :\n",confusion_matrix(y_double_test,y_double_pred_tree))


from sklearn.metrics import classification_report
print(classification_report(y_double_pred_tree,y_double_test, zero_division=0))

Matrice de confusion du modèle :
 [[1300   79]
 [ 164  106]]
              precision    recall  f1-score   support

         0.0       0.94      0.89      0.91      1464
         1.0       0.39      0.57      0.47       185

    accuracy                           0.85      1649
   macro avg       0.67      0.73      0.69      1649
weighted avg       0.88      0.85      0.86      1649

================Doublée=================
Matrice de confusion du modèle :
 [[1221  158]
 [ 134  136]]
              precision    recall  f1-score   support

         0.0       0.89      0.90      0.89      1355
         1.0       0.50      0.46      0.48       294

    accuracy                           0.82      1649
   macro avg       0.69      0.68      0.69      1649
weighted avg       0.82      0.82      0.82      1649


from sklearn.ensemble import RandomForestClassifier
model_random_forest = RandomForestClassifier(max_depth=6,n_estimators=100,random_state=100)
model_random_forest.fit(X_train, y_train)

model_random_forest_double = RandomForestClassifier(max_depth=6,n_estimators=100,random_state=100)
model_random_forest_double.fit(X_double_train, y_double_train)

RandomForestClassifier(max_depth=6, random_state=100)

RandomForestClassifier(max_depth=6, random_state=100)


y_pred_random_forest = model_random_forest.predict(X_test)

from sklearn.metrics import confusion_matrix
print("Matrice de confusion du modèle :\n",confusion_matrix(y_test,y_pred_random_forest))

from sklearn.metrics import classification_report
print(classification_report(y_test,y_pred_random_forest, zero_division=0))


print('================Doublée=================')

y_double_pred_random_forest = model_random_forest_double.predict(X_double_test)

print("Matrice de confusion du modèle :\n",confusion_matrix(y_double_test,y_double_pred_random_forest))

from sklearn.metrics import classification_report
print(classification_report(y_double_test,y_double_pred_random_forest, zero_division=0))

Matrice de confusion du modèle :
 [[1369   10]
 [ 216   54]]
              precision    recall  f1-score   support

         0.0       0.86      0.99      0.92      1379
         1.0       0.84      0.20      0.32       270

    accuracy                           0.86      1649
   macro avg       0.85      0.60      0.62      1649
weighted avg       0.86      0.86      0.83      1649

================Doublée=================
Matrice de confusion du modèle :
 [[1296   83]
 [ 149  121]]
              precision    recall  f1-score   support

         0.0       0.90      0.94      0.92      1379
         1.0       0.59      0.45      0.51       270

    accuracy                           0.86      1649
   macro avg       0.75      0.69      0.71      1649
weighted avg       0.85      0.86      0.85      1649


"""seuil = 0.5
for i in range(len(df.columns)) : 
    for j in range(i):
        if abs(df.corr()[df.columns[i]][j]) > seuil :
            print('attention', df.columns[i],'est trop correlée avec',df.corr()[df.columns[i]].index[j])"""

# Voici le résultat de cette exécution :
"""attention intelligence_o est trop correlée avec sincere_o
attention funny_o est trop correlée avec attractive_o
attention ambition_o est trop correlée avec intelligence_o
attention shared_interests_o est trop correlée avec funny_o
attention intelligence_partner est trop correlée avec sincere_partner
attention funny_partner est trop correlée avec attractive_partner
attention ambition_partner est trop correlée avec intelligence_partner
attention shared_interests_partner est trop correlée avec funny_partner
attention like est trop correlée avec attractive_partner
attention like est trop correlée avec sincere_partner
attention like est trop correlée avec intelligence_partner
attention like est trop correlée avec funny_partner
attention like est trop correlée avec shared_interests_partner
attention note_partner est trop correlée avec attractive_partner
attention note_partner est trop correlée avec sincere_partner
attention note_partner est trop correlée avec intelligence_partner
attention note_partner est trop correlée avec funny_partner
attention note_partner est trop correlée avec ambition_partner
attention note_partner est trop correlée avec shared_interests_partner
attention note_partner est trop correlée avec like
attention note_from_partner est trop correlée avec attractive_o
attention note_from_partner est trop correlée avec sincere_o
attention note_from_partner est trop correlée avec intelligence_o
attention note_from_partner est trop correlée avec funny_o
attention note_from_partner est trop correlée avec ambition_o
attention note_from_partner est trop correlée avec shared_interests_o
attention note_diff est trop correlée avec attractive_o
attention note_diff est trop correlée avec sincere_o
attention note_diff est trop correlée avec intelligence_o
attention note_diff est trop correlée avec funny_o
attention note_diff est trop correlée avec attractive_partner
attention note_diff est trop correlée avec sincere_partner
attention note_diff est trop correlée avec intelligence_partner
attention note_diff est trop correlée avec funny_partner
attention note_diff est trop correlée avec like
attention note_diff est trop correlée avec note_partner
attention note_diff est trop correlée avec note_from_partner"""

'attention intelligence_o est trop correlée avec sincere_o\nattention funny_o est trop correlée avec attractive_o\nattention ambition_o est trop correlée avec intelligence_o\nattention shared_interests_o est trop correlée avec funny_o\nattention intelligence_partner est trop correlée avec sincere_partner\nattention funny_partner est trop correlée avec attractive_partner\nattention ambition_partner est trop correlée avec intelligence_partner\nattention shared_interests_partner est trop correlée avec funny_partner\nattention like est trop correlée avec attractive_partner\nattention like est trop correlée avec sincere_partner\nattention like est trop correlée avec intelligence_partner\nattention like est trop correlée avec funny_partner\nattention like est trop correlée avec shared_interests_partner\nattention note_partner est trop correlée avec attractive_partner\nattention note_partner est trop correlée avec sincere_partner\nattention note_partner est trop correlée avec intelligence_partner\nattention note_partner est trop correlée avec funny_partner\nattention note_partner est trop correlée avec ambition_partner\nattention note_partner est trop correlée avec shared_interests_partner\nattention note_partner est trop correlée avec like\nattention note_from_partner est trop correlée avec attractive_o\nattention note_from_partner est trop correlée avec sincere_o\nattention note_from_partner est trop correlée avec intelligence_o\nattention note_from_partner est trop correlée avec funny_o\nattention note_from_partner est trop correlée avec ambition_o\nattention note_from_partner est trop correlée avec shared_interests_o\nattention note_diff est trop correlée avec attractive_o\nattention note_diff est trop correlée avec sincere_o\nattention note_diff est trop correlée avec intelligence_o\nattention note_diff est trop correlée avec funny_o\nattention note_diff est trop correlée avec attractive_partner\nattention note_diff est trop correlée avec sincere_partner\nattention note_diff est trop correlée avec intelligence_partner\nattention note_diff est trop correlée avec funny_partner\nattention note_diff est trop correlée avec like\nattention note_diff est trop correlée avec note_partner\nattention note_diff est trop correlée avec note_from_partner'


df_logic = df.drop(['sincere_o','attractive_o','intelligence_o','ambition_o', 'funny_o', 'sincere_partner', 'attractive_partner',
                   'intelligence_partner','funny_partner','shared_interests_partner','attractive', 'sincere', 'intelligence',
                   'funny', 'ambition','note_diff','ambition_partner', 'shared_interests_o','like'], axis = 1)


seuil = 0.5
for i in range(len(df_logic.columns)) : 
    for j in range(i):
        if abs(df_logic.corr()[df_logic.columns[i]][j]) > seuil :
            print('attention', df_logic.columns[i],'est trop correlée avec',df_logic.corr()[df_logic.columns[i]].index[j])


X_logic = df_logic.copy()
y_logic = X_logic.pop('match')

X_logic_train , X_logic_test, y_logic_train, y_logic_test = train_test_split(X_logic,y_logic,test_size = 0.2, shuffle = True, random_state = 100, stratify = df_logic['match'])


from sklearn.linear_model import LogisticRegression
model_reg = LogisticRegression(random_state=100)
model_reg.fit(X_logic_train, y_logic_train)

LogisticRegression(random_state=100)

LogisticRegression(random_state=100)


y_pred_reg = model_reg.predict(X_logic_test)

from sklearn.metrics import confusion_matrix
print("Matrice de confusion du modèle :\n",confusion_matrix(y_logic_test,y_pred_reg))

from sklearn.metrics import classification_report
print(classification_report(y_pred_reg,y_logic_test, zero_division=0))

Matrice de confusion du modèle :
 [[1340   39]
 [ 215   55]]
              precision    recall  f1-score   support

         0.0       0.97      0.86      0.91      1555
         1.0       0.20      0.59      0.30        94

    accuracy                           0.85      1649
   macro avg       0.59      0.72      0.61      1649
weighted avg       0.93      0.85      0.88      1649


from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import SequentialFeatureSelector

sfs = SequentialFeatureSelector(LogisticRegression(),direction = 'forward',cv = 10)
sfs.fit(X_logic_train,y_logic_train)

SequentialFeatureSelector(cv=10, estimator=LogisticRegression())

SequentialFeatureSelector(cv=10, estimator=LogisticRegression())

LogisticRegression()

LogisticRegression()


sfs.get_feature_names_out()
"""array(['gender', 'age', 'd_age', 'race_o', 'pref_o_sincere',
       'pref_o_intelligence', 'pref_o_ambition', 'attractive_important',
       'funny_important', 'expected_num_matches', 'guess_prob_liked',
       'note_yourself', 'note_partner', 'note_from_partner'], dtype=object)"""

"array(['gender', 'age', 'd_age', 'race_o', 'pref_o_sincere',\n       'pref_o_intelligence', 'pref_o_ambition', 'attractive_important',\n       'funny_important', 'expected_num_matches', 'guess_prob_liked',\n       'note_yourself', 'note_partner', 'note_from_partner'], dtype=object)"


X_logic_train = sfs.transform(X_logic_train)
X_logic_test = sfs.transform(X_logic_test)


model_reg = LogisticRegression(random_state=100)
model_reg.fit(X_logic_train, y_logic_train)

LogisticRegression(random_state=100)

LogisticRegression(random_state=100)


y_pred_reg = model_reg.predict(X_logic_test)

from sklearn.metrics import confusion_matrix
print("Matrice de confusion du modèle :\n",confusion_matrix(y_logic_test,y_pred_reg))

from sklearn.metrics import classification_report
print(classification_report(y_pred_reg,y_logic_test, zero_division=0))

Matrice de confusion du modèle :
 [[1348   31]
 [ 213   57]]
              precision    recall  f1-score   support

         0.0       0.98      0.86      0.92      1561
         1.0       0.21      0.65      0.32        88

    accuracy                           0.85      1649
   macro avg       0.59      0.76      0.62      1649
weighted avg       0.94      0.85      0.89      1649


sfs = SequentialFeatureSelector(LogisticRegression(),direction = 'backward',cv = 10)
sfs.fit(X_logic_train,y_logic_train)

SequentialFeatureSelector(cv=10, direction='backward',
                          estimator=LogisticRegression())

SequentialFeatureSelector(cv=10, direction='backward',
                          estimator=LogisticRegression())

LogisticRegression()

LogisticRegression()


sfs.get_feature_names_out()

array(['x0', 'x3', 'x7', 'x9', 'x10', 'x12', 'x13'], dtype=object)


X_logic_train = sfs.transform(X_logic_train)
X_logic_test = sfs.transform(X_logic_test)


model_reg = LogisticRegression(random_state=100)
model_reg.fit(X_logic_train, y_logic_train)

LogisticRegression(random_state=100)

LogisticRegression(random_state=100)


y_pred_reg = model_reg.predict(X_logic_test)

from sklearn.metrics import confusion_matrix
print("Matrice de confusion du modèle :\n",confusion_matrix(y_logic_test,y_pred_reg))

from sklearn.metrics import classification_report
print(classification_report(y_pred_reg,y_logic_test, zero_division=0))

Matrice de confusion du modèle :
 [[1344   35]
 [ 213   57]]
              precision    recall  f1-score   support

         0.0       0.97      0.86      0.92      1557
         1.0       0.21      0.62      0.31        92

    accuracy                           0.85      1649
   macro avg       0.59      0.74      0.62      1649
weighted avg       0.93      0.85      0.88      1649


import xgboost as xgb

X_train_xgb = X_train.copy()
y_train_xgb = y_train.copy()

X_test_xgb, X_val_xgb, y_test_xgb, y_val_xgb = train_test_split(X_test, y_test, test_size = 0.5, random_state = 22)


D_train = xgb.DMatrix(X_train_xgb, label=y_train_xgb)
D_val = xgb.DMatrix(X_val_xgb, label=y_val_xgb)
D_test = xgb.DMatrix(X_test_xgb, label = y_test_xgb)


params = {'eta' : 0.1, 'gamma': 0.2, 'max_depth' : 10, 'objective': 'binary:logistic', 'scale_pos_weight' : len(df[df['match']==0])/len(df[df['match']==1])}

steps = 50


boost = xgb.train(params, D_train, steps, evals = [(D_train, 'train'), (D_val, 'eval')] ,early_stopping_rounds = 10)

[0]	train-logloss:0.63197	eval-logloss:0.64675
[1]	train-logloss:0.58022	eval-logloss:0.60731
[2]	train-logloss:0.53597	eval-logloss:0.57395
[3]	train-logloss:0.49754	eval-logloss:0.54719
[4]	train-logloss:0.46212	eval-logloss:0.52336
[5]	train-logloss:0.43140	eval-logloss:0.50333
[6]	train-logloss:0.40516	eval-logloss:0.48517
[7]	train-logloss:0.37958	eval-logloss:0.46677
[8]	train-logloss:0.35842	eval-logloss:0.45287
[9]	train-logloss:0.33724	eval-logloss:0.43906
[10]	train-logloss:0.31682	eval-logloss:0.42724
[11]	train-logloss:0.30126	eval-logloss:0.41584
[12]	train-logloss:0.28678	eval-logloss:0.40720
[13]	train-logloss:0.27382	eval-logloss:0.39929
[14]	train-logloss:0.25905	eval-logloss:0.39098
[15]	train-logloss:0.24505	eval-logloss:0.38441
[16]	train-logloss:0.23255	eval-logloss:0.37778
[17]	train-logloss:0.22224	eval-logloss:0.37311
[18]	train-logloss:0.21074	eval-logloss:0.36696
[19]	train-logloss:0.20091	eval-logloss:0.36282
[20]	train-logloss:0.19199	eval-logloss:0.35723
[21]	train-logloss:0.18486	eval-logloss:0.35300
[22]	train-logloss:0.17794	eval-logloss:0.34913
[23]	train-logloss:0.16980	eval-logloss:0.34452
[24]	train-logloss:0.16490	eval-logloss:0.34072
[25]	train-logloss:0.15802	eval-logloss:0.33637
[26]	train-logloss:0.15168	eval-logloss:0.33335
[27]	train-logloss:0.14430	eval-logloss:0.33100
[28]	train-logloss:0.13795	eval-logloss:0.32954
[29]	train-logloss:0.13205	eval-logloss:0.32717
[30]	train-logloss:0.12793	eval-logloss:0.32499
[31]	train-logloss:0.12353	eval-logloss:0.32334
[32]	train-logloss:0.11989	eval-logloss:0.32209
[33]	train-logloss:0.11563	eval-logloss:0.32062
[34]	train-logloss:0.11142	eval-logloss:0.31755
[35]	train-logloss:0.10822	eval-logloss:0.31576
[36]	train-logloss:0.10451	eval-logloss:0.31497
[37]	train-logloss:0.10102	eval-logloss:0.31380
[38]	train-logloss:0.09784	eval-logloss:0.31262
[39]	train-logloss:0.09480	eval-logloss:0.31242
[40]	train-logloss:0.09236	eval-logloss:0.31028
[41]	train-logloss:0.08967	eval-logloss:0.31004
[42]	train-logloss:0.08711	eval-logloss:0.30735
[43]	train-logloss:0.08412	eval-logloss:0.30681
[44]	train-logloss:0.08278	eval-logloss:0.30659
[45]	train-logloss:0.08119	eval-logloss:0.30615
[46]	train-logloss:0.07849	eval-logloss:0.30585
[47]	train-logloss:0.07597	eval-logloss:0.30523
[48]	train-logloss:0.07329	eval-logloss:0.30374
[49]	train-logloss:0.07202	eval-logloss:0.30367


y_pred = boost.predict(D_test)
for i in range(len(y_pred)):
    if y_pred[i] > 0.5:
        y_pred[i] = 1
    else:
        y_pred[i] = 0

        
from sklearn.metrics import classification_report
print(classification_report(y_test_xgb,y_pred, zero_division=0))

              precision    recall  f1-score   support

         0.0       0.90      0.91      0.91       689
         1.0       0.51      0.46      0.48       135

    accuracy                           0.84       824
   macro avg       0.70      0.69      0.69       824
weighted avg       0.83      0.84      0.84       824


X_train_xgb = X_double_train.copy()
y_train_xgb = y_double_train.copy()

X_test_xgb, X_val_xgb, y_test_xgb, y_val_xgb = train_test_split(X_double_test, y_double_test, test_size = 0.5, random_state = 22)

D_train = xgb.DMatrix(X_train_xgb, label=y_train_xgb)
D_val = xgb.DMatrix(X_val_xgb, label=y_val_xgb)
D_test = xgb.DMatrix(X_test_xgb, label = y_test_xgb)

params = {'eta' : 0.1, 'gamma': 0.2, 'max_depth' : 10, 'objective': 'multi:softmax', 'num_class' : 2,  'eval_metric' : 'merror'}

steps = 50

boost_double = xgb.train(params, D_train, steps, evals = [(D_train, 'train'), (D_val, 'eval')] ,early_stopping_rounds = 10)

[0]	train-merror:0.07857	eval-merror:0.19758
[1]	train-merror:0.06150	eval-merror:0.17091
[2]	train-merror:0.05472	eval-merror:0.16727
[3]	train-merror:0.05134	eval-merror:0.16242
[4]	train-merror:0.04326	eval-merror:0.15758
[5]	train-merror:0.04039	eval-merror:0.14424
[6]	train-merror:0.03792	eval-merror:0.14182
[7]	train-merror:0.03270	eval-merror:0.13576
[8]	train-merror:0.02893	eval-merror:0.13091
[9]	train-merror:0.02684	eval-merror:0.12970
[10]	train-merror:0.02150	eval-merror:0.13455
[11]	train-merror:0.01681	eval-merror:0.13576
[12]	train-merror:0.01459	eval-merror:0.13697
[13]	train-merror:0.01303	eval-merror:0.13455
[14]	train-merror:0.01068	eval-merror:0.13333
[15]	train-merror:0.00977	eval-merror:0.13212
[16]	train-merror:0.00847	eval-merror:0.13212
[17]	train-merror:0.00808	eval-merror:0.13091
[18]	train-merror:0.00651	eval-merror:0.13333
[19]	train-merror:0.00573	eval-merror:0.12970


y_pred = boost_double.predict(D_test)

print(classification_report(y_test_xgb,y_pred, zero_division=0))

              precision    recall  f1-score   support

         0.0       0.89      0.93      0.91       689
         1.0       0.55      0.41      0.47       135

    accuracy                           0.85       824
   macro avg       0.72      0.67      0.69       824
weighted avg       0.83      0.85      0.84       824

	Unnamed: 0	age	age_o	d_age	race_o	samerace	importance_same_race	importance_same_religion	...	expected_num_matches	like	guess_prob_liked	met	decision	decision_o	match	note_partner	note_from_partner	note_diff
0	0	21.0	27.0	6.0	3.0	0.0	2.0	4.0	...	4.0	7.0	6.0	0.0	1.0	0.0	0.0	7.1	7.3	0.503366
1	1	21.0	22.0	1.0	3.0	0.0	2.0	4.0	...	4.0	7.0	5.0	1.0	1.0	0.0	0.0	7.1	7.0	0.482132
2	2	21.0	22.0	1.0	0.0	1.0	2.0	4.0	...	4.0	7.0	5.2	1.0	1.0	1.0	1.0	7.2	10.0	0.741635
3	3	21.0	23.0	2.0	3.0	0.0	2.0	4.0	...	4.0	7.0	6.0	0.0	1.0	1.0	1.0	6.8	7.9	0.527866
4	4	21.0	24.0	3.0	2.0	0.0	2.0	4.0	...	4.0	6.0	6.0	0.0	1.0	1.0	1.0	6.2	8.0	0.497129

On importe le dataset nettoyé¶

Suppression de variables :¶

Construction de X et y :¶

Utilisation de la fonction train_test_split :¶

Création de nouveaux ensembles d'apprentissage et de test, en doublant les match=1 sur le train_set :¶

On construit un arbre de décision :¶

On construit une forêt aléatoire :¶

On construit un modèle de régression logistique :¶

On applique l'algo de Forward Stepwise Selection :¶

On applique l'algo de Forward Stepwise Selection :¶

On construit un modèle XG Boost :¶