import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from plotly.subplots import make_subplots
from sklearn.feature_selection import chi2
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
from sklearn.model_selection import cross_val_score
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
import plotly.express as px
import plotly.io as pio
import plotly.offline as pyo
from sklearn.metrics import roc_curve, auc
import plotly.graph_objs as go
from plotly.subplots import make_subplots
from plotly.offline import init_notebook_mode, iplot
from dash import Dash, dcc, html
from base64 import b64encode
import io
import threading


data= pd.read_csv('Projet.csv')
data.head()


data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5630 entries, 0 to 5629
Data columns (total 20 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   CustomerID                   5630 non-null   int64  
 1   Churn                        5630 non-null   int64  
 2   Tenure                       5366 non-null   float64
 3   PreferredLoginDevice         5630 non-null   object 
 4   CityTier                     5630 non-null   int64  
 5   WarehouseToHome              5379 non-null   float64
 6   PreferredPaymentMode         5630 non-null   object 
 7   Gender                       5630 non-null   object 
 8   HourSpendOnApp               5375 non-null   float64
 9   NumberOfDeviceRegistered     5630 non-null   int64  
 10  PreferedOrderCat             5630 non-null   object 
 11  SatisfactionScore            5630 non-null   int64  
 12  MaritalStatus                5630 non-null   object 
 13  NumberOfAddress              5630 non-null   int64  
 14  Complain                     5630 non-null   int64  
 15  OrderAmountHikeFromlastYear  5365 non-null   float64
 16  CouponUsed                   5374 non-null   float64
 17  OrderCount                   5372 non-null   float64
 18  DaySinceLastOrder            5323 non-null   float64
 19  CashbackAmount               5630 non-null   int64  
dtypes: float64(7), int64(8), object(5)
memory usage: 879.8+ KB


# Vérification des doublons
duplicate_rows = data.duplicated()
print(f'Nombre de doublons : {duplicate_rows.sum()}')

Nombre de doublons : 0


pourcentage_manquant = data.isnull().mean() * 100
# pourcentage des valeurs manquantes
colonnes_sans_donnees_manquantes = pourcentage_manquant[pourcentage_manquant == 0].index.tolist()

# Imprimer la liste des variables avec 0% de données manquantes
print("Variables avec 0% de données manquantes :")
for col in colonnes_sans_donnees_manquantes:
    print(col)

# Tracer le pourcentage de données manquantes
colonnes_avec_donnees_manquantes = pourcentage_manquant[pourcentage_manquant > 0]
plt.figure(figsize=(10, 6))
sns.barplot(x=colonnes_avec_donnees_manquantes, y=colonnes_avec_donnees_manquantes.index, palette='viridis')
plt.title('Pourcentage de valeurs manquantes par colonne')
plt.xlabel('Pourcentage')
plt.ylabel('Colonnes')
plt.show()

Variables avec 0% de données manquantes :
CustomerID
Churn
PreferredLoginDevice
CityTier
PreferredPaymentMode
Gender
NumberOfDeviceRegistered
PreferedOrderCat
SatisfactionScore
MaritalStatus
NumberOfAddress
Complain
CashbackAmount


fig, axes = plt.subplots(3, 1, figsize=(8, 12))

#Tenure
sns.histplot(data['Tenure'].dropna(), kde=True, ax=axes[0], color=sns.color_palette('viridis')[1])
axes[0].set_title('Distribution de la Tenure')
axes[0].set_xlabel('Tenure')
axes[0].set_ylabel('Fréquence')

#HourSpendOnApp
sns.histplot(data['HourSpendOnApp'].dropna(), kde=True, ax=axes[1], color=sns.color_palette('viridis')[4])
axes[1].set_title('Distribution des Heures Passées sur l\'App')
axes[1].set_xlabel('Heures Passées sur l\'App')
axes[1].set_ylabel('Fréquence')

#DaySinceLastOrder
sns.histplot(data['DaySinceLastOrder'].dropna(), kde=True, ax=axes[2], color=sns.color_palette('viridis')[5])
axes[2].set_title('Distribution des Jours Depuis la Dernière Commande')
axes[2].set_xlabel('Jours Depuis la Dernière Commande')
axes[2].set_ylabel('Fréquence')

plt.tight_layout()
plt.show()


fig, axes = plt.subplots(4, 1, figsize=(8, 16))

# WarehouseToHome
sns.histplot(data['WarehouseToHome'].dropna(), kde=True, ax=axes[0], color=sns.color_palette('viridis')[1])
axes[0].set_title('Distribution de WarehouseToHome')
axes[0].set_xlabel('Distance entre entrepôt et domicile')
axes[0].set_ylabel('Fréquence')

# OrderAmountHikeFromlastYear
sns.histplot(data['OrderAmountHikeFromlastYear'].dropna(), kde=True, ax=axes[1], color=sns.color_palette('viridis')[2])
axes[1].set_title('Distribution de OrderAmountHikeFromlastYear')
axes[1].set_xlabel('Pourcentage d\'augmentation du montant de la commande par rapport à l\'année dernière')
axes[1].set_ylabel('Fréquence')

# CouponUsed
sns.histplot(data['CouponUsed'].dropna(), kde=True, ax=axes[2], color=sns.color_palette('viridis')[3])
axes[2].set_title('Distribution de CouponUsed')
axes[2].set_xlabel('Nombre de coupons utilisés')
axes[2].set_ylabel('Fréquence')

# OrderCount
sns.histplot(data['OrderCount'].dropna(), kde=True, ax=axes[3], color=sns.color_palette('viridis')[4])
axes[3].set_title('Distribution de OrderCount')
axes[3].set_xlabel('Nombre de commandes')
axes[3].set_ylabel('Fréquence')

plt.tight_layout()
plt.show()


# Calcul des médianes et moyennes pour les imputations
median_tenure = data['Tenure'].median()
median_day_since_last_order = data['DaySinceLastOrder'].median()
median_warehouse_to_home = data['WarehouseToHome'].median()
median_coupon_used = data['CouponUsed'].median()
median_order_count = data['OrderCount'].median()

mean_hour_spend_on_app = data['HourSpendOnApp'].mean()
mean_order_amount_hike_from_last_year = data['OrderAmountHikeFromlastYear'].mean()

# Application des imputations
data['Tenure'].fillna(median_tenure, inplace=True)
data['DaySinceLastOrder'].fillna(median_day_since_last_order, inplace=True)
data['WarehouseToHome'].fillna(median_warehouse_to_home, inplace=True)
data['CouponUsed'].fillna(median_coupon_used, inplace=True)
data['OrderCount'].fillna(median_order_count, inplace=True)

data['HourSpendOnApp'].fillna(mean_hour_spend_on_app, inplace=True)
data['OrderAmountHikeFromlastYear'].fillna(mean_order_amount_hike_from_last_year, inplace=True)

data.isnull().sum()

CustomerID                     0
Churn                          0
Tenure                         0
PreferredLoginDevice           0
CityTier                       0
WarehouseToHome                0
PreferredPaymentMode           0
Gender                         0
HourSpendOnApp                 0
NumberOfDeviceRegistered       0
PreferedOrderCat               0
SatisfactionScore              0
MaritalStatus                  0
NumberOfAddress                0
Complain                       0
OrderAmountHikeFromlastYear    0
CouponUsed                     0
OrderCount                     0
DaySinceLastOrder              0
CashbackAmount                 0
dtype: int64


def addlabels(x, y):
    for i, v in enumerate(y):
        plt.text(i, v // 2, f"{v / sum(y) * 100:.1f}%", ha='center', va='center', fontsize=12, color='white')

categorical_cols = ['PreferredLoginDevice', 'CityTier', 'PreferredPaymentMode',
       'Gender', 'NumberOfDeviceRegistered', 'PreferedOrderCat',
       'SatisfactionScore', 'MaritalStatus', 'Complain','NumberOfAddress']

num_rows = (len(categorical_cols) + 1) // 2

plt.figure(figsize=(15, 5 * num_rows))

for i, col in enumerate(categorical_cols):
    ax = plt.subplot(num_rows, 2, i + 1)
    sns.countplot(data=data, x=col, palette='viridis', ax=ax)
    ax.set_title(f'Distribution de {col}')
    ax.set_xlabel('')
    ax.set_ylabel('')
    ax.tick_params(axis='x', rotation=45)

    addlabels(ax.patches, [p.get_height() for p in ax.patches])

    ax.set_yticklabels([])

    ax.set_ylim(0, ax.get_ylim()[1] * 1.1)

plt.tight_layout()
plt.show()


# Liste des colonnes numériques
numerical_cols = ['Tenure', 'WarehouseToHome', 'HourSpendOnApp', 
                  'OrderAmountHikeFromlastYear', 'CouponUsed', 
                  'OrderCount', 'DaySinceLastOrder', 'CashbackAmount']

data_c = data[data['Churn']==1].copy()
data_nc = data[data['Churn']==0].copy()

# Création de la figure avec les sous-graphiques
fig, ax = plt.subplots(2, 4, figsize=(20, 15))
fig.suptitle('Densité des variables numériques en fonction de Churn', fontsize=20)
ax = ax.flatten()

# Boucle sur les colonnes numériques
for idx, c in enumerate(numerical_cols):
    sns.kdeplot(data=data_c[c], linewidth=3, label='Churn', ax=ax[idx], color=sns.color_palette('viridis')[2], fill=True)
    sns.kdeplot(data=data_nc[c], linewidth=3, label='No Churn', ax=ax[idx], color=sns.color_palette('viridis')[5], fill=True)
    
    ax[idx].legend(loc='upper right')
    ax[idx].set_title(c)
    ax[idx].set_xlabel('')
    ax[idx].set_ylabel('Density')

plt.tight_layout()
plt.show()


fig, ax = plt.subplots(5,2,figsize=(20, 18))
fig.suptitle('DDensité des variables catégorielles en fonction de Churn', fontsize=20)
ax = ax.flatten()

for idx,c in enumerate(categorical_cols):
    sns.histplot(data_c[c], linewidth= 3,
             label = 'Churn',ax=ax[idx],color=sns.color_palette('viridis')[2], fill=True)
    sns.histplot(data_nc[c], linewidth= 3,
             label = 'No Churn',ax=ax[idx],color=sns.color_palette('viridis')[5], fill=True)

    ax[idx].legend(loc='upper right')

plt.show()


# Matrice de corrélation de Pearson
plt.figure(figsize=(12, 10))
corr_matrix = data.corr()
sns.heatmap(corr_matrix, annot=True, fmt=".2f", cmap='viridis')
plt.title('Matrice de Corrélation de Pearson')
plt.show()

# Corrélation de chaque variable avec la cible 'Churn'
pearson_corr = data.corr()['Churn'].sort_values(ascending=False)
print(pearson_corr)

/var/folders/lk/l0x0m4v150vc1h4g637pjgyw0000gn/T/ipykernel_1198/335059431.py:3: FutureWarning:

The default value of numeric_only in DataFrame.corr is deprecated. In a future version, it will default to False. Select only valid columns or specify the value of numeric_only to silence this warning.

Churn                          1.000000
Complain                       0.250188
NumberOfDeviceRegistered       0.107939
SatisfactionScore              0.105481
CityTier                       0.084703
WarehouseToHome                0.069544
NumberOfAddress                0.043931
HourSpendOnApp                 0.018126
CouponUsed                    -0.001430
OrderAmountHikeFromlastYear   -0.009949
CustomerID                    -0.019083
OrderCount                    -0.024038
CashbackAmount                -0.154167
DaySinceLastOrder             -0.155871
Tenure                        -0.337831
Name: Churn, dtype: float64

/var/folders/lk/l0x0m4v150vc1h4g637pjgyw0000gn/T/ipykernel_1198/335059431.py:9: FutureWarning:

The default value of numeric_only in DataFrame.corr is deprecated. In a future version, it will default to False. Select only valid columns or specify the value of numeric_only to silence this warning.


# encodage One-Hot aux variables catégorielles
categorical_columns = ['PreferredLoginDevice', 'PreferredPaymentMode', 'Gender', 'PreferedOrderCat', 'MaritalStatus']
data_encoded = pd.get_dummies(data, columns=categorical_columns)

data_encoded.head(20)


# Sélection des colonnes numériques à normaliser
numerical_columns = ['Tenure', 'WarehouseToHome', 'HourSpendOnApp', 'NumberOfDeviceRegistered', 'OrderAmountHikeFromlastYear', 'CouponUsed', 'OrderCount', 'DaySinceLastOrder', 'CashbackAmount']  # ajustez cette liste selon vos colonnes  # ajoutez toutes les colonnes numériques

# Application de la normalisation
scaler = StandardScaler()
data_encoded[numerical_columns] = scaler.fit_transform(data_encoded[numerical_columns])


data_encoded.head()


# Calcul de la corrélation des variables avec la variable cible 'Churn'
correlation_with_churn = data_encoded.corr()['Churn'].sort_values()

# Création d'un graphique en barres pour visualiser ces corrélations
plt.figure(figsize=(8, 12))
sns.barplot(x=correlation_with_churn.values, y=correlation_with_churn.index, palette='viridis')
plt.title('Corrélation avec Churn')
plt.xlabel('Coefficient de Corrélation')
plt.ylabel('Variables')
plt.show()


# Liste des colonnes à supprimer basée sur la faible corrélation
columns_to_drop = ['PreferredPaymentMode_COD', 'CityTier', 
                   'PreferredPaymentMode_CC', 'HourSpendOnApp']

# Suppression des colonnes du DataFrame
data_encoded.drop(columns=columns_to_drop, axis=1, inplace=True)


X = data_encoded.drop('Churn', axis=1)  
y = data_encoded['Churn']


# Normalisation des données pour le test de Chi-carré
scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X)

# Calcul des scores Chi-carré
chi_scores = chi2(X_scaled, y)
chi2_importance = pd.Series(chi_scores[0], index=X.columns).sort_values(ascending=False)

# Visualisation de l'importance des variables avec un graphique en barres
plt.figure(figsize=(8, 12))
sns.barplot(x=chi2_importance.values, y=chi2_importance.index, palette='viridis')
plt.title('Importance des variables (Test de Chi-carré)')
plt.xlabel('Score Chi-carré')
plt.ylabel('Variables')
plt.show()


# Sélection des variables importantes (par exemple, celles identifiées comme importantes)
important_columns = ['Complain', 'MaritalStatus_Single', 'PreferedOrderCat_Mobile Phone', 'Tenure', 'PreferedOrderCat_Laptop & Accessory']


# Variables à supprimer basées sur les résultats des analyses
columns_to_drop = ['PreferredLoginDevice_Phone', 'PreferredPaymentMode_Cash on Delivery', 'CouponUsed', 
                    'PreferredPaymentMode_Credit Card', 'MaritalStatus_Divorced', 
                   'OrderCount', 'WarehouseToHome', 'PreferredPaymentMode_Debit Card', 'OrderAmountHikeFromlastYear', 
                   'PreferredPaymentMode_UPI', 'PreferredLoginDevice_Mobile Phone', 
                   'PreferredLoginDevice_Computer', 'Gender_Male', 'Gender_Female']

# Suppression des colonnes du DataFrame
data_reduced = data_encoded.drop(columns=columns_to_drop)

# Affichage des premières lignes du DataFrame réduit
data_reduced.head()


X = data_reduced.drop('Churn', axis=1)
y = data_reduced['Churn']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


# Entraînement du modèle de régression logistique
logreg = LogisticRegression()
logreg.fit(X_train, y_train)

# Prédictions et évaluation
y_pred_logreg = logreg.predict(X_test)
print("Régression Logistique:")
print(confusion_matrix(y_test, y_pred_logreg))
print(classification_report(y_test, y_pred_logreg))

Régression Logistique:
[[941   0]
 [185   0]]
              precision    recall  f1-score   support

           0       0.84      1.00      0.91       941
           1       0.00      0.00      0.00       185

    accuracy                           0.84      1126
   macro avg       0.42      0.50      0.46      1126
weighted avg       0.70      0.84      0.76      1126

/Users/khouloud/anaconda3/lib/python3.10/site-packages/sklearn/metrics/_classification.py:1344: UndefinedMetricWarning:

Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.

/Users/khouloud/anaconda3/lib/python3.10/site-packages/sklearn/metrics/_classification.py:1344: UndefinedMetricWarning:

Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.

/Users/khouloud/anaconda3/lib/python3.10/site-packages/sklearn/metrics/_classification.py:1344: UndefinedMetricWarning:

Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.


# Entraînement du modèle de forêt aléatoire
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

# Prédictions et évaluation
y_pred_rf = rf.predict(X_test)
print("Forêt Aléatoire:")
print(confusion_matrix(y_test, y_pred_rf))
print(classification_report(y_test, y_pred_rf))

Forêt Aléatoire:
[[922  19]
 [ 55 130]]
              precision    recall  f1-score   support

           0       0.94      0.98      0.96       941
           1       0.87      0.70      0.78       185

    accuracy                           0.93      1126
   macro avg       0.91      0.84      0.87      1126
weighted avg       0.93      0.93      0.93      1126


# Entraînement du modèle XGBoost
xgb_model = xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
xgb_model.fit(X_train, y_train)

# Prédictions et évaluation
y_pred_xgb = xgb_model.predict(X_test)
print("XGBoost:")
print(confusion_matrix(y_test, y_pred_xgb))
print(classification_report(y_test, y_pred_xgb))

/Users/khouloud/anaconda3/lib/python3.10/site-packages/xgboost/sklearn.py:1395: UserWarning:

`use_label_encoder` is deprecated in 1.7.0.

XGBoost:
[[918  23]
 [ 52 133]]
              precision    recall  f1-score   support

           0       0.95      0.98      0.96       941
           1       0.85      0.72      0.78       185

    accuracy                           0.93      1126
   macro avg       0.90      0.85      0.87      1126
weighted avg       0.93      0.93      0.93      1126


svm_model = SVC(kernel='linear', random_state=42)
svm_model.fit(X_train, y_train) 
y_pred_svm = svm_model.predict(X_test)

# Affichage des résultats
print("SVM:")
print(confusion_matrix(y_test, y_pred_svm))
print(classification_report(y_test, y_pred_svm))

SVM:
[[941   0]
 [185   0]]
              precision    recall  f1-score   support

           0       0.84      1.00      0.91       941
           1       0.00      0.00      0.00       185

    accuracy                           0.84      1126
   macro avg       0.42      0.50      0.46      1126
weighted avg       0.70      0.84      0.76      1126

/Users/khouloud/anaconda3/lib/python3.10/site-packages/sklearn/metrics/_classification.py:1344: UndefinedMetricWarning:

Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.

/Users/khouloud/anaconda3/lib/python3.10/site-packages/sklearn/metrics/_classification.py:1344: UndefinedMetricWarning:

Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.

/Users/khouloud/anaconda3/lib/python3.10/site-packages/sklearn/metrics/_classification.py:1344: UndefinedMetricWarning:

Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.


list = [logreg,svm_model,rf, xgb_model]
list_name = ['Logistic Regression', 'Support Vector Machine','Random Forest', 'XGBClassifier']


train_acc_list = []
test_acc_list = []

for mdl,name in zip(list,list_name):
    y_pred_train = mdl.predict(X_train)
    y_pred_test = mdl.predict(X_test)
    train_acc_list.append(accuracy_score(y_train, y_pred_train))
    test_acc_list.append(accuracy_score(y_test, y_pred_test))


all_models = pd.DataFrame({
    'Train_Accuracy': train_acc_list,
    'Test_Accuracy': test_acc_list
}, index=list_name)
all_models


y_score_logreg = logreg.predict_proba(X_test)[:, 1]
y_score_rf = rf.predict_proba(X_test)[:, 1]
y_score_xgb = xgb_model.predict_proba(X_test)[:, 1]


def create_train_accuracy_figure():
    fig = go.Figure()
    fig.add_trace(go.Bar(
        x=all_models['Train_Accuracy'],
        y=all_models.index,
        text=all_models['Train_Accuracy'],
        textposition='auto',
        marker=dict(
            color=all_models['Train_Accuracy'],
            colorscale='Viridis',
            colorbar=dict(title='Train Accuracy'),
        ),
        orientation='h',
    ))
    fig.update_layout(
        title='Models vs Train Accuracy',
        xaxis_title='Train Accuracy',
        yaxis_title='Model Names',
        template='plotly',
        title_font_size=20,
        title_font_color='black',
        hoverlabel_font_size=15,
        legend_title='<b>Train Accuracy</b>',
        legend=dict(
            title_font_size=15,
            font=dict(size=12),
        ),
    )
    return fig

def create_test_accuracy_figure():
    fig = go.Figure()
    fig.add_trace(go.Bar(
        x=all_models['Test_Accuracy'],
        y=all_models.index,
        text=all_models['Test_Accuracy'],
        textposition='auto',
        marker=dict(
            color=all_models['Test_Accuracy'],
            colorscale='Viridis',
            colorbar=dict(title='Test Accuracy'),
        ),
        orientation='h',
    ))
    fig.update_layout(
        title='Models vs Test Accuracy',
        xaxis_title='Test Accuracy',
        yaxis_title='Model Names',
        template='plotly',
        title_font_size=20,
        title_font_color='black',
        hoverlabel_font_size=15,
        legend_title='<b>Test Accuracy</b>',
        legend=dict(
            title_font_size=15,
            font=dict(size=12),
        ),
    )
    return fig

def create_roc_curve_figure():
    fig = go.Figure()

    # Logistic Regression
    fpr_logreg, tpr_logreg, _ = roc_curve(y_test, y_score_logreg)
    roc_auc_logreg = auc(fpr_logreg, tpr_logreg)
    fig.add_trace(go.Scatter(
        x=fpr_logreg,
        y=tpr_logreg,
        mode='lines',
        name=f'Logistic Regression (AUC = {roc_auc_logreg:.3f})',
        line=dict(color='rgba(50, 171, 96, 1)', width=2)
    ))

    # Random Forest
    fpr_rf, tpr_rf, _ = roc_curve(y_test, y_score_rf)
    roc_auc_rf = auc(fpr_rf, tpr_rf)
    fig.add_trace(go.Scatter(
        x=fpr_rf,
        y=tpr_rf,
        mode='lines',
        name=f'Random Forest (AUC = {roc_auc_rf:.3f})',
        line=dict(color='rgba(128, 0, 128, 1)', width=2)
    ))

    # XGBoost
    fpr_xgb, tpr_xgb, _ = roc_curve(y_test, y_score_xgb)
    roc_auc_xgb = auc(fpr_xgb, tpr_xgb)
    fig.add_trace(go.Scatter(
        x=fpr_xgb,
        y=tpr_xgb,
        mode='lines',
        name=f'XGBoost (AUC = {roc_auc_xgb:.3f})',
        line=dict(color='rgba(255, 140, 0, 1)', width=2)
    ))

    fig.update_layout(
        title='ROC Curves of Models',
        xaxis_title='False Positive Rate (FPR)',
        yaxis_title='True Positive Rate (TPR)',
        template='plotly',
        title_font_size=20,
        title_font_color='black',
        hoverlabel_font_size=15,
        legend=dict(
            title='Models',
            title_font_size=15,
            font=dict(size=12, color='black'),
        ),
    )
    return fig

def create_download_link(fig):
    buffer = io.StringIO()
    fig.write_html(buffer)
    html_bytes = buffer.getvalue().encode()
    encoded = b64encode(html_bytes).decode()
    return encoded

def create_dash_app(fig, encoded, filename, port):
    app = Dash(__name__)
    app.layout = html.Div([
        html.P("↓↓↓ try downloading the plot as HTML ↓↓↓", style={"text-align": "right", "font-weight": "bold"}),
        dcc.Graph(id="graph", figure=fig),
        html.A(
            html.Button("Download as HTML"), 
            id="download",
            href="data:text/html;base64," + encoded,
            download=filename
        )
    ])
    threading.Thread(target=app.run_server, kwargs={'debug': True, 'port': port}).start()

if __name__ == '__main__':
    # Create and run the train accuracy Dash app
    train_fig = create_train_accuracy_figure()
    train_encoded = create_download_link(train_fig)
    create_dash_app(train_fig, train_encoded, "train_plotly_graph.html", port=8050)

    # Create and run the test accuracy Dash app
    test_fig = create_test_accuracy_figure()
    test_encoded = create_download_link(test_fig)
    create_dash_app(test_fig, test_encoded, "test_plotly_graph.html", port=8051)

    # Create and run the ROC curve Dash app
    roc_fig = create_roc_curve_figure()
    roc_encoded = create_download_link(roc_fig)
    create_dash_app(roc_fig, roc_encoded, "roc_plotly_graph.html", port=8052)

	CustomerID	Churn	Tenure	CityTier	WarehouseToHome	HourSpendOnApp	NumberOfDeviceRegistered	SatisfactionScore	NumberOfAddress	Complain	...	Gender_Male	PreferedOrderCat_Fashion	PreferedOrderCat_Laptop & Accessory	PreferedOrderCat_Mobile	PreferedOrderCat_Mobile Phone	PreferedOrderCat_Others	MaritalStatus_Divorced	MaritalStatus_Married	MaritalStatus_Single
0	50001	1	4.0	3	6.0	3.000000	3	2	9	1	...	0	0	1	0	0	0	0	0	1
1	50002	1	9.0	1	8.0	3.000000	4	3	7	1	...	1	0	0	1	0	0	0	0	1
2	50003	1	9.0	1	30.0	2.000000	4	3	6	1	...	1	0	0	1	0	0	0	0	1
3	50004	1	0.0	3	15.0	2.000000	4	5	8	0	...	1	0	1	0	0	0	0	0	1
4	50005	1	0.0	1	12.0	2.931535	3	5	3	0	...	1	0	0	1	0	0	0	0	1
5	50006	1	0.0	1	22.0	3.000000	5	5	2	1	...	0	0	0	0	1	0	0	0	1
6	50007	1	9.0	3	11.0	2.000000	3	2	4	0	...	1	0	1	0	0	0	1	0	0
7	50008	1	9.0	1	6.0	3.000000	3	2	3	1	...	1	0	0	1	0	0	1	0	0
8	50009	1	13.0	3	9.0	2.931535	4	3	2	1	...	1	0	0	1	0	0	1	0	0
9	50010	1	9.0	1	31.0	2.000000	5	3	2	0	...	1	0	0	1	0	0	0	0	1
10	50011	1	4.0	1	18.0	2.000000	3	3	2	0	...	0	0	0	0	0	1	1	0	0
11	50012	1	11.0	1	6.0	3.000000	4	3	10	1	...	1	1	0	0	0	0	0	0	1
12	50013	1	0.0	1	11.0	2.000000	3	3	2	1	...	1	0	0	1	0	0	0	0	1
13	50014	1	0.0	1	15.0	3.000000	4	3	1	1	...	1	0	0	1	0	0	1	0	0
14	50015	1	9.0	3	15.0	3.000000	4	2	2	0	...	1	1	0	0	0	0	0	0	1
15	50016	1	9.0	2	12.0	3.000000	3	5	5	1	...	1	0	0	1	0	0	0	1	0
16	50017	1	0.0	1	12.0	2.931535	4	2	2	1	...	0	0	0	1	0	0	0	0	1
17	50018	1	0.0	3	11.0	2.000000	4	3	2	1	...	1	0	1	0	0	0	0	0	1
18	50019	1	0.0	1	13.0	3.000000	5	3	2	1	...	1	0	1	0	0	0	0	0	1
19	50020	1	19.0	1	20.0	3.000000	3	4	10	1	...	0	0	0	0	1	0	1	0	0

	CustomerID	Churn	Tenure	CityTier	WarehouseToHome	HourSpendOnApp	NumberOfDeviceRegistered	SatisfactionScore	NumberOfAddress	Complain	...	Gender_Male	PreferedOrderCat_Laptop & Accessory	PreferedOrderCat_Mobile	MaritalStatus_Single
0	50001	1	-0.733989	3	-1.146379	0.097069	-0.67290	2	9	1	...	0	1	0	1
1	50002	1	-0.135704	1	-0.906721	0.097069	0.30375	3	7	1	...	1	0	1	1
2	50003	1	-0.135704	1	1.729519	-1.320723	0.30375	3	6	1	...	1	0	1	1
3	50004	1	-1.212618	3	-0.067917	-1.320723	0.30375	5	8	0	...	1	1	0	1
4	50005	1	-1.212618	1	-0.427405	0.000000	-0.67290	5	3	0	...	1	0	1	1

	CustomerID	Churn	Tenure	NumberOfDeviceRegistered	SatisfactionScore	NumberOfAddress	Complain	DaySinceLastOrder	CashbackAmount	PreferedOrderCat_Laptop & Accessory	PreferedOrderCat_Mobile	MaritalStatus_Single
0	50001	1	-0.733989	-0.67290	2	9	1	0.151436	-0.350033	1	0	1
1	50002	1	-0.135704	0.30375	3	7	1	-1.249003	-1.142885	0	1	1
2	50003	1	-0.135704	0.30375	3	6	1	-0.408739	-1.163214	0	1	1
3	50004	1	-1.212618	0.30375	5	8	0	-0.408739	-0.878601	1	0	1
4	50005	1	-1.212618	-0.67290	5	3	0	-0.408739	-0.959919	0	1	1

	CustomerID	Churn	Tenure	PreferredLoginDevice	CityTier	WarehouseToHome	PreferredPaymentMode	Gender	HourSpendOnApp	NumberOfDeviceRegistered	PreferedOrderCat	SatisfactionScore	MaritalStatus	NumberOfAddress	Complain	OrderAmountHikeFromlastYear	CouponUsed	OrderCount	DaySinceLastOrder	CashbackAmount
0	50001	1	4.0	Mobile Phone	3	6.0	Debit Card	Female	3.0	3	Laptop & Accessory	2	Single	9	1	11.0	1.0	1.0	5.0	160
1	50002	1	NaN	Phone	1	8.0	UPI	Male	3.0	4	Mobile	3	Single	7	1	15.0	0.0	1.0	0.0	121
2	50003	1	NaN	Phone	1	30.0	Debit Card	Male	2.0	4	Mobile	3	Single	6	1	14.0	0.0	1.0	3.0	120
3	50004	1	0.0	Phone	3	15.0	Debit Card	Male	2.0	4	Laptop & Accessory	5	Single	8	0	23.0	0.0	1.0	3.0	134
4	50005	1	0.0	Phone	1	12.0	CC	Male	NaN	3	Mobile	5	Single	3	0	11.0	1.0	1.0	3.0	130

	Train_Accuracy	Test_Accuracy
Logistic Regression	0.830595	0.835702
Support Vector Machine	0.830595	0.835702
Random Forest	1.000000	0.934281
XGBClassifier	0.998224	0.933393

Big Data Project: E-commerce Customer Churn Prediction¶

1.Préparation de la data¶

a. Charger les modules¶

b. Charger la dataset¶

c. Chercher les doublons:¶

d. Analyse des valeurs manquantes¶

e. Distribution des variables avec des valeurs manquantes¶

Tenure :¶

HourSpendOnApp :¶

DaySinceLastOrder :¶

WarehouseToHome :¶

OrderAmountHikeFromlastYear :¶

CouponUsed & OrderCount::¶

2. Explorer la data¶

a. Les variables Catégorielles¶

PreferredLoginDevice :¶

PreferredPaymentMode :¶

Gender :¶

PreferedOrderCat :¶

MaritalStatus :¶

b. Les variables numérique et variable Churn¶

b. Les variables catégorielles et variable Churn¶

d. Matrice de corrélation¶

d. Conclusion des relations¶

3. Traitement de la data¶

a. Standarisation et Normalisation des variables :¶

Explication du graphe de corrélation :¶

Proche de +1 :¶

Proche de -1 :¶

Proche de 0 :¶

b. Test Chi-squared¶

Interpretation du graphe de Chi-carré¶

Suppression des variables¶

4. Préparation des modèles¶

a. Application du Train Test Split¶

Le choix de ces modeles de ML est assez simple ce sont les modeles les plus connus et les plus utilisés pour des taches de classification binaire comme dans notre cas la prediction qu'un client se désabonne ou pas¶

b. Application du model Logistic Regression¶

c.Application du model RandomForestClassifier¶

d. Application du model Logistic XGBoost¶

d. Application du model Logistic Support Vector Machine¶

f.Evaluer les modèles et ROC Curves¶