#toolbox
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from math import *
import statsmodels.api as sm

#stats tools
from scipy.stats import spearmanr, mode, shapiro, probplot


#ML tools
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.model_selection import train_test_split,cross_validate, StratifiedKFold
from sklearn.metrics import (mean_squared_error, mean_absolute_percentage_error,
                             accuracy_score, confusion_matrix, precision_score,
                             recall_score, f1_score, roc_curve,auc)
from statsmodels.stats.diagnostic import het_breuschpagan
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from patsy import dmatrices
from statsmodels.stats.outliers_influence import variance_inflation_factor

#Remover warning messages
import warnings
warnings.filterwarnings("ignore")


def show_shape(datasets):
    """
    datasets = {str : array, str : array}

    Show the shape of each dataset with ther name
    """
    for set in datasets:
        print("{} : {}".format(datasets[set].shape, set))


#Editor is Openclassrooms
def correlation_graph(pca, x_y,features) :
        """Display the correlation circle graph.

        Positional arguments:
        -----------------------------------
        pca: sklearn.decomposition.PCA: our fitted PCA object
        x_y: list or tuple: the x, y pair of dimensions to display, e.g., [0, 1] for PC1, PC2
        features: list or tuple: the list of features (i.e., dimensions) to represent
        """

        # Extract x and y
        x, y = x_y

        # Image size (in inches)
        fig, ax = plt.subplots(figsize=(8, 7))

        # For each principal component:
        for i in range(0, pca.components_.shape[1]):

                # Arrows
                ax.arrow(0, 0,
                        pca.components_[x, i],
                        pca.components_[y, i],
                        head_width=0.07,
                        head_length=0.07,
                        width=0.02)

                # Labels
                plt.text(pca.components_[x, i] + 0.05,
                        pca.components_[y, i] + 0.05,
                        features[i])

        # Display horizontal and vertical lines
        plt.plot([-1, 1], [0, 0], color='grey', ls='--')
        plt.plot([0, 0], [-1, 1], color='grey', ls='--')

        # Axes labels, with the percentage of explained variance
        plt.xlabel('PC{} ({}%)'.format(x+1, round(100*pca.explained_variance_ratio_[x], 1)))
        plt.ylabel('PC{} ({}%)'.format(y+1, round(100*pca.explained_variance_ratio_[y], 1)))

        plt.title("Correlation Circle (PC{} and PC{})".format(x+1, y+1))

        # The circle
        an = np.linspace(0, 2 * np.pi, 100)
        plt.plot(np.cos(an), np.sin(an))  # Add a unit circle for scale

        # Axes and display
        plt.axis('equal')
        # plt.show(block=False)


def show_scores(y, y_pred):
    """
    y = real values  
    y_pred = predicted values  
    ------  
    return : confusion matrice (variable), scores(printed)
    
    Show performance indicators:
    - Accuracy: Accuracy is the proportion of correct predictions among the total predictions. It is a good indicator when the classes are balanced
    - Where TP is the number of true positives, TN is the number of true negatives, FP is the number of false positives, and FN is the number of false negatives
    - Precision: Precision is the proportion of positive predictions that are truly positive. It is used when the cost of false positives is high
    - Recall (or Sensitivity): Recall is the proportion of true positives that are correctly predicted as such. It is used when the cost of false negatives is high
    - F1-score: The F1-score is a harmonic mean of precision and recall. It provides a balance between these two measures
    """
    confusion_mat = confusion_matrix(y, y_pred)
    accuracy_KMeans = accuracy_score(y, y_pred)
    precision_KMeans = precision_score(y, y_pred)
    recall_KMeans = recall_score(y, y_pred)
    f1_KMeans = f1_score(y, y_pred)
    print('Confusion Matrice:\n{}'.format(confusion_mat))

    #Get values from confusion matrice
    TN, FP, FN, TP = confusion_mat.ravel()

    #Calculate rates from confusion matrice
    TPR_test = round(TP / (TP + FN) * 100, 2) #True positive rate (recall)
    FPR_test = round(FP / (FP + TN) * 100, 2) #False positive rate
    TNR_test = round(TN / (TN + FP) * 100, 2) #True negative rate (specificity)
    FNR_test = round(FN / (FN + TP) * 100, 2) #False negative rate

    # Affichage des taux pour l'ensemble de test
    print("\nTrue positive rate (recall) : {}%".format(TPR_test))
    print("False positive rate : {}%".format(FPR_test))
    print("True negative rate (specificity) : {}%".format(TNR_test))
    print("False negative rate : {}%".format(FNR_test))
    print("\n--------------\n")
    print('Accuracy: {}'.format(accuracy_KMeans))
    print('Precision: {}'.format(precision_KMeans))
    print('Recall: {}'.format(recall_KMeans))
    print('F1 Score: {}'.format(f1_KMeans))

    return confusion_mat


df_source = pd.read_csv("Ressources/billets.csv", delimiter=";")


df_source


df_source.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1500 entries, 0 to 1499
Data columns (total 7 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   is_genuine    1500 non-null   bool   
 1   diagonal      1500 non-null   float64
 2   height_left   1500 non-null   float64
 3   height_right  1500 non-null   float64
 4   margin_low    1463 non-null   float64
 5   margin_up     1500 non-null   float64
 6   length        1500 non-null   float64
dtypes: bool(1), float64(6)
memory usage: 71.9 KB


df_source.duplicated().sum()

0


df_source.describe().round(2)


for col in df_source.columns[1:]:

    
    plt.figure(figsize=(10,10))
    plt.suptitle("===|  {}  |===".format(col), color="Green")

    plt.subplot(221)
    plt.title("False & True")
    sns.boxplot(data=df_source, x="is_genuine", y=col);

    #------------

    plt.subplot(222)
    plt.title("False & True")
    sns.histplot(data=df_source, x=col, kde=True);
    mean = df_source[col].mean()
    med = df_source[col].median()
    mod = mode(df_source[col], axis=None, keepdims=False)[0] #scipystats object

    plt.axvline(mean, color='r', linestyle='dashed', linewidth=2, label='Mean')
    plt.axvline(med, color='g', linestyle='dashed', linewidth=2, label='Median')
    plt.axvline(mod, color='b', linestyle='dashed', linewidth=2, label='Mode')
    legend = ["Distribution",
              "Mean ({})".format(str(mean.round(2))),
              "Median ({})".format(str(med.round(2))),
              "Mode ({})".format(str(mod.round(2)))]
    plt.legend(legend)
    plt.xticks(rotation=45)

    #Compare normal distribution vs actual with Shapiro-Wilk test
    statistic, p_value = shapiro(df_source[col])

    #Return test variables
    text_x = df_source[col].max() - (df_source[col].std()*4)
    text_y = df_source[col].value_counts().max()

    #Return the result
    alpha = 0.05  #Alpha level

    if p_value > alpha:
        result = "Gaussian distribution : YES"
    else:
        result = "Gaussian distribution : NO"

    plt.text(x=text_x, y=text_y, s="Test value : {}\nP-value : {}\n{}".format(round(statistic,2), round(p_value,2), result), bbox=dict(facecolor='white', edgecolor='white', boxstyle='round,pad=0.5'))

    #------------

    df_temp = df_source.loc[df_source["is_genuine"] == 0]
    plt.subplot(223)
    plt.title("False")
    sns.histplot(data=df_temp, x=col, kde=True);
    mean = df_temp[col].mean()
    med = df_temp[col].median()
    mod = mode(df_temp[col], axis=None, keepdims=False)[0] #scipystats object

    plt.axvline(mean, color='r', linestyle='dashed', linewidth=2, label='Mean')
    plt.axvline(med, color='g', linestyle='dashed', linewidth=2, label='Median')
    plt.axvline(mod, color='b', linestyle='dashed', linewidth=2, label='Mode')
    legend = ["Distribution",
              "Mean ({})".format(str(mean.round(2))),
              "Median ({})".format(str(med.round(2))),
              "Mode ({})".format(str(mod.round(2)))]
    plt.legend(legend)
    plt.xticks(rotation=45)

    #Compare normal distribution vs actual with Shapiro-Wilk test
    statistic, p_value = shapiro(df_temp[col])

    #Return test variables
    text_x = df_temp[col].max() - (df_temp[col].std()*4)
    text_y = df_temp[col].value_counts().max()

    #Return the result
    alpha = 0.05  #Alpha level

    if p_value > alpha:
        result = "Gaussian distribution : YES"
    else:
        result = "Gaussian distribution : NO"

    plt.text(x=text_x, y=text_y, s="Test value : {}\nP-value : {}\n{}".format(round(statistic,2), round(p_value,2), result), bbox=dict(facecolor='white', edgecolor='white', boxstyle='round,pad=0.5'))

    med_false = med

    #------------

    df_temp = df_source.loc[df_source["is_genuine"] == 1]
    plt.subplot(224)
    plt.title("True")
    sns.histplot(data=df_temp, x=col, kde=True);
    mean = df_temp[col].mean()
    med = df_temp[col].median()
    mod = mode(df_temp[col], axis=None, keepdims=False)[0] #scipystats object

    plt.axvline(mean, color='r', linestyle='dashed', linewidth=2, label='Mean')
    plt.axvline(med, color='g', linestyle='dashed', linewidth=2, label='Median')
    plt.axvline(mod, color='b', linestyle='dashed', linewidth=2, label='Mode')
    legend = ["Distribution",
              "Mean ({})".format(str(mean.round(2))),
              "Median ({})".format(str(med.round(2))),
              "Mode ({})".format(str(mod.round(2)))]
    plt.legend(legend)
    plt.xticks(rotation=45)

    #Compare normal distribution vs actual with Shapiro-Wilk test
    statistic, p_value = shapiro(df_temp[col])

    ##Return test variables
    text_x = df_temp[col].max() - (df_temp[col].std()*4)
    text_y = df_temp[col].value_counts().max()

    #Return the result
    alpha = 0.05  #Alpha level

    if p_value > alpha:
        result = "Gaussian distribution : YES"
    else:
        result = "Gaussian distribution : NO"

    plt.text(x=text_x, y=text_y, s="Test value : {}\nP-value : {}\n{}".format(round(statistic,2), round(p_value,2), result), bbox=dict(facecolor='white', edgecolor='white', boxstyle='round,pad=0.5'))


    med_true = med

    #------------

    plt.tight_layout(w_pad=3)

    med_diff = round(med_false - med_true, 2)
    med_prop = round((med_false - med_true)/med_true, 2)

    print("Column: {} | The difference between False and True : {} ({}%)".format(col, med_diff, med_prop))
    
plt.show()

Column: diagonal | The difference between False and True : -0.08 (-0.0%)
Column: height_left | The difference between False and True : 0.23 (0.0%)
Column: height_right | The difference between False and True : 0.35 (0.0%)
Column: margin_low | The difference between False and True : 1.08 (0.26%)
Column: margin_up | The difference between False and True : 0.3 (0.1%)
Column: length | The difference between False and True : -1.58 (-0.01%)


sns.pairplot(data=df_source, hue="is_genuine", height=1.8, plot_kws={'alpha': 0.2}); #plot_kws to set alpha
plt.show()


#Some features are not following the Gaussian distribution, then "Spearman method" used
sns.heatmap(df_source.corr(method="spearman"), annot=True)

<Axes: >


features = ['length',"margin_up", "height_right"]
target = "margin_low"


#Check the correlation is real
for feature in features:
    statistic, p_value = spearmanr(df_source[target], df_source[feature], nan_policy="omit") #NaN are ignored

    alpha = 0.05  #alpha level

    print("Correlation test between margin_low | {}".format(feature))
    if p_value > alpha:
        result = "The result is due to the random (p_value : {} > {} (alpha))".format(round(p_value,2), alpha)
    else:
        result = "There are a significance correlation\n\np_value : {} < {} (alpha)\ncorrelation stat : {}\n-------------------".format(round(p_value,2), alpha, round(statistic,2))
        
    print(result)

Correlation test between margin_low | length
There are a significance correlation

p_value : 0.0 < 0.05 (alpha)
correlation stat : -0.59
-------------------
Correlation test between margin_low | margin_up
There are a significance correlation

p_value : 0.0 < 0.05 (alpha)
correlation stat : 0.42
-------------------
Correlation test between margin_low | height_right
There are a significance correlation

p_value : 0.0 < 0.05 (alpha)
correlation stat : 0.4
-------------------


#Correlation visualize
num_subplot = (len(features)*10) + 101
plt.figure(figsize=(12,5))
plt.suptitle("Correlation with - margin_low -")
for feature in features:
    plt.subplot(num_subplot)
    sns.regplot(data=df_source, x=feature, y=target, line_kws={"color": "red"})
    sns.scatterplot(data=df_source, x=feature, y=target, hue="is_genuine")
    num_subplot += 1
plt.tight_layout(w_pad=5)
plt.show()


#Split missing values and not NaN
df_full_values = df_source.loc[df_source["margin_low"].notna()]
print(df_full_values.shape)


df_to_impute = df_source.loc[df_source["margin_low"].isna()]
print(df_to_impute.shape)

(1463, 7)
(37, 7)


df_full_values["margin_up_squared"] = df_full_values["margin_up"]**2


#Define different features to test with the model
models = {"margin_low ~ length":["length"],
         "margin_low ~ length + margin_up":["length", "margin_up"],
         "margin_low ~ length + margin_up_squared":["length", "margin_up_squared"],
         "margin_low ~ length + margin_up + height_right":["length", "margin_up", "height_right"]}


#Compare the different model scores

for model in models:
    features = models[model]

    
    #Feedback for information
    print("==================\n{}\n------------------".format(model))

    #Define features (X) and target (y)
    X = df_full_values[features]
    y = df_full_values[target]

    #margin_up_squarred feature have high values -> normalize
    norm_model = StandardScaler()
    X_scaled = norm_model.fit_transform(X)
    X_scaled = pd.DataFrame(X_scaled)
    X_scaled.columns = features

    #Split train & test dataset
    X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=18, stratify=df_full_values["is_genuine"]) 

    datasets = {"X_train":X_train, "X_test":X_test, "y_train":y_train, "y_test":y_test}

    for set in datasets:
        print("{} : {}".format(datasets[set].shape, set))

    #Model init
    model = LinearRegression()

    #Fitting model on data
    model.fit(X_train, y_train);

    #Get the predicted values
    y_pred_test = model.predict(X_test)

    #Check the differences between real and predicted values
    rmse_score = round(mean_squared_error(y_test, y_pred_test),3)
    mape_score = round(mean_absolute_percentage_error(y_test, y_pred_test),3)

    print("------------------\nRMSE: {}".format(rmse_score))
    print("MAPE: {}".format(mape_score))

    #Score R**2
    print("R2 = {}".format(round(model.score(X_scaled, y),2)))

    #Coefficients show
    coefficients = []
    for i, feature in enumerate(features):
        #a*x text
        coef_text = "{} * {}".format(str(model.coef_[i].round(2)), feature)
        coefficients.append(coef_text)

    #Linear regression math formula: f(x) = a1*x + a2*x [..] + an*x + b  
    a = " + ".join(coefficients) 
    b = model.intercept_.round(2)

    print("f(x) = {} + {}\n==================\n-------------".format(a, b))

==================
margin_low ~ length
------------------
(1170, 1) : X_train
(293, 1) : X_test
(1170,) : y_train
(293,) : y_test
------------------
RMSE: 0.24
MAPE: 0.088
R2 = 0.44
f(x) = -0.45 * length + 4.49
==================
-------------
==================
margin_low ~ length + margin_up
------------------
(1170, 2) : X_train
(293, 2) : X_test
(1170,) : y_train
(293,) : y_test
------------------
RMSE: 0.241
MAPE: 0.087
R2 = 0.45
f(x) = -0.4 * length + 0.08 * margin_up + 4.49
==================
-------------
==================
margin_low ~ length + margin_up_squared
------------------
(1170, 2) : X_train
(293, 2) : X_test
(1170,) : y_train
(293,) : y_test
------------------
RMSE: 0.241
MAPE: 0.087
R2 = 0.45
f(x) = -0.4 * length + 0.09 * margin_up_squared + 4.49
==================
-------------
==================
margin_low ~ length + margin_up + height_right
------------------
(1170, 3) : X_train
(293, 3) : X_test
(1170,) : y_train
(293,) : y_test
------------------
RMSE: 0.228
MAPE: 0.084
R2 = 0.47
f(x) = -0.38 * length + 0.07 * margin_up + 0.08 * height_right + 4.49
==================
-------------


#Features pick from the last test
features = ["length", "margin_up", "height_right"]


#Feedback for information
print("Feature : {}".format(features))
print("Target feature : {}".format(target))

Feature : ['length', 'margin_up', 'height_right']
Target feature : margin_low


#Define features (X) and target (y)
X = df_full_values[features]
y = df_full_values[target]


#Split train & test dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=18, stratify=df_full_values["is_genuine"])


datasets = {"X_train":X_train, "X_test":X_test, "y_train":y_train, "y_test":y_test}

show_shape(datasets)

(1170, 3) : X_train
(293, 3) : X_test
(1170,) : y_train
(293,) : y_test


#Model init
model = LinearRegression()

#Fitting model on data
model.fit(X_train, y_train);

#Get the predicted values
y_pred_test = model.predict(X_test)


#Check the differences between real and predicted values
rmse_score = round(mean_squared_error(y_test, y_pred_test),3)
mape_score = round(mean_absolute_percentage_error(y_test, y_pred_test),3)

print("RMSE: {}".format(rmse_score))
print("MAPE: {}".format(mape_score))

RMSE: 0.228
MAPE: 0.084


#Score R**2
print("R2 = {}".format(round(model.score(X, y),2)))

#Coefficients show
coefficients = []
for i, feature in enumerate(features):
    #a*x text
    coef_text = "{} * {}".format(str(model.coef_[i].round(2)), feature)
    coefficients.append(coef_text)

#Linear regression math formula: f(x) = a1*x + a2*x [..] + an*x + b  
a = " + ".join(coefficients) 
b = model.intercept_.round(2)

print("f(x) = {} + {}".format(a, b))

R2 = 0.47
f(x) = -0.43 * length + 0.32 * margin_up + 0.24 * height_right + 27.59


#Get residuals values
residuals =  y_test - y_pred_test


#Shapiro test
statistic, p_value = shapiro(residuals)

alpha = 0.05  #alpha level

if p_value > alpha:
    print("The residuals values's distribution is following a Gaussian curve\np-value : {}".format(round(p_value,2)))
else:
    print("The residuals values's distribution is NOT following a Gaussian curve\np-value : {}".format(round(p_value,2)))

The residuals values's distribution is NOT following a Gaussian curve
p-value : 0.04


#Visual check
plt.figure(figsize=(10,5))
plt.suptitle("Residuals values distribution")

plt.subplot(121)
sns.histplot(residuals, kde=True);

plt.subplot(122)
probplot(residuals, plot=plt)

plt.tight_layout(w_pad=5)
plt.show()


plt.title("Homoscedasticity")
sns.scatterplot(x=y_pred_test, y=residuals)
plt.ylabel("residuals")
plt.xlabel("y_pred_test")

Text(0.5, 0, 'y_pred_test')


#Residuals show on one axis
plt.subplots(figsize=(10, 6))
plt.scatter(x=X_test.index, y=residuals, alpha=0.5)
plt.plot(np.repeat(0, len(X.index)), color='red')
plt.title('Residuals from multi linear regression model')
plt.show()


#Add constant feature to the matrice with explanatory features
X_constant = sm.add_constant(X_test)

#Breusch-Pagan test
_, pval, _, _ = het_breuschpagan(residuals, X_constant)

# Afficher la p-value
print("P-value : {}".format(pval))

P-value : 0.0012222296083232555


#Get correlation matrice with intercept (1) column added
_, X_vif = dmatrices('margin_low~length+margin_up+height_right', data=df_source, return_type='dataframe')

#Calculate VIF
vif = pd.DataFrame()
vif['VIF'] = [variance_inflation_factor(X_vif.values, i) for i in range(X_vif.shape[1])]
vif['variable'] = X_vif.columns
vif[1:]


#Predict missing values to visualize
x_missing_values = df_to_impute[features].iloc[:,0]
y_missing_values_pred = model.predict(df_to_impute[features]).tolist()


plt.title("Predicted values on the Real values graph")

sns.regplot(x=X.iloc[:,0], y=y.tolist(), color="g", scatter_kws={"alpha": 0.2}, line_kws={"color": "red", "alpha":0.5})
sns.scatterplot(x=x_missing_values, y=y_missing_values_pred, hue=df_to_impute["is_genuine"])
plt.ylabel("margin_low")
plt.xlabel("length")
plt.legend(["Real values", "Estimate model", "Confidence intervale", "Pred values - Genuine", "Pred values - Not Genuine"])

plt.show()


df_refill = df_source.copy()

df_refill.loc[df_refill["margin_low"].isna(), ["margin_low"]] = model.predict(df_refill.loc[df_refill["margin_low"].isna(),features])


df_refill.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1500 entries, 0 to 1499
Data columns (total 7 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   is_genuine    1500 non-null   bool   
 1   diagonal      1500 non-null   float64
 2   height_left   1500 non-null   float64
 3   height_right  1500 non-null   float64
 4   margin_low    1500 non-null   float64
 5   margin_up     1500 non-null   float64
 6   length        1500 non-null   float64
dtypes: bool(1), float64(6)
memory usage: 71.9 KB


#Define features and target
features = ['diagonal', 'height_left', 'height_right', 
            'margin_low', 'margin_up', 'length']
target = 'is_genuine'

#Get information in X and y
X = df_refill[features]
y = df_refill[target]


#Scaled data is a mandatory for this model
model_normalizer = StandardScaler()

#Fitting model on datas
model_normalizer.fit(X)

#Scaling
X_scaled = model_normalizer.transform(X)

datasets = {"X_scaled":X_scaled}
show_shape(datasets)

(1500, 6) : X_scaled


#Model init
model_kmeans = KMeans(n_clusters=2, n_init=5, random_state=18)

#Fitting model on datas
model_kmeans.fit(X_scaled)

#Get x predicted (f(x))
y_pred_kmeans = model_kmeans.predict(X_scaled)

#Get centroids
centroids = model_kmeans.cluster_centers_

#Get clusters
clusters = model_kmeans.labels_


#Get confusion matrice and scores
confusion_matrice = show_scores(y, y_pred_kmeans)

Confusion Matrice:
[[486  14]
 [ 10 990]]

True positive rate (recall) : 99.0%
False positive rate : 2.8%
True negative rate (specificity) : 97.2%
False negative rate : 1.0%

--------------

Accuracy: 0.984
Precision: 0.9860557768924303
Recall: 0.99
F1 Score: 0.9880239520958083


plt.figure(figsize=(8, 6))
plt.title('Confusion matrice')
sns.heatmap(confusion_matrice, annot=True, fmt='d', cmap='Blues', cbar=False)
plt.xlabel('Predicted')
plt.ylabel('Real')

plt.show()


#init model
model_PCA = PCA()

#Fitting on datas
model_PCA.fit(X_scaled)

#PCs creation
X_pca = model_PCA.transform(X_scaled)
centroids_pca = model_PCA.transform(centroids)

model_PCA.explained_variance_ratio_.round(2).cumsum()

array([0.43, 0.6 , 0.73, 0.85, 0.95, 1.  ])


correlation_graph(model_PCA, [0,1], features)


plt.figure(figsize=(8,8))
plt.title('Clusters and Centroids on PCA')

plt.scatter(X_pca[:, 0], X_pca[:, 1],c=y_pred_kmeans, s=50, alpha=0.5)
plt.scatter(centroids_pca[:, 0], centroids_pca[:, 1], c='red', marker='+', s=500, linewidth=2)

plt.show()


#Show columns name to easy copy/paste
df_refill.columns

Index(['is_genuine', 'diagonal', 'height_left', 'height_right', 'margin_low',
       'margin_up', 'length'],
      dtype='object')


#Get correlation matrice with intercept (1) column added
_, X_vif = dmatrices('margin_low~diagonal+height_left+height_right+margin_up+length', data=df_refill, return_type='dataframe')

#Calculate VIF
vif = pd.DataFrame()
vif['VIF'] = [variance_inflation_factor(X_vif.values, i) for i in range(X_vif.shape[1])]
vif['variable'] = X_vif.columns
vif[1:]


#Define different features to test with the model
models = {"is_genuine ~ length":["length"],
         "is_genuine ~ length + margin_low":["length", "margin_low"],
         "is_genuine ~ length + margin_low + margin_up":["length", "margin_low", "margin_up"],
         "is_genuine ~ length + margin_low + height_right":["length", "margin_low", "height_right"],
         "is_genuine ~ length + margin_low + margin_up + height_right":["length", "margin_low", "margin_up", "height_right"],
         "is_genuine ~ ALL (length + margin_low + margin_up + height_right + height_left + diagonal)":["length", "margin_low", "margin_up", "height_right", "height_left", "diagonal"]}


for model in models:
    #Define features and target
    features = models[model]
    target = 'is_genuine'

    #Define numbers of folds
    k = 8

    #Feedback for information
    print("==================\n{}\n------------------\nMean on {} folds".format(model, k))

    #Get information in X and y
    X = df_refill[features]
    y = df_refill[target]

    #Normalize data
    norm_model = StandardScaler()
    X_scaled = norm_model.fit_transform(X)

    X_scaled = pd.DataFrame(X_scaled)
    X_scaled.columns = X.columns

    #Init model
    model = LogisticRegression()

    #Init with Shuffle data
    stratified_kf = StratifiedKFold(n_splits=k, shuffle=True, random_state=18)

    #Define scores
    scoring = {'precision': 'precision',
               'accuracy': 'accuracy',
               'recall': 'recall',
               'f1': 'f1'}

    # Utiliser cross_validate pour effectuer la validation croisée et obtenir les scores
    cv_results = cross_validate(model, X_scaled, y, scoring=scoring, cv=stratified_kf)

    # Afficher les résultats
    for metric, values in cv_results.items():
        print("{}: {}".format(metric, round(np.mean(values),3)))

    print("")

==================
is_genuine ~ length
------------------
Mean on 8 folds
fit_time: 0.004
score_time: 0.008
test_precision: 0.959
test_accuracy: 0.957
test_recall: 0.978
test_f1: 0.968

==================
is_genuine ~ length + margin_low
------------------
Mean on 8 folds
fit_time: 0.005
score_time: 0.007
test_precision: 0.983
test_accuracy: 0.984
test_recall: 0.993
test_f1: 0.988

==================
is_genuine ~ length + margin_low + margin_up
------------------
Mean on 8 folds
fit_time: 0.008
score_time: 0.009
test_precision: 0.988
test_accuracy: 0.989
test_recall: 0.996
test_f1: 0.992

==================
is_genuine ~ length + margin_low + height_right
------------------
Mean on 8 folds
fit_time: 0.005
score_time: 0.007
test_precision: 0.984
test_accuracy: 0.985
test_recall: 0.993
test_f1: 0.989

==================
is_genuine ~ length + margin_low + margin_up + height_right
------------------
Mean on 8 folds
fit_time: 0.005
score_time: 0.007
test_precision: 0.989
test_accuracy: 0.991
test_recall: 0.997
test_f1: 0.993

==================
is_genuine ~ ALL (length + margin_low + margin_up + height_right + height_left + diagonal)
------------------
Mean on 8 folds
fit_time: 0.006
score_time: 0.007
test_precision: 0.989
test_accuracy: 0.99
test_recall: 0.996
test_f1: 0.993


for model in models:
    #Define features and target
    features = models[model]
    target = 'is_genuine'

    #Feedback for information
    print("==================\n{}\n------------------".format(model))
    
    #Get information in X and y
    X = df_refill[features]
    y = df_refill[target]

    #Normalize data
    norm_model = StandardScaler()
    X_scaled = norm_model.fit_transform(X)

    X_scaled = pd.DataFrame(X_scaled)
    X_scaled.columns = X.columns

    #Split dataset to train and test (with stratify on target to split with the quantities)
    X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=18, stratify=y)

    datasets = {"X_train":X_train, "X_test":X_test, "y_train":y_train, "y_test":y_test}
    show_shape(datasets)

    print("")
    
    #Model init
    model_log = LogisticRegression()

    #Fitting model on datas
    model_log.fit(X_train, y_train);

    #Predict f(x) binary
    y_pred = model_log.predict(X_test)

    #Predict f(x) not convert in binary (i.e: [0.6,0.4]) we need the 2nd column = False class
    y_proba = model_log.predict_proba(X_test)[:,1]

    confusion_matrice = show_scores(y_test, y_pred)

    print("")

==================
is_genuine ~ length
------------------
(1200, 1) : X_train
(300, 1) : X_test
(1200,) : y_train
(300,) : y_test

Confusion Matrice:
[[ 93   7]
 [  6 194]]

True positive rate (recall) : 97.0%
False positive rate : 7.0%
True negative rate (specificity) : 93.0%
False negative rate : 3.0%

--------------

Accuracy: 0.9566666666666667
Precision: 0.9651741293532339
Recall: 0.97
F1 Score: 0.9675810473815462

==================
is_genuine ~ length + margin_low
------------------
(1200, 2) : X_train
(300, 2) : X_test
(1200,) : y_train
(300,) : y_test

Confusion Matrice:
[[ 93   7]
 [  2 198]]

True positive rate (recall) : 99.0%
False positive rate : 7.0%
True negative rate (specificity) : 93.0%
False negative rate : 1.0%

--------------

Accuracy: 0.97
Precision: 0.9658536585365853
Recall: 0.99
F1 Score: 0.9777777777777777

==================
is_genuine ~ length + margin_low + margin_up
------------------
(1200, 3) : X_train
(300, 3) : X_test
(1200,) : y_train
(300,) : y_test

Confusion Matrice:
[[ 98   2]
 [  1 199]]

True positive rate (recall) : 99.5%
False positive rate : 2.0%
True negative rate (specificity) : 98.0%
False negative rate : 0.5%

--------------

Accuracy: 0.99
Precision: 0.9900497512437811
Recall: 0.995
F1 Score: 0.9925187032418954

==================
is_genuine ~ length + margin_low + height_right
------------------
(1200, 3) : X_train
(300, 3) : X_test
(1200,) : y_train
(300,) : y_test

Confusion Matrice:
[[ 97   3]
 [  2 198]]

True positive rate (recall) : 99.0%
False positive rate : 3.0%
True negative rate (specificity) : 97.0%
False negative rate : 1.0%

--------------

Accuracy: 0.9833333333333333
Precision: 0.9850746268656716
Recall: 0.99
F1 Score: 0.9875311720698254

==================
is_genuine ~ length + margin_low + margin_up + height_right
------------------
(1200, 4) : X_train
(300, 4) : X_test
(1200,) : y_train
(300,) : y_test

Confusion Matrice:
[[ 98   2]
 [  1 199]]

True positive rate (recall) : 99.5%
False positive rate : 2.0%
True negative rate (specificity) : 98.0%
False negative rate : 0.5%

--------------

Accuracy: 0.99
Precision: 0.9900497512437811
Recall: 0.995
F1 Score: 0.9925187032418954

==================
is_genuine ~ ALL (length + margin_low + margin_up + height_right + height_left + diagonal)
------------------
(1200, 6) : X_train
(300, 6) : X_test
(1200,) : y_train
(300,) : y_test

Confusion Matrice:
[[ 98   2]
 [  1 199]]

True positive rate (recall) : 99.5%
False positive rate : 2.0%
True negative rate (specificity) : 98.0%
False negative rate : 0.5%

--------------

Accuracy: 0.99
Precision: 0.9900497512437811
Recall: 0.995
F1 Score: 0.9925187032418954


df_refill.columns

Index(['is_genuine', 'diagonal', 'height_left', 'height_right', 'margin_low',
       'margin_up', 'length'],
      dtype='object')


#Define features and target
features = ['height_left', 'height_right', 
            'margin_low', 'margin_up', 'length']
target = 'is_genuine'

#Get information in X and y
X = df_refill[features]
y = df_refill[target]

#Normalize data
norm_model = StandardScaler()
X_scaled = norm_model.fit_transform(X)

X_scaled = pd.DataFrame(X_scaled)
X_scaled.columns = X.columns

#Split dataset to train and test (with stratify on target to split with the quantities)
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=18, stratify=y)

datasets = {"X_train":X_train, "X_test":X_test, "y_train":y_train, "y_test":y_test}
show_shape(datasets)

(1200, 5) : X_train
(300, 5) : X_test
(1200,) : y_train
(300,) : y_test


#Model init
model_log = LogisticRegression()

#Fitting model on datas
model_log.fit(X_train, y_train);

#Predict f(x) binary
y_pred = model_log.predict(X_test)

#Predict f(x) not convert in binary (i.e: [0.6,0.4]) we need the 2nd column = False class
y_proba = model_log.predict_proba(X_test)[:,1]


coeficients = model_log.coef_
coeficients

array([[-0.45381789, -0.6780743 , -2.44481524, -1.60481819,  3.8007262 ]])


confusion_matrice = show_scores(y_test, y_pred)

Confusion Matrice:
[[ 98   2]
 [  1 199]]

True positive rate (recall) : 99.5%
False positive rate : 2.0%
True negative rate (specificity) : 98.0%
False negative rate : 0.5%

--------------

Accuracy: 0.99
Precision: 0.9900497512437811
Recall: 0.995
F1 Score: 0.9925187032418954


plt.figure(figsize=(8, 6))
plt.title('Confusion matrice')
sns.heatmap(confusion_matrice, annot=True, fmt='d', cmap='Blues', cbar=False)
plt.xlabel('Predicted')
plt.ylabel('Real')

plt.show()


#Calculate the false positive rates and true positives rates
fpr, tpr, thresholds = roc_curve(y_test, y_proba)
roc_auc = auc(fpr, tpr)

#Show ROC and AUC curve
plt.figure()
plt.title("ROC line")
plt.plot(fpr, tpr, label="ROC AUC  = %0.2f" % roc_auc)
plt.plot([0, 1], [0, 1], "k--")

plt.xlabel("Rate false trues")
plt.ylabel("Rate true trues")
plt.legend(loc="lower right")

plt.show()


#Looking for the best threshold
df_thresholds = pd.DataFrame((fpr,tpr, thresholds)).round(3).T

columns = ["fp rate", "tp rate", "thresholds"]
df_thresholds.columns = columns

df_thresholds


def fakebanknote_detection(csv_file, delimiter, model):
    """
    Takes a CSV file and delimiter as input,
    and returns a DataFrame with authenticity predictions for each banknote in the CSV file.

    Parameters:
    csv_file (str): The path to the CSV file. The CSV file should contain an 'id' column 
                    and columns for each independent variable.
    delimiter (str): Depend of the csv file (";" , "," , " " , ".")
    model: A trained machine learning model

    Return:
    df (DataFrame): Containing the original data from the CSV file, along with an 
                    'authenticity' column with authenticity predictions for each banknote 
                    ('True' for authentic, 'False' for non-authentic), and a 
                    'probability_authenticity' column with the associated probability for each prediction
    """
    #Read the CSV file
    X = pd.read_csv(csv_file, delimiter=delimiter)

    #Set 'id' as the index
    X.set_index('id', inplace=True)

    #Archive diagonal infomartion (useless)
    df_archive = X["diagonal"]
    X.drop("diagonal", axis=1, inplace=True)

    #Normalize the data
    norm_model = StandardScaler()
    X_scaled = norm_model.fit_transform(X)

    #Make predictions
    y_pred = model.predict(X_scaled)

    #Calculate probabilities
    probabilities = model.predict_proba(X_scaled)

    #Convert predictions to "True" or "False"
    y_pred = ['True' if pred == 1 else 'False' for pred in y_pred]

    #Add predictions to the DataFrame
    X['authenticity'] = y_pred

    #Add probabilities (%) to the DataFrame
    probabilities_true_values = probabilities[:, 1] * 100

    #Switch probabilities on 100% with best threshold
    best_threshold = 0.756 * 100
    X['probability_authenticity (%)'] =[round(prob,2) if prob >= best_threshold else (100 - round(prob,2)) for prob in probabilities_true_values]

    return X


fakebanknote_detection(r"Ressources\billets_production.csv", ",", model_log)

	is_genuine	diagonal	height_left	height_right	margin_low	margin_up	length
0	True	171.81	104.86	104.95	4.52	2.89	112.83
1	True	171.46	103.36	103.66	3.77	2.99	113.09
2	True	172.69	104.48	103.50	4.40	2.94	113.16
3	True	171.36	103.91	103.94	3.62	3.01	113.51
4	True	171.73	104.28	103.46	4.04	3.48	112.54
...	...	...	...	...	...	...	...
1495	False	171.75	104.38	104.17	4.42	3.09	111.28
1496	False	172.19	104.63	104.44	5.27	3.37	110.97
1497	False	171.80	104.01	104.12	5.51	3.36	111.95
1498	False	172.06	104.28	104.06	5.17	3.46	112.25
1499	False	171.47	104.15	103.82	4.63	3.37	112.07

	diagonal	height_left	height_right	margin_low	margin_up	length
count	1500.00	1500.00	1500.00	1463.00	1500.00	1500.00
mean	171.96	104.03	103.92	4.49	3.15	112.68
std	0.31	0.30	0.33	0.66	0.23	0.87
min	171.04	103.14	102.82	2.98	2.27	109.49
25%	171.75	103.82	103.71	4.01	2.99	112.03
50%	171.96	104.04	103.92	4.31	3.14	112.96
75%	172.17	104.23	104.15	4.87	3.31	113.34
max	173.01	104.88	104.95	6.90	3.91	114.44

	height_left	height_right	margin_low	margin_up	length	authenticity	probability_authenticity (%)
id
A_1	104.01	103.54	5.21	3.30	111.42	False	95.97
A_2	104.17	104.13	6.00	3.31	112.09	False	98.28
A_3	104.58	104.29	4.99	3.39	111.57	False	99.36
A_4	104.55	104.34	4.44	3.03	113.20	True	99.99
A_5	103.63	103.56	3.77	3.16	113.33	True	100.00

Prelude :¶

Contents ¶

1. Basics ¶

1.1 Libraries ¶

1.2 Fonctions ¶

1.3 Data importation ¶

1.4 Data informations ¶

2. Exploratory Analysis ¶

2.1 Univariate analysis ¶

2.2 Bivariate analysis ¶

2.3 Missing values filling ¶

2.3.1 First looking ¶

2.3.2 Looking for the best model linear regression ¶

2.3.3 Train model on selected features ¶

2.3.4 Conformity checks ¶

a. Residuals values's distribution check

b. Residuals's homoscedasticity check

c. No multi collinearity check

2.3.5 Estimation predicted values on NaN ¶

2.3.6 Fill missing values from multi linear regression ¶

3. Two ways to find the authenticity ¶

3.1 K-means ¶

3.1.1 Clustering ¶

3.1.2 Results ¶

3.1.3 Centroids visualized on PC1 & PC2 (PCA model)¶

3.2 Logistic regression to classify on is_genuine ¶

3.2.1 No multi collinearity check ¶

3.2.2 Looking for the best model logistic regression ¶

3.2.3 Train model on selected features ¶

3.2.4 Results ¶

4. Final Supervised Classification Algorithm ¶

	VIF	variable
1	1.012790	diagonal
2	1.145295	height_left
3	1.229263	height_right
4	1.403517	margin_up
5	1.574765	length

	fp rate	tp rate	thresholds
0	0.00	0.000	2.000
1	0.00	0.005	1.000
2	0.00	0.990	0.756
3	0.01	0.990	0.644
4	0.01	0.995	0.614
5	0.11	0.995	0.077
6	0.11	1.000	0.069
7	1.00	1.000	0.000