We have six geometric pieces of information about a banknote:
And one information about banknote:
The objective is : create an algorithm to detect counterfeit banknotes by minimizing false positives as much as possible.
#toolbox
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from math import *
import statsmodels.api as sm
#stats tools
from scipy.stats import spearmanr, mode, shapiro, probplot
#ML tools
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.model_selection import train_test_split,cross_validate, StratifiedKFold
from sklearn.metrics import (mean_squared_error, mean_absolute_percentage_error,
accuracy_score, confusion_matrix, precision_score,
recall_score, f1_score, roc_curve,auc)
from statsmodels.stats.diagnostic import het_breuschpagan
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from patsy import dmatrices
from statsmodels.stats.outliers_influence import variance_inflation_factor
#Remover warning messages
import warnings
warnings.filterwarnings("ignore")
def show_shape(datasets):
"""
datasets = {str : array, str : array}
Show the shape of each dataset with ther name
"""
for set in datasets:
print("{} : {}".format(datasets[set].shape, set))
#Editor is Openclassrooms
def correlation_graph(pca, x_y,features) :
"""Display the correlation circle graph.
Positional arguments:
-----------------------------------
pca: sklearn.decomposition.PCA: our fitted PCA object
x_y: list or tuple: the x, y pair of dimensions to display, e.g., [0, 1] for PC1, PC2
features: list or tuple: the list of features (i.e., dimensions) to represent
"""
# Extract x and y
x, y = x_y
# Image size (in inches)
fig, ax = plt.subplots(figsize=(8, 7))
# For each principal component:
for i in range(0, pca.components_.shape[1]):
# Arrows
ax.arrow(0, 0,
pca.components_[x, i],
pca.components_[y, i],
head_width=0.07,
head_length=0.07,
width=0.02)
# Labels
plt.text(pca.components_[x, i] + 0.05,
pca.components_[y, i] + 0.05,
features[i])
# Display horizontal and vertical lines
plt.plot([-1, 1], [0, 0], color='grey', ls='--')
plt.plot([0, 0], [-1, 1], color='grey', ls='--')
# Axes labels, with the percentage of explained variance
plt.xlabel('PC{} ({}%)'.format(x+1, round(100*pca.explained_variance_ratio_[x], 1)))
plt.ylabel('PC{} ({}%)'.format(y+1, round(100*pca.explained_variance_ratio_[y], 1)))
plt.title("Correlation Circle (PC{} and PC{})".format(x+1, y+1))
# The circle
an = np.linspace(0, 2 * np.pi, 100)
plt.plot(np.cos(an), np.sin(an)) # Add a unit circle for scale
# Axes and display
plt.axis('equal')
# plt.show(block=False)
def show_scores(y, y_pred):
"""
y = real values
y_pred = predicted values
------
return : confusion matrice (variable), scores(printed)
Show performance indicators:
- Accuracy: Accuracy is the proportion of correct predictions among the total predictions. It is a good indicator when the classes are balanced
- Where TP is the number of true positives, TN is the number of true negatives, FP is the number of false positives, and FN is the number of false negatives
- Precision: Precision is the proportion of positive predictions that are truly positive. It is used when the cost of false positives is high
- Recall (or Sensitivity): Recall is the proportion of true positives that are correctly predicted as such. It is used when the cost of false negatives is high
- F1-score: The F1-score is a harmonic mean of precision and recall. It provides a balance between these two measures
"""
confusion_mat = confusion_matrix(y, y_pred)
accuracy_KMeans = accuracy_score(y, y_pred)
precision_KMeans = precision_score(y, y_pred)
recall_KMeans = recall_score(y, y_pred)
f1_KMeans = f1_score(y, y_pred)
print('Confusion Matrice:\n{}'.format(confusion_mat))
#Get values from confusion matrice
TN, FP, FN, TP = confusion_mat.ravel()
#Calculate rates from confusion matrice
TPR_test = round(TP / (TP + FN) * 100, 2) #True positive rate (recall)
FPR_test = round(FP / (FP + TN) * 100, 2) #False positive rate
TNR_test = round(TN / (TN + FP) * 100, 2) #True negative rate (specificity)
FNR_test = round(FN / (FN + TP) * 100, 2) #False negative rate
# Affichage des taux pour l'ensemble de test
print("\nTrue positive rate (recall) : {}%".format(TPR_test))
print("False positive rate : {}%".format(FPR_test))
print("True negative rate (specificity) : {}%".format(TNR_test))
print("False negative rate : {}%".format(FNR_test))
print("\n--------------\n")
print('Accuracy: {}'.format(accuracy_KMeans))
print('Precision: {}'.format(precision_KMeans))
print('Recall: {}'.format(recall_KMeans))
print('F1 Score: {}'.format(f1_KMeans))
return confusion_mat
df_source = pd.read_csv("Ressources/billets.csv", delimiter=";")
df_source
is_genuine | diagonal | height_left | height_right | margin_low | margin_up | length | |
---|---|---|---|---|---|---|---|
0 | True | 171.81 | 104.86 | 104.95 | 4.52 | 2.89 | 112.83 |
1 | True | 171.46 | 103.36 | 103.66 | 3.77 | 2.99 | 113.09 |
2 | True | 172.69 | 104.48 | 103.50 | 4.40 | 2.94 | 113.16 |
3 | True | 171.36 | 103.91 | 103.94 | 3.62 | 3.01 | 113.51 |
4 | True | 171.73 | 104.28 | 103.46 | 4.04 | 3.48 | 112.54 |
... | ... | ... | ... | ... | ... | ... | ... |
1495 | False | 171.75 | 104.38 | 104.17 | 4.42 | 3.09 | 111.28 |
1496 | False | 172.19 | 104.63 | 104.44 | 5.27 | 3.37 | 110.97 |
1497 | False | 171.80 | 104.01 | 104.12 | 5.51 | 3.36 | 111.95 |
1498 | False | 172.06 | 104.28 | 104.06 | 5.17 | 3.46 | 112.25 |
1499 | False | 171.47 | 104.15 | 103.82 | 4.63 | 3.37 | 112.07 |
1500 rows × 7 columns
df_source.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 1500 entries, 0 to 1499 Data columns (total 7 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 is_genuine 1500 non-null bool 1 diagonal 1500 non-null float64 2 height_left 1500 non-null float64 3 height_right 1500 non-null float64 4 margin_low 1463 non-null float64 5 margin_up 1500 non-null float64 6 length 1500 non-null float64 dtypes: bool(1), float64(6) memory usage: 71.9 KB
37 missing values on margin_low feature.
df_source.duplicated().sum()
0
There is no same banknote
df_source.describe().round(2)
diagonal | height_left | height_right | margin_low | margin_up | length | |
---|---|---|---|---|---|---|
count | 1500.00 | 1500.00 | 1500.00 | 1463.00 | 1500.00 | 1500.00 |
mean | 171.96 | 104.03 | 103.92 | 4.49 | 3.15 | 112.68 |
std | 0.31 | 0.30 | 0.33 | 0.66 | 0.23 | 0.87 |
min | 171.04 | 103.14 | 102.82 | 2.98 | 2.27 | 109.49 |
25% | 171.75 | 103.82 | 103.71 | 4.01 | 2.99 | 112.03 |
50% | 171.96 | 104.04 | 103.92 | 4.31 | 3.14 | 112.96 |
75% | 172.17 | 104.23 | 104.15 | 4.87 | 3.31 | 113.34 |
max | 173.01 | 104.88 | 104.95 | 6.90 | 3.91 | 114.44 |
for col in df_source.columns[1:]:
plt.figure(figsize=(10,10))
plt.suptitle("===| {} |===".format(col), color="Green")
plt.subplot(221)
plt.title("False & True")
sns.boxplot(data=df_source, x="is_genuine", y=col);
#------------
plt.subplot(222)
plt.title("False & True")
sns.histplot(data=df_source, x=col, kde=True);
mean = df_source[col].mean()
med = df_source[col].median()
mod = mode(df_source[col], axis=None, keepdims=False)[0] #scipystats object
plt.axvline(mean, color='r', linestyle='dashed', linewidth=2, label='Mean')
plt.axvline(med, color='g', linestyle='dashed', linewidth=2, label='Median')
plt.axvline(mod, color='b', linestyle='dashed', linewidth=2, label='Mode')
legend = ["Distribution",
"Mean ({})".format(str(mean.round(2))),
"Median ({})".format(str(med.round(2))),
"Mode ({})".format(str(mod.round(2)))]
plt.legend(legend)
plt.xticks(rotation=45)
#Compare normal distribution vs actual with Shapiro-Wilk test
statistic, p_value = shapiro(df_source[col])
#Return test variables
text_x = df_source[col].max() - (df_source[col].std()*4)
text_y = df_source[col].value_counts().max()
#Return the result
alpha = 0.05 #Alpha level
if p_value > alpha:
result = "Gaussian distribution : YES"
else:
result = "Gaussian distribution : NO"
plt.text(x=text_x, y=text_y, s="Test value : {}\nP-value : {}\n{}".format(round(statistic,2), round(p_value,2), result), bbox=dict(facecolor='white', edgecolor='white', boxstyle='round,pad=0.5'))
#------------
df_temp = df_source.loc[df_source["is_genuine"] == 0]
plt.subplot(223)
plt.title("False")
sns.histplot(data=df_temp, x=col, kde=True);
mean = df_temp[col].mean()
med = df_temp[col].median()
mod = mode(df_temp[col], axis=None, keepdims=False)[0] #scipystats object
plt.axvline(mean, color='r', linestyle='dashed', linewidth=2, label='Mean')
plt.axvline(med, color='g', linestyle='dashed', linewidth=2, label='Median')
plt.axvline(mod, color='b', linestyle='dashed', linewidth=2, label='Mode')
legend = ["Distribution",
"Mean ({})".format(str(mean.round(2))),
"Median ({})".format(str(med.round(2))),
"Mode ({})".format(str(mod.round(2)))]
plt.legend(legend)
plt.xticks(rotation=45)
#Compare normal distribution vs actual with Shapiro-Wilk test
statistic, p_value = shapiro(df_temp[col])
#Return test variables
text_x = df_temp[col].max() - (df_temp[col].std()*4)
text_y = df_temp[col].value_counts().max()
#Return the result
alpha = 0.05 #Alpha level
if p_value > alpha:
result = "Gaussian distribution : YES"
else:
result = "Gaussian distribution : NO"
plt.text(x=text_x, y=text_y, s="Test value : {}\nP-value : {}\n{}".format(round(statistic,2), round(p_value,2), result), bbox=dict(facecolor='white', edgecolor='white', boxstyle='round,pad=0.5'))
med_false = med
#------------
df_temp = df_source.loc[df_source["is_genuine"] == 1]
plt.subplot(224)
plt.title("True")
sns.histplot(data=df_temp, x=col, kde=True);
mean = df_temp[col].mean()
med = df_temp[col].median()
mod = mode(df_temp[col], axis=None, keepdims=False)[0] #scipystats object
plt.axvline(mean, color='r', linestyle='dashed', linewidth=2, label='Mean')
plt.axvline(med, color='g', linestyle='dashed', linewidth=2, label='Median')
plt.axvline(mod, color='b', linestyle='dashed', linewidth=2, label='Mode')
legend = ["Distribution",
"Mean ({})".format(str(mean.round(2))),
"Median ({})".format(str(med.round(2))),
"Mode ({})".format(str(mod.round(2)))]
plt.legend(legend)
plt.xticks(rotation=45)
#Compare normal distribution vs actual with Shapiro-Wilk test
statistic, p_value = shapiro(df_temp[col])
##Return test variables
text_x = df_temp[col].max() - (df_temp[col].std()*4)
text_y = df_temp[col].value_counts().max()
#Return the result
alpha = 0.05 #Alpha level
if p_value > alpha:
result = "Gaussian distribution : YES"
else:
result = "Gaussian distribution : NO"
plt.text(x=text_x, y=text_y, s="Test value : {}\nP-value : {}\n{}".format(round(statistic,2), round(p_value,2), result), bbox=dict(facecolor='white', edgecolor='white', boxstyle='round,pad=0.5'))
med_true = med
#------------
plt.tight_layout(w_pad=3)
med_diff = round(med_false - med_true, 2)
med_prop = round((med_false - med_true)/med_true, 2)
print("Column: {} | The difference between False and True : {} ({}%)".format(col, med_diff, med_prop))
plt.show()
Column: diagonal | The difference between False and True : -0.08 (-0.0%) Column: height_left | The difference between False and True : 0.23 (0.0%) Column: height_right | The difference between False and True : 0.35 (0.0%) Column: margin_low | The difference between False and True : 1.08 (0.26%) Column: margin_up | The difference between False and True : 0.3 (0.1%) Column: length | The difference between False and True : -1.58 (-0.01%)
Focus on :
sns.pairplot(data=df_source, hue="is_genuine", height=1.8, plot_kws={'alpha': 0.2}); #plot_kws to set alpha
plt.show()
The clusters (True / False) are really easy to see on the length feature.
Let's check the correlation couples :
#Some features are not following the Gaussian distribution, then "Spearman method" used
sns.heatmap(df_source.corr(method="spearman"), annot=True)
<Axes: >
Picking interresting features to used in linear regression
features = ['length',"margin_up", "height_right"]
target = "margin_low"
#Check the correlation is real
for feature in features:
statistic, p_value = spearmanr(df_source[target], df_source[feature], nan_policy="omit") #NaN are ignored
alpha = 0.05 #alpha level
print("Correlation test between margin_low | {}".format(feature))
if p_value > alpha:
result = "The result is due to the random (p_value : {} > {} (alpha))".format(round(p_value,2), alpha)
else:
result = "There are a significance correlation\n\np_value : {} < {} (alpha)\ncorrelation stat : {}\n-------------------".format(round(p_value,2), alpha, round(statistic,2))
print(result)
Correlation test between margin_low | length There are a significance correlation p_value : 0.0 < 0.05 (alpha) correlation stat : -0.59 ------------------- Correlation test between margin_low | margin_up There are a significance correlation p_value : 0.0 < 0.05 (alpha) correlation stat : 0.42 ------------------- Correlation test between margin_low | height_right There are a significance correlation p_value : 0.0 < 0.05 (alpha) correlation stat : 0.4 -------------------
#Correlation visualize
num_subplot = (len(features)*10) + 101
plt.figure(figsize=(12,5))
plt.suptitle("Correlation with - margin_low -")
for feature in features:
plt.subplot(num_subplot)
sns.regplot(data=df_source, x=feature, y=target, line_kws={"color": "red"})
sns.scatterplot(data=df_source, x=feature, y=target, hue="is_genuine")
num_subplot += 1
plt.tight_layout(w_pad=5)
plt.show()
#Split missing values and not NaN
df_full_values = df_source.loc[df_source["margin_low"].notna()]
print(df_full_values.shape)
df_to_impute = df_source.loc[df_source["margin_low"].isna()]
print(df_to_impute.shape)
(1463, 7) (37, 7)
df_full_values["margin_up_squared"] = df_full_values["margin_up"]**2
#Define different features to test with the model
models = {"margin_low ~ length":["length"],
"margin_low ~ length + margin_up":["length", "margin_up"],
"margin_low ~ length + margin_up_squared":["length", "margin_up_squared"],
"margin_low ~ length + margin_up + height_right":["length", "margin_up", "height_right"]}
#Compare the different model scores
for model in models:
features = models[model]
#Feedback for information
print("==================\n{}\n------------------".format(model))
#Define features (X) and target (y)
X = df_full_values[features]
y = df_full_values[target]
#margin_up_squarred feature have high values -> normalize
norm_model = StandardScaler()
X_scaled = norm_model.fit_transform(X)
X_scaled = pd.DataFrame(X_scaled)
X_scaled.columns = features
#Split train & test dataset
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=18, stratify=df_full_values["is_genuine"])
datasets = {"X_train":X_train, "X_test":X_test, "y_train":y_train, "y_test":y_test}
for set in datasets:
print("{} : {}".format(datasets[set].shape, set))
#Model init
model = LinearRegression()
#Fitting model on data
model.fit(X_train, y_train);
#Get the predicted values
y_pred_test = model.predict(X_test)
#Check the differences between real and predicted values
rmse_score = round(mean_squared_error(y_test, y_pred_test),3)
mape_score = round(mean_absolute_percentage_error(y_test, y_pred_test),3)
print("------------------\nRMSE: {}".format(rmse_score))
print("MAPE: {}".format(mape_score))
#Score R**2
print("R2 = {}".format(round(model.score(X_scaled, y),2)))
#Coefficients show
coefficients = []
for i, feature in enumerate(features):
#a*x text
coef_text = "{} * {}".format(str(model.coef_[i].round(2)), feature)
coefficients.append(coef_text)
#Linear regression math formula: f(x) = a1*x + a2*x [..] + an*x + b
a = " + ".join(coefficients)
b = model.intercept_.round(2)
print("f(x) = {} + {}\n==================\n-------------".format(a, b))
================== margin_low ~ length ------------------ (1170, 1) : X_train (293, 1) : X_test (1170,) : y_train (293,) : y_test ------------------ RMSE: 0.24 MAPE: 0.088 R2 = 0.44 f(x) = -0.45 * length + 4.49 ================== ------------- ================== margin_low ~ length + margin_up ------------------ (1170, 2) : X_train (293, 2) : X_test (1170,) : y_train (293,) : y_test ------------------ RMSE: 0.241 MAPE: 0.087 R2 = 0.45 f(x) = -0.4 * length + 0.08 * margin_up + 4.49 ================== ------------- ================== margin_low ~ length + margin_up_squared ------------------ (1170, 2) : X_train (293, 2) : X_test (1170,) : y_train (293,) : y_test ------------------ RMSE: 0.241 MAPE: 0.087 R2 = 0.45 f(x) = -0.4 * length + 0.09 * margin_up_squared + 4.49 ================== ------------- ================== margin_low ~ length + margin_up + height_right ------------------ (1170, 3) : X_train (293, 3) : X_test (1170,) : y_train (293,) : y_test ------------------ RMSE: 0.228 MAPE: 0.084 R2 = 0.47 f(x) = -0.38 * length + 0.07 * margin_up + 0.08 * height_right + 4.49 ================== -------------
margin_low ~ length + margin_up + height_right seems to be the most effective :
#Features pick from the last test
features = ["length", "margin_up", "height_right"]
#Feedback for information
print("Feature : {}".format(features))
print("Target feature : {}".format(target))
Feature : ['length', 'margin_up', 'height_right'] Target feature : margin_low
#Define features (X) and target (y)
X = df_full_values[features]
y = df_full_values[target]
#Split train & test dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=18, stratify=df_full_values["is_genuine"])
datasets = {"X_train":X_train, "X_test":X_test, "y_train":y_train, "y_test":y_test}
show_shape(datasets)
(1170, 3) : X_train (293, 3) : X_test (1170,) : y_train (293,) : y_test
#Model init
model = LinearRegression()
#Fitting model on data
model.fit(X_train, y_train);
#Get the predicted values
y_pred_test = model.predict(X_test)
#Check the differences between real and predicted values
rmse_score = round(mean_squared_error(y_test, y_pred_test),3)
mape_score = round(mean_absolute_percentage_error(y_test, y_pred_test),3)
print("RMSE: {}".format(rmse_score))
print("MAPE: {}".format(mape_score))
RMSE: 0.228 MAPE: 0.084
Use the RMSE score to compare with other model (features pick)
MAPE score is close to 0 -> the model performs
#Score R**2
print("R2 = {}".format(round(model.score(X, y),2)))
#Coefficients show
coefficients = []
for i, feature in enumerate(features):
#a*x text
coef_text = "{} * {}".format(str(model.coef_[i].round(2)), feature)
coefficients.append(coef_text)
#Linear regression math formula: f(x) = a1*x + a2*x [..] + an*x + b
a = " + ".join(coefficients)
b = model.intercept_.round(2)
print("f(x) = {} + {}".format(a, b))
R2 = 0.47 f(x) = -0.43 * length + 0.32 * margin_up + 0.24 * height_right + 27.59
R2 should be highter than 0.5 (close to 1) to be representative
We can't have a better score
#Get residuals values
residuals = y_test - y_pred_test
#Shapiro test
statistic, p_value = shapiro(residuals)
alpha = 0.05 #alpha level
if p_value > alpha:
print("The residuals values's distribution is following a Gaussian curve\np-value : {}".format(round(p_value,2)))
else:
print("The residuals values's distribution is NOT following a Gaussian curve\np-value : {}".format(round(p_value,2)))
The residuals values's distribution is NOT following a Gaussian curve p-value : 0.04
#Visual check
plt.figure(figsize=(10,5))
plt.suptitle("Residuals values distribution")
plt.subplot(121)
sns.histplot(residuals, kde=True);
plt.subplot(122)
probplot(residuals, plot=plt)
plt.tight_layout(w_pad=5)
plt.show()
The test show not Gaussian distribution followed (0.04 close to 0.05), but the graphs are enought
plt.title("Homoscedasticity")
sns.scatterplot(x=y_pred_test, y=residuals)
plt.ylabel("residuals")
plt.xlabel("y_pred_test")
Text(0.5, 0, 'y_pred_test')
No specific pattern (line, W, V, S curve, etc...) -> Homoscedasticity OK
#Residuals show on one axis
plt.subplots(figsize=(10, 6))
plt.scatter(x=X_test.index, y=residuals, alpha=0.5)
plt.plot(np.repeat(0, len(X.index)), color='red')
plt.title('Residuals from multi linear regression model')
plt.show()
#Add constant feature to the matrice with explanatory features
X_constant = sm.add_constant(X_test)
#Breusch-Pagan test
_, pval, _, _ = het_breuschpagan(residuals, X_constant)
# Afficher la p-value
print("P-value : {}".format(pval))
P-value : 0.0012222296083232555
Heteroskedasticity is present due from the test.
But this test is really harsh.
The tests don't confirm our hypothesis, but the results on graphs are enougth to use this model to fill NaN.
#Get correlation matrice with intercept (1) column added
_, X_vif = dmatrices('margin_low~length+margin_up+height_right', data=df_source, return_type='dataframe')
#Calculate VIF
vif = pd.DataFrame()
vif['VIF'] = [variance_inflation_factor(X_vif.values, i) for i in range(X_vif.shape[1])]
vif['variable'] = X_vif.columns
vif[1:]
VIF | variable | |
---|---|---|
1 | 1.509435 | length |
2 | 1.394010 | margin_up |
3 | 1.213794 | height_right |
The VIF (Variance Inflation Factor) should be under 5 to prouve the no multi collinearity -> it's OK
Let's visualize the predicted values on NaN
#Predict missing values to visualize
x_missing_values = df_to_impute[features].iloc[:,0]
y_missing_values_pred = model.predict(df_to_impute[features]).tolist()
plt.title("Predicted values on the Real values graph")
sns.regplot(x=X.iloc[:,0], y=y.tolist(), color="g", scatter_kws={"alpha": 0.2}, line_kws={"color": "red", "alpha":0.5})
sns.scatterplot(x=x_missing_values, y=y_missing_values_pred, hue=df_to_impute["is_genuine"])
plt.ylabel("margin_low")
plt.xlabel("length")
plt.legend(["Real values", "Estimate model", "Confidence intervale", "Pred values - Genuine", "Pred values - Not Genuine"])
plt.show()
df_refill = df_source.copy()
df_refill.loc[df_refill["margin_low"].isna(), ["margin_low"]] = model.predict(df_refill.loc[df_refill["margin_low"].isna(),features])
df_refill.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 1500 entries, 0 to 1499 Data columns (total 7 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 is_genuine 1500 non-null bool 1 diagonal 1500 non-null float64 2 height_left 1500 non-null float64 3 height_right 1500 non-null float64 4 margin_low 1500 non-null float64 5 margin_up 1500 non-null float64 6 length 1500 non-null float64 dtypes: bool(1), float64(6) memory usage: 71.9 KB
#Define features and target
features = ['diagonal', 'height_left', 'height_right',
'margin_low', 'margin_up', 'length']
target = 'is_genuine'
#Get information in X and y
X = df_refill[features]
y = df_refill[target]
#Scaled data is a mandatory for this model
model_normalizer = StandardScaler()
#Fitting model on datas
model_normalizer.fit(X)
#Scaling
X_scaled = model_normalizer.transform(X)
datasets = {"X_scaled":X_scaled}
show_shape(datasets)
(1500, 6) : X_scaled
#Model init
model_kmeans = KMeans(n_clusters=2, n_init=5, random_state=18)
#Fitting model on datas
model_kmeans.fit(X_scaled)
#Get x predicted (f(x))
y_pred_kmeans = model_kmeans.predict(X_scaled)
#Get centroids
centroids = model_kmeans.cluster_centers_
#Get clusters
clusters = model_kmeans.labels_
#Get confusion matrice and scores
confusion_matrice = show_scores(y, y_pred_kmeans)
Confusion Matrice: [[486 14] [ 10 990]] True positive rate (recall) : 99.0% False positive rate : 2.8% True negative rate (specificity) : 97.2% False negative rate : 1.0% -------------- Accuracy: 0.984 Precision: 0.9860557768924303 Recall: 0.99 F1 Score: 0.9880239520958083
Other performance indicators:
plt.figure(figsize=(8, 6))
plt.title('Confusion matrice')
sns.heatmap(confusion_matrice, annot=True, fmt='d', cmap='Blues', cbar=False)
plt.xlabel('Predicted')
plt.ylabel('Real')
plt.show()
#init model
model_PCA = PCA()
#Fitting on datas
model_PCA.fit(X_scaled)
#PCs creation
X_pca = model_PCA.transform(X_scaled)
centroids_pca = model_PCA.transform(centroids)
model_PCA.explained_variance_ratio_.round(2).cumsum()
array([0.43, 0.6 , 0.73, 0.85, 0.95, 1. ])
60% variances are explained on the 2 firsts principal components
correlation_graph(model_PCA, [0,1], features)
plt.figure(figsize=(8,8))
plt.title('Clusters and Centroids on PCA')
plt.scatter(X_pca[:, 0], X_pca[:, 1],c=y_pred_kmeans, s=50, alpha=0.5)
plt.scatter(centroids_pca[:, 0], centroids_pca[:, 1], c='red', marker='+', s=500, linewidth=2)
plt.show()
To conclude :
Kmeans seems easy and good way to classify but this model is not perfect.
We should try with logistic regression.
#Show columns name to easy copy/paste
df_refill.columns
Index(['is_genuine', 'diagonal', 'height_left', 'height_right', 'margin_low', 'margin_up', 'length'], dtype='object')
#Get correlation matrice with intercept (1) column added
_, X_vif = dmatrices('margin_low~diagonal+height_left+height_right+margin_up+length', data=df_refill, return_type='dataframe')
#Calculate VIF
vif = pd.DataFrame()
vif['VIF'] = [variance_inflation_factor(X_vif.values, i) for i in range(X_vif.shape[1])]
vif['variable'] = X_vif.columns
vif[1:]
VIF | variable | |
---|---|---|
1 | 1.012790 | diagonal |
2 | 1.145295 | height_left |
3 | 1.229263 | height_right |
4 | 1.403517 | margin_up |
5 | 1.574765 | length |
The VIF should to be under 5 to prouve the no multi collinearity -> it's OK
#Define different features to test with the model
models = {"is_genuine ~ length":["length"],
"is_genuine ~ length + margin_low":["length", "margin_low"],
"is_genuine ~ length + margin_low + margin_up":["length", "margin_low", "margin_up"],
"is_genuine ~ length + margin_low + height_right":["length", "margin_low", "height_right"],
"is_genuine ~ length + margin_low + margin_up + height_right":["length", "margin_low", "margin_up", "height_right"],
"is_genuine ~ ALL (length + margin_low + margin_up + height_right + height_left + diagonal)":["length", "margin_low", "margin_up", "height_right", "height_left", "diagonal"]}
Test with kfold cross validation (stratify on target feature)
for model in models:
#Define features and target
features = models[model]
target = 'is_genuine'
#Define numbers of folds
k = 8
#Feedback for information
print("==================\n{}\n------------------\nMean on {} folds".format(model, k))
#Get information in X and y
X = df_refill[features]
y = df_refill[target]
#Normalize data
norm_model = StandardScaler()
X_scaled = norm_model.fit_transform(X)
X_scaled = pd.DataFrame(X_scaled)
X_scaled.columns = X.columns
#Init model
model = LogisticRegression()
#Init with Shuffle data
stratified_kf = StratifiedKFold(n_splits=k, shuffle=True, random_state=18)
#Define scores
scoring = {'precision': 'precision',
'accuracy': 'accuracy',
'recall': 'recall',
'f1': 'f1'}
# Utiliser cross_validate pour effectuer la validation croisée et obtenir les scores
cv_results = cross_validate(model, X_scaled, y, scoring=scoring, cv=stratified_kf)
# Afficher les résultats
for metric, values in cv_results.items():
print("{}: {}".format(metric, round(np.mean(values),3)))
print("")
================== is_genuine ~ length ------------------ Mean on 8 folds fit_time: 0.004 score_time: 0.008 test_precision: 0.959 test_accuracy: 0.957 test_recall: 0.978 test_f1: 0.968 ================== is_genuine ~ length + margin_low ------------------ Mean on 8 folds fit_time: 0.005 score_time: 0.007 test_precision: 0.983 test_accuracy: 0.984 test_recall: 0.993 test_f1: 0.988 ================== is_genuine ~ length + margin_low + margin_up ------------------ Mean on 8 folds fit_time: 0.008 score_time: 0.009 test_precision: 0.988 test_accuracy: 0.989 test_recall: 0.996 test_f1: 0.992 ================== is_genuine ~ length + margin_low + height_right ------------------ Mean on 8 folds fit_time: 0.005 score_time: 0.007 test_precision: 0.984 test_accuracy: 0.985 test_recall: 0.993 test_f1: 0.989 ================== is_genuine ~ length + margin_low + margin_up + height_right ------------------ Mean on 8 folds fit_time: 0.005 score_time: 0.007 test_precision: 0.989 test_accuracy: 0.991 test_recall: 0.997 test_f1: 0.993 ================== is_genuine ~ ALL (length + margin_low + margin_up + height_right + height_left + diagonal) ------------------ Mean on 8 folds fit_time: 0.006 score_time: 0.007 test_precision: 0.989 test_accuracy: 0.99 test_recall: 0.996 test_f1: 0.993
Test with train_test_split (stratify on target feature):
for model in models:
#Define features and target
features = models[model]
target = 'is_genuine'
#Feedback for information
print("==================\n{}\n------------------".format(model))
#Get information in X and y
X = df_refill[features]
y = df_refill[target]
#Normalize data
norm_model = StandardScaler()
X_scaled = norm_model.fit_transform(X)
X_scaled = pd.DataFrame(X_scaled)
X_scaled.columns = X.columns
#Split dataset to train and test (with stratify on target to split with the quantities)
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=18, stratify=y)
datasets = {"X_train":X_train, "X_test":X_test, "y_train":y_train, "y_test":y_test}
show_shape(datasets)
print("")
#Model init
model_log = LogisticRegression()
#Fitting model on datas
model_log.fit(X_train, y_train);
#Predict f(x) binary
y_pred = model_log.predict(X_test)
#Predict f(x) not convert in binary (i.e: [0.6,0.4]) we need the 2nd column = False class
y_proba = model_log.predict_proba(X_test)[:,1]
confusion_matrice = show_scores(y_test, y_pred)
print("")
================== is_genuine ~ length ------------------ (1200, 1) : X_train (300, 1) : X_test (1200,) : y_train (300,) : y_test Confusion Matrice: [[ 93 7] [ 6 194]] True positive rate (recall) : 97.0% False positive rate : 7.0% True negative rate (specificity) : 93.0% False negative rate : 3.0% -------------- Accuracy: 0.9566666666666667 Precision: 0.9651741293532339 Recall: 0.97 F1 Score: 0.9675810473815462 ================== is_genuine ~ length + margin_low ------------------ (1200, 2) : X_train (300, 2) : X_test (1200,) : y_train (300,) : y_test Confusion Matrice: [[ 93 7] [ 2 198]] True positive rate (recall) : 99.0% False positive rate : 7.0% True negative rate (specificity) : 93.0% False negative rate : 1.0% -------------- Accuracy: 0.97 Precision: 0.9658536585365853 Recall: 0.99 F1 Score: 0.9777777777777777 ================== is_genuine ~ length + margin_low + margin_up ------------------ (1200, 3) : X_train (300, 3) : X_test (1200,) : y_train (300,) : y_test Confusion Matrice: [[ 98 2] [ 1 199]] True positive rate (recall) : 99.5% False positive rate : 2.0% True negative rate (specificity) : 98.0% False negative rate : 0.5% -------------- Accuracy: 0.99 Precision: 0.9900497512437811 Recall: 0.995 F1 Score: 0.9925187032418954 ================== is_genuine ~ length + margin_low + height_right ------------------ (1200, 3) : X_train (300, 3) : X_test (1200,) : y_train (300,) : y_test Confusion Matrice: [[ 97 3] [ 2 198]] True positive rate (recall) : 99.0% False positive rate : 3.0% True negative rate (specificity) : 97.0% False negative rate : 1.0% -------------- Accuracy: 0.9833333333333333 Precision: 0.9850746268656716 Recall: 0.99 F1 Score: 0.9875311720698254 ================== is_genuine ~ length + margin_low + margin_up + height_right ------------------ (1200, 4) : X_train (300, 4) : X_test (1200,) : y_train (300,) : y_test Confusion Matrice: [[ 98 2] [ 1 199]] True positive rate (recall) : 99.5% False positive rate : 2.0% True negative rate (specificity) : 98.0% False negative rate : 0.5% -------------- Accuracy: 0.99 Precision: 0.9900497512437811 Recall: 0.995 F1 Score: 0.9925187032418954 ================== is_genuine ~ ALL (length + margin_low + margin_up + height_right + height_left + diagonal) ------------------ (1200, 6) : X_train (300, 6) : X_test (1200,) : y_train (300,) : y_test Confusion Matrice: [[ 98 2] [ 1 199]] True positive rate (recall) : 99.5% False positive rate : 2.0% True negative rate (specificity) : 98.0% False negative rate : 0.5% -------------- Accuracy: 0.99 Precision: 0.9900497512437811 Recall: 0.995 F1 Score: 0.9925187032418954
ALL features* are selected due to the high scores.
* except "diagonal" due to the coefficient is close to 0
df_refill.columns
Index(['is_genuine', 'diagonal', 'height_left', 'height_right', 'margin_low', 'margin_up', 'length'], dtype='object')
#Define features and target
features = ['height_left', 'height_right',
'margin_low', 'margin_up', 'length']
target = 'is_genuine'
#Get information in X and y
X = df_refill[features]
y = df_refill[target]
#Normalize data
norm_model = StandardScaler()
X_scaled = norm_model.fit_transform(X)
X_scaled = pd.DataFrame(X_scaled)
X_scaled.columns = X.columns
#Split dataset to train and test (with stratify on target to split with the quantities)
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=18, stratify=y)
datasets = {"X_train":X_train, "X_test":X_test, "y_train":y_train, "y_test":y_test}
show_shape(datasets)
(1200, 5) : X_train (300, 5) : X_test (1200,) : y_train (300,) : y_test
#Model init
model_log = LogisticRegression()
#Fitting model on datas
model_log.fit(X_train, y_train);
#Predict f(x) binary
y_pred = model_log.predict(X_test)
#Predict f(x) not convert in binary (i.e: [0.6,0.4]) we need the 2nd column = False class
y_proba = model_log.predict_proba(X_test)[:,1]
coeficients = model_log.coef_
coeficients
array([[-0.45381789, -0.6780743 , -2.44481524, -1.60481819, 3.8007262 ]])
confusion_matrice = show_scores(y_test, y_pred)
Confusion Matrice: [[ 98 2] [ 1 199]] True positive rate (recall) : 99.5% False positive rate : 2.0% True negative rate (specificity) : 98.0% False negative rate : 0.5% -------------- Accuracy: 0.99 Precision: 0.9900497512437811 Recall: 0.995 F1 Score: 0.9925187032418954
plt.figure(figsize=(8, 6))
plt.title('Confusion matrice')
sns.heatmap(confusion_matrice, annot=True, fmt='d', cmap='Blues', cbar=False)
plt.xlabel('Predicted')
plt.ylabel('Real')
plt.show()
The results are better with this model.
Performance indicators for every threshold :
ROC Curve: The Receiver Operating Characteristic (ROC) curve is a graph that illustrates the performance of a classification model for all classification thresholds. It plots the true positive rate (sensitivity) against the false positive rate (1-specificity).
AUC: The Area Under the Curve (AUC) is the area beneath the ROC curve. It provides an aggregated measure of the classification model's performance across all possible classification thresholds. It can also be interpreted as the probability that a classifier will rank a randomly chosen positive example higher than a randomly chosen negative example. Values range from 0 to 1, where a value of 0.5 corresponds to a random model, and a value of 1 corresponds to a perfect model.
ROC-AUC score: The ROC-AUC score is simply the numerical value for the AUC. It is used as a measure of the overall performance of a classification model.
#Calculate the false positive rates and true positives rates
fpr, tpr, thresholds = roc_curve(y_test, y_proba)
roc_auc = auc(fpr, tpr)
#Show ROC and AUC curve
plt.figure()
plt.title("ROC line")
plt.plot(fpr, tpr, label="ROC AUC = %0.2f" % roc_auc)
plt.plot([0, 1], [0, 1], "k--")
plt.xlabel("Rate false trues")
plt.ylabel("Rate true trues")
plt.legend(loc="lower right")
plt.show()
Very high ROC AUC score -> very good global performance of the classification model
Model Selection :
In our context of fake banknote detection, where it is crucial to detect all fake banknotes, Logistic Regression seems to be the best choice:
#Looking for the best threshold
df_thresholds = pd.DataFrame((fpr,tpr, thresholds)).round(3).T
columns = ["fp rate", "tp rate", "thresholds"]
df_thresholds.columns = columns
df_thresholds
fp rate | tp rate | thresholds | |
---|---|---|---|
0 | 0.00 | 0.000 | 2.000 |
1 | 0.00 | 0.005 | 1.000 |
2 | 0.00 | 0.990 | 0.756 |
3 | 0.01 | 0.990 | 0.644 |
4 | 0.01 | 0.995 | 0.614 |
5 | 0.11 | 0.995 | 0.077 |
6 | 0.11 | 1.000 | 0.069 |
7 | 1.00 | 1.000 | 0.000 |
The best threshold is : 0.756
def fakebanknote_detection(csv_file, delimiter, model):
"""
Takes a CSV file and delimiter as input,
and returns a DataFrame with authenticity predictions for each banknote in the CSV file.
Parameters:
csv_file (str): The path to the CSV file. The CSV file should contain an 'id' column
and columns for each independent variable.
delimiter (str): Depend of the csv file (";" , "," , " " , ".")
model: A trained machine learning model
Return:
df (DataFrame): Containing the original data from the CSV file, along with an
'authenticity' column with authenticity predictions for each banknote
('True' for authentic, 'False' for non-authentic), and a
'probability_authenticity' column with the associated probability for each prediction
"""
#Read the CSV file
X = pd.read_csv(csv_file, delimiter=delimiter)
#Set 'id' as the index
X.set_index('id', inplace=True)
#Archive diagonal infomartion (useless)
df_archive = X["diagonal"]
X.drop("diagonal", axis=1, inplace=True)
#Normalize the data
norm_model = StandardScaler()
X_scaled = norm_model.fit_transform(X)
#Make predictions
y_pred = model.predict(X_scaled)
#Calculate probabilities
probabilities = model.predict_proba(X_scaled)
#Convert predictions to "True" or "False"
y_pred = ['True' if pred == 1 else 'False' for pred in y_pred]
#Add predictions to the DataFrame
X['authenticity'] = y_pred
#Add probabilities (%) to the DataFrame
probabilities_true_values = probabilities[:, 1] * 100
#Switch probabilities on 100% with best threshold
best_threshold = 0.756 * 100
X['probability_authenticity (%)'] =[round(prob,2) if prob >= best_threshold else (100 - round(prob,2)) for prob in probabilities_true_values]
return X
fakebanknote_detection(r"Ressources\billets_production.csv", ",", model_log)
height_left | height_right | margin_low | margin_up | length | authenticity | probability_authenticity (%) | |
---|---|---|---|---|---|---|---|
id | |||||||
A_1 | 104.01 | 103.54 | 5.21 | 3.30 | 111.42 | False | 95.97 |
A_2 | 104.17 | 104.13 | 6.00 | 3.31 | 112.09 | False | 98.28 |
A_3 | 104.58 | 104.29 | 4.99 | 3.39 | 111.57 | False | 99.36 |
A_4 | 104.55 | 104.34 | 4.44 | 3.03 | 113.20 | True | 99.99 |
A_5 | 103.63 | 103.56 | 3.77 | 3.16 | 113.33 | True | 100.00 |