import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from scipy import stats
import warnings
warnings.simplefilter("ignore")
def drop_Tags(cell):
"""
"<" and ">" are like On/off button
When there are a ">" without "<" on back so the fonction catch the text after into the next "<"
"""
new_sentence = []
i = 0 # < = 1 ; > = 0 like on/off button
for char in cell:
if cell[0] != "<": #If there are no tags -> don't change the sentence
return cell
elif char == "<":
i += 1
elif i == 0:
new_sentence.append(char)
elif char == ">":
i -= 1
else: continue
new_sentence = "".join(new_sentence)
return new_sentence
#Data import from Fichier_erp
df_productsinfos = pd.read_excel("DATA_sources/Fichier_erp.xlsx")
#Data import from Fichier_liaison
df_connection = pd.read_excel("DATA_sources/Fichier_liaison.xlsx")
#Data import from Fichier_web
df_webinfos = pd.read_excel("DATA_sources/Fichier_web.xlsx")
#View of Dataframe size
df_productsinfos.shape
(825, 5)
#View of dataframe description
df_productsinfos.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 825 entries, 0 to 824 Data columns (total 5 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 product_id 825 non-null int64 1 onsale_web 825 non-null int64 2 price 825 non-null float64 3 stock_quantity 825 non-null int64 4 stock_status 825 non-null object dtypes: float64(1), int64(3), object(1) memory usage: 32.4+ KB
#View dataframe
df_productsinfos
product_id | onsale_web | price | stock_quantity | stock_status | |
---|---|---|---|---|---|
0 | 3847 | 1 | 24.2 | 0 | outofstock |
1 | 3849 | 1 | 34.3 | 0 | outofstock |
2 | 3850 | 1 | 20.8 | 0 | outofstock |
3 | 4032 | 1 | 14.1 | 0 | outofstock |
4 | 4039 | 1 | 46.0 | 0 | outofstock |
... | ... | ... | ... | ... | ... |
820 | 7203 | 0 | 45.0 | 30 | instock |
821 | 7204 | 0 | 45.0 | 9 | instock |
822 | 7247 | 1 | 54.8 | 23 | instock |
823 | 7329 | 0 | 26.5 | 14 | instock |
824 | 7338 | 1 | 16.3 | 45 | instock |
825 rows × 5 columns
#Check duplicates
df_productsinfos.duplicated().sum()
0
#Check unique key
df_productsinfos["product_id"].duplicated().sum()
0
#Check how many different stock status there are (there are not empty values)
df_productsinfos["stock_status"].unique()
array(['outofstock', 'instock'], dtype=object)
#Check min and max on numeric series
df_productsinfos.describe().round(2)
product_id | onsale_web | price | stock_quantity | |
---|---|---|---|---|
count | 825.00 | 825.00 | 825.00 | 825.00 |
mean | 5162.60 | 0.87 | 32.38 | 26.58 |
std | 902.64 | 0.34 | 26.83 | 45.88 |
min | 3847.00 | 0.00 | -8.00 | -1.00 |
25% | 4348.00 | 1.00 | 14.60 | 1.00 |
50% | 4907.00 | 1.00 | 24.40 | 11.00 |
75% | 5805.00 | 1.00 | 42.00 | 34.00 |
max | 7338.00 | 1.00 | 225.00 | 578.00 |
The data are about products informations: id, show on web site (0|1), unity price, stock quantity
Pay attention on:
#View of Dataframe size
df_connection.shape
(825, 2)
#View of dataframe description
df_connection.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 825 entries, 0 to 824 Data columns (total 2 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 product_id 825 non-null int64 1 id_web 734 non-null object dtypes: int64(1), object(1) memory usage: 13.0+ KB
Some id_web are NaN and object Dtype (ex: 14680-1)
#Drop NaN ID cause there are useless
df_connection.dropna(inplace=True)
#View dataframe
df_connection
product_id | id_web | |
---|---|---|
0 | 3847 | 15298 |
1 | 3849 | 15296 |
2 | 3850 | 15300 |
3 | 4032 | 19814 |
4 | 4039 | 19815 |
... | ... | ... |
792 | 7023 | 15891 |
793 | 7025 | 15887 |
822 | 7247 | 13127-1 |
823 | 7329 | 14680-1 |
824 | 7338 | 16230 |
734 rows × 2 columns
#Check duplicates
df_connection.duplicated().sum()
0
#Check unique key product_id
df_connection["product_id"].duplicated().sum()
0
#Check unique key id_web
df_connection["id_web"].duplicated().sum()
0
Min/max check is useless cause the data are ID
df_webinfos.shape
(1513, 28)
df_webinfos.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 1513 entries, 0 to 1512 Data columns (total 28 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 sku 1428 non-null object 1 virtual 1513 non-null int64 2 downloadable 1513 non-null int64 3 rating_count 1513 non-null int64 4 average_rating 1430 non-null float64 5 total_sales 1430 non-null float64 6 tax_status 716 non-null object 7 tax_class 0 non-null float64 8 post_author 1430 non-null float64 9 post_date 1430 non-null datetime64[ns] 10 post_date_gmt 1430 non-null datetime64[ns] 11 post_content 0 non-null float64 12 post_title 1430 non-null object 13 post_excerpt 716 non-null object 14 post_status 1430 non-null object 15 comment_status 1430 non-null object 16 ping_status 1430 non-null object 17 post_password 0 non-null float64 18 post_name 1430 non-null object 19 post_modified 1430 non-null datetime64[ns] 20 post_modified_gmt 1430 non-null datetime64[ns] 21 post_content_filtered 0 non-null float64 22 post_parent 1430 non-null float64 23 guid 1430 non-null object 24 menu_order 1430 non-null float64 25 post_type 1430 non-null object 26 post_mime_type 714 non-null object 27 comment_count 1430 non-null float64 dtypes: datetime64[ns](4), float64(10), int64(3), object(11) memory usage: 331.1+ KB
#Columns 7, 11, 17, 21 have no infos -> drop columns
df_webinfos.drop(columns={"tax_class", "post_content", "post_password", "post_content_filtered"}, inplace=True)
#sku is own ID, NaN are useless -> drop rows
df_webinfos.dropna(subset="sku", inplace=True)
#Check duplicates
df_webinfos.duplicated().sum()
0
#Check duplicates on sku (ID)
df_webinfos["sku"].duplicated().sum()
714
On 1428 rows, 714 are duplicate. So each sku has a double rows
3 differents points between 2 same sku (ID):
#Create a dataframe with one row correspondant to product category of each duplicated sku
df_missings_values = df_webinfos.loc[df_webinfos["post_type"] != "product"]
#With the copy of duplicated rows saved, we can drop them from the original dataframe
df_webinfos.drop(df_webinfos.loc[df_webinfos["post_type"] != "product"].index, inplace=True)
df_webinfos.info()
<class 'pandas.core.frame.DataFrame'> Index: 714 entries, 2 to 1510 Data columns (total 24 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 sku 714 non-null object 1 virtual 714 non-null int64 2 downloadable 714 non-null int64 3 rating_count 714 non-null int64 4 average_rating 714 non-null float64 5 total_sales 714 non-null float64 6 tax_status 714 non-null object 7 post_author 714 non-null float64 8 post_date 714 non-null datetime64[ns] 9 post_date_gmt 714 non-null datetime64[ns] 10 post_title 714 non-null object 11 post_excerpt 714 non-null object 12 post_status 714 non-null object 13 comment_status 714 non-null object 14 ping_status 714 non-null object 15 post_name 714 non-null object 16 post_modified 714 non-null datetime64[ns] 17 post_modified_gmt 714 non-null datetime64[ns] 18 post_parent 714 non-null float64 19 guid 714 non-null object 20 menu_order 714 non-null float64 21 post_type 714 non-null object 22 post_mime_type 0 non-null object 23 comment_count 714 non-null float64 dtypes: datetime64[ns](4), float64(6), int64(3), object(11) memory usage: 139.5+ KB
#Catch the informations missing in the first df from the 2nd df
#Change na values with specific sign
"""
df_webinfos.tax_status.fillna("-", inplace=True)
df_webinfos.post_excerpt.fillna("-", inplace=True)
df_webinfos.post_mime_type.fillna("-", inplace=True)
for sku in df_webinfos["sku"]:
tax_status = df_webinfos.loc[df_webinfos["sku"] == sku, "tax_status"].item()
tax_status_noNan = df_missings_values.loc[df_missings_values["sku"] == sku, "tax_status"].item()
if tax_status == "-":
tax_status = tax_status_noNan
post_excerpt = df_webinfos.loc[df_webinfos["sku"] == sku, "post_excerpt"].item()
post_excerpt_noNan = df_missings_values.loc[df_missings_values["sku"] == sku, "post_excerpt"].item()
if post_excerpt == "-":
post_excerpt = post_excerpt_noNan
post_mime_type = df_webinfos.loc[df_webinfos["sku"] == sku, "post_mime_type"].item()
post_mime_type_noNan = df_missings_values.loc[df_missings_values["sku"] == sku, "post_mime_type"].item()
if post_mime_type == "-":
post_mime_type = post_mime_type_noNan"""
'\ndf_webinfos.tax_status.fillna("-", inplace=True)\ndf_webinfos.post_excerpt.fillna("-", inplace=True)\ndf_webinfos.post_mime_type.fillna("-", inplace=True)\n\n\nfor sku in df_webinfos["sku"]:\n\n tax_status = df_webinfos.loc[df_webinfos["sku"] == sku, "tax_status"].item()\n tax_status_noNan = df_missings_values.loc[df_missings_values["sku"] == sku, "tax_status"].item()\n if tax_status == "-":\n tax_status = tax_status_noNan\n \n\n post_excerpt = df_webinfos.loc[df_webinfos["sku"] == sku, "post_excerpt"].item()\n post_excerpt_noNan = df_missings_values.loc[df_missings_values["sku"] == sku, "post_excerpt"].item()\n if post_excerpt == "-":\n post_excerpt = post_excerpt_noNan\n \n\n post_mime_type = df_webinfos.loc[df_webinfos["sku"] == sku, "post_mime_type"].item()\n post_mime_type_noNan = df_missings_values.loc[df_missings_values["sku"] == sku, "post_mime_type"].item()\n if post_mime_type == "-":\n post_mime_type = post_mime_type_noNan'
df_missings_values.info()
<class 'pandas.core.frame.DataFrame'> Index: 714 entries, 0 to 1512 Data columns (total 24 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 sku 714 non-null object 1 virtual 714 non-null int64 2 downloadable 714 non-null int64 3 rating_count 714 non-null int64 4 average_rating 714 non-null float64 5 total_sales 714 non-null float64 6 tax_status 0 non-null object 7 post_author 714 non-null float64 8 post_date 714 non-null datetime64[ns] 9 post_date_gmt 714 non-null datetime64[ns] 10 post_title 714 non-null object 11 post_excerpt 0 non-null object 12 post_status 714 non-null object 13 comment_status 714 non-null object 14 ping_status 714 non-null object 15 post_name 714 non-null object 16 post_modified 714 non-null datetime64[ns] 17 post_modified_gmt 714 non-null datetime64[ns] 18 post_parent 714 non-null float64 19 guid 714 non-null object 20 menu_order 714 non-null float64 21 post_type 714 non-null object 22 post_mime_type 714 non-null object 23 comment_count 714 non-null float64 dtypes: datetime64[ns](4), float64(6), int64(3), object(11) memory usage: 139.5+ KB
df_webinfos.describe()
virtual | downloadable | rating_count | average_rating | total_sales | post_author | post_date | post_date_gmt | post_modified | post_modified_gmt | post_parent | menu_order | comment_count | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
count | 714.0 | 714.0 | 714.0 | 714.0 | 714.000000 | 714.000000 | 714 | 714 | 714 | 714 | 714.0 | 714.0 | 714.0 |
mean | 0.0 | 0.0 | 0.0 | 0.0 | 4.012605 | 1.998599 | 2018-08-22 03:57:52.950980352 | 2018-08-22 02:29:08.581232640 | 2020-06-20 21:45:25.694678016 | 2020-06-20 19:51:53.929971968 | 0.0 | 0.0 | 0.0 |
min | 0.0 | 0.0 | 0.0 | 0.0 | 0.000000 | 1.000000 | 2018-02-08 12:58:52 | 2018-02-08 11:58:52 | 2018-02-20 15:19:23 | 2018-02-20 14:19:23 | 0.0 | 0.0 | 0.0 |
25% | 0.0 | 0.0 | 0.0 | 0.0 | 0.000000 | 2.000000 | 2018-02-27 20:01:12.500000 | 2018-02-27 19:01:12.500000 | 2020-06-18 18:32:35 | 2020-06-18 16:32:35 | 0.0 | 0.0 | 0.0 |
50% | 0.0 | 0.0 | 0.0 | 0.0 | 1.000000 | 2.000000 | 2018-04-19 14:52:10 | 2018-04-19 12:52:10 | 2020-08-04 09:30:06.500000 | 2020-08-04 07:30:06.500000 | 0.0 | 0.0 | 0.0 |
75% | 0.0 | 0.0 | 0.0 | 0.0 | 4.000000 | 2.000000 | 2019-01-31 14:35:47 | 2019-01-31 13:35:47 | 2020-08-25 10:32:32 | 2020-08-25 08:32:32 | 0.0 | 0.0 | 0.0 |
max | 0.0 | 0.0 | 0.0 | 0.0 | 96.000000 | 2.000000 | 2020-07-20 11:00:00 | 2020-07-20 09:00:00 | 2020-08-27 18:55:03 | 2020-08-27 16:55:03 | 0.0 | 0.0 | 0.0 |
std | 0.0 | 0.0 | 0.0 | 0.0 | 8.518183 | 0.037424 | NaN | NaN | NaN | NaN | 0.0 | 0.0 | 0.0 |
#Some columns are == 0 -> drop these columns
df_webinfos.drop(columns={"virtual", "downloadable", "rating_count", "average_rating", "post_parent", "menu_order", "comment_count"}, inplace=True)
#View of different status are possible for each colomuns with status or type
for column in df_webinfos.columns:
if (column.find("status") != -1) or column.find("type") != -1:
print("\nStatus for", column,":")
for status in df_webinfos[column].unique():
print(" -",status)
Status for tax_status : - taxable Status for post_status : - publish Status for comment_status : - closed Status for ping_status : - closed Status for post_type : - product Status for post_mime_type : - nan
#These columns have no relevant information -> send them in archive dataframe then drop columns
df_cols_archives = df_webinfos[["sku","post_status", "comment_status", "ping_status","tax_status", "post_type", "post_mime_type"]]
#Drop columns
df_webinfos.drop(columns={"post_status", "comment_status", "ping_status","tax_status", "post_type", "post_mime_type"}, inplace=True)
#Drop ALL tags in every cells of post_excerpt (ex: <span> <div></div> </span>)
for row in df_webinfos.index:
cell = df_webinfos["post_excerpt"][row]
cell = str(cell)
df_webinfos["post_excerpt"][row] = drop_Tags(cell)
#View of dataframe
df_webinfos
sku | total_sales | post_author | post_date | post_date_gmt | post_title | post_excerpt | post_name | post_modified | post_modified_gmt | guid | |
---|---|---|---|---|---|---|---|---|---|---|---|
2 | 15075 | 3.0 | 2.0 | 2018-02-14 15:39:43 | 2018-02-14 14:39:43 | Parés Baltà Penedès Indigena 2017 | Des couleurs et aromes intenses où le fruit et... | pares-balta-penedes-indigena-2017 | 2020-08-20 15:35:02 | 2020-08-20 13:35:02 | https://www.bottle-neck.fr/?post_type=product&... |
3 | 16209 | 6.0 | 2.0 | 2018-02-14 17:15:31 | 2018-02-14 16:15:31 | Maurel Cabardès Tradition 2017 | Un joli nez aux arômes de fruits rouges, de ca... | maurel-cabardes-tradition-2017 | 2020-08-05 18:05:03 | 2020-08-05 16:05:03 | https://www.bottle-neck.fr/?post_type=product&... |
5 | 13895 | 0.0 | 2.0 | 2019-03-19 10:41:50 | 2019-03-19 09:41:50 | Château Saransot-Dupré Bordeaux Blanc 2016 | Rareté en Médoc (mais d’A.O.C. Bordeaux) le do... | chateau-saransot-dupre-bordeaux-blanc-2016 | 2020-04-25 21:11:40 | 2020-04-25 19:11:40 | https://www.bottle-neck.fr/?post_type=product&... |
6 | 12857 | 0.0 | 2.0 | 2018-04-12 17:56:13 | 2018-04-12 15:56:13 | Château de Meursault Puligny-Montrachet 1er Cr... | Il présente une grande fraîcheur minérale au n... | chateau-de-puligny-montrachet-1cru-champ-canet... | 2020-02-06 16:35:02 | 2020-02-06 15:35:02 | https://www.bottle-neck.fr/?post_type=product&... |
9 | 14106 | 0.0 | 2.0 | 2019-06-08 09:54:21 | 2019-06-08 07:54:21 | Stéphane Tissot Château-Chalon 2011 | Ce vin peut-être dégusté sur sa jeunesse mais ... | stephane-tissot-chateau-chalon-2011 | 2020-05-29 17:35:03 | 2020-05-29 15:35:03 | https://www.bottle-neck.fr/?post_type=product&... |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
1501 | 15941 | 4.0 | 2.0 | 2018-02-12 14:26:52 | 2018-02-12 13:26:52 | Domaine de la Jérôme Côtes du Rhône Village 201 | Un vin opulent, à la texture onctueuse, et tou... | jerome-cotes-du-rhone-2018 | 2020-06-25 14:00:03 | 2020-06-25 12:00:03 | https://www.bottle-neck.fr/?post_type=product&... |
1503 | 10459 | 0.0 | 2.0 | 2018-04-13 15:58:19 | 2018-04-13 13:58:19 | Alphonse Mellot Sancerre Rouge Génération XIX ... | Robe d'un superbe rubis, dense et profond - As... | alphonse-mellot-sancerre-rouge-generation-xix-... | 2020-08-24 14:05:02 | 2020-08-24 12:05:02 | https://www.bottle-neck.fr/?post_type=product&... |
1507 | 16320 | 3.0 | 2.0 | 2018-02-12 12:49:29 | 2018-02-12 11:49:29 | Mourgues du Grès IGP Pont du Gard Terre d'Arge... | Magnifique exemple d’assemblage réussi, ou cha... | mourgues-du-gres-costieres-de-nimes-terre-darg... | 2020-08-25 11:15:03 | 2020-08-25 09:15:03 | https://www.bottle-neck.fr/?post_type=product&... |
1509 | 15663 | 3.0 | 2.0 | 2018-02-27 10:27:01 | 2018-02-27 09:27:01 | Chermette Domaine du Vissoux Brouilly Pierreux... | Le Brouilly est le plus méridional des crus du... | chermette-domaine-du-vissoux-brouilly-pierreux... | 2020-08-01 09:35:02 | 2020-08-01 07:35:02 | https://www.bottle-neck.fr/?post_type=product&... |
1510 | 15910 | 0.0 | 2.0 | 2019-03-28 10:59:43 | 2019-03-28 09:59:43 | Thevenet Quintaine Viré-Clessé La Bongran 2015 | C’est dans cette cuvée la pleine expression du... | thevenet-quintaine-vire-clesse-la-bongran-2015 | 2020-08-14 10:45:02 | 2020-08-14 08:45:02 | https://www.bottle-neck.fr/?post_type=product&... |
714 rows × 11 columns
#Check if there are all the primary keys needed to join
pk_products_infos = df_productsinfos["product_id"].tolist()
pk_connection_products = df_connection["product_id"].tolist()
pk_products_notin_connection = []
for key in pk_products_infos:
if key not in pk_connection_products:
pk_products_notin_connection.append(key)
print("It's missing:", len(pk_products_notin_connection),"key in the connection source")
It's missing: 91 key in the connection source
#Check if there are all the primary keys needed to join
pk_web_infos = df_webinfos["sku"].tolist()
pk_connection_web = df_connection["id_web"].tolist()
pk_web_notin_connection = []
for key in pk_web_infos:
if key not in pk_connection_web:
pk_web_notin_connection.append(key)
print("It's missing:", len(pk_web_notin_connection),"key in the connection source")
It's missing: 0 key in the connection source
#Merge Connection ON Products infos
df_temp = pd.merge(df_productsinfos, df_connection, on="product_id" )
#Merge products info ON df_temp to create the full dataframe
df_complete = pd.merge(df_temp, df_webinfos, left_on="id_web", right_on="sku")
#Check if a key is missing
df_complete["product_id"].isna().sum()
0
#Check if a key is missing
df_complete["id_web"].isna().sum()
0
df_complete.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 714 entries, 0 to 713 Data columns (total 17 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 product_id 714 non-null int64 1 onsale_web 714 non-null int64 2 price 714 non-null float64 3 stock_quantity 714 non-null int64 4 stock_status 714 non-null object 5 id_web 714 non-null object 6 sku 714 non-null object 7 total_sales 714 non-null float64 8 post_author 714 non-null float64 9 post_date 714 non-null datetime64[ns] 10 post_date_gmt 714 non-null datetime64[ns] 11 post_title 714 non-null object 12 post_excerpt 714 non-null object 13 post_name 714 non-null object 14 post_modified 714 non-null datetime64[ns] 15 post_modified_gmt 714 non-null datetime64[ns] 16 guid 714 non-null object dtypes: datetime64[ns](4), float64(3), int64(3), object(7) memory usage: 95.0+ KB
#Create columns for total values sales
df_complete["total_value_sales"] = df_complete["total_sales"] * df_complete["price"]
#Total value sales online (==1)
df_temp = df_complete.loc[df_complete["onsale_web"] == 1]
Total_sales_online = df_temp["total_value_sales"].sum().astype("int")
print(f"The total online sales are {Total_sales_online}.")
The total online sales are 70568.
#Top 5 best seller
df_temp.sort_values("total_value_sales", ascending=False).head(5)
product_id | onsale_web | price | stock_quantity | stock_status | id_web | sku | total_sales | post_author | post_date | post_date_gmt | post_title | post_excerpt | post_name | post_modified | post_modified_gmt | guid | total_value_sales | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
194 | 4334 | 1 | 49.0 | 0 | outofstock | 7818 | 7818 | 96.0 | 2.0 | 2018-03-01 14:02:44 | 2018-03-01 13:02:44 | Champagne Gosset Grand Blanc de Blancs | La bulle fine et presente se dirige vers la su... | champagne-gosset-grand-blanc-de-blanc | 2020-08-12 14:00:03 | 2020-08-12 12:00:03 | https://www.bottle-neck.fr/?post_type=product&... | 4704.0 |
71 | 4144 | 1 | 49.0 | 11 | instock | 1662 | 1662 | 87.0 | 2.0 | 2018-02-13 13:17:25 | 2018-02-13 12:17:25 | Champagne Gosset Grand Rosé | Le nez exprime sans détour la fraise, dans sa ... | champagne-gosset-grand-rose | 2020-08-14 16:45:03 | 2020-08-14 14:45:03 | https://www.bottle-neck.fr/?post_type=product&... | 4263.0 |
218 | 4402 | 1 | 176.0 | 8 | instock | 3510 | 3510 | 13.0 | 2.0 | 2018-03-22 11:21:05 | 2018-03-22 10:21:05 | Cognac Frapin VIP XO | La cuvée VIP XO à été enrichie d’eaux-de-vie t... | cognac-frapin-vip-xo | 2020-08-22 11:35:03 | 2020-08-22 09:35:03 | https://www.bottle-neck.fr/?post_type=product&... | 2288.0 |
70 | 4142 | 1 | 53.0 | 8 | instock | 11641 | 11641 | 30.0 | 2.0 | 2018-02-13 13:08:44 | 2018-02-13 12:08:44 | Champagne Gosset Grand Millésime 2006 | L'attaque est ample, gourmande avec une belle ... | champagne-gosset-grand-millesime-2006 | 2020-08-20 09:30:03 | 2020-08-20 07:30:03 | https://www.bottle-neck.fr/?post_type=product&... | 1590.0 |
69 | 4141 | 1 | 39.0 | 1 | instock | 304 | 304 | 40.0 | 2.0 | 2018-02-13 12:57:44 | 2018-02-13 11:57:44 | Champagne Gosset Grande Réserve | Le nez, ouvert et expressif, évoque les fruits... | gosset-champagne-grande-reserve | 2020-08-27 11:25:02 | 2020-08-27 09:25:02 | https://www.bottle-neck.fr/?post_type=product&... | 1560.0 |
df_complete["price"].describe()
count 714.000000 mean 32.493137 std 27.810525 min 5.200000 25% 14.100000 50% 23.550000 75% 42.175000 max 225.000000 Name: price, dtype: float64
mean = df_complete["price"].mean()
med = np.percentile(df_complete["price"], 50)
df_complete["price"].skew()
2.58090126300337
Skewness > 0 == Right distribution
ymax = 110
plt.title("Distribution de la variable PRICE")
sns.histplot(data=df_complete, x='price', kde=True, label="Quantité")
plt.vlines(x=mean,ymin=0, ymax = ymax , color="g", label="Moyenne", linestyles=":")
plt.vlines(x=med, ymin=0, ymax = ymax, color="r", label="Médiane", linestyles=":")
plt.legend()
<matplotlib.legend.Legend at 0x16eab876310>
I will use the quartiles due to the assymetrical distribution -> then the mean is far from the center ( mean != median != mode) If the line was in bell foam (Gaussian distribution), I should use the z-score due to symetrical distribution -> then mean perfectly at the middle ( mean == median == mod)
sns.boxplot(data=df_complete.price)
<Axes: >
#Looking for 5 biggest outliers values
df_complete.sort_values("price", ascending=False).head(5)
product_id | onsale_web | price | stock_quantity | stock_status | id_web | sku | total_sales | post_author | post_date | post_date_gmt | post_title | post_excerpt | post_name | post_modified | post_modified_gmt | guid | total_value_sales | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
199 | 4352 | 1 | 225.0 | 0 | outofstock | 15940 | 15940 | 5.0 | 2.0 | 2018-03-02 10:30:04 | 2018-03-02 09:30:04 | Champagne Egly-Ouriet Grand Cru Millésimé 2008 | Issu d’un assemblage de 70% de Pinot Noir du g... | champagne-egly-ouriet-grand-cru-millesime-2008 | 2020-03-07 11:18:45 | 2020-03-07 10:18:45 | https://www.bottle-neck.fr/?post_type=product&... | 1125.0 |
426 | 5001 | 1 | 217.5 | 20 | instock | 14581 | 14581 | 0.0 | 2.0 | 2018-07-17 09:45:39 | 2018-07-17 07:45:39 | David Duband Charmes-Chambertin Grand Cru 2014 | Robe cerise chatoyante. Un nez de fruits rouge... | david-duband-charmes-chambertin-grand-cru-2014 | 2020-05-16 09:00:05 | 2020-05-16 07:00:05 | https://www.bottle-neck.fr/?post_type=product&... | 0.0 |
587 | 5892 | 1 | 191.3 | 10 | instock | 14983 | 14983 | 3.0 | 2.0 | 2019-03-28 10:21:36 | 2019-03-28 09:21:36 | Coteaux Champenois Egly-Ouriet Ambonnay Rouge ... | Cet Ambonnay évoque les grands Pinots Noirs de... | coteaux-champenois-egly-ouriet-ambonnay-rouge-... | 2020-04-01 09:30:09 | 2020-04-01 07:30:09 | https://www.bottle-neck.fr/?post_type=product&... | 573.9 |
218 | 4402 | 1 | 176.0 | 8 | instock | 3510 | 3510 | 13.0 | 2.0 | 2018-03-22 11:21:05 | 2018-03-22 10:21:05 | Cognac Frapin VIP XO | La cuvée VIP XO à été enrichie d’eaux-de-vie t... | cognac-frapin-vip-xo | 2020-08-22 11:35:03 | 2020-08-22 09:35:03 | https://www.bottle-neck.fr/?post_type=product&... | 2288.0 |
553 | 5767 | 1 | 175.0 | 12 | instock | 15185 | 15185 | 0.0 | 2.0 | 2019-03-13 14:43:22 | 2019-03-13 13:43:22 | Camille Giroud Clos de Vougeot 2016 | Ce vin provient de vignes âgées de 50 ans situ... | camille-giroud-clos-de-vougeot-2016 | 2020-06-11 15:25:04 | 2020-06-11 13:25:04 | https://www.bottle-neck.fr/?post_type=product&... | 0.0 |
# IQR Inter Quartile Range
Q1 = np.percentile(df_complete['price'], 25)
Q3 = np.percentile(df_complete['price'], 75)
IQR = Q3 - Q1
print("Old Shape: ", df_complete.shape)
#Highter than mustache
highter = np.where(df_complete['price'] >= (Q3 + 1.5 * IQR))
#Lower than mustache
lower = np.where(df_complete['price'] <= (Q1 - 1.5 * IQR))
#Archive outliers
df_outliers = df_complete.loc[(df_complete.index.isin(highter[0])) | df_complete.index.isin(lower[0])]
#Drop outliers
df_complete.drop(highter[0], inplace = True)
df_complete.drop(lower[0], inplace = True)
print("New Shape: ", df_complete.shape)
Old Shape: (714, 18) New Shape: (682, 18)
outliers = df_outliers["sku"].count()
outliers
32
#Show outliers values
print("These are outliers: ", sorted(df_outliers["price"].tolist()))
These are outliers: [85.6, 86.8, 88.4, 92.0, 93.0, 99.0, 100.0, 102.3, 104.0, 105.0, 105.0, 105.6, 108.5, 109.6, 112.0, 114.0, 115.0, 115.0, 116.4, 121.0, 121.0, 122.0, 124.8, 126.5, 135.0, 137.0, 157.0, 175.0, 176.0, 191.3, 217.5, 225.0]
zvalue = np.abs(stats.zscore(df_complete['price']))
threshold = 2
#Position of the outlier
outliers = np.where(zvalue > threshold)[0]
print(np.where(zvalue > threshold))
print(len(outliers))
(array([ 10, 18, 35, 66, 73, 125, 180, 194, 196, 197, 200, 213, 217, 237, 248, 347, 410, 411, 412, 414, 427, 430, 474, 488, 498, 499, 524, 586, 595, 596, 599, 614, 635, 640, 644, 679], dtype=int64),) 36