import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from scipy import stats

import warnings
warnings.simplefilter("ignore")


def drop_Tags(cell):
    """ 
    "<" and ">" are like On/off button
    When there are a ">" without "<" on back so the fonction catch the text after into the next "<"

    """
    new_sentence = []
    i = 0 # < = 1 ; > = 0  like on/off button

    for char in cell:   
        if cell[0] != "<":      #If there are no tags -> don't change the sentence
            return cell

        elif char == "<":
            i += 1
        elif i == 0:
            new_sentence.append(char)
        elif char == ">":
            i -= 1
        else: continue
        
    new_sentence = "".join(new_sentence)
    return new_sentence


#Data import from Fichier_erp
df_productsinfos = pd.read_excel("DATA_sources/Fichier_erp.xlsx")

#Data import from Fichier_liaison
df_connection = pd.read_excel("DATA_sources/Fichier_liaison.xlsx")

#Data import from Fichier_web
df_webinfos = pd.read_excel("DATA_sources/Fichier_web.xlsx")


#View of Dataframe size
df_productsinfos.shape

(825, 5)


#View of dataframe description
df_productsinfos.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 825 entries, 0 to 824
Data columns (total 5 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   product_id      825 non-null    int64  
 1   onsale_web      825 non-null    int64  
 2   price           825 non-null    float64
 3   stock_quantity  825 non-null    int64  
 4   stock_status    825 non-null    object 
dtypes: float64(1), int64(3), object(1)
memory usage: 32.4+ KB


#View dataframe
df_productsinfos


#Check duplicates
df_productsinfos.duplicated().sum()

0


#Check unique key
df_productsinfos["product_id"].duplicated().sum()

0


#Check how many different stock status there are (there are not empty values)
df_productsinfos["stock_status"].unique()

array(['outofstock', 'instock'], dtype=object)


#Check min and max on numeric series
df_productsinfos.describe().round(2)


#View of Dataframe size
df_connection.shape

(825, 2)


#View of dataframe description
df_connection.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 825 entries, 0 to 824
Data columns (total 2 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   product_id  825 non-null    int64 
 1   id_web      734 non-null    object
dtypes: int64(1), object(1)
memory usage: 13.0+ KB


#Drop NaN ID cause there are useless
df_connection.dropna(inplace=True)


#View dataframe
df_connection


#Check duplicates
df_connection.duplicated().sum()

0


#Check unique key product_id
df_connection["product_id"].duplicated().sum()

0


#Check unique key id_web
df_connection["id_web"].duplicated().sum()

0


df_webinfos.shape

(1513, 28)


df_webinfos.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1513 entries, 0 to 1512
Data columns (total 28 columns):
 #   Column                 Non-Null Count  Dtype         
---  ------                 --------------  -----         
 0   sku                    1428 non-null   object        
 1   virtual                1513 non-null   int64         
 2   downloadable           1513 non-null   int64         
 3   rating_count           1513 non-null   int64         
 4   average_rating         1430 non-null   float64       
 5   total_sales            1430 non-null   float64       
 6   tax_status             716 non-null    object        
 7   tax_class              0 non-null      float64       
 8   post_author            1430 non-null   float64       
 9   post_date              1430 non-null   datetime64[ns]
 10  post_date_gmt          1430 non-null   datetime64[ns]
 11  post_content           0 non-null      float64       
 12  post_title             1430 non-null   object        
 13  post_excerpt           716 non-null    object        
 14  post_status            1430 non-null   object        
 15  comment_status         1430 non-null   object        
 16  ping_status            1430 non-null   object        
 17  post_password          0 non-null      float64       
 18  post_name              1430 non-null   object        
 19  post_modified          1430 non-null   datetime64[ns]
 20  post_modified_gmt      1430 non-null   datetime64[ns]
 21  post_content_filtered  0 non-null      float64       
 22  post_parent            1430 non-null   float64       
 23  guid                   1430 non-null   object        
 24  menu_order             1430 non-null   float64       
 25  post_type              1430 non-null   object        
 26  post_mime_type         714 non-null    object        
 27  comment_count          1430 non-null   float64       
dtypes: datetime64[ns](4), float64(10), int64(3), object(11)
memory usage: 331.1+ KB


#Columns 7, 11, 17, 21 have no infos -> drop columns
df_webinfos.drop(columns={"tax_class", "post_content", "post_password", "post_content_filtered"}, inplace=True)


#sku is own ID, NaN are useless -> drop rows
df_webinfos.dropna(subset="sku", inplace=True)


#Check duplicates
df_webinfos.duplicated().sum()

0


#Check duplicates on sku (ID)
df_webinfos["sku"].duplicated().sum()

714


#Create a dataframe with one row correspondant to product category of each duplicated sku
df_missings_values = df_webinfos.loc[df_webinfos["post_type"] != "product"]


#With the copy of duplicated rows saved, we can drop them from the original dataframe
df_webinfos.drop(df_webinfos.loc[df_webinfos["post_type"] != "product"].index, inplace=True)


df_webinfos.info()

<class 'pandas.core.frame.DataFrame'>
Index: 714 entries, 2 to 1510
Data columns (total 24 columns):
 #   Column             Non-Null Count  Dtype         
---  ------             --------------  -----         
 0   sku                714 non-null    object        
 1   virtual            714 non-null    int64         
 2   downloadable       714 non-null    int64         
 3   rating_count       714 non-null    int64         
 4   average_rating     714 non-null    float64       
 5   total_sales        714 non-null    float64       
 6   tax_status         714 non-null    object        
 7   post_author        714 non-null    float64       
 8   post_date          714 non-null    datetime64[ns]
 9   post_date_gmt      714 non-null    datetime64[ns]
 10  post_title         714 non-null    object        
 11  post_excerpt       714 non-null    object        
 12  post_status        714 non-null    object        
 13  comment_status     714 non-null    object        
 14  ping_status        714 non-null    object        
 15  post_name          714 non-null    object        
 16  post_modified      714 non-null    datetime64[ns]
 17  post_modified_gmt  714 non-null    datetime64[ns]
 18  post_parent        714 non-null    float64       
 19  guid               714 non-null    object        
 20  menu_order         714 non-null    float64       
 21  post_type          714 non-null    object        
 22  post_mime_type     0 non-null      object        
 23  comment_count      714 non-null    float64       
dtypes: datetime64[ns](4), float64(6), int64(3), object(11)
memory usage: 139.5+ KB


#Catch the informations missing in the first df from the 2nd df
#Change na values with specific sign
"""
df_webinfos.tax_status.fillna("-", inplace=True)
df_webinfos.post_excerpt.fillna("-", inplace=True)
df_webinfos.post_mime_type.fillna("-", inplace=True)


for sku in df_webinfos["sku"]:

    tax_status = df_webinfos.loc[df_webinfos["sku"] == sku, "tax_status"].item()
    tax_status_noNan = df_missings_values.loc[df_missings_values["sku"] == sku, "tax_status"].item()
    if tax_status == "-":
        tax_status = tax_status_noNan
    

    post_excerpt = df_webinfos.loc[df_webinfos["sku"] == sku, "post_excerpt"].item()
    post_excerpt_noNan = df_missings_values.loc[df_missings_values["sku"] == sku, "post_excerpt"].item()
    if post_excerpt == "-":
        post_excerpt = post_excerpt_noNan
    

    post_mime_type = df_webinfos.loc[df_webinfos["sku"] == sku, "post_mime_type"].item()
    post_mime_type_noNan = df_missings_values.loc[df_missings_values["sku"] == sku, "post_mime_type"].item()
    if post_mime_type == "-":
        post_mime_type = post_mime_type_noNan"""

'\ndf_webinfos.tax_status.fillna("-", inplace=True)\ndf_webinfos.post_excerpt.fillna("-", inplace=True)\ndf_webinfos.post_mime_type.fillna("-", inplace=True)\n\n\nfor sku in df_webinfos["sku"]:\n\n    tax_status = df_webinfos.loc[df_webinfos["sku"] == sku, "tax_status"].item()\n    tax_status_noNan = df_missings_values.loc[df_missings_values["sku"] == sku, "tax_status"].item()\n    if tax_status == "-":\n        tax_status = tax_status_noNan\n    \n\n    post_excerpt = df_webinfos.loc[df_webinfos["sku"] == sku, "post_excerpt"].item()\n    post_excerpt_noNan = df_missings_values.loc[df_missings_values["sku"] == sku, "post_excerpt"].item()\n    if post_excerpt == "-":\n        post_excerpt = post_excerpt_noNan\n    \n\n    post_mime_type = df_webinfos.loc[df_webinfos["sku"] == sku, "post_mime_type"].item()\n    post_mime_type_noNan = df_missings_values.loc[df_missings_values["sku"] == sku, "post_mime_type"].item()\n    if post_mime_type == "-":\n        post_mime_type = post_mime_type_noNan'


df_missings_values.info()

<class 'pandas.core.frame.DataFrame'>
Index: 714 entries, 0 to 1512
Data columns (total 24 columns):
 #   Column             Non-Null Count  Dtype         
---  ------             --------------  -----         
 0   sku                714 non-null    object        
 1   virtual            714 non-null    int64         
 2   downloadable       714 non-null    int64         
 3   rating_count       714 non-null    int64         
 4   average_rating     714 non-null    float64       
 5   total_sales        714 non-null    float64       
 6   tax_status         0 non-null      object        
 7   post_author        714 non-null    float64       
 8   post_date          714 non-null    datetime64[ns]
 9   post_date_gmt      714 non-null    datetime64[ns]
 10  post_title         714 non-null    object        
 11  post_excerpt       0 non-null      object        
 12  post_status        714 non-null    object        
 13  comment_status     714 non-null    object        
 14  ping_status        714 non-null    object        
 15  post_name          714 non-null    object        
 16  post_modified      714 non-null    datetime64[ns]
 17  post_modified_gmt  714 non-null    datetime64[ns]
 18  post_parent        714 non-null    float64       
 19  guid               714 non-null    object        
 20  menu_order         714 non-null    float64       
 21  post_type          714 non-null    object        
 22  post_mime_type     714 non-null    object        
 23  comment_count      714 non-null    float64       
dtypes: datetime64[ns](4), float64(6), int64(3), object(11)
memory usage: 139.5+ KB


df_webinfos.describe()


#Some columns are == 0 -> drop these columns
df_webinfos.drop(columns={"virtual", "downloadable", "rating_count", "average_rating", "post_parent", "menu_order", "comment_count"}, inplace=True)


#View of different status are possible for each colomuns with status or type
for column in df_webinfos.columns:
    if (column.find("status") != -1) or column.find("type") != -1:
        print("\nStatus for", column,":")
        for status in df_webinfos[column].unique():
            print("                        -",status)

Status for tax_status :
                        - taxable

Status for post_status :
                        - publish

Status for comment_status :
                        - closed

Status for ping_status :
                        - closed

Status for post_type :
                        - product

Status for post_mime_type :
                        - nan


#These columns have no relevant information -> send them in archive dataframe then drop columns
df_cols_archives = df_webinfos[["sku","post_status", "comment_status", "ping_status","tax_status", "post_type", "post_mime_type"]]

#Drop columns
df_webinfos.drop(columns={"post_status", "comment_status", "ping_status","tax_status", "post_type", "post_mime_type"}, inplace=True)


#Drop ALL tags in every cells of post_excerpt  (ex: <span> <div></div> </span>)
for row in df_webinfos.index:
    cell = df_webinfos["post_excerpt"][row]
    cell = str(cell)

    df_webinfos["post_excerpt"][row] = drop_Tags(cell)


#View of dataframe
df_webinfos


#Check if there are all the primary keys needed to join
pk_products_infos = df_productsinfos["product_id"].tolist()
pk_connection_products = df_connection["product_id"].tolist()

pk_products_notin_connection = []
for key in pk_products_infos:
    if key not in pk_connection_products:
        pk_products_notin_connection.append(key)

print("It's missing:", len(pk_products_notin_connection),"key in the connection source")

It's missing: 91 key in the connection source


#Check if there are all the primary keys needed to join
pk_web_infos = df_webinfos["sku"].tolist()
pk_connection_web = df_connection["id_web"].tolist()

pk_web_notin_connection = []
for key in pk_web_infos:
    if key not in pk_connection_web:
        pk_web_notin_connection.append(key)

print("It's missing:", len(pk_web_notin_connection),"key in the connection source")

It's missing: 0 key in the connection source


#Merge Connection ON Products infos
df_temp = pd.merge(df_productsinfos, df_connection, on="product_id" )


#Merge products info ON df_temp to create the full dataframe
df_complete = pd.merge(df_temp, df_webinfos, left_on="id_web", right_on="sku")


#Check if a key is missing
df_complete["product_id"].isna().sum()

0


#Check if a key is missing
df_complete["id_web"].isna().sum()

0


df_complete.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 714 entries, 0 to 713
Data columns (total 17 columns):
 #   Column             Non-Null Count  Dtype         
---  ------             --------------  -----         
 0   product_id         714 non-null    int64         
 1   onsale_web         714 non-null    int64         
 2   price              714 non-null    float64       
 3   stock_quantity     714 non-null    int64         
 4   stock_status       714 non-null    object        
 5   id_web             714 non-null    object        
 6   sku                714 non-null    object        
 7   total_sales        714 non-null    float64       
 8   post_author        714 non-null    float64       
 9   post_date          714 non-null    datetime64[ns]
 10  post_date_gmt      714 non-null    datetime64[ns]
 11  post_title         714 non-null    object        
 12  post_excerpt       714 non-null    object        
 13  post_name          714 non-null    object        
 14  post_modified      714 non-null    datetime64[ns]
 15  post_modified_gmt  714 non-null    datetime64[ns]
 16  guid               714 non-null    object        
dtypes: datetime64[ns](4), float64(3), int64(3), object(7)
memory usage: 95.0+ KB


#Create columns for total values sales
df_complete["total_value_sales"] = df_complete["total_sales"] * df_complete["price"]


#Total value sales online (==1)
df_temp = df_complete.loc[df_complete["onsale_web"] == 1]

Total_sales_online = df_temp["total_value_sales"].sum().astype("int")

print(f"The total online sales are {Total_sales_online}.")

The total online sales are 70568.


#Top 5 best seller
df_temp.sort_values("total_value_sales", ascending=False).head(5)


df_complete["price"].describe()

count    714.000000
mean      32.493137
std       27.810525
min        5.200000
25%       14.100000
50%       23.550000
75%       42.175000
max      225.000000
Name: price, dtype: float64


mean = df_complete["price"].mean()
med = np.percentile(df_complete["price"], 50)


df_complete["price"].skew()

2.58090126300337


ymax = 110

plt.title("Distribution de la variable PRICE")

sns.histplot(data=df_complete, x='price', kde=True, label="Quantité")
plt.vlines(x=mean,ymin=0, ymax = ymax , color="g", label="Moyenne", linestyles=":")
plt.vlines(x=med, ymin=0, ymax = ymax, color="r", label="Médiane", linestyles=":")

plt.legend()

<matplotlib.legend.Legend at 0x16eab876310>


sns.boxplot(data=df_complete.price)

<Axes: >


#Looking for 5 biggest outliers values
df_complete.sort_values("price", ascending=False).head(5)


# IQR Inter Quartile Range 
Q1 = np.percentile(df_complete['price'], 25)
 
Q3 = np.percentile(df_complete['price'], 75)

IQR = Q3 - Q1
 
print("Old Shape: ", df_complete.shape)
 
#Highter than mustache
highter = np.where(df_complete['price'] >= (Q3 + 1.5 * IQR))
#Lower than mustache
lower = np.where(df_complete['price'] <= (Q1 - 1.5 * IQR))

#Archive outliers
df_outliers = df_complete.loc[(df_complete.index.isin(highter[0])) | df_complete.index.isin(lower[0])]
 
#Drop outliers
df_complete.drop(highter[0], inplace = True)
df_complete.drop(lower[0], inplace = True)
 
print("New Shape: ", df_complete.shape)

Old Shape:  (714, 18)
New Shape:  (682, 18)


outliers = df_outliers["sku"].count()

outliers

32


#Show outliers values
print("These are outliers: ", sorted(df_outliers["price"].tolist()))

These are outliers:  [85.6, 86.8, 88.4, 92.0, 93.0, 99.0, 100.0, 102.3, 104.0, 105.0, 105.0, 105.6, 108.5, 109.6, 112.0, 114.0, 115.0, 115.0, 116.4, 121.0, 121.0, 122.0, 124.8, 126.5, 135.0, 137.0, 157.0, 175.0, 176.0, 191.3, 217.5, 225.0]


zvalue = np.abs(stats.zscore(df_complete['price']))


threshold = 2
 
#Position of the outlier
outliers = np.where(zvalue > threshold)[0]

print(np.where(zvalue > threshold))

print(len(outliers))

(array([ 10,  18,  35,  66,  73, 125, 180, 194, 196, 197, 200, 213, 217,
       237, 248, 347, 410, 411, 412, 414, 427, 430, 474, 488, 498, 499,
       524, 586, 595, 596, 599, 614, 635, 640, 644, 679], dtype=int64),)
36

	product_id	onsale_web	price	stock_quantity	stock_status
0	3847	1	24.2	0	outofstock
1	3849	1	34.3	0	outofstock
2	3850	1	20.8	0	outofstock
3	4032	1	14.1	0	outofstock
4	4039	1	46.0	0	outofstock
...	...	...	...	...	...
820	7203	0	45.0	30	instock
821	7204	0	45.0	9	instock
822	7247	1	54.8	23	instock
823	7329	0	26.5	14	instock
824	7338	1	16.3	45	instock

	product_id	onsale_web	price	stock_quantity
count	825.00	825.00	825.00	825.00
mean	5162.60	0.87	32.38	26.58
std	902.64	0.34	26.83	45.88
min	3847.00	0.00	-8.00	-1.00
25%	4348.00	1.00	14.60	1.00
50%	4907.00	1.00	24.40	11.00
75%	5805.00	1.00	42.00	34.00
max	7338.00	1.00	225.00	578.00

	product_id	id_web
0	3847	15298
1	3849	15296
2	3850	15300
3	4032	19814
4	4039	19815
...	...	...
792	7023	15891
793	7025	15887
822	7247	13127-1
823	7329	14680-1
824	7338	16230

	sku	total_sales	post_author	post_date	post_date_gmt	post_title	post_excerpt	post_name	post_modified	post_modified_gmt	guid
2	15075	3.0	2.0	2018-02-14 15:39:43	2018-02-14 14:39:43	Parés Baltà Penedès Indigena 2017	Des couleurs et aromes intenses où le fruit et...	pares-balta-penedes-indigena-2017	2020-08-20 15:35:02	2020-08-20 13:35:02	https://www.bottle-neck.fr/?post_type=product&...
3	16209	6.0	2.0	2018-02-14 17:15:31	2018-02-14 16:15:31	Maurel Cabardès Tradition 2017	Un joli nez aux arômes de fruits rouges, de ca...	maurel-cabardes-tradition-2017	2020-08-05 18:05:03	2020-08-05 16:05:03	https://www.bottle-neck.fr/?post_type=product&...
5	13895	0.0	2.0	2019-03-19 10:41:50	2019-03-19 09:41:50	Château Saransot-Dupré Bordeaux Blanc 2016	Rareté en Médoc (mais d’A.O.C. Bordeaux) le do...	chateau-saransot-dupre-bordeaux-blanc-2016	2020-04-25 21:11:40	2020-04-25 19:11:40	https://www.bottle-neck.fr/?post_type=product&...
6	12857	0.0	2.0	2018-04-12 17:56:13	2018-04-12 15:56:13	Château de Meursault Puligny-Montrachet 1er Cr...	Il présente une grande fraîcheur minérale au n...	chateau-de-puligny-montrachet-1cru-champ-canet...	2020-02-06 16:35:02	2020-02-06 15:35:02	https://www.bottle-neck.fr/?post_type=product&...
9	14106	0.0	2.0	2019-06-08 09:54:21	2019-06-08 07:54:21	Stéphane Tissot Château-Chalon 2011	Ce vin peut-être dégusté sur sa jeunesse mais ...	stephane-tissot-chateau-chalon-2011	2020-05-29 17:35:03	2020-05-29 15:35:03	https://www.bottle-neck.fr/?post_type=product&...
...	...	...	...	...	...	...	...	...	...	...	...
1501	15941	4.0	2.0	2018-02-12 14:26:52	2018-02-12 13:26:52	Domaine de la Jérôme Côtes du Rhône Village 201	Un vin opulent, à la texture onctueuse, et tou...	jerome-cotes-du-rhone-2018	2020-06-25 14:00:03	2020-06-25 12:00:03	https://www.bottle-neck.fr/?post_type=product&...
1503	10459	0.0	2.0	2018-04-13 15:58:19	2018-04-13 13:58:19	Alphonse Mellot Sancerre Rouge Génération XIX ...	Robe d'un superbe rubis, dense et profond - As...	alphonse-mellot-sancerre-rouge-generation-xix-...	2020-08-24 14:05:02	2020-08-24 12:05:02	https://www.bottle-neck.fr/?post_type=product&...
1507	16320	3.0	2.0	2018-02-12 12:49:29	2018-02-12 11:49:29	Mourgues du Grès IGP Pont du Gard Terre d'Arge...	Magnifique exemple d’assemblage réussi, ou cha...	mourgues-du-gres-costieres-de-nimes-terre-darg...	2020-08-25 11:15:03	2020-08-25 09:15:03	https://www.bottle-neck.fr/?post_type=product&...
1509	15663	3.0	2.0	2018-02-27 10:27:01	2018-02-27 09:27:01	Chermette Domaine du Vissoux Brouilly Pierreux...	Le Brouilly est le plus méridional des crus du...	chermette-domaine-du-vissoux-brouilly-pierreux...	2020-08-01 09:35:02	2020-08-01 07:35:02	https://www.bottle-neck.fr/?post_type=product&...
1510	15910	0.0	2.0	2019-03-28 10:59:43	2019-03-28 09:59:43	Thevenet Quintaine Viré-Clessé La Bongran 2015	C’est dans cette cuvée la pleine expression du...	thevenet-quintaine-vire-clesse-la-bongran-2015	2020-08-14 10:45:02	2020-08-14 08:45:02	https://www.bottle-neck.fr/?post_type=product&...

	product_id	onsale_web	price	stock_quantity	stock_status	id_web	sku	total_sales	post_author	post_date	post_date_gmt	post_title	post_excerpt	post_name	post_modified	post_modified_gmt	guid	total_value_sales
194	4334	1	49.0	0	outofstock	7818	7818	96.0	2.0	2018-03-01 14:02:44	2018-03-01 13:02:44	Champagne Gosset Grand Blanc de Blancs	La bulle fine et presente se dirige vers la su...	champagne-gosset-grand-blanc-de-blanc	2020-08-12 14:00:03	2020-08-12 12:00:03	https://www.bottle-neck.fr/?post_type=product&...	4704.0
71	4144	1	49.0	11	instock	1662	1662	87.0	2.0	2018-02-13 13:17:25	2018-02-13 12:17:25	Champagne Gosset Grand Rosé	Le nez exprime sans détour la fraise, dans sa ...	champagne-gosset-grand-rose	2020-08-14 16:45:03	2020-08-14 14:45:03	https://www.bottle-neck.fr/?post_type=product&...	4263.0
218	4402	1	176.0	8	instock	3510	3510	13.0	2.0	2018-03-22 11:21:05	2018-03-22 10:21:05	Cognac Frapin VIP XO	La cuvée VIP XO à été enrichie d’eaux-de-vie t...	cognac-frapin-vip-xo	2020-08-22 11:35:03	2020-08-22 09:35:03	https://www.bottle-neck.fr/?post_type=product&...	2288.0
70	4142	1	53.0	8	instock	11641	11641	30.0	2.0	2018-02-13 13:08:44	2018-02-13 12:08:44	Champagne Gosset Grand Millésime 2006	L'attaque est ample, gourmande avec une belle ...	champagne-gosset-grand-millesime-2006	2020-08-20 09:30:03	2020-08-20 07:30:03	https://www.bottle-neck.fr/?post_type=product&...	1590.0
69	4141	1	39.0	1	instock	304	304	40.0	2.0	2018-02-13 12:57:44	2018-02-13 11:57:44	Champagne Gosset Grande Réserve	Le nez, ouvert et expressif, évoque les fruits...	gosset-champagne-grande-reserve	2020-08-27 11:25:02	2020-08-27 09:25:02	https://www.bottle-neck.fr/?post_type=product&...	1560.0

Objectifs :

Import libraries needed¶

Fonctions definition¶

Import data needed for analysis¶

Data discover¶

1. Products info:¶

2. Connection :¶

3. Web infos :¶

Data exploration¶

Merging PRODUCTS INFOS and WEB INFOS with CONNECTION¶

Sales analysis¶

Outliers and contradictions¶

Visuel en boxplot (boite à moustache == quartiles visuels == Q1, Q2, Q3)¶

Utilisation de l'inter Quartile range¶

Utilisation du z-score¶

Incoherences¶

	virtual	downloadable	rating_count	average_rating	total_sales	post_author	post_date	post_date_gmt	post_modified	post_modified_gmt	post_parent	menu_order	comment_count
count	714.0	714.0	714.0	714.0	714.000000	714.000000	714	714	714	714	714.0	714.0	714.0
mean	0.0	0.0	0.0	0.0	4.012605	1.998599	2018-08-22 03:57:52.950980352	2018-08-22 02:29:08.581232640	2020-06-20 21:45:25.694678016	2020-06-20 19:51:53.929971968	0.0	0.0	0.0
min	0.0	0.0	0.0	0.0	0.000000	1.000000	2018-02-08 12:58:52	2018-02-08 11:58:52	2018-02-20 15:19:23	2018-02-20 14:19:23	0.0	0.0	0.0
25%	0.0	0.0	0.0	0.0	0.000000	2.000000	2018-02-27 20:01:12.500000	2018-02-27 19:01:12.500000	2020-06-18 18:32:35	2020-06-18 16:32:35	0.0	0.0	0.0
50%	0.0	0.0	0.0	0.0	1.000000	2.000000	2018-04-19 14:52:10	2018-04-19 12:52:10	2020-08-04 09:30:06.500000	2020-08-04 07:30:06.500000	0.0	0.0	0.0
75%	0.0	0.0	0.0	0.0	4.000000	2.000000	2019-01-31 14:35:47	2019-01-31 13:35:47	2020-08-25 10:32:32	2020-08-25 08:32:32	0.0	0.0	0.0
max	0.0	0.0	0.0	0.0	96.000000	2.000000	2020-07-20 11:00:00	2020-07-20 09:00:00	2020-08-27 18:55:03	2020-08-27 16:55:03	0.0	0.0	0.0
std	0.0	0.0	0.0	0.0	8.518183	0.037424	NaN	NaN	NaN	NaN	0.0	0.0	0.0

	product_id	onsale_web	price	stock_quantity	stock_status	id_web	sku	total_sales	post_author	post_date	post_date_gmt	post_title	post_excerpt	post_name	post_modified	post_modified_gmt	guid	total_value_sales
199	4352	1	225.0	0	outofstock	15940	15940	5.0	2.0	2018-03-02 10:30:04	2018-03-02 09:30:04	Champagne Egly-Ouriet Grand Cru Millésimé 2008	Issu d’un assemblage de 70% de Pinot Noir du g...	champagne-egly-ouriet-grand-cru-millesime-2008	2020-03-07 11:18:45	2020-03-07 10:18:45	https://www.bottle-neck.fr/?post_type=product&...	1125.0
426	5001	1	217.5	20	instock	14581	14581	0.0	2.0	2018-07-17 09:45:39	2018-07-17 07:45:39	David Duband Charmes-Chambertin Grand Cru 2014	Robe cerise chatoyante. Un nez de fruits rouge...	david-duband-charmes-chambertin-grand-cru-2014	2020-05-16 09:00:05	2020-05-16 07:00:05	https://www.bottle-neck.fr/?post_type=product&...	0.0
587	5892	1	191.3	10	instock	14983	14983	3.0	2.0	2019-03-28 10:21:36	2019-03-28 09:21:36	Coteaux Champenois Egly-Ouriet Ambonnay Rouge ...	Cet Ambonnay évoque les grands Pinots Noirs de...	coteaux-champenois-egly-ouriet-ambonnay-rouge-...	2020-04-01 09:30:09	2020-04-01 07:30:09	https://www.bottle-neck.fr/?post_type=product&...	573.9
218	4402	1	176.0	8	instock	3510	3510	13.0	2.0	2018-03-22 11:21:05	2018-03-22 10:21:05	Cognac Frapin VIP XO	La cuvée VIP XO à été enrichie d’eaux-de-vie t...	cognac-frapin-vip-xo	2020-08-22 11:35:03	2020-08-22 09:35:03	https://www.bottle-neck.fr/?post_type=product&...	2288.0
553	5767	1	175.0	12	instock	15185	15185	0.0	2.0	2019-03-13 14:43:22	2019-03-13 13:43:22	Camille Giroud Clos de Vougeot 2016	Ce vin provient de vignes âgées de 50 ans situ...	camille-giroud-clos-de-vougeot-2016	2020-06-11 15:25:04	2020-06-11 13:25:04	https://www.bottle-neck.fr/?post_type=product&...	0.0