# Libraries to help with reading and manipulating data
import numpy as np
import pandas as pd

# Library to split the data
from Scikit-learn.model_selection import train_test_split

# Libaries to help with data visualization
import matplotlib.pyplot as plt
from matplotlib.ticker import FuncFormatter
import seaborn as sns

# Removes the limit for the number of displayed columns
pd.set_option("display.max_columns", None)

# Sets the limit for the number of displayed rows
pd.set_option("display.max_rows", 100)

# Import libraries for building linear regression model
from statsmodels.formula.api import ols
import statsmodels.api as sm
from Scikit-learn.linear_model import LinearRegression

# Import library for preparing data
from Scikit-learn.model_selection import train_test_split

# Import library for checking multicollinearity
from statsmodels.stats.outliers_influence import variance_inflation_factor

# Import library for visualization of the regression
import statsmodels.stats.api as sms

# Import library to plot q-q plot of residuals
import scipy.stats as stats
import pylab

#import librabry used to re-split the data in a different way
from Scikit-learn.model_selection import cross_val_score

import warnings
warnings.filterwarnings("ignore")

# Connect collab
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).

kart = pd.read_csv('/content/drive/MyDrive/MIT - Data Sciences/Colab Notebooks/Week Four - Regression and Prediction/Guided Project/SuperKart.csv')

# Copying data to another variable to avoid any changes to original data
data = kart.copy()

data.head()

data.tail()

# Undestand the shape of the data
shape = data.shape
print(f'There are {shape[0]} rows and {shape[1]} columns in the dataset.')

There are 8763 rows and 12 columns in the dataset.

# Check the data types
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8763 entries, 0 to 8762
Data columns (total 12 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Product_Id                 8763 non-null   object 
 1   Product_Weight             8763 non-null   float64
 2   Product_Sugar_Content      8763 non-null   object 
 3   Product_Allocated_Area     8763 non-null   float64
 4   Product_Type               8763 non-null   object 
 5   Product_MRP                8763 non-null   float64
 6   Store_Id                   8763 non-null   object 
 7   Store_Establishment_Year   8763 non-null   int64  
 8   Store_Size                 8763 non-null   object 
 9   Store_Location_City_Type   8763 non-null   object 
 10  Store_Type                 8763 non-null   object 
 11  Product_Store_Sales_Total  8763 non-null   float64
dtypes: float64(4), int64(1), object(7)
memory usage: 821.7+ KB

# Checking for missing values in the data
data.isnull().sum()

# Check for duplicated rows
data.duplicated().sum()

0

# Lets see if there will be any data cleanup necessary - we will go column by column

# Product_Id: Numeric
P_Id_Desc = data['Product_Id'].describe()
print(f"Product_Id: ")
print('\n')
print(P_Id_Desc)
print('\n')

# Product_Weight: This is a numeric column so we can look at the min, amx and mean
P_Wt_Desc = data['Product_Weight'].describe()
print(f"Product_Weight: ")
print('\n')
print(P_Wt_Desc)
print('\n')

# Product_Sugar_Content: This is a Categorical (object) so we can look at unique values
P_Tp_Desc = data['Product_Sugar_Content'].unique()
print('\n')
print(f"Product_Sugar_Content: ")
print(P_Tp_Desc)
print('\n')

# Product Allocation Area: Numeric
P_Al_Desc = data['Product_Allocated_Area'].describe()
print('\n')
print(f"Product_Allocated_Area: ")
print(P_Al_Desc)
print('\n')

# Product_Type: Categorical
print(data['Product_Type'].value_counts())
print('\n')

# Product_MRP: Numeric
P_MRP_Desc = data['Product_MRP'].describe()
print('\n')
print(f"Product_MRP: ")
print(P_MRP_Desc)
print('\n')

# Store_Id: Categorical
Store_Id_Desc = data['Store_Id'].describe()
print('\n')
print(f"Store_Id: ")
print(Store_Id_Desc)
print('\n')
print(data['Store_Id'].unique())
print('\n')

# Store_Establishment_Year: Numeric
Store_Est_Unique = data['Store_Establishment_Year'].nunique()
print('\n')
print(f"Store_Establishment_Year: ")
print(Store_Est_Unique)
print('\n')
print(data['Store_Establishment_Year'].unique())
print('\n')

# Store_Size: Categorical
Store_Size_Desc = data['Store_Size'].unique()
print('\n')
print(f"Store_Size: ")
print(Store_Size_Desc)
print('\n')
print(data['Store_Establishment_Year'].unique())
print('\n')

# Store_Location_City_Type: Categorical
Store_Loc_Desc = data['Store_Location_City_Type'].nunique()
print('\n')
print(f"Store_Location_City_Type: ")
print(Store_Loc_Desc)
print(data['Store_Location_City_Type'].unique())
print('\n')

# Store_Type: Categorical
Store_Type_Desc = data['Store_Type'].unique()
print('\n')
print(f"Store_Type: ")
print(Store_Type_Desc)
print('\n')


# Print the values for Store_Type along with a count of the number of stores in that type

# Create a DataFrame with unique Store IDs and their types
unique_stores = data[['Store_Id', 'Store_Type']].drop_duplicates()

# Count the occurrences of each store type
store_type_counts = unique_stores['Store_Type'].value_counts()

print(store_type_counts)
print('\n')

#Product_Store_Sales_Total: Numeric
P_St_Desc = data['Product_Store_Sales_Total'].describe()
print('\n')
print(f"Product_Store_Sales_Total: ")
print(P_St_Desc)
print('\n')

Product_Id: 


count       8763
unique      8763
top       FD6114
freq           1
Name: Product_Id, dtype: object


Product_Weight: 


count    8763.000000
mean       12.653792
std         2.217320
min         4.000000
25%        11.150000
50%        12.660000
75%        14.180000
max        22.000000
Name: Product_Weight, dtype: float64


Product_Sugar_Content: 
['Low Sugar' 'Regular' 'No Sugar' 'reg']


Product_Allocated_Area: 
count    8763.000000
mean        0.068786
std         0.048204
min         0.004000
25%         0.031000
50%         0.056000
75%         0.096000
max         0.298000
Name: Product_Allocated_Area, dtype: float64


Product_Type
Fruits and Vegetables    1249
Snack Foods              1149
Frozen Foods              811
Dairy                     796
Household                 740
Baking Goods              716
Canned                    677
Health and Hygiene        628
Meat                      618
Soft Drinks               519
Breads                    200
Hard Drinks               186
Others                    151
Starchy Foods             141
Breakfast                 106
Seafood                    76
Name: count, dtype: int64


Product_MRP: 
count    8763.000000
mean      147.032539
std        30.694110
min        31.000000
25%       126.160000
50%       146.740000
75%       167.585000
max       266.000000
Name: Product_MRP, dtype: float64


Store_Id: 
count       8763
unique         4
top       OUT004
freq        4676
Name: Store_Id, dtype: object


['OUT004' 'OUT003' 'OUT001' 'OUT002']


Store_Establishment_Year: 
4


[2009 1999 1987 1998]


Store_Size: 
['Medium' 'High' 'Small']


[2009 1999 1987 1998]


Store_Location_City_Type: 
3
['Tier 2' 'Tier 1' 'Tier 3']


Store_Type: 
['Supermarket Type2' 'Departmental Store' 'Supermarket Type1' 'Food Mart']


Store_Type
Supermarket Type2     1
Departmental Store    1
Supermarket Type1     1
Food Mart             1
Name: count, dtype: int64


Product_Store_Sales_Total: 
count    8763.000000
mean     3464.003640
std      1065.630494
min        33.000000
25%      2761.715000
50%      3452.340000
75%      4145.165000
max      8000.000000
Name: Product_Store_Sales_Total, dtype: float64

# See all numeric variables info at the same time
#data.describe(include = np.number)

# See all variables info at the same time
data.describe(include='all')

# Let's fix the Product_Sugar_Content data, place 'reg' into 'Regular'
data['Product_Sugar_Content'] = data['Product_Sugar_Content'].replace('reg', 'Regular')
print(data['Product_Sugar_Content'].value_counts())

Product_Sugar_Content
Low Sugar    4885
Regular      2359
No Sugar     1519
Name: count, dtype: int64

# Since Product_Id's are all unique they could be dropped, but there is a non-numeric prefix that may give us some useful information, so ltes drop the numeric portion and keep only the prefixes
data['Product_Id'] = data['Product_Id'].str[:2]
print(data['Product_Id'].value_counts())

Product_Id
FD    6539
NC    1519
DR     705
Name: count, dtype: int64

# Function to plot a boxplot and a histogram along the same scale

def histogram_boxplot(data, feature, figsize = (12, 7), kde = False, bins = None):
    """
    Boxplot and histogram combined

    data: dataframe
    feature: dataframe column
    figsize: size of figure (default (12,7))
    kde: whether to show density curve (default False)
    bins: number of bins for histogram (default None)
    """
    f2, (ax_box2, ax_hist2) = plt.subplots(
        nrows = 2,      # Number of rows of the subplot grid = 2
        sharex = True,  # x-axis will be shared among all subplots
        gridspec_kw = {"height_ratios": (0.25, 0.75)},
        figsize = figsize,
    )                   # Creating the 2 subplots
    sns.boxplot(
        data = data, x = feature, ax = ax_box2, showmeans = True, color = "violet"
    )                   # Boxplot will be created and a star will indicate the mean value of the column
    sns.histplot(
        data = data, x = feature, kde = kde, ax = ax_hist2, bins = bins, palette = "winter"
    ) if bins else sns.histplot(
        data = data, x = feature, kde = kde, ax = ax_hist2
    )                   # For histogram
    ax_hist2.axvline(
        data[feature].mean(), color = "green", linestyle = "--"
    )                   # Add mean to the histogram
    ax_hist2.axvline(
        data[feature].median(), color = "black", linestyle = "-"
    )

histogram_boxplot(data, "Product_Weight")

histogram_boxplot(data, "Product_Allocated_Area")

sns.histplot(np.log(data['Product_Allocated_Area']) + .02)

<Axes: xlabel='Product_Allocated_Area', ylabel='Count'>

histogram_boxplot(data, "Product_MRP", kde = True)

histogram_boxplot(data, "Product_Store_Sales_Total")

# Function to create labeled barplots

def labeled_barplot(data, feature, perc = False, n = None):
    """
    Barplot with percentage at the top

    data: dataframe
    feature: dataframe column
    perc: whether to display percentages instead of count (default is False)
    n: displays the top n category levels (default is None, i.e., display all levels)
    """

    total = len(data[feature])            # Length of the column
    count = data[feature].nunique()
    if n is None:
        plt.figure(figsize = (count + 1, 5))
    else:
        plt.figure(figsize = (n + 1, 5))

    plt.xticks(rotation = 90, fontsize = 15)
    ax = sns.countplot(
        data = data,
        x = feature,
        palette = "Paired",
        order = data[feature].value_counts().index[:n].sort_values(),
    )

    for p in ax.patches:
        if perc == True:
            label = "{:.1f}%".format(
                100 * p.get_height() / total
            )                              # Percentage of each class of the category
        else:
            label = p.get_height()         # Count of each level of the category

        x = p.get_x() + p.get_width() / 2  # Width of the plot
        y = p.get_height()                 # Height of the plot

        ax.annotate(
            label,
            (x, y),
            ha = "center",
            va = "center",
            size = 12,
            xytext = (0, 5),
            textcoords = "offset points",
        )                                 # Annotate the percentage

    plt.show()                            # Show the plot

labeled_barplot(data, "Product_Id", perc = True)

labeled_barplot(data, "Product_Sugar_Content", perc = True)

labeled_barplot(data, "Product_Type", perc = True)

labeled_barplot(data, "Store_Id", perc = True)

labeled_barplot(data, "Store_Size", perc = True)

labeled_barplot(data, "Store_Location_City_Type", perc = True)

labeled_barplot(data, "Store_Type", perc = True)

# Create a copy of the dataset data dropping all non-numeric values

# Create a copy of the dataset
data_numeric = data.copy()

# Drop non-numeric columns
data_numeric = data_numeric.drop(['Product_Id', 'Product_Sugar_Content', 'Product_Type', 'Store_Id', 'Store_Size', 'Store_Location_City_Type', 'Store_Type'], axis=1)

# Display the first few rows of the new DataFrame
data_numeric.head()

# Create the correlation heatmap
data_numeric.corr()

# Plot the heatmap on data_numeric
plt.figure(figsize=(10, 8))
sns.heatmap(data_numeric.corr(), annot=True, fmt = '.2f', cmap='coolwarm')
# fmt is the precison (number of decimals)
plt.show()

#data_rev = data.groupby(['Product_Id'], as_index = False)['Product_Store_Sales_Total'].sum()
#sns.barplot(x = 'Product_Id', y = 'Product_Store_Sales_Total', data = data_rev)

# alternatley we could do the following:

# Visualize the effect of Product_Id on Product_Store_Sales_Total
plt.figure(figsize=(10, 6))
sns.barplot(data = data, x = 'Product_Id', y = 'Product_Store_Sales_Total', estimator = np.sum, ci = 1, palette = 'icefire')
plt.show()

## Removed as it did not give any insights
# Visualize the effect of Product_Weight on Product_Store_Sales_Total
#plt.figure(figsize=(25, 8))
#sns.barplot(data = data, x = 'Product_Weight', y = 'Product_Store_Sales_Total', estimator = np.sum, ci = 1, palette = 'icefire')
#plt.xticks(rotation = 90)
#plt.show()

# Visualize the effect of Product_Sugar_Content on Product_Store_Sales_Total
plt.figure(figsize=(10, 6))
sns.barplot(data = data, x = 'Product_Sugar_Content', y = 'Product_Store_Sales_Total', estimator = np.sum, ci = 1, palette = 'icefire')
plt.xticks(rotation = 90)
plt.show()

# Visualize the effect of Product_Type on Product_Store_Sales_Total
plt.figure(figsize=(25, 8))
sns.barplot(data = data, x = 'Product_Type', y = 'Product_Store_Sales_Total', estimator = np.sum, ci = 1, palette = 'icefire')
plt.xticks(rotation = 90)
plt.show()

# Visualize the effect of Store_Id on Product_Store_Sales_Total
plt.figure(figsize=(10, 6))
sns.barplot(data = data, x = 'Store_Id', y = 'Product_Store_Sales_Total', estimator = np.sum, ci = 1, palette = 'icefire')
plt.xticks(rotation = 90)
plt.show()

# Visualize the effect of Store_Size on Product_Store_Sales_Total
plt.figure(figsize=(10, 6))
sns.barplot(data = data, x = 'Store_Size', y = 'Product_Store_Sales_Total', estimator = np.sum, ci = 1, palette = 'icefire')
plt.xticks(rotation = 90)
plt.show()

# Visualize the effect of Store_Location_City_Type on Product_Store_Sales_Total
plt.figure(figsize=(10, 6))
sns.barplot(data = data, x = 'Store_Location_City_Type', y = 'Product_Store_Sales_Total', estimator = np.sum, ci = 1, palette = 'icefire')
plt.xticks(rotation = 90)
plt.show()

# Visualize the effect of Store_Type on Product_Store_Sales_Total
plt.figure(figsize=(10, 6))
sns.barplot(data = data, x = 'Store_Type', y = 'Product_Store_Sales_Total', estimator = np.sum, ci = 1, palette = 'icefire')
plt.xticks(rotation = 90)
plt.show()

# Visualize the relationship between Product_Weight and Product_Store_Sales_Total
plt.figure(figsize=(10, 6))
sns.scatterplot(x='Product_Weight', y='Product_Store_Sales_Total', data=data)
plt.title('Product Weight vs. Product Store Sales Total')
plt.xlabel('Product Weight')
plt.ylabel('Product Store Sales Total')
plt.show()

# Visualize the relationship between Product_Allocated_Area and Product_Store_Sales_Total
plt.figure(figsize=(10, 6))
sns.scatterplot(x='Product_Allocated_Area', y='Product_Store_Sales_Total', data=data)
plt.title('Product Allocated Area vs. Product Store Sales Total')
plt.xlabel('Product Allocated Area')
plt.ylabel('Product Store Sales Total')
plt.show()

# Visualize the relationship between Product_MRP and Product_Store_Sales_Total
plt.figure(figsize=(10, 6))
sns.scatterplot(x='Product_MRP', y='Product_Store_Sales_Total', data=data)
plt.title('Product MRP vs. Product Store Sales Total')
plt.xlabel('Product MRP')
plt.ylabel('Product Store Sales Total')
plt.show()

## Removed - not useful
# Visualize the relationship between Store_Establishment_Year and Product_Store_Sales_Total
#plt.figure(figsize=(10, 6))
#sns.scatterplot(x='Store_Establishment_Year', y='Product_Store_Sales_Total', data=data)
#plt.title('Store Establishment Year vs. Product Store Sales Total')
#plt.xlabel('Store Establishment Year')
#plt.ylabel('Product Store Sales Total')
#plt.show()

# Visualize the relationship between Product_Id and Product_Store_Sales_Total
plt.figure(figsize=(25, 8))
sns.boxplot(x='Product_Id', y='Product_Store_Sales_Total', data=data, palette = 'icefire')
plt.title('Product ID vs. Product Store Sales Total')
plt.xlabel('Product ID')
plt.ylabel('Product Store Sales Total')
plt.show()

# Visualize the relationship between Product_Sugar_Content and Product_Store_Sales_Total
plt.figure(figsize=(10, 6))
sns.boxplot(x='Product_Sugar_Content', y='Product_Store_Sales_Total', data=data, palette = 'icefire')
plt.title('Product Sugar Content vs. Product Store Sales Total')
plt.xlabel('Product Sugar Content')
plt.ylabel('Product Store Sales Total')
plt.show()

# Visualize the relationship between Product_Type and Product_Store_Sales_Total
plt.figure(figsize=(25, 8))
sns.boxplot(x='Product_Type', y='Product_Store_Sales_Total', data=data, palette = 'icefire')
plt.title('Product Type vs. Product Store Sales Total')
plt.xlabel('Product Type')
plt.ylabel('Product Store Sales Total')
plt.show()

# Visualize the relationship between Store_Id and Product_Store_Sales_Total
plt.figure(figsize=(10, 6))
sns.boxplot(x='Store_Id', y='Product_Store_Sales_Total', data=data, palette = 'icefire')
plt.title('Store ID vs. Product Store Sales Total')
plt.xlabel('Store ID')
plt.ylabel('Product Store Sales Total')
plt.show()

# Visualize the relationship between Store_Size and Product_Store_Sales_Total
plt.figure(figsize=(10, 6))
sns.boxplot(x='Store_Size', y='Product_Store_Sales_Total', data=data, palette = 'icefire')
plt.title('Store Size vs. Product Store Sales Total')
plt.xlabel('Store Size')
plt.ylabel('Product Store Sales Total')
plt.show()

# Visualize the relationship between Store_Location_City_Type and Product_Store_Sales_Total
plt.figure(figsize=(10, 6))
sns.boxplot(x='Store_Location_City_Type', y='Product_Store_Sales_Total', data=data, palette = 'icefire')
plt.title('Store Location City Type vs. Product Store Sales Total')
plt.xlabel('Store Location City Type')
plt.ylabel('Product Store Sales Total')
plt.show()

# Visualize the relationship between Store_Type and Product_Store_Sales_Total
plt.figure(figsize=(10, 6))
sns.boxplot(x='Store_Type', y='Product_Store_Sales_Total', data=data, palette = 'icefire')
plt.title('Store Type vs. Product Store Sales Total')
plt.xlabel('Store Type')
plt.ylabel('Product Store Sales Total')
plt.show()

data.groupby(['Store_Id'])['Product_Store_Sales_Total'].sum()

# Function to format the y-axis labels as plain numbers
def format_yaxis(x, pos):
    return f'{int(x):,}'

# Create a subplot grid with 1 row and 4 columns, sharing the y-axis
fig, axes = plt.subplots(nrows=1, ncols=4, figsize=(20, 6), sharey=True)

# Add a main title for the entire figure
fig.suptitle('Product_Id Vs Product_Store_Sales_Total', fontsize=16)

# Iterate through stores and create a plot for each
for i, store_id in enumerate(['OUT001', 'OUT002', 'OUT003', 'OUT004']):
    # Plot the barplot on the corresponding axes
    sns.barplot(data=data[data['Store_Id'] == store_id], x='Product_Id',  y='Product_Store_Sales_Total',
                estimator=np.sum, ci=1, palette='icefire', ax=axes[i])

    # Set the title for each subplot
    axes[i].tick_params(axis='x', rotation=90)
    axes[i].set_title(f"{store_id}")

    # Apply the y-axis formatting to avoid exponents
    axes[i].yaxis.set_major_formatter(FuncFormatter(format_yaxis))

# Adjust the layout to prevent overlap
plt.tight_layout(rect=[0, 0.05, 1, 0.95])  # Adjust bottom and top margins to fit the suptitle and x-axis label

plt.show()

#
print('/n')
print('/n')
#

# Function to format the y-axis labels as plain numbers
def format_yaxis(x, pos):
    return f'{int(x):,}'

# Create a subplot grid with 1 row and 4 columns, sharing the y-axis
fig, axes = plt.subplots(nrows=1, ncols=4, figsize=(20, 6), sharey=True)

# Add a main title for the entire figure
fig.suptitle('Product_Id Vs Product_MRP', fontsize=16)

# Iterate through stores and create a plot for each
for i, store_id in enumerate(['OUT001', 'OUT002', 'OUT003', 'OUT004']):
    # Plot the barplot on the corresponding axes
    sns.barplot(data=data[data['Store_Id'] == store_id], x='Product_Id',  y='Product_MRP',
                estimator=np.sum, ci=1, palette='icefire', ax=axes[i])

    # Set the title for each subplot
    axes[i].tick_params(axis='x', rotation=90)
    axes[i].set_title(f"{store_id}")

    # Apply the y-axis formatting to avoid exponents
    axes[i].yaxis.set_major_formatter(FuncFormatter(format_yaxis))

# Adjust the layout to prevent overlap
plt.tight_layout(rect=[0, 0.05, 1, 0.95])  # Adjust bottom and top margins to fit the suptitle and x-axis label

plt.show()

/n
/n

# Function to format the y-axis labels as plain numbers
def format_yaxis(x, pos):
    return f'{int(x):,}'

# Create a subplot grid with 1 row and 4 columns, sharing the y-axis
fig, axes = plt.subplots(nrows=1, ncols=4, figsize=(20, 6), sharey=True)

# Add a main title for the entire figure
fig.suptitle('Product_Sugar_Content Vs Product_Store_Sales_Total', fontsize=16)

# Iterate through stores and create a plot for each
for i, store_id in enumerate(['OUT001', 'OUT002', 'OUT003', 'OUT004']):
    # Plot the barplot on the corresponding axes
    sns.barplot(data=data[data['Store_Id'] == store_id], x='Product_Sugar_Content', y='Product_Store_Sales_Total',
                estimator=np.sum, ci=1, palette='icefire', ax=axes[i])

    # Set the title for each subplot
    axes[i].tick_params(axis='x', rotation=90)
    axes[i].set_title(f"{store_id}")

    # Apply the y-axis formatting to avoid exponents
    axes[i].yaxis.set_major_formatter(FuncFormatter(format_yaxis))

# Adjust the layout to prevent overlap
plt.tight_layout(rect=[0, 0.05, 1, 0.95])  # Adjust bottom and top margins to fit the suptitle and x-axis label

plt.show()

#
print('/n')
print('/n')
#

# Function to format the y-axis labels as plain numbers
def format_yaxis(x, pos):
    return f'{int(x):,}'

# Create a subplot grid with 1 row and 4 columns, sharing the y-axis
fig, axes = plt.subplots(nrows=1, ncols=4, figsize=(20, 6), sharey=True)

# Add a main title for the entire figure
fig.suptitle('Product_Sugar_Content Vs Product_MRP', fontsize=16)

# Iterate through stores and create a plot for each
for i, store_id in enumerate(['OUT001', 'OUT002', 'OUT003', 'OUT004']):
    # Plot the barplot on the corresponding axes
    sns.barplot(data=data[data['Store_Id'] == store_id], x='Product_Sugar_Content',  y='Product_MRP',
                estimator=np.sum, ci=1, palette='icefire', ax=axes[i])

    # Set the title for each subplot
    axes[i].tick_params(axis='x', rotation=90)
    axes[i].set_title(f"{store_id}")

    # Apply the y-axis formatting to avoid exponents
    axes[i].yaxis.set_major_formatter(FuncFormatter(format_yaxis))

# Adjust the layout to prevent overlap
plt.tight_layout(rect=[0, 0.05, 1, 0.95])  # Adjust bottom and top margins to fit the suptitle and x-axis label

plt.show()

/n
/n

# Function to format the y-axis labels as plain numbers
def format_yaxis(x, pos):
    return f'{int(x):,}'

# Create a subplot grid with 1 row and 4 columns, sharing the y-axis
fig, axes = plt.subplots(nrows=1, ncols=4, figsize=(20, 6), sharey=True)

# Add a main title for the entire figure
fig.suptitle('Store_Location_City_Type Vs Product_Store_Sales_Total', fontsize=16)

# Iterate through stores and create a plot for each
for i, store_id in enumerate(['OUT001', 'OUT002', 'OUT003', 'OUT004']):
    # Plot the barplot on the corresponding axes
    sns.barplot(data=data[data['Store_Id'] == store_id], x='Store_Location_City_Type', y='Product_Store_Sales_Total',
                estimator=np.sum, ci=1, palette='icefire', ax=axes[i])

    # Set the title for each subplot
    axes[i].tick_params(axis='x', rotation=90)
    axes[i].set_title(f"Store: {store_id}")

    # Apply the y-axis formatting to avoid exponents
    axes[i].yaxis.set_major_formatter(FuncFormatter(format_yaxis))

# Add a single x-axis title
fig.text(0.5, 0.04, 'Store_Location_City_Type', ha='center', fontsize=14)

# Adjust the layout to prevent overlap
plt.tight_layout(rect=[0, 0.05, 1, 0.95])  # Adjust bottom and top margins to fit the suptitle and x-axis label

plt.show()

#
print('/n')
print('/n')
#

# Function to format the y-axis labels as plain numbers
def format_yaxis(x, pos):
    return f'{int(x):,}'

# Create a subplot grid with 1 row and 4 columns, sharing the y-axis
fig, axes = plt.subplots(nrows=1, ncols=4, figsize=(20, 6), sharey=True)

# Add a main title for the entire figure
fig.suptitle('Store_Location_City_Type Vs Product_MRP', fontsize=16)

# Iterate through stores and create a plot for each
for i, store_id in enumerate(['OUT001', 'OUT002', 'OUT003', 'OUT004']):
    # Plot the barplot on the corresponding axes
    sns.barplot(data=data[data['Store_Id'] == store_id], x='Store_Location_City_Type',  y='Product_MRP',
                estimator=np.sum, ci=1, palette='icefire', ax=axes[i])

    # Set the title for each subplot
    axes[i].tick_params(axis='x', rotation=90)
    axes[i].set_title(f"{store_id}")

    # Apply the y-axis formatting to avoid exponents
    axes[i].yaxis.set_major_formatter(FuncFormatter(format_yaxis))

# Adjust the layout to prevent overlap
plt.tight_layout(rect=[0, 0.05, 1, 0.95])  # Adjust bottom and top margins to fit the suptitle and x-axis label

plt.show()

/n
/n

# Function to format the y-axis labels as plain numbers
def format_yaxis(x, pos):
    return f'{int(x):,}'

# Create a subplot grid with 1 row and 4 columns, sharing the y-axis
fig, axes = plt.subplots(nrows=1, ncols=4, figsize=(20, 6), sharey=True)

# Add a main title for the entire figure
fig.suptitle('Store_Type Vs Product_Store_Sales_Total', fontsize=16)

# Iterate through stores and create a plot for each
for i, store_id in enumerate(['OUT001', 'OUT002', 'OUT003', 'OUT004']):
    # Plot the barplot on the corresponding axes
    sns.barplot(data=data[data['Store_Id'] == store_id], x='Store_Type', y='Product_Store_Sales_Total',
                estimator=np.sum, ci=1, palette='icefire', ax=axes[i])

    # Set the title for each subplot
    axes[i].tick_params(axis='x', rotation=90)
    axes[i].set_title(f"Store: {store_id}")

    # Apply the y-axis formatting to avoid exponents
    axes[i].yaxis.set_major_formatter(FuncFormatter(format_yaxis))

# Add a single x-axis title
fig.text(0.5, 0.04, 'Store_Type', ha='center', fontsize=14)

# Adjust the layout to prevent overlap
plt.tight_layout(rect=[0, 0.05, 1, 0.95])  # Adjust bottom and top margins to fit the suptitle and x-axis label

plt.show()

#
print('/n')
print('/n')
#

# Function to format the y-axis labels as plain numbers
def format_yaxis(x, pos):
    return f'{int(x):,}'

# Create a subplot grid with 1 row and 4 columns, sharing the y-axis
fig, axes = plt.subplots(nrows=1, ncols=4, figsize=(20, 6), sharey=True)

# Add a main title for the entire figure
fig.suptitle('Store_Type Vs Product_MRP', fontsize=16)

# Iterate through stores and create a plot for each
for i, store_id in enumerate(['OUT001', 'OUT002', 'OUT003', 'OUT004']):
    # Plot the barplot on the corresponding axes
    sns.barplot(data=data[data['Store_Id'] == store_id], x='Store_Type',  y='Product_MRP',
                estimator=np.sum, ci=1, palette='icefire', ax=axes[i])

    # Set the title for each subplot
    axes[i].tick_params(axis='x', rotation=90)
    axes[i].set_title(f"{store_id}")

    # Apply the y-axis formatting to avoid exponents
    axes[i].yaxis.set_major_formatter(FuncFormatter(format_yaxis))

# Adjust the layout to prevent overlap
plt.tight_layout(rect=[0, 0.05, 1, 0.95])  # Adjust bottom and top margins to fit the suptitle and x-axis label

plt.show()

/n
/n

# Visualize the effect of Product_Id on Product_Store_Sales_Total
plt.figure(figsize = [10, 8])
sns.boxplot(data = data, x = data.Product_Id, y = data.Product_MRP, palette = 'icefire')
plt.xticks(rotation = 90)
plt.title("Boxplot - Product_ID Vs Product_MRP")
plt.xlabel("Product_Id")
plt.ylabel("Product_MRP (of each product)")
plt.show()

# Visualize the effect of Product_Type on Product_Store_Sales_Total
plt.figure(figsize = [25, 8])
sns.boxplot(data = data, x = data.Product_Type, y = data.Product_MRP, palette = 'icefire')
plt.xticks(rotation = 90)
plt.title("Boxplot - Product_Type Vs Product_MRP")
plt.xlabel("Product_Type")
plt.ylabel("Product_MRP (of each product)")
plt.show()

# Investigate the product distribution between stores, does store OUT003 carry the same items (but perhpas at a different prince) as store OUT004

pd.crosstab(data.Store_Id, data.Product_Type)

# Visualize this
plt.figure(figsize=(15, 6))
sns.heatmap(pd.crosstab(data.Store_Id, data.Product_Type),
            annot = True,
            cmap = 'icefire',
            fmt = 'g')
plt.show()

plt.figure(figsize=(15, 6))
sns.heatmap(pd.crosstab(data.Product_Sugar_Content, data.Product_Type),
            annot = True,
            cmap = 'icefire',
            fmt = 'g')
plt.show()

# Store Age
# Check if 'Store_Established_Year' exists in the DataFrame
if 'Store_Establishment_Year' in data.columns:
    data['Store_Age'] = 2024 - data['Store_Establishment_Year']
else:
    print("'Store_Establishment_Year' column not found in the DataFrame.")

# We no longer need the Store_Establishment_Year column, so let's drop it
data.drop('Store_Establishment_Year', axis=1, inplace=True)

# Find the unique pproduct types  that exist in the data frame
#print(data['Product_Type'].unique())

# Or we cn do the fllowing (which may be easier to read)
data['Product_Type'].value_counts().index.to_list()

['Fruits and Vegetables',
 'Snack Foods',
 'Frozen Foods',
 'Dairy',
 'Household',
 'Baking Goods',
 'Canned',
 'Health and Hygiene',
 'Meat',
 'Soft Drinks',
 'Breads',
 'Hard Drinks',
 'Others',
 'Starchy Foods',
 'Breakfast',
 'Seafood']

# Create a list of perishible items
perishables = [
              "Dairy",
              "Meat",
              "Fruits and Vegetables",
              "Breakfast",
              "Breads",
              "Seafood"
              ]

# Define a function to substitute the new categories for the origals based on our list
def change(x):
    if x in perishables:
        return "Perishables"
    else:
        return "Non Perishables"

# Intsantiate the function to preview the changes
data.Product_Type.apply(change)

# Perform the transformation by appending a new column to the end of the data frame
change1 = []
for i in range(0, len(data)):
    if data.Product_Type[i] in perishables:
        change1.append("Perishables")
    else:
        change1.append("Non Perishables")

data["Product_Type_Categories"] = change1

# We can now drop the Product_Type column
data.drop('Product_Type', axis=1, inplace=True)

# Check the transformation
data.head()

# Outlier detection using boxplot

#numeric_columns = data.select_dtypes(include=np.number).columns.tolist()
#numeric_columns.remove("Store_Establishment_Year")
#numeric_columns.remove("Store_Age_Years")


numeric_columns = data.select_dtypes(include=np.number).columns.tolist()
numeric_columns

['Product_Weight',
 'Product_Allocated_Area',
 'Product_MRP',
 'Product_Store_Sales_Total',
 'Store_Age']

plt.figure(figsize = (20, 15))

for i, variable in enumerate(numeric_columns):
    plt.subplot(4, 5, i + 1)
    plt.boxplot(data[variable], whis = 1.5)
    plt.tight_layout()
    plt.title(variable)

plt.show()

# Determine the types of features present in the data frame
data.dtypes.value_counts()

data.select_dtypes(include='object').columns.to_list()

['Product_Id',
 'Product_Sugar_Content',
 'Store_Id',
 'Store_Size',
 'Store_Location_City_Type',
 'Store_Type',
 'Product_Type_Categories']

# Perform OHE
data_OHE = pd.get_dummies(
                         data,
                         columns = data.select_dtypes(include='object').columns,
                         drop_first=True
                         )

data_OHE.head()

x = data_OHE.drop('Product_Store_Sales_Total', axis = 1)  # These are the features
y = data_OHE['Product_Store_Sales_Total']                 # This is the target

# Let's make sure everything looks as it should
x.shape, y.shape

((8763, 19), (8763,))

# Lets use the stats models library ( we could also use Scikit-learn)
# Create a bias variable and pre-pend that onto the data frame
x = sm.add_constant(x)

x.head(5)

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.3, random_state = 1)

x_train.shape, x_test.shape, y_train.shape, y_test.shape

((6134, 20), (2629, 20), (6134,), (2629,))

# We need to convert the data type of the T/F data to 1/0 for the actual regression
x_train1 = x_train.astype(float)
y_train1 = y_train.astype(float)

# We need to do the same thing for the test data
x_test1 = x_test.astype(float)
y_test1 = y_test.astype(float)

# To see how this works you can compute it in the following way, for a single column
variance_inflation_factor(x_train1.values, 0)

0.0

# Create an empty datafram
VIF_df = pd.DataFrame()

VIF_df['Features'] = x_train1.columns
VIF_df['VIF'] = [variance_inflation_factor(x_train1.values, i) for i in range(x_train1.shape[1])]
VIF_df

x_train.drop('Store_Age', axis = 1, inplace = True)

# We need to remember to do the same thin in the testing data set as well
x_test.drop('Store_Age', axis = 1, inplace = True)

# Recompute the VIF
# Create an empty datafram
VIF_df = pd.DataFrame()

VIF_df['Features'] = x_train1.columns
VIF_df['VIF'] = [variance_inflation_factor(x_train1.values, i) for i in range(x_train1.shape[1])]
VIF_df

model = sm.OLS(y_train1, x_train1).fit()
model.summary()

y_hat = model.predict(x_train1)

from Scikit-learn.metrics import r2_score, mean_absolute_percentage_error, mean_absolute_error, mean_squared_error

# Let's examine the r2 score
r2_score(y_train1, y_hat)

0.8236738620512217

# mean_squared_error
mean_squared_error(y_train1, y_hat)

199511.99287139165

# Model Performance on test and train data
def model_evaluation(model, x, y):  # Changed function parameters to accept x and y directly
  y_hat = model.predict(x)          # Use the correct x for prediction

  rmse = np.sqrt(mean_squared_error(y, y_hat))
  mae  = mean_absolute_error(y, y_hat)
  mape = mean_absolute_percentage_error(y, y_hat)
  r2   = r2_score(y, y_hat)

  return pd.DataFrame({'RMSE': [rmse], 'MAE': [mae], 'MAPE': [mape], 'R2': [r2]})

# Perform the regression on the training data
train = model_evaluation(model, x_train1, y_train1)

# Perform the regression on the test data
test = model_evaluation(model, x_test1, y_test1)

# Label rows
train.index = ['Train']
test.index = ['Test']

# Concatenate the result
pd.concat([train, test], axis = 0)

# Redidue of the model
model.resid

# which is this calculation#
y_train1 - model.predict(x_train1)

np.mean(model.resid)

-1.438585163756384e-11

result = sms.het_goldfeldquandt(y_train1, x_train1)
result
# The second value is nothing but the p-value

(0.9838049659112555, 0.6741005762249113, 'increasing')

# Plot residual vs fitted values -  we need to not see a pattern here
model.fittedvalues    # the same as model.predict(x_train1)

# create a scatterplot
plt.scatter(model.fittedvalues, model.resid, alpha = 0.4)
plt.axhline(y = 0, color = 'r', linestyle = '--', alpha = 0.5)
plt.title('Residuals vs Fitted Values')
plt.xlabel('Fitted Values')
plt.ylabel('Residuals')
plt.show()

# Plot histogram of residuals
sns.histplot(model.resid, kde = True)

<Axes: ylabel='Count'>

stats.probplot(model.resid, dist = 'norm', plot = pylab)
plt.title('Normal Probability Plot of Residuals')
plt.show()

model2 = LinearRegression()

mae_ = cross_val_score(model2, x_train1, y_train1, cv = 10, scoring = 'neg_mean_absolute_percentage_error')  # This gives percenatge error for different folds where cv is the number of folds

# Get the mean of mae_
np.mean(-mae_)

0.09884834456107033

# Std deviation
np.std(-mae_)

0.009025108770564186

# Assume a normal distibution
np.mean(-mae_) - 3 * np.std(-mae_), np.mean(-mae_) + 3 * np.std(-mae_)  # range at which 99.7% of the time the error stays withing this range

(0.07177301824937778, 0.1259236708727629)

# Do the same on the test data set - even though this doesnt make a lot of sense given the model is now being trained on a very small data set
np.mean(cross_val_score(model2, x_test1, y_test1, cv = 10, scoring = 'neg_mean_absolute_percentage_error'))

-0.09818012800305127

# Convert notebook to html
!jupyter nbconvert --to html "/content/drive/MyDrive/MIT - Data Sciences/Colab Notebooks/Week Four - Regression and Prediction/Guided Project/SuperKart Sales Prediction.ipynb"

	Product_Weight	Product_Allocated_Area	Product_MRP	Store_Establishment_Year	Product_Store_Sales_Total
0	12.66	0.027	117.08	2009	2842.40
1	16.54	0.144	171.43	1999	4830.02
2	14.28	0.031	162.08	1987	4130.16
3	12.10	0.112	186.31	1987	4132.18
4	9.57	0.010	123.67	1998	2279.36

	Product_Weight	Product_Allocated_Area	Product_MRP	Store_Establishment_Year	Product_Store_Sales_Total
Product_Weight	1.000000	0.014754	0.532716	-0.161907	0.737955
Product_Allocated_Area	0.014754	1.000000	-0.009508	0.004467	-0.000933
Product_MRP	0.532716	-0.009508	1.000000	-0.189357	0.787989
Store_Establishment_Year	-0.161907	0.004467	-0.189357	1.000000	-0.185027
Product_Store_Sales_Total	0.737955	-0.000933	0.787989	-0.185027	1.000000

Dep. Variable:	Product_Store_Sales_Total	R-squared:	0.824
Model:	OLS	Adj. R-squared:	0.823
Method:	Least Squares	F-statistic:	2860.
Date:	Wed, 04 Sep 2024	Prob (F-statistic):	0.00
Time:	21:04:47	Log-Likelihood:	-46132.
No. Observations:	6134	AIC:	9.229e+04
Df Residuals:	6123	BIC:	9.236e+04
Df Model:	10
Covariance Type:	nonrobust

	coef	std err	t	P>\|t\|	[0.025	0.975]
const	-58.4335	6.034	-9.684	0.000	-70.262	-46.605
Product_Weight	135.0943	3.436	39.318	0.000	128.359	141.830
Product_Allocated_Area	-139.5679	118.876	-1.174	0.240	-372.608	93.472
Product_MRP	12.8696	0.254	50.702	0.000	12.372	13.367
Store_Age	5.0845	1.693	3.003	0.003	1.766	8.403
Product_Id_FD	0.3932	21.936	0.018	0.986	-42.609	43.396
Product_Id_NC	16.0039	12.324	1.299	0.194	-8.155	40.163
Product_Sugar_Content_No Sugar	16.0039	12.324	1.299	0.194	-8.155	40.163
Product_Sugar_Content_Regular	30.8608	13.284	2.323	0.020	4.819	56.902
Store_Id_OUT002	-261.6689	4.242	-61.689	0.000	-269.984	-253.354
Store_Id_OUT003	323.1208	9.566	33.779	0.000	304.368	341.873
Store_Id_OUT004	-143.9189	6.803	-21.155	0.000	-157.255	-130.583
Store_Size_Medium	179.2019	14.093	12.716	0.000	151.575	206.828
Store_Size_Small	-261.6689	4.242	-61.689	0.000	-269.984	-253.354
Store_Location_City_Type_Tier 2	-119.8854	5.097	-23.523	0.000	-129.877	-109.894
Store_Location_City_Type_Tier 3	-261.6689	4.242	-61.689	0.000	-269.984	-253.354
Store_Type_Food Mart	-261.6689	4.242	-61.689	0.000	-269.984	-253.354
Store_Type_Supermarket Type1	24.0335	7.601	3.162	0.002	9.133	38.934
Store_Type_Supermarket Type2	-143.9189	6.803	-21.155	0.000	-157.255	-130.583
Product_Type_Categories_Perishables	7.7111	13.233	0.583	0.560	-18.231	33.653

Omnibus:	1745.262	Durbin-Watson:	1.987
Prob(Omnibus):	0.000	Jarque-Bera (JB):	46497.082
Skew:	0.775	Prob(JB):	0.00
Kurtosis:	16.399	Cond. No.	6.36e+34

	Product_Id	Product_Weight	Product_Sugar_Content	Product_Allocated_Area	Product_Type	Product_MRP	Store_Id	Store_Establishment_Year	Store_Size	Store_Location_City_Type	Store_Type	Product_Store_Sales_Total
0	FD6114	12.66	Low Sugar	0.027	Frozen Foods	117.08	OUT004	2009	Medium	Tier 2	Supermarket Type2	2842.40
1	FD7839	16.54	Low Sugar	0.144	Dairy	171.43	OUT003	1999	Medium	Tier 1	Departmental Store	4830.02
2	FD5075	14.28	Regular	0.031	Canned	162.08	OUT001	1987	High	Tier 2	Supermarket Type1	4130.16
3	FD8233	12.10	Low Sugar	0.112	Baking Goods	186.31	OUT001	1987	High	Tier 2	Supermarket Type1	4132.18
4	NC1180	9.57	No Sugar	0.010	Health and Hygiene	123.67	OUT002	1998	Small	Tier 3	Food Mart	2279.36

	Product_Id	Product_Weight	Product_Sugar_Content	Product_Allocated_Area	Product_Type	Product_MRP	Store_Id	Store_Establishment_Year	Store_Size	Store_Location_City_Type	Store_Type	Product_Store_Sales_Total
8758	NC7546	14.80	No Sugar	0.016	Health and Hygiene	140.53	OUT004	2009	Medium	Tier 2	Supermarket Type2	3806.53
8759	NC584	14.06	No Sugar	0.142	Household	144.51	OUT004	2009	Medium	Tier 2	Supermarket Type2	5020.74
8760	NC2471	13.48	No Sugar	0.017	Health and Hygiene	88.58	OUT001	1987	High	Tier 2	Supermarket Type1	2443.42
8761	NC7187	13.89	No Sugar	0.193	Household	168.44	OUT001	1987	High	Tier 2	Supermarket Type1	4171.82
8762	FD306	14.73	Low Sugar	0.177	Snack Foods	224.93	OUT002	1998	Small	Tier 3	Food Mart	2186.08

	0
Product_Id	0
Product_Weight	0
Product_Sugar_Content	0
Product_Allocated_Area	0
Product_Type	0
Product_MRP	0
Store_Id	0
Store_Establishment_Year	0
Store_Size	0
Store_Location_City_Type	0
Store_Type	0
Product_Store_Sales_Total	0

	Product_Id	Product_Weight	Product_Sugar_Content	Product_Allocated_Area	Product_Type	Product_MRP	Store_Id	Store_Establishment_Year	Store_Size	Store_Location_City_Type	Store_Type	Product_Store_Sales_Total
count	8763	8763.000000	8763	8763.000000	8763	8763.000000	8763	8763.000000	8763	8763	8763	8763.000000
unique	8763	NaN	4	NaN	16	NaN	4	NaN	3	3	4	NaN
top	FD6114	NaN	Low Sugar	NaN	Fruits and Vegetables	NaN	OUT004	NaN	Medium	Tier 2	Supermarket Type2	NaN
freq	1	NaN	4885	NaN	1249	NaN	4676	NaN	6025	6262	4676	NaN
mean	NaN	12.653792	NaN	0.068786	NaN	147.032539	NaN	2002.032751	NaN	NaN	NaN	3464.003640
std	NaN	2.217320	NaN	0.048204	NaN	30.694110	NaN	8.388381	NaN	NaN	NaN	1065.630494
min	NaN	4.000000	NaN	0.004000	NaN	31.000000	NaN	1987.000000	NaN	NaN	NaN	33.000000
25%	NaN	11.150000	NaN	0.031000	NaN	126.160000	NaN	1998.000000	NaN	NaN	NaN	2761.715000
50%	NaN	12.660000	NaN	0.056000	NaN	146.740000	NaN	2009.000000	NaN	NaN	NaN	3452.340000
75%	NaN	14.180000	NaN	0.096000	NaN	167.585000	NaN	2009.000000	NaN	NaN	NaN	4145.165000
max	NaN	22.000000	NaN	0.298000	NaN	266.000000	NaN	2009.000000	NaN	NaN	NaN	8000.000000

	Product_Store_Sales_Total
Store_Id
OUT001	6223113.18
OUT002	2030909.72
OUT003	6673457.57
OUT004	15427583.43

	Product_Type
0	Non Perishables
1	Perishables
2	Non Perishables
3	Non Perishables
4	Non Perishables
...	...
8758	Non Perishables
8759	Non Perishables
8760	Non Perishables
8761	Non Perishables
8762	Non Perishables

	Features	VIF
0	const	0.000000
1	Product_Weight	1.752928
2	Product_Allocated_Area	1.001184
3	Product_MRP	1.885058
4	Store_Age	inf
5	Product_Id_FD	2.774371
6	Product_Id_NC	inf
7	Product_Sugar_Content_No Sugar	inf
8	Product_Sugar_Content_Regular	1.088506
9	Store_Id_OUT002	inf
10	Store_Id_OUT003	inf
11	Store_Id_OUT004	inf
12	Store_Size_Medium	inf
13	Store_Size_Small	inf
14	Store_Location_City_Type_Tier 2	inf
15	Store_Location_City_Type_Tier 3	inf
16	Store_Type_Food Mart	inf
17	Store_Type_Supermarket Type1	inf
18	Store_Type_Supermarket Type2	inf
19	Product_Type_Categories_Perishables	1.217690

	RMSE	MAE	MAPE	R2
Train	446.667654	262.110666	0.098564	0.823674
Test	446.685287	266.690587	0.104608	0.825641

	0
910	-157.136399
7022	-27.190868
8056	359.346837
2939	0.843308
68	126.494755
...	...
2895	165.265382
7813	-240.653427
905	200.238129
5192	20.060083
235	-106.886905

	Product_Weight	Product_Allocated_Area	Product_MRP	Product_Store_Sales_Total	Store_Age	Product_Id_FD	Product_Id_NC	Product_Sugar_Content_No Sugar	Product_Sugar_Content_Regular	Store_Id_OUT002	Store_Id_OUT003	Store_Id_OUT004	Store_Size_Medium	Store_Size_Small	Store_Location_City_Type_Tier 2	Store_Location_City_Type_Tier 3	Store_Type_Food Mart	Store_Type_Supermarket Type1	Store_Type_Supermarket Type2	Product_Type_Categories_Perishables
0	12.66	0.027	117.08	2842.40	15	True	False	False	False	False	False	True	True	False	True	False	False	False	True	False
1	16.54	0.144	171.43	4830.02	25	True	False	False	False	False	True	False	True	False	False	False	False	False	False	True
2	14.28	0.031	162.08	4130.16	37	True	False	False	True	False	False	False	False	False	True	False	False	True	False	False
3	12.10	0.112	186.31	4132.18	37	True	False	False	False	False	False	False	False	False	True	False	False	True	False	False
4	9.57	0.010	123.67	2279.36	26	False	True	True	False	True	False	False	False	True	False	True	True	False	False	False

	const	Product_Weight	Product_Allocated_Area	Product_MRP	Store_Age	Product_Id_FD	Product_Id_NC	Product_Sugar_Content_No Sugar	Product_Sugar_Content_Regular	Store_Id_OUT002	Store_Id_OUT003	Store_Id_OUT004	Store_Size_Medium	Store_Size_Small	Store_Location_City_Type_Tier 2	Store_Location_City_Type_Tier 3	Store_Type_Food Mart	Store_Type_Supermarket Type1	Store_Type_Supermarket Type2	Product_Type_Categories_Perishables
0	1.0	12.66	0.027	117.08	15	True	False	False	False	False	False	True	True	False	True	False	False	False	True	False
1	1.0	16.54	0.144	171.43	25	True	False	False	False	False	True	False	True	False	False	False	False	False	False	True
2	1.0	14.28	0.031	162.08	37	True	False	False	True	False	False	False	False	False	True	False	False	True	False	False
3	1.0	12.10	0.112	186.31	37	True	False	False	False	False	False	False	False	False	True	False	False	True	False	False
4	1.0	9.57	0.010	123.67	26	False	True	True	False	True	False	False	False	True	False	True	True	False	False	False

SuperKart Sales Prediction¶

Context:¶

Objective:¶

Data Description:¶

Observations:¶

Data Insights:¶

Observations:¶

Observations:¶

Observations:¶

Exploratory (Univariate) Analysis¶

Data Distribtion for numeric features¶

Data Distribtion for categorical features¶

Product_Id¶

Product_Sugar_Content¶

Product_Type¶

Store_Id¶

Store_Size¶

Store_Location_City_Type¶

Store_Type¶

Bivariate Analysis¶

Correlation for Numerical Variables¶

Correlation for Categorical Variables¶

Target Variable Distribution with Respect to Numerical Features¶

Target Variable Distribution with Respect to Categorical Features¶

Store Data Analysis - Deep Dive - Not complete yet, can be expanded¶

Product Analysis - Deep Dive - Not complete yet, can be expanded¶

Data Preprocessing¶

Outlier Checks for Numercial Features¶

Data Preparation for Modeling¶

Split Data¶

Model Training¶

Step One: Multicollinearity Analysis¶

Building Models¶

Predictions¶

Observations:¶

There is an opportunity to fine tune the model here - not completed¶

Drop insignificant variables (variables with p-value > 0.05) from the above model and create the regression model again.¶

Observations:¶

Checking the below linear regression assumptions¶

1. Check for mean residuals¶

2. Check for homoscedasticity¶

3. Linearity of variables¶

4. Normality of error terms¶

Apply cross validation to improve the model and evaluate it using different evaluation metrics¶

Actionable Insights and Business Recommendations¶