# Importing libraries for data manipulation
import numpy as np

import pandas as pd

# Importing libraries for data visualization
import seaborn as sns

import matplotlib.pyplot as plt

# Importing libraries for building linear regression model
import statsmodels.api as sm

from statsmodels.stats.outliers_influence import variance_inflation_factor

# Importing libraries for scaling the data
from Scikit-learn.preprocessing import MinMaxScaler

# To ignore warnings
import warnings
warnings.filterwarnings("ignore")

from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive

# Loading both train and test datasets

train_df = pd.read_csv('filepath/Train.csv')

test_df = pd.read_csv('filepath/Test.csv')

# Checking the first 5 rows of the dataset
train_df.head()

train_df = train_df.drop(['Item_Identifier', 'Outlet_Identifier'], axis = 1)

test_df = test_df.drop(['Item_Identifier', 'Outlet_Identifier'], axis = 1)

train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8523 entries, 0 to 8522
Data columns (total 10 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Item_Weight                7060 non-null   float64
 1   Item_Fat_Content           8523 non-null   object 
 2   Item_Visibility            8523 non-null   float64
 3   Item_Type                  8523 non-null   object 
 4   Item_MRP                   8523 non-null   float64
 5   Outlet_Establishment_Year  8523 non-null   int64  
 6   Outlet_Size                6113 non-null   object 
 7   Outlet_Location_Type       8523 non-null   object 
 8   Outlet_Type                8523 non-null   object 
 9   Item_Outlet_Sales          8523 non-null   float64
dtypes: float64(4), int64(1), object(5)
memory usage: 666.0+ KB

(train_df.isnull().sum() / train_df.shape[0])*100

Item_Weight                  17.165317
Item_Fat_Content              0.000000
Item_Visibility               0.000000
Item_Type                     0.000000
Item_MRP                      0.000000
Outlet_Establishment_Year     0.000000
Outlet_Size                  28.276428
Outlet_Location_Type          0.000000
Outlet_Type                   0.000000
Item_Outlet_Sales             0.000000
dtype: float64

fig, axes = plt.subplots(2, 2, figsize = (18, 10))

fig.suptitle('Bar plot for all categorical variables in the dataset')

sns.countplot(ax = axes[0, 0], x = 'Item_Fat_Content', data = train_df, color = 'blue',
              order = train_df['Item_Fat_Content'].value_counts().index);

sns.countplot(ax = axes[0, 1], x = 'Outlet_Size', data = train_df, color = 'blue',
              order = train_df['Outlet_Size'].value_counts().index);

sns.countplot(ax = axes[1, 0], x = 'Outlet_Location_Type', data = train_df, color = 'blue',
              order = train_df['Outlet_Location_Type'].value_counts().index);

sns.countplot(ax = axes[1, 1], x = 'Outlet_Type', data = train_df, color = 'blue',
              order = train_df['Outlet_Type'].value_counts().index);

fig = plt.figure(figsize = (18, 6))

sns.countplot(x = 'Item_Type', data = train_df, color = 'blue', order = train_df['Item_Type'].value_counts().index);

plt.xticks(rotation = 45);

train_df['Item_Fat_Content'] = train_df['Item_Fat_Content'].apply(lambda x: 'Low Fat' if x == 'low fat' or x == 'LF' else x)

train_df['Item_Fat_Content'] = train_df['Item_Fat_Content'].apply(lambda x: 'Regular' if x == 'reg' else x)

test_df['Item_Fat_Content'] = test_df['Item_Fat_Content'].apply(lambda x: 'Low Fat' if x == 'low fat' or x == 'LF' else x)

test_df['Item_Fat_Content'] = test_df['Item_Fat_Content'].apply(lambda x: 'Regular' if x == 'reg' else x)

fig, axes = plt.subplots(1, 3, figsize = (20, 6))

fig.suptitle('Histogram for all numerical variables in the dataset')

sns.histplot(x = 'Item_Weight', data = train_df, kde = True, ax = axes[0]);

sns.histplot(x = 'Item_Visibility', data = train_df, kde = True, ax = axes[1]);

sns.histplot(x = 'Item_MRP', data = train_df, kde = True, ax = axes[2]);

fig = plt.figure(figsize = (18, 6))

sns.lineplot(x = 'Outlet_Establishment_Year', y = 'Item_Outlet_Sales', data = train_df, ci = None, estimator = 'mean');

fig = plt.figure(figsize = (18, 6))

sns.heatmap(train_df.corr(numeric_only = True), annot = True);

plt.xticks(rotation = 45);

fig, axes = plt.subplots(1, 3, figsize = (20, 6))

fig.suptitle('Bi-variate scatterplot for all numerical variables with the dependent variable')

sns.scatterplot(x = 'Item_Weight', y = 'Item_Outlet_Sales', data = train_df, ax = axes[0]);

sns.scatterplot(x = 'Item_Visibility', y = 'Item_Outlet_Sales', data = train_df, ax = axes[1]);

sns.scatterplot(x = 'Item_MRP', y = 'Item_Outlet_Sales', data = train_df, ax = axes[2]);

fig = plt.figure(figsize = (18, 3))

sns.heatmap(train_df.pivot_table(index = 'Item_Fat_Content', columns = 'Item_Type', values = 'Item_Weight'), annot = True);

plt.xticks(rotation = 45);

fig = plt.figure(figsize = (18, 3))

sns.heatmap(train_df.pivot_table(index = 'Outlet_Type', columns = 'Outlet_Location_Type', values = 'Item_Weight'), annot = True);

plt.xticks(rotation = 45);

fig = plt.figure(figsize = (18, 3))

sns.heatmap(train_df.pivot_table(index = 'Item_Fat_Content', columns = 'Outlet_Size', values = 'Item_Weight'), annot = True);

plt.xticks(rotation = 45);

item_weight_indices_to_be_updated = train_df[train_df['Item_Weight'].isnull()].index

train_df.loc[item_weight_indices_to_be_updated, 'Item_Weight'] = np.random.uniform(10, 14,
                                                                                   len(item_weight_indices_to_be_updated))

item_weight_indices_to_be_updated = test_df[test_df['Item_Weight'].isnull()].index

test_df.loc[item_weight_indices_to_be_updated, 'Item_Weight'] = np.random.uniform(10, 14,
                                                                                   len(item_weight_indices_to_be_updated))

outlet_size_data = train_df[train_df['Outlet_Size'].notnull()]

outlet_size_missing_data = train_df[train_df['Outlet_Size'].isnull()]

fig, axes = plt.subplots(1, 3, figsize = (18, 6))

fig.suptitle('Bar plot for all categorical variables in the dataset where the variable Outlet_Size is missing')

sns.countplot(ax = axes[0], x = 'Outlet_Type', data = outlet_size_missing_data, color = 'blue',
              order = outlet_size_missing_data['Outlet_Type'].value_counts().index);

sns.countplot(ax = axes[1], x = 'Outlet_Location_Type', data = outlet_size_missing_data, color = 'blue',
              order = outlet_size_missing_data['Outlet_Location_Type'].value_counts().index);

sns.countplot(ax = axes[2], x = 'Item_Fat_Content', data = outlet_size_missing_data, color = 'blue',
              order = outlet_size_missing_data['Item_Fat_Content'].value_counts().index);

fig= plt.figure(figsize = (18, 3))

sns.heatmap(pd.crosstab(index = outlet_size_data['Outlet_Type'], columns = outlet_size_data['Outlet_Size']), annot = True, fmt = 'g')

plt.xticks(rotation = 45);

fig = plt.figure(figsize = (18, 3))

sns.heatmap(pd.crosstab(index = outlet_size_data['Outlet_Location_Type'], columns = outlet_size_data['Outlet_Size']), annot = True, fmt = 'g')

plt.xticks(rotation = 45);

fig = plt.figure(figsize = (18, 3))

sns.heatmap(pd.crosstab(index = outlet_size_data['Item_Fat_Content'], columns = outlet_size_data['Outlet_Size']), annot = True, fmt = 'g')

plt.xticks(rotation = 45);

grocery_store_indices = train_df[train_df['Outlet_Size'].isnull()].query(" Outlet_Type == 'Grocery Store' ").index

tier_2_indices = train_df[train_df['Outlet_Size'].isnull()].query(" Outlet_Location_Type == 'Tier 2' ").index

train_df.loc[grocery_store_indices, 'Outlet_Size'] = 'Small'

train_df.loc[tier_2_indices, 'Outlet_Size'] = 'Small'

grocery_store_indices = test_df[test_df['Outlet_Size'].isnull()].query(" Outlet_Type == 'Grocery Store' ").index

tier_2_indices = test_df[test_df['Outlet_Size'].isnull()].query(" Outlet_Location_Type == 'Tier 2' ").index

test_df.loc[grocery_store_indices, 'Outlet_Size'] = 'Small'

test_df.loc[tier_2_indices, 'Outlet_Size'] = 'Small'

train_df.isnull().sum()

Item_Weight                  0
Item_Fat_Content             0
Item_Visibility              0
Item_Type                    0
Item_MRP                     0
Outlet_Establishment_Year    0
Outlet_Size                  0
Outlet_Location_Type         0
Outlet_Type                  0
Item_Outlet_Sales            0
dtype: int64

test_df.isnull().sum()

Item_Weight                  0
Item_Fat_Content             0
Item_Visibility              0
Item_Type                    0
Item_MRP                     0
Outlet_Establishment_Year    0
Outlet_Size                  0
Outlet_Location_Type         0
Outlet_Type                  0
dtype: int64

train_df['Outlet_Age'] = 2013 - train_df['Outlet_Establishment_Year']

test_df['Outlet_Age'] = 2013 - test_df['Outlet_Establishment_Year']

fig = plt.figure(figsize = (18, 6))

sns.boxplot(x = 'Outlet_Age', y = 'Item_Outlet_Sales', data = train_df);

# We are removing the outcome variable from the feature set
# Also removing the variable Outlet_Establishment_Year, as we have created a new variable Outlet_Age
train_features = train_df.drop(['Item_Outlet_Sales', 'Outlet_Establishment_Year'], axis = 1)

# And then we are extracting the outcome variable separately
train_target = train_df['Item_Outlet_Sales']

# Creating dummy variables for the categorical variables
train_features = pd.get_dummies(train_features, drop_first = True)

train_features.head()

# Creating an instance of the MinMaxScaler
scaler = MinMaxScaler()

# Applying fit_transform on the training features data
train_features_scaled = scaler.fit_transform(train_features)

# The above scaler returns the data in array format, below we are converting it back to pandas DataFrame
train_features_scaled = pd.DataFrame(train_features_scaled, index = train_features.index, columns = train_features.columns)

train_features_scaled.head()

# Adding the intercept term
train_features_scaled = sm.add_constant(train_features_scaled)

# Calling the OLS algorithm on the train features and the target variable
ols_model_0 = sm.OLS(train_target, train_features_scaled)

# Fitting the Model
ols_res_0 = ols_model_0.fit()

print(ols_res_0.summary())

                            OLS Regression Results                            
==============================================================================
Dep. Variable:      Item_Outlet_Sales   R-squared:                       0.563
Model:                            OLS   Adj. R-squared:                  0.562
Method:                 Least Squares   F-statistic:                     405.8
Date:                Sat, 04 May 2024   Prob (F-statistic):               0.00
Time:                        17:04:16   Log-Likelihood:                -71993.
No. Observations:                8523   AIC:                         1.440e+05
Df Residuals:                    8495   BIC:                         1.442e+05
Df Model:                          27                                         
Covariance Type:            nonrobust                                         
===================================================================================================
                                      coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------------------------------
const                             191.1328    508.434      0.376      0.707    -805.521    1187.787
Item_Weight                       -16.5955     48.666     -0.341      0.733    -111.993      78.802
Item_Visibility                   -99.5252     81.716     -1.218      0.223    -259.709      60.658
Item_MRP                         3668.8921     46.594     78.742      0.000    3577.556    3760.228
Outlet_Age                       -745.9224    250.311     -2.980      0.003   -1236.592    -255.253
Item_Fat_Content_Regular           40.2254     28.243      1.424      0.154     -15.138      95.589
Item_Type_Breads                    5.2887     84.086      0.063      0.950    -159.541     170.118
Item_Type_Breakfast                 7.4127    116.662      0.064      0.949    -221.274     236.100
Item_Type_Canned                   26.9152     62.797      0.429      0.668     -96.182     150.013
Item_Type_Dairy                   -39.9972     62.259     -0.642      0.521    -162.039      82.045
Item_Type_Frozen Foods            -27.3438     58.894     -0.464      0.642    -142.790      88.102
Item_Type_Fruits and Vegetables    29.9352     54.985      0.544      0.586     -77.849     137.720
Item_Type_Hard Drinks              -2.1258     90.233     -0.024      0.981    -179.005     174.754
Item_Type_Health and Hygiene      -11.3927     68.043     -0.167      0.867    -144.774     121.989
Item_Type_Household               -38.2234     59.953     -0.638      0.524    -155.746      79.299
Item_Type_Meat                      0.7845     70.684      0.011      0.991    -137.773     139.342
Item_Type_Others                  -22.1413     98.659     -0.224      0.822    -215.537     171.255
Item_Type_Seafood                 186.2936    148.082      1.258      0.208    -103.983     476.570
Item_Type_Snack Foods             -10.1074     55.276     -0.183      0.855    -118.461      98.247
Item_Type_Soft Drinks             -27.6760     70.204     -0.394      0.693    -165.294     109.942
Item_Type_Starchy Foods            23.6990    103.085      0.230      0.818    -178.374     225.772
Outlet_Size_Medium               -728.1034    274.677     -2.651      0.008   -1266.538    -189.669
Outlet_Size_Small                -762.6419    254.942     -2.991      0.003   -1262.390    -262.894
Outlet_Location_Type_Tier 2      -168.8407     87.638     -1.927      0.054    -340.632       2.950
Outlet_Location_Type_Tier 3      -421.2016    152.010     -2.771      0.006    -719.178    -123.225
Outlet_Type_Supermarket Type1    1516.3136    140.024     10.829      0.000    1241.832    1790.795
Outlet_Type_Supermarket Type2    1252.1081    123.873     10.108      0.000    1009.287    1494.929
Outlet_Type_Supermarket Type3    3724.0421    176.038     21.155      0.000    3378.965    4069.120
==============================================================================
Omnibus:                      961.324   Durbin-Watson:                   2.004
Prob(Omnibus):                  0.000   Jarque-Bera (JB):             2293.926
Skew:                           0.667   Prob(JB):                         0.00
Kurtosis:                       5.163   Cond. No.                         105.
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.

vif_series = pd.Series(
    [variance_inflation_factor(train_features_scaled.values, i) for i in range(train_features_scaled.shape[1])],
    index = train_features_scaled.columns,
    dtype = float)

print("VIF Scores: \n\n{}\n".format(vif_series))

VIF Scores: 

const                              1726.816373
Item_Weight                           1.020611
Item_Visibility                       1.101090
Item_MRP                              1.013147
Outlet_Age                           50.920867
Item_Fat_Content_Regular              1.216512
Item_Type_Breads                      1.349976
Item_Type_Breakfast                   1.158239
Item_Type_Canned                      1.853153
Item_Type_Dairy                       1.906107
Item_Type_Frozen Foods                2.093280
Item_Type_Fruits and Vegetables       2.497372
Item_Type_Hard Drinks                 1.331340
Item_Type_Health and Hygiene          1.771824
Item_Type_Household                   2.289880
Item_Type_Meat                        1.581256
Item_Type_Others                      1.263713
Item_Type_Seafood                     1.091679
Item_Type_Snack Foods                 2.469063
Item_Type_Soft Drinks                 1.629239
Item_Type_Starchy Foods               1.211250
Outlet_Size_Medium                  111.036026
Outlet_Size_Small                   106.822133
Outlet_Location_Type_Tier 2          11.286518
Outlet_Location_Type_Tier 3          36.823422
Outlet_Type_Supermarket Type1        29.623285
Outlet_Type_Supermarket Type2         9.945390
Outlet_Type_Supermarket Type3        20.218306
dtype: float64

train_features_scaled_new = train_features_scaled.drop("Outlet_Age", axis = 1)

vif_series = pd.Series(
    [variance_inflation_factor(train_features_scaled_new.values, i) for i in range(train_features_scaled_new.shape[1])],
    index = train_features_scaled_new.columns,
    dtype = float)

print("VIF Scores: \n\n{}\n".format(vif_series))

VIF Scores: 

const                              121.289011
Item_Weight                          1.020538
Item_Visibility                      1.101069
Item_MRP                             1.013106
Item_Fat_Content_Regular             1.216468
Item_Type_Breads                     1.349676
Item_Type_Breakfast                  1.158230
Item_Type_Canned                     1.853092
Item_Type_Dairy                      1.906104
Item_Type_Frozen Foods               2.093034
Item_Type_Fruits and Vegetables      2.497200
Item_Type_Hard Drinks                1.331259
Item_Type_Health and Hygiene         1.771789
Item_Type_Household                  2.289860
Item_Type_Meat                       1.581242
Item_Type_Others                     1.263659
Item_Type_Seafood                    1.091517
Item_Type_Snack Foods                2.469012
Item_Type_Soft Drinks                1.629234
Item_Type_Starchy Foods              1.211247
Outlet_Size_Medium                  10.995498
Outlet_Size_Small                   12.276151
Outlet_Location_Type_Tier 2          2.691636
Outlet_Location_Type_Tier 3          7.540189
Outlet_Type_Supermarket Type1        5.955106
Outlet_Type_Supermarket Type2        4.230159
Outlet_Type_Supermarket Type3        4.263672
dtype: float64

ols_model_2 = sm.OLS(train_target, train_features_scaled_new)

ols_res_2 = ols_model_2.fit()

print(ols_res_2.summary())

                            OLS Regression Results                            
==============================================================================
Dep. Variable:      Item_Outlet_Sales   R-squared:                       0.563
Model:                            OLS   Adj. R-squared:                  0.561
Method:                 Least Squares   F-statistic:                     420.6
Date:                Sat, 04 May 2024   Prob (F-statistic):               0.00
Time:                        17:04:26   Log-Likelihood:                -71997.
No. Observations:                8523   AIC:                         1.440e+05
Df Residuals:                    8496   BIC:                         1.442e+05
Df Model:                          26                                         
Covariance Type:            nonrobust                                         
===================================================================================================
                                      coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------------------------------
const                           -1269.8149    134.810     -9.419      0.000   -1534.076   -1005.554
Item_Weight                       -15.3686     48.687     -0.316      0.752    -110.807      80.070
Item_Visibility                   -98.4753     81.753     -1.205      0.228    -258.732      61.781
Item_MRP                         3668.0064     46.615     78.688      0.000    3576.630    3759.383
Item_Fat_Content_Regular           40.7268     28.256      1.441      0.150     -14.662      96.115
Item_Type_Breads                    1.5482     84.116      0.018      0.985    -163.339     166.436
Item_Type_Breakfast                 8.4150    116.716      0.072      0.943    -220.377     237.207
Item_Type_Canned                   25.8404     62.825      0.411      0.681     -97.312     148.993
Item_Type_Dairy                   -39.7802     62.287     -0.639      0.523    -161.879      82.318
Item_Type_Frozen Foods            -25.4402     58.917     -0.432      0.666    -140.933      90.052
Item_Type_Fruits and Vegetables    28.5737     55.009      0.519      0.603     -79.257     136.405
Item_Type_Hard Drinks              -4.2183     90.272     -0.047      0.963    -181.174     172.738
Item_Type_Health and Hygiene      -10.4967     68.074     -0.154      0.877    -143.939     122.945
Item_Type_Household               -38.7475     59.981     -0.646      0.518    -156.324      78.829
Item_Type_Meat                      1.4009     70.717      0.020      0.984    -137.221     140.022
Item_Type_Others                  -24.0595     98.703     -0.244      0.807    -217.541     169.422
Item_Type_Seafood                 180.9125    148.139      1.221      0.222    -109.477     471.302
Item_Type_Snack Foods             -10.8581     55.301     -0.196      0.844    -119.261      97.545
Item_Type_Soft Drinks             -27.3261     70.237     -0.389      0.697    -165.007     110.355
Item_Type_Starchy Foods            24.2112    103.133      0.235      0.814    -177.955     226.377
Outlet_Size_Medium                 48.8474     86.477      0.565      0.572    -120.668     218.363
Outlet_Size_Small                 -47.9044     86.466     -0.554      0.580    -217.398     121.589
Outlet_Location_Type_Tier 2        59.0599     42.817      1.379      0.168     -24.873     142.992
Outlet_Location_Type_Tier 3       -17.2455     68.818     -0.251      0.802    -152.146     117.655
Outlet_Type_Supermarket Type1    1889.2923     62.811     30.079      0.000    1766.168    2012.416
Outlet_Type_Supermarket Type2    1531.9401     80.825     18.954      0.000    1373.503    1690.377
Outlet_Type_Supermarket Type3    3258.0353     80.877     40.284      0.000    3099.496    3416.575
==============================================================================
Omnibus:                      962.961   Durbin-Watson:                   2.004
Prob(Omnibus):                  0.000   Jarque-Bera (JB):             2300.843
Skew:                           0.668   Prob(JB):                         0.00
Kurtosis:                       5.167   Cond. No.                         28.3
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.

train_features_scaled_new2 = train_features_scaled_new.drop(['Item_Type_Breads',
'Item_Type_Breakfast',
'Item_Type_Canned',
'Item_Type_Dairy',
'Item_Type_Frozen Foods',
'Item_Type_Fruits and Vegetables',
'Item_Type_Hard Drinks',
'Item_Type_Health and Hygiene',
'Item_Type_Household',
'Item_Type_Meat',
'Item_Type_Others',
'Item_Type_Seafood',
'Item_Type_Snack Foods',
'Item_Type_Soft Drinks',
'Item_Type_Starchy Foods'], axis = 1)

vif_series = pd.Series(
    [variance_inflation_factor(train_features_scaled_new2.values, i) for i in range(train_features_scaled_new2.shape[1])],
    index = train_features_scaled_new2.columns,
    dtype = float)

print("VIF Scores: \n\n{}\n".format(vif_series))

VIF Scores: 

const                            109.333116
Item_Weight                        1.007701
Item_Visibility                    1.093167
Item_MRP                           1.000740
Item_Fat_Content_Regular           1.003019
Outlet_Size_Medium                10.985763
Outlet_Size_Small                 12.267807
Outlet_Location_Type_Tier 2        2.689563
Outlet_Location_Type_Tier 3        7.530628
Outlet_Type_Supermarket Type1      5.947375
Outlet_Type_Supermarket Type2      4.226072
Outlet_Type_Supermarket Type3      4.261104
dtype: float64

ols_model_3 = sm.OLS(train_target, train_features_scaled_new2)

ols_res_3 = ols_model_3.fit()

print(ols_res_3.summary())

                            OLS Regression Results                            
==============================================================================
Dep. Variable:      Item_Outlet_Sales   R-squared:                       0.563
Model:                            OLS   Adj. R-squared:                  0.562
Method:                 Least Squares   F-statistic:                     994.9
Date:                Sat, 04 May 2024   Prob (F-statistic):               0.00
Time:                        17:04:33   Log-Likelihood:                -72000.
No. Observations:                8523   AIC:                         1.440e+05
Df Residuals:                    8511   BIC:                         1.441e+05
Df Model:                          11                                         
Covariance Type:            nonrobust                                         
=================================================================================================
                                    coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------------------------
const                         -1278.2339    127.920     -9.992      0.000   -1528.989   -1027.479
Item_Weight                     -16.9933     48.352     -0.351      0.725    -111.775      77.789
Item_Visibility                 -94.5856     81.413     -1.162      0.245    -254.174      65.003
Item_MRP                       3666.7427     46.303     79.190      0.000    3575.978    3757.508
Item_Fat_Content_Regular         52.1464     25.643      2.034      0.042       1.880     102.412
Outlet_Size_Medium               48.6784     86.389      0.563      0.573    -120.665     218.022
Outlet_Size_Small               -49.2563     86.387     -0.570      0.569    -218.595     120.082
Outlet_Location_Type_Tier 2      60.4077     42.776      1.412      0.158     -23.444     144.260
Outlet_Location_Type_Tier 3     -17.9079     68.735     -0.261      0.794    -152.645     116.829
Outlet_Type_Supermarket Type1  1888.6126     62.734     30.105      0.000    1765.639    2011.586
Outlet_Type_Supermarket Type2  1532.3723     80.740     18.979      0.000    1374.103    1690.642
Outlet_Type_Supermarket Type3  3258.6332     80.807     40.326      0.000    3100.232    3417.034
==============================================================================
Omnibus:                      961.665   Durbin-Watson:                   2.003
Prob(Omnibus):                  0.000   Jarque-Bera (JB):             2292.659
Skew:                           0.668   Prob(JB):                         0.00
Kurtosis:                       5.161   Cond. No.                         25.6
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.

train_features_scaled_new3 = train_features_scaled_new2.drop("Item_Weight", axis = 1)

vif_series = pd.Series(
    [variance_inflation_factor(train_features_scaled_new3.values, i) for i in range(train_features_scaled_new3.shape[1])],
    index = train_features_scaled_new3.columns,
    dtype = float)

print("VIF Scores: \n\n{}\n".format(vif_series))

VIF Scores: 

const                            106.905754
Item_Visibility                    1.093065
Item_MRP                           1.000167
Item_Fat_Content_Regular           1.002641
Outlet_Size_Medium                10.977215
Outlet_Size_Small                 12.259457
Outlet_Location_Type_Tier 2        2.689448
Outlet_Location_Type_Tier 3        7.519408
Outlet_Type_Supermarket Type1      5.938545
Outlet_Type_Supermarket Type2      4.225983
Outlet_Type_Supermarket Type3      4.255095
dtype: float64

ols_model_4 = sm.OLS(train_target, train_features_scaled_new3)

ols_res_4 = ols_model_4.fit()

print(ols_res_4.summary())

                            OLS Regression Results                            
==============================================================================
Dep. Variable:      Item_Outlet_Sales   R-squared:                       0.563
Model:                            OLS   Adj. R-squared:                  0.562
Method:                 Least Squares   F-statistic:                     1095.
Date:                Sat, 04 May 2024   Prob (F-statistic):               0.00
Time:                        17:04:37   Log-Likelihood:                -72000.
No. Observations:                8523   AIC:                         1.440e+05
Df Residuals:                    8512   BIC:                         1.441e+05
Df Model:                          10                                         
Covariance Type:            nonrobust                                         
=================================================================================================
                                    coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------------------------
const                         -1284.9327    126.486    -10.159      0.000   -1532.875   -1036.990
Item_Visibility                 -94.3088     81.405     -1.159      0.247    -253.882      65.264
Item_MRP                       3666.3534     46.287     79.209      0.000    3575.619    3757.088
Item_Fat_Content_Regular         52.3215     25.637      2.041      0.041       2.068     102.575
Outlet_Size_Medium               47.8315     86.351      0.554      0.580    -121.437     217.100
Outlet_Size_Small               -50.0483     86.353     -0.580      0.562    -219.321     119.224
Outlet_Location_Type_Tier 2      60.5062     42.773      1.415      0.157     -23.340     144.352
Outlet_Location_Type_Tier 3     -18.8403     68.680     -0.274      0.784    -153.470     115.790
Outlet_Type_Supermarket Type1  1887.7630     62.684     30.116      0.000    1764.887    2010.639
Outlet_Type_Supermarket Type2  1532.5026     80.735     18.982      0.000    1374.243    1690.762
Outlet_Type_Supermarket Type3  3259.6997     80.746     40.370      0.000    3101.419    3417.981
==============================================================================
Omnibus:                      961.830   Durbin-Watson:                   2.003
Prob(Omnibus):                  0.000   Jarque-Bera (JB):             2292.698
Skew:                           0.668   Prob(JB):                         0.00
Kurtosis:                       5.161   Cond. No.                         24.5
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.

train_features_scaled_new4 = train_features_scaled_new3.drop(["Outlet_Location_Type_Tier 2", "Outlet_Location_Type_Tier 3"], axis = 1)

vif_series = pd.Series(
    [variance_inflation_factor(train_features_scaled_new4.values, i) for i in range(train_features_scaled_new4.shape[1])],
    index = train_features_scaled_new4.columns,
    dtype = float)

print("VIF Scores: \n\n{}\n".format(vif_series))

VIF Scores: 

const                            27.169808
Item_Visibility                   1.092348
Item_MRP                          1.000143
Item_Fat_Content_Regular          1.002605
Outlet_Size_Medium                4.034011
Outlet_Size_Small                 2.814532
Outlet_Type_Supermarket Type1     2.478493
Outlet_Type_Supermarket Type2     2.842880
Outlet_Type_Supermarket Type3     2.863427
dtype: float64

ols_model_5 = sm.OLS(train_target, train_features_scaled_new4)

ols_res_5 = ols_model_5.fit()

print(ols_res_5.summary())

                            OLS Regression Results                            
==============================================================================
Dep. Variable:      Item_Outlet_Sales   R-squared:                       0.562
Model:                            OLS   Adj. R-squared:                  0.562
Method:                 Least Squares   F-statistic:                     1368.
Date:                Sat, 04 May 2024   Prob (F-statistic):               0.00
Time:                        17:04:39   Log-Likelihood:                -72001.
No. Observations:                8523   AIC:                         1.440e+05
Df Residuals:                    8514   BIC:                         1.441e+05
Df Model:                           8                                         
Covariance Type:            nonrobust                                         
=================================================================================================
                                    coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------------------------
const                         -1358.8933     63.766    -21.311      0.000   -1483.889   -1233.897
Item_Visibility                 -93.3375     81.378     -1.147      0.251    -252.859      66.184
Item_MRP                       3666.0517     46.287     79.203      0.000    3575.318    3756.785
Item_Fat_Content_Regular         52.1432     25.636      2.034      0.042       1.890     102.396
Outlet_Size_Medium               66.6696     52.347      1.274      0.203     -35.943     169.282
Outlet_Size_Small                14.1489     41.376      0.342      0.732     -66.958      95.255
Outlet_Type_Supermarket Type1  1942.9093     40.496     47.978      0.000    1863.527    2022.291
Outlet_Type_Supermarket Type2  1568.8091     66.218     23.692      0.000    1439.005    1698.613
Outlet_Type_Supermarket Type3  3296.0104     66.238     49.760      0.000    3166.167    3425.854
==============================================================================
Omnibus:                      961.728   Durbin-Watson:                   2.002
Prob(Omnibus):                  0.000   Jarque-Bera (JB):             2296.342
Skew:                           0.667   Prob(JB):                         0.00
Kurtosis:                       5.164   Cond. No.                         13.3
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.

train_features_scaled_new5 = train_features_scaled_new4.drop(["Outlet_Size_Small", "Outlet_Size_Medium"], axis=1)

vif_series = pd.Series(
    [variance_inflation_factor(train_features_scaled_new5.values, i) for i in range(train_features_scaled_new5.shape[1])],
    index = train_features_scaled_new5.columns,
    dtype = float)

print("VIF Scores: \n\n{}\n".format(vif_series))

VIF Scores: 

const                            15.813401
Item_Visibility                   1.092314
Item_MRP                          1.000114
Item_Fat_Content_Regular          1.002581
Outlet_Type_Supermarket Type1     2.306663
Outlet_Type_Supermarket Type2     1.731428
Outlet_Type_Supermarket Type3     1.744734
dtype: float64

ols_model_6 = sm.OLS(train_target, train_features_scaled_new5)
ols_res_6 = ols_model_6.fit()
print(ols_res_6.summary())

                            OLS Regression Results                            
==============================================================================
Dep. Variable:      Item_Outlet_Sales   R-squared:                       0.562
Model:                            OLS   Adj. R-squared:                  0.562
Method:                 Least Squares   F-statistic:                     1824.
Date:                Sat, 04 May 2024   Prob (F-statistic):               0.00
Time:                        17:04:41   Log-Likelihood:                -72002.
No. Observations:                8523   AIC:                         1.440e+05
Df Residuals:                    8516   BIC:                         1.441e+05
Df Model:                           6                                         
Covariance Type:            nonrobust                                         
=================================================================================================
                                    coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------------------------
const                         -1344.7104     48.647    -27.642      0.000   -1440.070   -1249.351
Item_Visibility                 -93.1427     81.377     -1.145      0.252    -252.661      66.376
Item_MRP                       3665.7108     46.286     79.197      0.000    3574.979    3756.443
Item_Fat_Content_Regular         52.3194     25.636      2.041      0.041       2.067     102.572
Outlet_Type_Supermarket Type1  1949.3298     39.067     49.897      0.000    1872.749    2025.911
Outlet_Type_Supermarket Type2  1621.3566     51.677     31.375      0.000    1520.057    1722.657
Outlet_Type_Supermarket Type3  3348.5571     51.705     64.763      0.000    3247.203    3449.911
==============================================================================
Omnibus:                      960.314   Durbin-Watson:                   2.002
Prob(Omnibus):                  0.000   Jarque-Bera (JB):             2290.252
Skew:                           0.667   Prob(JB):                         0.00
Kurtosis:                       5.161   Cond. No.                         10.7
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.

train_features_scaled_new6 = train_features_scaled_new5.drop("Item_Visibility", axis = 1)

vif_series = pd.Series(
    [variance_inflation_factor(train_features_scaled_new6.values, i) for i in range(train_features_scaled_new6.shape[1])],
    index = train_features_scaled_new6.columns,
    dtype = float)

print("VIF Scores: \n\n{}\n".format(vif_series))

VIF Scores: 

const                            11.452496
Item_MRP                          1.000114
Item_Fat_Content_Regular          1.000048
Outlet_Type_Supermarket Type1     2.125686
Outlet_Type_Supermarket Type2     1.654765
Outlet_Type_Supermarket Type3     1.658941
dtype: float64

ols_model_7 = sm.OLS(train_target, train_features_scaled_new6)
ols_res_7 = ols_model_7.fit()
print(ols_res_7.summary())

                            OLS Regression Results                            
==============================================================================
Dep. Variable:      Item_Outlet_Sales   R-squared:                       0.562
Model:                            OLS   Adj. R-squared:                  0.562
Method:                 Least Squares   F-statistic:                     2188.
Date:                Sat, 04 May 2024   Prob (F-statistic):               0.00
Time:                        17:04:46   Log-Likelihood:                -72003.
No. Observations:                8523   AIC:                         1.440e+05
Df Residuals:                    8517   BIC:                         1.441e+05
Df Model:                           5                                         
Covariance Type:            nonrobust                                         
=================================================================================================
                                    coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------------------------
const                         -1373.9504     41.400    -33.187      0.000   -1455.104   -1292.796
Item_MRP                       3665.7374     46.287     79.196      0.000    3575.004    3756.471
Item_Fat_Content_Regular         50.8447     25.604      1.986      0.047       0.655     101.035
Outlet_Type_Supermarket Type1  1961.8549     37.504     52.311      0.000    1888.338    2035.371
Outlet_Type_Supermarket Type2  1633.8029     50.521     32.339      0.000    1534.769    1732.837
Outlet_Type_Supermarket Type3  3361.6803     50.418     66.676      0.000    3262.848    3460.513
==============================================================================
Omnibus:                      962.316   Durbin-Watson:                   2.002
Prob(Omnibus):                  0.000   Jarque-Bera (JB):             2299.500
Skew:                           0.668   Prob(JB):                         0.00
Kurtosis:                       5.166   Cond. No.                         8.43
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.

# Residuals
residual = ols_res_7.resid

residual.mean()

8.910764469886965e-12

# Plot histogram of residuals
sns.histplot(residual, kde = True)

<Axes: ylabel='Count'>

# Predicted values
fitted = ols_res_7.fittedvalues

sns.residplot(x = fitted, y = residual, color = "lightblue")

plt.xlabel("Fitted Values")

plt.ylabel("Residual")

plt.title("Residual PLOT")

plt.show()

# Log transformation on the target variable
train_target_log = np.log(train_target)

# Fitting new model with the transformed target variable
ols_model_7 = sm.OLS(train_target_log, train_features_scaled_new6)

ols_res_7 = ols_model_7.fit()

# Predicted values
fitted = ols_res_7.fittedvalues

residual1 = ols_res_7.resid

sns.residplot(x = fitted, y = residual1, color = "lightblue")

plt.xlabel("Fitted Values")

plt.ylabel("Residual")

plt.title("Residual PLOT")

plt.show()

print(ols_res_7.summary())

                            OLS Regression Results                            
==============================================================================
Dep. Variable:      Item_Outlet_Sales   R-squared:                       0.720
Model:                            OLS   Adj. R-squared:                  0.720
Method:                 Least Squares   F-statistic:                     4375.
Date:                Sat, 04 May 2024   Prob (F-statistic):               0.00
Time:                        17:05:10   Log-Likelihood:                -6816.7
No. Observations:                8523   AIC:                         1.365e+04
Df Residuals:                    8517   BIC:                         1.369e+04
Df Model:                           5                                         
Covariance Type:            nonrobust                                         
=================================================================================================
                                    coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------------------------
const                             4.6356      0.020    234.794      0.000       4.597       4.674
Item_MRP                          1.9555      0.022     88.588      0.000       1.912       1.999
Item_Fat_Content_Regular          0.0158      0.012      1.293      0.196      -0.008       0.040
Outlet_Type_Supermarket Type1     1.9550      0.018    109.305      0.000       1.920       1.990
Outlet_Type_Supermarket Type2     1.7737      0.024     73.618      0.000       1.726       1.821
Outlet_Type_Supermarket Type3     2.4837      0.024    103.297      0.000       2.437       2.531
==============================================================================
Omnibus:                      829.137   Durbin-Watson:                   2.007
Prob(Omnibus):                  0.000   Jarque-Bera (JB):             1164.564
Skew:                          -0.775   Prob(JB):                    1.31e-253
Kurtosis:                       3.937   Cond. No.                         8.43
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.

from statsmodels.stats.diagnostic import het_white

from statsmodels.compat import lzip

import statsmodels.stats.api as sms

import statsmodels.stats.api as sms

from statsmodels.compat import lzip

name = ["F statistic", "p-value"]

test = sms.het_goldfeldquandt(train_target_log, train_features_scaled_new6)

lzip(name, test)

[('F statistic', 0.9395156175145152), ('p-value', 0.9790604597916552)]

without_const = train_features_scaled.iloc[:, 1:]

test_features = pd.get_dummies(test_df, drop_first = True)

test_features = test_features[list(without_const)]

# Applying transform on the test data
test_features_scaled = scaler.transform(test_features)

test_features_scaled = pd.DataFrame(test_features_scaled, columns = without_const.columns)

test_features_scaled = sm.add_constant(test_features_scaled)

test_features_scaled = test_features_scaled.drop(["Item_Weight", "Item_Visibility", "Item_Type_Breads", "Item_Type_Breakfast", "Item_Type_Canned", "Item_Type_Dairy","Item_Type_Frozen Foods","Item_Type_Fruits and Vegetables", "Item_Type_Hard Drinks", "Item_Type_Health and Hygiene", "Item_Type_Household", "Item_Type_Meat", "Item_Type_Others", "Item_Type_Seafood", "Item_Type_Snack Foods", "Item_Type_Soft Drinks", "Item_Type_Starchy Foods", "Outlet_Size_Medium", "Outlet_Size_Small", "Outlet_Location_Type_Tier 2", "Outlet_Location_Type_Tier 3", 'Outlet_Age'], axis = 1)

test_features_scaled.head()

print(ols_res_7.rsquared)

0.71975057509795

print(ols_res_7.mse_resid)

0.29009080646817986

print(np.sqrt(ols_res_7.mse_resid))

0.5386007858035299

# Fitting linear model

from Scikit-learn.linear_model import LinearRegression
from Scikit-learn.model_selection import cross_val_score

linearregression = LinearRegression()

cv_Score11 = cross_val_score(linearregression, train_features_scaled_new6, train_target_log, cv = 10)

cv_Score12 = cross_val_score(linearregression, train_features_scaled_new6, train_target_log, cv = 10,
                             scoring = 'neg_mean_squared_error')


print("RSquared: %0.3f (+/- %0.3f)" % (cv_Score11.mean(), cv_Score11.std()*2))

print("Mean Squared Error: %0.3f (+/- %0.3f)" % (-1*cv_Score12.mean(), cv_Score12.std()*2))

RSquared: 0.718 (+/- 0.049)
Mean Squared Error: 0.290 (+/- 0.030)

# These test predictions will be on a log scale
test_predictions = ols_res_7.predict(test_features_scaled)

# We are converting the log scale predictions to its original scale
test_predictions_inverse_transformed = np.exp(test_predictions)

test_predictions_inverse_transformed

0       1374.895044
1       1177.811420
2        591.395560
3       2033.800973
4       6765.005719
           ...     
5676    1843.793296
5677    1937.766915
5678    1504.855760
5679    3388.112463
5680    1106.508932
Length: 5681, dtype: float64

fig, ax = plt.subplots(1, 2, figsize = (24, 12))

sns.histplot(test_predictions, ax = ax[0]);

sns.histplot(test_predictions_inverse_transformed, ax = ax[1]);

# Convert notebook to html
!jupyter nbconvert --to html "/content/drive/MyDrive/MIT - Data Sciences/Colab Notebooks/Week Four - Regression and Prediction/Mentor Learning Session/BigMart Sales Prediction/Case_Study_BigMart_Sales_Prediction.ipynb"

	Item_Weight	Item_Visibility	Item_MRP	Outlet_Age	Item_Fat_Content_Regular	Item_Type_Breads	Item_Type_Breakfast	Item_Type_Canned	Item_Type_Dairy	Item_Type_Frozen Foods	...	Item_Type_Snack Foods	Item_Type_Soft Drinks	Item_Type_Starchy Foods	Outlet_Size_Medium	Outlet_Size_Small	Outlet_Location_Type_Tier 2	Outlet_Location_Type_Tier 3	Outlet_Type_Supermarket Type1	Outlet_Type_Supermarket Type2	Outlet_Type_Supermarket Type3
0	9.30	0.016047	249.8092	14	False	False	False	False	True	False	...	False	False	False	True	False	False	False	True	False	False
1	5.92	0.019278	48.2692	4	True	False	False	False	False	False	...	False	True	False	True	False	False	True	False	True	False
2	17.50	0.016760	141.6180	14	False	False	False	False	False	False	...	False	False	False	True	False	False	False	True	False	False
3	19.20	0.000000	182.0950	15	True	False	False	False	False	False	...	False	False	False	False	True	False	True	False	False	False
4	8.93	0.000000	53.8614	26	False	False	False	False	False	False	...	False	False	False	False	False	False	True	True	False	False

BigMart Sales Prediction¶

Context¶

Objective¶

Dataset¶

Importing the libraries and overview of the dataset¶

Note: The first section of the notebook is the section that has been covered in the previous case studies. For this discussion, this part can be skipped and we can directly refer to this summary of data description and observations from EDA.

Loading the datasets¶

Checking the info of the training data¶

EDA and Data Preprocessing¶

Univariate Analysis¶

Observations:¶

Bivariate Analysis¶

Summary of EDA ¶

Missing Value Treatment¶

Feature Engineering¶

Modeling¶

Feature Selection¶

Removing Multicollinearity¶

Checking for the assumptions and rebuilding the model¶

Mean of residuals should be 0 and normality of error terms¶

Tests for Normality¶

Linearity of Variables¶

No Heteroscedasticity¶

Test for Homoscedasticity¶

Evaluation Metrics¶

R-Squared¶

Mean Squared Error¶

Root Mean Squared Error¶

Predictions on the Test Dataset¶

Conclusions and Recommendations¶

	Item_Identifier	Item_Weight	Item_Fat_Content	Item_Visibility	Item_Type	Item_MRP	Outlet_Identifier	Outlet_Establishment_Year	Outlet_Size	Outlet_Location_Type	Outlet_Type	Item_Outlet_Sales
0	FDA15	9.30	Low Fat	0.016047	Dairy	249.8092	OUT049	1999	Medium	Tier 1	Supermarket Type1	3735.1380
1	DRC01	5.92	Regular	0.019278	Soft Drinks	48.2692	OUT018	2009	Medium	Tier 3	Supermarket Type2	443.4228
2	FDN15	17.50	Low Fat	0.016760	Meat	141.6180	OUT049	1999	Medium	Tier 1	Supermarket Type1	2097.2700
3	FDX07	19.20	Regular	0.000000	Fruits and Vegetables	182.0950	OUT010	1998	NaN	Tier 3	Grocery Store	732.3800
4	NCD19	8.93	Low Fat	0.000000	Household	53.8614	OUT013	1987	High	Tier 3	Supermarket Type1	994.7052

	Item_Weight	Item_Visibility	Item_MRP	Outlet_Age	Item_Fat_Content_Regular	Item_Type_Dairy	...	Item_Type_Soft Drinks	Outlet_Size_Medium	Outlet_Size_Small	Outlet_Location_Type_Tier 3	Outlet_Type_Supermarket Type1	Outlet_Type_Supermarket Type2
0	0.282525	0.048866	0.927507	0.416667	0.0	1.0	...	0.0	1.0	0.0	0.0	1.0	0.0
1	0.081274	0.058705	0.072068	0.000000	1.0	0.0	...	1.0	1.0	0.0	1.0	0.0	1.0
2	0.770765	0.051037	0.468288	0.416667	0.0	0.0	...	0.0	1.0	0.0	0.0	1.0	0.0
3	0.871986	0.000000	0.640093	0.458333	1.0	0.0	...	0.0	0.0	1.0	1.0	0.0	0.0
4	0.260494	0.000000	0.095805	0.916667	0.0	0.0	...	0.0	0.0	0.0	1.0	1.0	0.0

	const	Item_MRP	Item_Fat_Content_Regular	Outlet_Type_Supermarket Type1	Outlet_Type_Supermarket Type3
0	1.0	0.325012	0.0	1.0	0.0
1	1.0	0.237819	1.0	1.0	0.0
2	1.0	0.893316	0.0	0.0	0.0
3	1.0	0.525233	0.0	1.0	0.0
4	1.0	0.861381	1.0	0.0	1.0

BigMart Sales Prediction¶

Context¶

Objective¶

Dataset¶

Importing the libraries and overview of the dataset¶

Note: The first section of the notebook is the section that has been covered in the previous case studies. For this discussion, this part can be skipped and we can directly refer to this summary of data description and observations from EDA.

Loading the datasets¶

Checking the info of the training data¶

EDA and Data Preprocessing¶

Univariate Analysis¶

Observations:¶

Bivariate Analysis¶

Summary of EDA¶

Missing Value Treatment¶

Feature Engineering¶

Modeling¶

Feature Selection¶

Removing Multicollinearity¶

Checking for the assumptions and rebuilding the model¶

Mean of residuals should be 0 and normality of error terms¶

Tests for Normality¶

Linearity of Variables¶

No Heteroscedasticity¶

Test for Homoscedasticity¶

Evaluation Metrics¶

R-Squared¶

Mean Squared Error¶

Root Mean Squared Error¶

Predictions on the Test Dataset¶

Conclusions and Recommendations¶

Summary of EDA ¶