# Libraries to help with reading and manipulating data
import pandas as pd
import numpy as np

# Libaries to help with data visualization
import matplotlib.pyplot as plt
import seaborn as sns

# To tune model, get different metric scores, and split data
from Scikit-learn import metrics
from Scikit-learn.metrics import r2_score, mean_squared_error, mean_absolute_error

from Scikit-learn.model_selection import train_test_split, StratifiedKFold, cross_val_score

# To be used for data scaling and one hot encoding
from Scikit-learn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder

# To impute missing values
from Scikit-learn.impute import SimpleImputer

# To do hyperparameter tuning
from Scikit-learn.model_selection import RandomizedSearchCV

# To be used for creating pipelines and personalizing them
from Scikit-learn.pipeline import Pipeline
from Scikit-learn.compose import ColumnTransformer

# To define maximum number of columns to be displayed in a dataframe
pd.set_option("display.max_columns", None)

# To suppress scientific notations for a dataframe
pd.set_option("display.float_format", lambda x: "%.3f" % x)

# To help with model building
from Scikit-learn.tree import DecisionTreeRegressor
from Scikit-learn.ensemble import (
    BaggingRegressor,
    RandomForestRegressor,
    GradientBoostingRegressor,
    AdaBoostRegressor,
    StackingRegressor,
)
from xgboost import XGBRegressor


# To suppress scientific notations
pd.set_option("display.float_format", lambda x: "%.3f" % x)

# To suppress warnings
import warnings

warnings.filterwarnings("ignore")

# This will help in making the Python code more structured automatically (good coding practice)
#%load_ext nb_black

# Connect to google
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive

data = pd.read_csv("/content/drive/MyDrive/MIT - Data Sciences/Colab Notebooks/Week_Five_-_Classification_and_Hypothesis_Testing/KC_Roasters_Coffee_Quality_Prediction/Roasters.csv")

# Checking the number of rows and columns in the training data
data.shape

(29131, 18)

# let's view the first 5 rows of the data
data.head()

# let's view the last 5 rows of the data
data.tail()

# let's check for duplicate values in the data
data.duplicated().sum()

0

# let's check for missing values in the data
round(data.isnull().sum() / data.isnull().count() * 100, 2)

# let's check the data types of the columns in the dataset
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 29131 entries, 0 to 29130
Data columns (total 18 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   T_data_1_1  29131 non-null  int64  
 1   T_data_1_2  29131 non-null  int64  
 2   T_data_1_3  29131 non-null  int64  
 3   T_data_2_1  29131 non-null  int64  
 4   T_data_2_2  29131 non-null  int64  
 5   T_data_2_3  29131 non-null  int64  
 6   T_data_3_1  29131 non-null  int64  
 7   T_data_3_2  29131 non-null  int64  
 8   T_data_3_3  29131 non-null  int64  
 9   T_data_4_1  29131 non-null  int64  
 10  T_data_4_2  29131 non-null  int64  
 11  T_data_4_3  29131 non-null  int64  
 12  T_data_5_1  29131 non-null  int64  
 13  T_data_5_2  29131 non-null  int64  
 14  T_data_5_3  29131 non-null  int64  
 15  H_data      29087 non-null  float64
 16  AH_data     29100 non-null  float64
 17  quality     29131 non-null  int64  
dtypes: float64(2), int64(16)
memory usage: 4.0 MB

# let's view the statistical summary of the numerical columns in the data
data.describe().T

# Creating the copy of the dataframe
df = data.copy()

# Function to plot a boxplot and a histogram along the same scale.


def histogram_boxplot(data, feature, figsize=(12, 7), kde=False, bins=None):
    """
    Boxplot and histogram combined

    data: dataframe
    feature: dataframe column
    figsize: size of figure (default (12,7))
    kde: whether to the show density curve (default False)
    bins: number of bins for histogram (default None)
    """
    f2, (ax_box2, ax_hist2) = plt.subplots(
        nrows=2,  # Number of rows of the subplot grid = 2
        sharex=True,  # X-axis will be shared among all subplots
        gridspec_kw={"height_ratios": (0.25, 0.75)},
        figsize=figsize,
    )  # creating the 2 subplots
    sns.boxplot(
        data=data, x=feature, ax=ax_box2, showmeans=True, color="violet"
    )  # boxplot will be created and a triangle will indicate the mean value of the column
    sns.histplot(
        data=data, x=feature, kde=kde, ax=ax_hist2, bins=bins, palette="winter"
    ) if bins else sns.histplot(
        data=data, x=feature, kde=kde, ax=ax_hist2
    )  # For histogram
    ax_hist2.axvline(
        data[feature].mean(), color="green", linestyle="--"
    )  # Add mean to the histogram
    ax_hist2.axvline(
        data[feature].median(), color="black", linestyle="-"
    )  # Add median to the histogram

# Observations on T_data_1_1
histogram_boxplot(df, "T_data_1_1", figsize=(12, 7), kde=False, bins=None)

# Observations on T_data_1_2
histogram_boxplot(df, "T_data_1_2", figsize=(12, 7), kde=False, bins=None)

# Observations on T_data_1_3
histogram_boxplot(df, "T_data_1_3", figsize=(12, 7), kde=False, bins=None)

# Minimum value for 2nd sensor is 168 and 183 for 3rd sensor, so we will replace values less than 168 in first sensor with 168
df["T_data_1_1"].clip(lower=168, inplace=True)

# Observations on T_data_2_1
histogram_boxplot(df, "T_data_2_1", figsize=(12, 7), kde=False, bins=None)

# Observations on T_data_2_2
histogram_boxplot(df, "T_data_2_2", figsize=(12, 7), kde=False, bins=None)

# Observations on T_data_2_3
histogram_boxplot(df, "T_data_2_3", figsize=(12, 7), kde=False, bins=None)

# Minimum value for 2nd sensor is 113 and 107 for 3rd sensor, so we will replace values less than 168 in first sensor with 107
df["T_data_2_1"].clip(lower=107, inplace=True)

# Observations on T_data_3_1
histogram_boxplot(df, "T_data_3_1", figsize=(12, 7), kde=False, bins=None)

# Observations on T_data_3_2
histogram_boxplot(df, "T_data_3_2", figsize=(12, 7), kde=False, bins=None)

# Observations on T_data_3_3
histogram_boxplot(df, "T_data_3_3", figsize=(12, 7), kde=False, bins=None)

# Observations on T_data_4_1
histogram_boxplot(df, "T_data_4_1", figsize=(12, 7), kde=False, bins=None)

# Observations on T_data_4_2
histogram_boxplot(df, "T_data_4_2", figsize=(12, 7), kde=False, bins=None)

# Observations on T_data_4_3
histogram_boxplot(df, "T_data_4_3", figsize=(12, 7), kde=False, bins=None)

# Observations on T_data_5_1
histogram_boxplot(df, "T_data_5_1", figsize=(12, 7), kde=False, bins=None)

# Observations on T_data_5_2
histogram_boxplot(df, "T_data_5_2", figsize=(12, 7), kde=False, bins=None)

# Observations on T_data_5_3
histogram_boxplot(df, "T_data_5_3", figsize=(12, 7), kde=False, bins=None)

# Observations on H_data
histogram_boxplot(df, "H_data", figsize=(12, 7), kde=False, bins=None)

# Observations on AH_data
histogram_boxplot(df, "AH_data", figsize=(12, 7), kde=False, bins=None)

# Observations on quality
histogram_boxplot(df, "quality", figsize=(12, 7), kde=False, bins=None)

sns.set(rc={"figure.figsize": (8, 4)})

# Quality vs AH_data
sns.scatterplot(data=df, x="quality", y="AH_data")

<Axes: xlabel='quality', ylabel='AH_data'>

# Quality vs H_data
sns.scatterplot(data=df, x="quality", y="H_data")

<Axes: xlabel='quality', ylabel='H_data'>

# quality vs temp in 1st chamber

fig = plt.figure(figsize = (20,15))
fig.subplots_adjust(hspace=0.4, wspace=0.4)

ax = fig.add_subplot(2, 3, 1)
sns.scatterplot(data=df, x="quality", y="T_data_1_1")

ax = fig.add_subplot(2, 3, 2)
sns.scatterplot(data=df, x="quality", y="T_data_1_2")

ax = fig.add_subplot(2, 3, 3)
sns.scatterplot(data=df, x="quality", y="T_data_1_3")

<Axes: xlabel='quality', ylabel='T_data_1_3'>

# quality vs temp in 2nd chamber

fig = plt.figure(figsize = (20,15))
fig.subplots_adjust(hspace=0.4, wspace=0.4)

ax = fig.add_subplot(2, 3, 1)
sns.scatterplot(data=df, x="quality", y="T_data_2_1")

ax = fig.add_subplot(2, 3, 2)
sns.scatterplot(data=df, x="quality", y="T_data_2_2")

ax = fig.add_subplot(2, 3, 3)
sns.scatterplot(data=df, x="quality", y="T_data_2_3")

<Axes: xlabel='quality', ylabel='T_data_2_3'>

# quality vs temp in 3rd chamber

fig = plt.figure(figsize = (20,15))
fig.subplots_adjust(hspace=0.4, wspace=0.4)

ax = fig.add_subplot(2, 3, 1)
sns.scatterplot(data=df, x="quality", y="T_data_3_1")

ax = fig.add_subplot(2, 3, 2)
sns.scatterplot(data=df, x="quality", y="T_data_3_2")

ax = fig.add_subplot(2, 3, 3)
sns.scatterplot(data=df, x="quality", y="T_data_3_3")

<Axes: xlabel='quality', ylabel='T_data_3_3'>

# quality vs temp in 4th chamber

fig = plt.figure(figsize = (20,15))
fig.subplots_adjust(hspace=0.4, wspace=0.4)

ax = fig.add_subplot(2, 3, 1)
sns.scatterplot(data=df, x="quality", y="T_data_4_1")

ax = fig.add_subplot(2, 3, 2)
sns.scatterplot(data=df, x="quality", y="T_data_4_2")

ax = fig.add_subplot(2, 3, 3)
sns.scatterplot(data=df, x="quality", y="T_data_4_3")

<Axes: xlabel='quality', ylabel='T_data_4_3'>

# quality vs temp in 5th chamber

fig = plt.figure(figsize = (20,15))
fig.subplots_adjust(hspace=0.4, wspace=0.4)

ax = fig.add_subplot(2, 3, 1)
sns.scatterplot(data=df, x="quality", y="T_data_5_1")

ax = fig.add_subplot(2, 3, 2)
sns.scatterplot(data=df, x="quality", y="T_data_5_2")

ax = fig.add_subplot(2, 3, 3)
sns.scatterplot(data=df, x="quality", y="T_data_5_3")

<Axes: xlabel='quality', ylabel='T_data_5_3'>

# Correlation matrix

sns.set(rc={"figure.figsize": (16, 10)})
sns.heatmap(
    df.corr(), annot=True, linewidths=0.5, center=0, cbar=False, cmap="Spectral"
)
plt.show()

# Creating the copy of the dataframe
df1 = df.copy()

# Dividing train data into X and y

X = df1.drop(["quality"], axis=1)
y = df1["quality"]

# Splitting data into training and validation set:

X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4, random_state=1)

X_test, X_val, y_test, y_val = train_test_split(
    X_temp, y_temp, test_size=0.5, random_state=1
)

print(X_train.shape, X_val.shape, X_test.shape)

(17478, 17) (5827, 17) (5826, 17)

imputer = SimpleImputer(strategy="median")

# Fit and transform the train data
X_train = pd.DataFrame(imputer.fit_transform(X_train), columns=X_train.columns)

# Transform the validation data
X_val = pd.DataFrame(imputer.transform(X_val), columns=X_train.columns)

# Transform the test data
X_test = pd.DataFrame(imputer.transform(X_test), columns=X_train.columns)

# Checking that no column has missing values in train or test sets
print(X_train.isna().sum())
print("-" * 30)
print(X_val.isna().sum())
print("-" * 30)
print(X_test.isna().sum())

T_data_1_1    0
T_data_1_2    0
T_data_1_3    0
T_data_2_1    0
T_data_2_2    0
T_data_2_3    0
T_data_3_1    0
T_data_3_2    0
T_data_3_3    0
T_data_4_1    0
T_data_4_2    0
T_data_4_3    0
T_data_5_1    0
T_data_5_2    0
T_data_5_3    0
H_data        0
AH_data       0
dtype: int64
------------------------------
T_data_1_1    0
T_data_1_2    0
T_data_1_3    0
T_data_2_1    0
T_data_2_2    0
T_data_2_3    0
T_data_3_1    0
T_data_3_2    0
T_data_3_3    0
T_data_4_1    0
T_data_4_2    0
T_data_4_3    0
T_data_5_1    0
T_data_5_2    0
T_data_5_3    0
H_data        0
AH_data       0
dtype: int64
------------------------------
T_data_1_1    0
T_data_1_2    0
T_data_1_3    0
T_data_2_1    0
T_data_2_2    0
T_data_2_3    0
T_data_3_1    0
T_data_3_2    0
T_data_3_3    0
T_data_4_1    0
T_data_4_2    0
T_data_4_3    0
T_data_5_1    0
T_data_5_2    0
T_data_5_3    0
H_data        0
AH_data       0
dtype: int64

# Function to compute adjusted R-squared
def adj_r2_score(predictors, targets, predictions):
    r2 = r2_score(targets, predictions)
    n = predictors.shape[0]
    k = predictors.shape[1]
    return 1 - ((1 - r2) * (n - 1) / (n - k - 1))


# Function to compute different metrics to check performance of a regression model
def model_performance_regression(model, predictors, target):
    """
    Function to compute different metrics to check regression model performance

    model: regressor
    predictors: independent variables
    target: dependent variable
    """

    # predicting using the independent variables
    pred = model.predict(predictors)

    r2 = r2_score(target, pred)  # to compute R-squared
    adjr2 = adj_r2_score(predictors, target, pred)  # to compute adjusted R-squared
    rmse = np.sqrt(mean_squared_error(target, pred))  # to compute RMSE
    mae = mean_absolute_error(target, pred)  # to compute MAE

    # creating a dataframe of metrics
    df_perf = pd.DataFrame(
        {
            "RMSE": rmse,
            "MAE": mae,
            "R-squared": r2,
            "Adj. R-squared": adjr2,
        },
        index=[0],
    )

    return df_perf

dtree = DecisionTreeRegressor(random_state=1)
dtree.fit(X_train, y_train)

DecisionTreeRegressor(random_state=1)

DecisionTreeRegressor(random_state=1)

dtree_model_train_perf = model_performance_regression(dtree, X_train, y_train)
dtree_model_train_perf

dtree_model_val_perf = model_performance_regression(dtree, X_val, y_val)
dtree_model_val_perf

rf_estimator = RandomForestRegressor(random_state=1)
rf_estimator.fit(X_train, y_train)

RandomForestRegressor(random_state=1)

RandomForestRegressor(random_state=1)

rf_estimator_model_train_perf = model_performance_regression(
    rf_estimator, X_train, y_train
)
rf_estimator_model_train_perf

rf_estimator_model_val_perf = model_performance_regression(rf_estimator, X_val, y_val)
rf_estimator_model_val_perf

bag_estimator = BaggingRegressor(random_state=1)
bag_estimator.fit(X_train, y_train)

BaggingRegressor(random_state=1)

BaggingRegressor(random_state=1)

bag_estimator_model_train_perf = model_performance_regression(
    bag_estimator, X_train, y_train
)
bag_estimator_model_train_perf

bag_estimator_model_val_perf = model_performance_regression(bag_estimator, X_val, y_val)
bag_estimator_model_val_perf

ab_regressor = AdaBoostRegressor(random_state=1)
ab_regressor.fit(X_train, y_train)

AdaBoostRegressor(random_state=1)

AdaBoostRegressor(random_state=1)

ab_regressor_model_train_perf = model_performance_regression(
    ab_regressor, X_train, y_train
)
ab_regressor_model_train_perf

ab_regressor_model_val_perf = model_performance_regression(ab_regressor, X_val, y_val)
ab_regressor_model_val_perf

gb_estimator = GradientBoostingRegressor(random_state=1)
gb_estimator.fit(X_train, y_train)

GradientBoostingRegressor(random_state=1)

GradientBoostingRegressor(random_state=1)

gb_estimator_model_train_perf = model_performance_regression(
    gb_estimator, X_train, y_train
)
gb_estimator_model_train_perf

gb_estimator_model_val_perf = model_performance_regression(gb_estimator, X_val, y_val)
gb_estimator_model_val_perf

xgb_estimator = XGBRegressor(random_state=1)
xgb_estimator.fit(X_train, y_train)

XGBRegressor(base_score=None, booster=None, callbacks=None,
             colsample_bylevel=None, colsample_bynode=None,
             colsample_bytree=None, device=None, early_stopping_rounds=None,
             enable_categorical=False, eval_metric=None, feature_types=None,
             gamma=None, grow_policy=None, importance_type=None,
             interaction_constraints=None, learning_rate=None, max_bin=None,
             max_cat_threshold=None, max_cat_to_onehot=None,
             max_delta_step=None, max_depth=None, max_leaves=None,
             min_child_weight=None, missing=nan, monotone_constraints=None,
             multi_strategy=None, n_estimators=None, n_jobs=None,
             num_parallel_tree=None, random_state=1, ...)

XGBRegressor(base_score=None, booster=None, callbacks=None,
             colsample_bylevel=None, colsample_bynode=None,
             colsample_bytree=None, device=None, early_stopping_rounds=None,
             enable_categorical=False, eval_metric=None, feature_types=None,
             gamma=None, grow_policy=None, importance_type=None,
             interaction_constraints=None, learning_rate=None, max_bin=None,
             max_cat_threshold=None, max_cat_to_onehot=None,
             max_delta_step=None, max_depth=None, max_leaves=None,
             min_child_weight=None, missing=nan, monotone_constraints=None,
             multi_strategy=None, n_estimators=None, n_jobs=None,
             num_parallel_tree=None, random_state=1, ...)

xgb_estimator_model_train_perf = model_performance_regression(
    xgb_estimator, X_train, y_train
)
xgb_estimator_model_train_perf

xgb_estimator_model_val_perf = model_performance_regression(xgb_estimator, X_val, y_val)
xgb_estimator_model_val_perf

# Training performance comparison

models_train_comp_df = pd.concat(
    [
        dtree_model_train_perf.T,
        rf_estimator_model_train_perf.T,
        bag_estimator_model_train_perf.T,
        ab_regressor_model_train_perf.T,
        gb_estimator_model_train_perf.T,
        xgb_estimator_model_train_perf.T,
    ],
    axis=1,
)
models_train_comp_df.columns = [
    "Decision tree",
    "Random forest",
    "Bagging Regressor",
    "Adaboost",
    "Gradient Boosting",
    "Xgboost",
]
print("Training performance comparison:")
models_train_comp_df.T

Training performance comparison:

# Validation performance comparison

models_val_comp_df = pd.concat(
    [
        dtree_model_val_perf.T,
        rf_estimator_model_val_perf.T,
        bag_estimator_model_val_perf.T,
        ab_regressor_model_val_perf.T,
        gb_estimator_model_val_perf.T,
        xgb_estimator_model_val_perf.T,
    ],
    axis=1,
)
models_val_comp_df.columns = [
    "Decision tree",
    "Random forest",
    "Bagging Regressor",
    "Adaboost",
    "Gradient Boosting",
    "Xgboost",
]
print("Validation performance comparison:")
models_val_comp_df.T

Validation performance comparison:

%%time

rf_tuned = RandomForestRegressor(random_state=1)

# Grid of parameters to choose from
parameters = {
                'max_depth':[4, 6, 8, 10, None],
                'max_features': ['sqrt','log2',None],
                'n_estimators': [80, 90, 100, 110, 120]
}

# Type of scoring used to compare parameter combinations
scorer = metrics.make_scorer(metrics.r2_score)

# Run the grid search
randomized_cv = RandomizedSearchCV(rf_tuned, parameters, scoring=scorer, n_iter=40, n_jobs = -1, cv=5, random_state=1)
randomized_cv = randomized_cv.fit(X_train, y_train)

# Set the clf to the best combination of parameters
rf_tuned = randomized_cv.best_estimator_

# Fit the best algorithm to the data
rf_tuned.fit(X_train, y_train)

print("Best parameters are {} with CV score={}:" .format(randomized_cv.best_params_,randomized_cv.best_score_))

Best parameters are {'n_estimators': 110, 'max_features': 'log2', 'max_depth': None} with CV score=0.8740151805774847:
CPU times: user 21.8 s, sys: 1.37 s, total: 23.2 s
Wall time: 14min 27s

# Creating new pipeline with best parameters
rf_tuned = RandomForestRegressor(
    random_state=1, max_depth=None, max_features="log2", n_estimators=110
)

rf_tuned.fit(X_train, y_train)

RandomForestRegressor(max_features='log2', n_estimators=110, random_state=1)

RandomForestRegressor(max_features='log2', n_estimators=110, random_state=1)

rf_tuned_train_perf = model_performance_regression(rf_tuned, X_train, y_train)
rf_tuned_train_perf

rf_tuned_val_perf = model_performance_regression(rf_tuned, X_val, y_val)
rf_tuned_val_perf

%%time

# Defining the model
Model = BaggingRegressor(random_state=1)

# Parameter grid to pass in RandomSearchCV
param_grid = {
              'max_samples': [0.7,0.8,0.9,1],
              'max_features': [0.7,0.8,0.9,1],
              'n_estimators' : [50, 100, 120, 150],
             }

# Type of scoring used to compare parameter combinations
scorer = metrics.make_scorer(metrics.r2_score)

# Calling RandomizedSearchCV
randomized_cv = RandomizedSearchCV(estimator=Model, param_distributions=param_grid, n_iter=20, n_jobs = -1, scoring=scorer, cv=5, random_state=1)

# Fitting parameters in RandomizedSearchCV
randomized_cv.fit(X_train,y_train)

print("Best parameters are {} with CV score={}:" .format(randomized_cv.best_params_,randomized_cv.best_score_))

Best parameters are {'n_estimators': 120, 'max_samples': 0.9, 'max_features': 0.7} with CV score=0.8602137771073354:
CPU times: user 26.2 s, sys: 1.03 s, total: 27.2 s
Wall time: 15min 44s

# Creating new pipeline with best parameters
bag_tuned = BaggingRegressor(
    random_state=1, max_samples=0.7, max_features=0.9, n_estimators=120
)

bag_tuned.fit(X_train, y_train)

BaggingRegressor(max_features=0.9, max_samples=0.7, n_estimators=120,
                 random_state=1)

BaggingRegressor(max_features=0.9, max_samples=0.7, n_estimators=120,
                 random_state=1)

bag_tuned_train_perf = model_performance_regression(bag_tuned, X_train, y_train)
bag_tuned_train_perf

bag_tuned_val_perf = model_performance_regression(bag_tuned, X_val, y_val)
bag_tuned_val_perf

%%time

# Choose the type of classifier
dtree_tuned = DecisionTreeRegressor(random_state=1)

# Grid of parameters to choose from
parameters = {'max_depth': list(np.arange(15,20)) + [None],
              'min_samples_leaf': [1, 3] + [None],
              'max_leaf_nodes' : [5, 10, 15] + [None],
              'min_impurity_decrease': [0.001, 0.0]
             }

# Type of scoring used to compare parameter combinations
scorer = metrics.make_scorer(metrics.r2_score)

# Run the grid search
randomized_cv = RandomizedSearchCV(dtree_tuned, parameters, scoring=scorer,cv=5, n_jobs = -1, verbose = 2, n_iter = 100)
randomized_cv = randomized_cv.fit(X_train, y_train)

print("Best parameters are {} with CV score={}:" .format(randomized_cv.best_params_,randomized_cv.best_score_))

Fitting 5 folds for each of 100 candidates, totalling 500 fits
Best parameters are {'min_samples_leaf': 3, 'min_impurity_decrease': 0.0, 'max_leaf_nodes': None, 'max_depth': None} with CV score=0.698901500324649:
CPU times: user 906 ms, sys: 99.7 ms, total: 1.01 s
Wall time: 30.4 s

dtree_tuned = DecisionTreeRegressor(
    random_state=1,
    max_depth=None,
    min_samples_leaf=1,
    max_leaf_nodes=None,
    min_impurity_decrease=0.001,
)

dtree_tuned.fit(X_train, y_train)

DecisionTreeRegressor(min_impurity_decrease=0.001, random_state=1)

DecisionTreeRegressor(min_impurity_decrease=0.001, random_state=1)

dtree_tuned_train_perf = model_performance_regression(dtree_tuned, X_train, y_train)
dtree_tuned_train_perf

dtree_tuned_val_perf = model_performance_regression(dtree_tuned, X_val, y_val)
dtree_tuned_val_perf

# Training performance comparison

models_train_comp_df = pd.concat(
    [
        dtree_tuned_train_perf.T,
        bag_tuned_train_perf.T,
        rf_tuned_train_perf.T,
    ],
    axis=1,
)
models_train_comp_df.columns = [
    "Tuned Decision Tree",
    "Tuned Bagging regressor",
    "Tuned Random forest",
]
print("Training performance comparison:")
models_train_comp_df.T

Training performance comparison:

# Validation performance comparison

models_val_comp_df = pd.concat(
    [
        dtree_tuned_val_perf.T,
        bag_tuned_val_perf.T,
        rf_tuned_val_perf.T,
    ],
    axis=1,
)
models_val_comp_df.columns = [
    "Tuned Decision Tree",
    "Tuned Bagging regressor",
    "Tuned Random forest",
]
print("Validation performance comparison:")
models_val_comp_df.T

Validation performance comparison:

# Let's check the performance on test set
rf_test = model_performance_regression(rf_tuned, X_test, y_test)
rf_test

feature_names = X_train.columns
importances = rf_tuned.feature_importances_
indices = np.argsort(importances)

plt.figure(figsize=(12, 12))
plt.title("Feature Importances")
plt.barh(range(len(indices)), importances[indices], color="violet", align="center")
plt.yticks(range(len(indices)), [feature_names[i] for i in indices])
plt.xlabel("Relative Importance")
plt.show()

Model = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="median")),
        (
            "Random forest",
            RandomForestRegressor(
                random_state=1, max_depth=None, max_features="log2", n_estimators=110
            ),
        ),
    ]
)

# Separating the target variable and other variables
X = df.drop(columns="quality")
Y = df["quality"]

# Splitting data into training and test set:

X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=1)

print(X_train.shape, X_test.shape)

(20391, 17) (8740, 17)

Model.fit(X_train, y_train)

Pipeline(steps=[('imputer', SimpleImputer(strategy='median')),
                ('Random forest',
                 RandomForestRegressor(max_features='log2', n_estimators=110,
                                       random_state=1))])

Pipeline(steps=[('imputer', SimpleImputer(strategy='median')),
                ('Random forest',
                 RandomForestRegressor(max_features='log2', n_estimators=110,
                                       random_state=1))])

SimpleImputer(strategy='median')

RandomForestRegressor(max_features='log2', n_estimators=110, random_state=1)

# Let's check the performance on test set
Pipeline_model_test = model_performance_regression(Model, X_test, y_test)
Pipeline_model_test

# Convert notebook to html
!jupyter nbconvert --to html "/content/drive/MyDrive/MIT - Data Sciences/Colab Notebooks/Week_Five_-_Classification_and_Hypothesis_Testing/KC_Roasters_Coffee_Quality_Prediction/KC+Roasters+Case+Study.ipynb"

[NbConvertApp] Converting notebook /content/drive/MyDrive/MIT - Data Sciences/Colab Notebooks/Week_Five_-_Classification_and_Hypothesis_Testing/KC_Roasters_Coffee_Quality_Prediction/KC+Roasters+Case+Study.ipynb to html
[NbConvertApp] WARNING | Alternative text is missing on 27 image(s).
[NbConvertApp] Writing 7726845 bytes to /content/drive/MyDrive/MIT - Data Sciences/Colab Notebooks/Week_Five_-_Classification_and_Hypothesis_Testing/KC_Roasters_Coffee_Quality_Prediction/KC+Roasters+Case+Study.html

	T_data_1_1	T_data_1_2	T_data_1_3	T_data_2_1	T_data_2_2	T_data_2_3	T_data_3_1	T_data_3_2	T_data_3_3	T_data_4_1	T_data_4_2	T_data_4_3	T_data_5_1	T_data_5_2	T_data_5_3	H_data	AH_data	quality
0	212	210	211	347	353	347	474	473	481	346	348	355	241	241	243	167.850	9.220	61
1	212	211	211	346	352	346	475	473	481	349	348	355	241	241	243	162.510	9.220	57
2	212	211	211	345	352	346	476	473	481	352	349	355	242	241	242	164.990	9.220	61
3	213	211	211	344	351	346	477	473	481	355	349	355	242	241	242	167.340	9.220	63
4	213	211	211	343	350	346	478	473	482	358	349	355	243	241	242	163.040	9.220	63

	T_data_1_1	T_data_1_2	T_data_1_3	T_data_2_1	T_data_2_2	T_data_2_3	T_data_3_1	T_data_3_2	T_data_3_3	T_data_4_1	T_data_4_2	T_data_4_3	T_data_5_1	T_data_5_2	T_data_5_3	H_data	AH_data	quality
29126	275	276	266	180	376	356	495	506	486	341	337	308	234	220	223	154.120	6.190	77
29127	275	276	266	181	376	355	495	507	486	341	337	308	234	220	224	158.790	6.190	82
29128	275	276	266	183	376	354	495	507	486	341	337	307	234	219	224	154.830	6.190	82
29129	275	276	265	185	376	354	495	507	486	341	337	307	234	219	224	153.680	6.190	82
29130	274	276	265	187	376	353	496	508	486	341	337	307	234	219	224	155.640	6.190	82

	count	mean	std	min	25%	50%	75%	max
T_data_1_1	29131.000	253.552	32.488	13.000	232.000	252.000	280.000	343.000
T_data_1_2	29131.000	254.078	30.025	168.000	231.000	254.000	280.000	341.000
T_data_1_3	29131.000	254.058	28.977	183.000	232.000	252.000	279.000	339.000
T_data_2_1	29131.000	343.076	32.544	70.000	325.000	344.000	363.000	436.000
T_data_2_2	29131.000	344.524	33.719	113.000	326.000	344.000	364.000	598.000
T_data_2_3	29131.000	342.017	31.031	107.000	322.500	343.000	363.000	433.000
T_data_3_1	29131.000	494.513	50.315	311.000	461.000	497.000	524.000	1024.000
T_data_3_2	29131.000	495.851	43.756	401.000	460.000	498.000	527.000	756.000
T_data_3_3	29131.000	500.775	64.996	197.000	460.000	499.000	530.000	934.000
T_data_4_1	29131.000	345.081	43.405	53.000	324.000	343.000	362.000	912.000
T_data_4_2	29131.000	342.738	38.340	51.000	322.000	343.000	362.000	613.000
T_data_4_3	29131.000	345.234	35.052	52.000	326.000	344.000	361.000	695.000
T_data_5_1	29131.000	245.814	25.499	114.000	226.000	245.000	267.000	319.000
T_data_5_2	29131.000	246.304	26.520	145.000	226.000	245.000	266.000	361.000
T_data_5_3	29131.000	245.757	26.051	146.000	225.000	247.000	266.000	374.000
H_data	29087.000	171.127	14.079	144.410	157.350	170.470	182.135	203.680
AH_data	29100.000	7.551	1.123	4.380	6.810	7.520	8.320	10.740
quality	29131.000	64.322	16.397	0.000	54.000	66.000	77.000	100.000

	RMSE	MAE	R-squared	Adj. R-squared
Decision tree	0.000	0.000	1.000	1.000
Random forest	2.022	1.502	0.985	0.985
Bagging Regressor	2.632	1.815	0.975	0.975
Adaboost	16.514	13.754	-0.001	-0.002
Gradient Boosting	15.020	12.236	0.172	0.171
Xgboost	7.652	5.980	0.785	0.785

	RMSE	MAE	R-squared	Adj. R-squared
Decision tree	7.438	5.202	0.789	0.789
Random forest	5.307	4.004	0.893	0.893
Bagging Regressor	5.916	4.408	0.867	0.866
Adaboost	16.260	13.530	-0.006	-0.009
Gradient Boosting	14.873	12.134	0.158	0.156
Xgboost	9.028	7.091	0.690	0.689

Problem Statement¶

Business Context¶

Objective¶

Data Dictionary¶

Importing necessary libraries¶

Loading the dataset¶

Data Overview¶

Checking the shape of the dataset¶

Displaying the first few rows of the dataset¶

Checking for duplicate values¶

Checking for missing values¶

Checking the data types of the columns for the dataset¶

Statistical summary of the dataset¶

Observations:¶

Exploratory Data Analysis¶

Univariate analysis¶

Bivariate analysis¶

Data Pre-Processing¶

Data Preparation for Modeling¶

Missing value imputation¶

Model Building¶

Decision Tree¶

Random Forest¶

Bagging Regressor¶

Adaboost¶

Gradient Boosting¶

Xgboost¶

Model performance comparison¶

Hyperparameter Tuning¶

Tuning Random Forest Regressor model¶

Tuning Bagging Regressor model¶

Tuning Decision Tree Regressor model¶

Model performance comparison and choosing the final model¶

Let's use Pipelines to build the final model¶

Business Insights and Conclusions¶

	RMSE	MAE	R-squared	Adj. R-squared
Tuned Decision Tree	1.519	1.126	0.992	0.992
Tuned Bagging regressor	2.944	2.229	0.968	0.968
Tuned Random forest	1.915	1.421	0.987	0.987

	RMSE	MAE	R-squared	Adj. R-squared
Tuned Decision Tree	7.633	5.387	0.778	0.778
Tuned Bagging regressor	5.763	4.397	0.874	0.873
Tuned Random forest	4.992	3.767	0.905	0.905