# Importing the basic libraries we will require for the project

# Libraries to help with reading and manipulating data
import pandas as pd
import numpy as np

# Libaries to help with data visualization
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

# Importing the Machine Learning models we require from Scikit-Learn
from Scikit-learn.linear_model import LogisticRegression
from Scikit-learn.svm import SVC
from Scikit-learn.tree import DecisionTreeClassifier
from Scikit-learn import tree
from Scikit-learn.ensemble import RandomForestClassifier

# Importing the other functions we may require from Scikit-Learn
from Scikit-learn.model_selection import train_test_split, GridSearchCV
from Scikit-learn.preprocessing import MinMaxScaler, LabelEncoder, OneHotEncoder
from Scikit-learn.impute import SimpleImputer

# To get diferent metric scores
from Scikit-learn.metrics import confusion_matrix,classification_report,roc_auc_score,precision_recall_curve,roc_curve,make_scorer

# Code to ignore warnings from function usage
import warnings;
import numpy as np
warnings.filterwarnings('ignore')

# Connect to Google
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive

# Loading the dataset - sheet_name parameter is used if there are Basic tabs in the excel file.
data=pd.read_excel("/content/drive/MyDrive/MIT - Data Sciences/Colab Notebooks/Week_Five_-_Classification_and_Hypothesis_Testing/Practice_Project_-_Travel_Package_Purchase_Prediction/Tourism.xlsx",sheet_name='Tourism')

data.head()

data.tail()

data.shape

(4888, 20)

data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4888 entries, 0 to 4887
Data columns (total 20 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   CustomerID                4888 non-null   int64  
 1   ProdTaken                 4888 non-null   int64  
 2   Age                       4662 non-null   float64
 3   TypeofContact             4863 non-null   object 
 4   CityTier                  4888 non-null   int64  
 5   DurationOfPitch           4637 non-null   float64
 6   Occupation                4888 non-null   object 
 7   Gender                    4888 non-null   object 
 8   NumberOfPersonVisiting    4888 non-null   int64  
 9   NumberOfFollowups         4843 non-null   float64
 10  ProductPitched            4888 non-null   object 
 11  PreferredPropertyStar     4862 non-null   float64
 12  MaritalStatus             4888 non-null   object 
 13  NumberOfTrips             4748 non-null   float64
 14  Passport                  4888 non-null   int64  
 15  PitchSatisfactionScore    4888 non-null   int64  
 16  OwnCar                    4888 non-null   int64  
 17  NumberOfChildrenVisiting  4822 non-null   float64
 18  Designation               4888 non-null   object 
 19  MonthlyIncome             4655 non-null   float64
dtypes: float64(7), int64(7), object(6)
memory usage: 763.9+ KB

pd.DataFrame(data={'% of Missing Values':round(data.isna().sum()/data.isna().count()*100,2)}).sort_values(by='% of Missing Values',ascending=False)

data.nunique()

CustomerID                  4888
ProdTaken                      2
Age                           44
TypeofContact                  2
CityTier                       3
DurationOfPitch               34
Occupation                     4
Gender                         3
NumberOfPersonVisiting         5
NumberOfFollowups              6
ProductPitched                 5
PreferredPropertyStar          3
MaritalStatus                  4
NumberOfTrips                 12
Passport                       2
PitchSatisfactionScore         5
OwnCar                         2
NumberOfChildrenVisiting       4
Designation                    5
MonthlyIncome               2475
dtype: int64

# Dropping CustomerID column
data.drop(columns='CustomerID',inplace=True)

data.describe().T

# Making a list of all catrgorical variables
cat_col=['TypeofContact', 'CityTier','Occupation', 'Gender', 'NumberOfPersonVisiting',
       'NumberOfFollowups', 'ProductPitched', 'PreferredPropertyStar',
       'MaritalStatus', 'Passport', 'PitchSatisfactionScore',
       'OwnCar', 'NumberOfChildrenVisiting', 'Designation']

# Printing number of count of each unique value in each column
for column in cat_col:
    print(data[column].value_counts())
    print('-'*50)

Self Enquiry       3444
Company Invited    1419
Name: TypeofContact, dtype: int64
--------------------------------------------------
1    3190
3    1500
2     198
Name: CityTier, dtype: int64
--------------------------------------------------
Salaried          2368
Small Business    2084
Large Business     434
Free Lancer          2
Name: Occupation, dtype: int64
--------------------------------------------------
Male       2916
Female     1817
Fe Male     155
Name: Gender, dtype: int64
--------------------------------------------------
3    2402
2    1418
4    1026
1      39
5       3
Name: NumberOfPersonVisiting, dtype: int64
--------------------------------------------------
4.0    2068
3.0    1466
5.0     768
2.0     229
1.0     176
6.0     136
Name: NumberOfFollowups, dtype: int64
--------------------------------------------------
Basic           1842
Deluxe          1732
Standard         742
Super Deluxe     342
King             230
Name: ProductPitched, dtype: int64
--------------------------------------------------
3.0    2993
5.0     956
4.0     913
Name: PreferredPropertyStar, dtype: int64
--------------------------------------------------
Married      2340
Divorced      950
Single        916
Unmarried     682
Name: MaritalStatus, dtype: int64
--------------------------------------------------
0    3466
1    1422
Name: Passport, dtype: int64
--------------------------------------------------
3    1478
5     970
1     942
4     912
2     586
Name: PitchSatisfactionScore, dtype: int64
--------------------------------------------------
1    3032
0    1856
Name: OwnCar, dtype: int64
--------------------------------------------------
1.0    2080
2.0    1335
0.0    1082
3.0     325
Name: NumberOfChildrenVisiting, dtype: int64
--------------------------------------------------
Executive         1842
Manager           1732
Senior Manager     742
AVP                342
VP                 230
Name: Designation, dtype: int64
--------------------------------------------------

# Replacing 'Fe Male' with 'Female'
data.Gender=data.Gender.replace('Fe Male', 'Female')

# Converting the data type of each categorical variable to 'category'
for column in cat_col:
    data[column]=data[column].astype('category')

data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4888 entries, 0 to 4887
Data columns (total 19 columns):
 #   Column                    Non-Null Count  Dtype   
---  ------                    --------------  -----   
 0   ProdTaken                 4888 non-null   int64   
 1   Age                       4662 non-null   float64 
 2   TypeofContact             4863 non-null   category
 3   CityTier                  4888 non-null   category
 4   DurationOfPitch           4637 non-null   float64 
 5   Occupation                4888 non-null   category
 6   Gender                    4888 non-null   category
 7   NumberOfPersonVisiting    4888 non-null   category
 8   NumberOfFollowups         4843 non-null   category
 9   ProductPitched            4888 non-null   category
 10  PreferredPropertyStar     4862 non-null   category
 11  MaritalStatus             4888 non-null   category
 12  NumberOfTrips             4748 non-null   float64 
 13  Passport                  4888 non-null   category
 14  PitchSatisfactionScore    4888 non-null   category
 15  OwnCar                    4888 non-null   category
 16  NumberOfChildrenVisiting  4822 non-null   category
 17  Designation               4888 non-null   category
 18  MonthlyIncome             4655 non-null   float64 
dtypes: category(14), float64(4), int64(1)
memory usage: 260.3 KB

df = data.copy()

# Defining the hist_box() function
def hist_box(data, col):
    f, (ax_box, ax_hist) = plt.subplots(2, sharex=True, gridspec_kw={'height_ratios': (0.15, 0.85)}, figsize=(12, 6))
    # Adding a graph in each part
    sns.boxplot(data=data, x=col, ax=ax_box, showmeans=True)
    sns.histplot(data=data, x=col, kde=True, ax=ax_hist)
    plt.show()

hist_box(df, "Age")

hist_box(df, 'DurationOfPitch')

df[df['DurationOfPitch']>40]

hist_box(df, 'MonthlyIncome')

df[(df.MonthlyIncome>40000) | (df.MonthlyIncome<12000)]

hist_box(df,'NumberOfTrips')

df.NumberOfTrips.value_counts(normalize=True)

2.0     0.308340
3.0     0.227254
1.0     0.130581
4.0     0.100674
5.0     0.096462
6.0     0.067818
7.0     0.045914
8.0     0.022115
19.0    0.000211
21.0    0.000211
20.0    0.000211
22.0    0.000211
Name: NumberOfTrips, dtype: float64

# Dropping observaions with duration of pitch greater than 40. There are just 2 such observations
df.drop(index=df[df.DurationOfPitch>37].index,inplace=True)

# Dropping observation with monthly income less than 12000 or greater than 40000. There are just 4 such observations
df.drop(index=df[(df.MonthlyIncome>40000) | (df.MonthlyIncome<12000)].index,inplace=True)

# Dropping observations with number of trips greater than 8. There are just 4 such observations
df.drop(index=df[df.NumberOfTrips>10].index,inplace=True)

sns.countplot(x = df['NumberOfPersonVisiting'])
plt.show()

df['NumberOfPersonVisiting'].value_counts(normalize=True)

3    0.491390
2    0.289873
4    0.210127
1    0.007995
5    0.000615
Name: NumberOfPersonVisiting, dtype: float64

sns.countplot(x = df['Occupation'])
plt.show()

df['Occupation'].value_counts(normalize=True)

Salaried          0.484215
Small Business    0.427224
Large Business    0.088151
Free Lancer       0.000410
Name: Occupation, dtype: float64

sns.countplot(x = df['CityTier'])
plt.show()

df['CityTier'].value_counts(normalize=True)

1    0.652317
3    0.307093
2    0.040590
Name: CityTier, dtype: float64

sns.countplot(x = df['Gender'])
plt.show()

df['Gender'].value_counts(normalize=True)

Male      0.596556
Female    0.403444
Name: Gender, dtype: float64

sns.countplot(x = df['NumberOfFollowups'])
plt.show()

df['NumberOfFollowups'].value_counts(normalize=True)

4.0    0.426857
3.0    0.302504
5.0    0.158701
2.0    0.047383
1.0    0.036416
6.0    0.028140
Name: NumberOfFollowups, dtype: float64

sns.countplot(x = df['ProductPitched'])
plt.show()

df['ProductPitched'].value_counts(normalize=True)

Basic           0.376384
Deluxe          0.354244
Standard        0.152112
Super Deluxe    0.070111
King            0.047150
Name: ProductPitched, dtype: float64

sns.countplot(x = df['TypeofContact'])
plt.show()

df['TypeofContact'].value_counts(normalize=True)

Self Enquiry       0.70884
Company Invited    0.29116
Name: TypeofContact, dtype: float64

sns.countplot(x = df['Designation'])
plt.show()

df['Designation'].value_counts(normalize=True)

Executive         0.376384
Manager           0.354244
Senior Manager    0.152112
AVP               0.070111
VP                0.047150
Name: Designation, dtype: float64

sns.countplot(x = df['ProdTaken'])
plt.show()

df['ProdTaken'].value_counts(normalize=True)

0    0.811808
1    0.188192
Name: ProdTaken, dtype: float64

cols_list = df.select_dtypes(include=np.number).columns.tolist()

plt.figure(figsize=(12, 7))
sns.heatmap(data[cols_list].corr(), annot=True, vmin=-1, vmax=1, fmt=".2f", cmap="Spectral")
plt.show()

# Defining the stacked_barplot() function
def stacked_barplot(data,predictor,target,figsize=(10,6)):
  (pd.crosstab(data[predictor],data[target],normalize='index')*100).plot(kind='bar',figsize=figsize,stacked=True)
  plt.legend(loc="lower right")
  plt.ylabel(target)

stacked_barplot(data, "MaritalStatus", "ProdTaken" )

stacked_barplot(df, "ProductPitched", "ProdTaken" )

stacked_barplot(data, "Passport", "ProdTaken" )

stacked_barplot(data, "Designation", "ProdTaken" )

# Separating target variable and other variables
X=data.drop(columns='ProdTaken')
Y=data['ProdTaken']

# Dropping columns
X.drop(columns=['DurationOfPitch','NumberOfFollowups','ProductPitched','PitchSatisfactionScore'],inplace=True)

# Splitting the data into train and test sets
X_train,X_test,y_train,y_test=train_test_split(X,Y,test_size=0.30,random_state=1,stratify=Y)

si1=SimpleImputer(strategy='median')

median_imputed_col=['Age','MonthlyIncome','NumberOfTrips']

# Fit and transform the train data
X_train[median_imputed_col]=si1.fit_transform(X_train[median_imputed_col])

#Transform the test data i.e. replace missing values with the median calculated using training data
X_test[median_imputed_col]=si1.transform(X_test[median_imputed_col])

si2=SimpleImputer(strategy='most_frequent')

mode_imputed_col=['TypeofContact','PreferredPropertyStar','NumberOfChildrenVisiting']

# Fit and transform the train data
X_train[mode_imputed_col]=si2.fit_transform(X_train[mode_imputed_col])

# Transform the test data i.e. replace missing values with the mode calculated using training data
X_test[mode_imputed_col]=si2.transform(X_test[mode_imputed_col])

# Checking that no column has missing values in train or test sets
print(X_train.isna().sum())
print('-'*30)
print(X_test.isna().sum())

Age                         0
TypeofContact               0
CityTier                    0
Occupation                  0
Gender                      0
NumberOfPersonVisiting      0
PreferredPropertyStar       0
MaritalStatus               0
NumberOfTrips               0
Passport                    0
OwnCar                      0
NumberOfChildrenVisiting    0
Designation                 0
MonthlyIncome               0
dtype: int64
------------------------------
Age                         0
TypeofContact               0
CityTier                    0
Occupation                  0
Gender                      0
NumberOfPersonVisiting      0
PreferredPropertyStar       0
MaritalStatus               0
NumberOfTrips               0
Passport                    0
OwnCar                      0
NumberOfChildrenVisiting    0
Designation                 0
MonthlyIncome               0
dtype: int64

#converting data types of columns to float
for column in ['NumberOfPersonVisiting', 'Passport', 'OwnCar']:
    X_train[column]=X_train[column].astype('float')
    X_test[column]=X_test[column].astype('float')

#List of columns to create a dummy variables
col_dummy=['TypeofContact', 'Occupation', 'Gender', 'MaritalStatus', 'Designation', 'CityTier']

#Encoding categorical varaibles
X_train=pd.get_dummies(X_train, columns=col_dummy, drop_first=True)
X_test=pd.get_dummies(X_test, columns=col_dummy, drop_first=True)

# Creating metric function
def metrics_score(actual, predicted):
    print(classification_report(actual, predicted))

    cm = confusion_matrix(actual, predicted)
    plt.figure(figsize=(8,5))

    sns.heatmap(cm, annot=True,  fmt='.2f')
    plt.ylabel('Actual')
    plt.xlabel('Predicted')
    plt.show()

# Fitting logistic regression model
lg = LogisticRegression()
lg.fit(X_train,y_train)

LogisticRegression()

LogisticRegression()

# Checking the performance on the training data
y_pred_train = lg.predict(X_train)
metrics_score(y_train, y_pred_train)

              precision    recall  f1-score   support

           0       0.85      0.98      0.91      2777
           1       0.73      0.25      0.37       644

    accuracy                           0.84      3421
   macro avg       0.79      0.61      0.64      3421
weighted avg       0.83      0.84      0.81      3421

# Checking the performance on the test dataset
y_pred_test = lg.predict(X_test)
metrics_score(y_test, y_pred_test)

              precision    recall  f1-score   support

           0       0.85      0.98      0.91      1191
           1       0.69      0.23      0.34       276

    accuracy                           0.84      1467
   macro avg       0.77      0.60      0.62      1467
weighted avg       0.82      0.84      0.80      1467

# Predict_proba gives the probability of each observation belonging to each class
y_scores_lg=lg.predict_proba(X_train)

precisions_lg, recalls_lg, thresholds_lg = precision_recall_curve(y_train, y_scores_lg[:,1])

# Plot values of precisions, recalls, and thresholds
plt.figure(figsize=(10,7))
plt.plot(thresholds_lg, precisions_lg[:-1], 'b--', label='precision')
plt.plot(thresholds_lg, recalls_lg[:-1], 'g--', label = 'recall')
plt.xlabel('Threshold')
plt.legend(loc='upper left')
plt.ylim([0,1])
plt.show()

# Setting the optimal threshold
optimal_threshold = 0.25

# creating confusion matrix
y_pred_train = lg.predict_proba(X_train)
metrics_score(y_train, y_pred_train[:,1]>optimal_threshold)

              precision    recall  f1-score   support

           0       0.90      0.81      0.85      2777
           1       0.42      0.60      0.49       644

    accuracy                           0.77      3421
   macro avg       0.66      0.70      0.67      3421
weighted avg       0.81      0.77      0.78      3421

y_pred_test = lg.predict_proba(X_test)
metrics_score(y_test, y_pred_test[:,1]>optimal_threshold)

              precision    recall  f1-score   support

           0       0.91      0.81      0.86      1191
           1       0.45      0.65      0.53       276

    accuracy                           0.78      1467
   macro avg       0.68      0.73      0.70      1467
weighted avg       0.82      0.78      0.80      1467

scaling = MinMaxScaler(feature_range=(-1,1)).fit(X_train)
X_train_scaled = scaling.transform(X_train)
X_test_scaled = scaling.transform(X_test)

svm = SVC(kernel='linear',probability=True) # Linear kernal or linear decision boundary
model = svm.fit(X= X_train_scaled, y = y_train)

y_pred_train_svm = model.predict(X_train_scaled)
metrics_score(y_train, y_pred_train_svm)

              precision    recall  f1-score   support

           0       0.81      1.00      0.90      2777
           1       1.00      0.00      0.00       644

    accuracy                           0.81      3421
   macro avg       0.91      0.50      0.45      3421
weighted avg       0.85      0.81      0.73      3421

print("Testing performance:")
y_pred_test_svm = model.predict(X_test_scaled)
metrics_score(y_test, y_pred_test_svm)

Testing performance:
              precision    recall  f1-score   support

           0       0.81      1.00      0.90      1191
           1       1.00      0.00      0.01       276

    accuracy                           0.81      1467
   macro avg       0.91      0.50      0.45      1467
weighted avg       0.85      0.81      0.73      1467

# Predict on train data
y_scores_svm=model.predict_proba(X_train_scaled)

precisions_svm, recalls_svm, thresholds_svm = precision_recall_curve(y_train, y_scores_svm[:,1])

# Plot values of precisions, recalls, and thresholds
plt.figure(figsize=(10,7))
plt.plot(thresholds_svm, precisions_svm[:-1], 'b--', label='precision')
plt.plot(thresholds_svm, recalls_svm[:-1], 'g--', label = 'recall')
plt.xlabel('Threshold')
plt.legend(loc='upper left')
plt.ylim([0,1])
plt.show()

optimal_threshold_svm=0.25

print("Training performance:")
y_pred_train_svm = model.predict_proba(X_train_scaled)
metrics_score(y_train, y_pred_train_svm[:,1]>optimal_threshold_svm)

Training performance:
              precision    recall  f1-score   support

           0       0.89      0.82      0.85      2777
           1       0.42      0.57      0.48       644

    accuracy                           0.77      3421
   macro avg       0.65      0.69      0.67      3421
weighted avg       0.80      0.77      0.78      3421

y_pred_test = model.predict_proba(X_test_scaled)
metrics_score(y_test, y_pred_test[:,1]>optimal_threshold_svm)

              precision    recall  f1-score   support

           0       0.90      0.82      0.86      1191
           1       0.44      0.62      0.51       276

    accuracy                           0.78      1467
   macro avg       0.67      0.72      0.69      1467
weighted avg       0.81      0.78      0.79      1467

svm_rbf=SVC(kernel='rbf',probability=True)
# Fit the model
svm_rbf.fit(X_train_scaled,y_train)

SVC(probability=True)

SVC(probability=True)

y_pred_train_svm = svm_rbf.predict(X_train_scaled)
metrics_score(y_train, y_pred_train_svm)

              precision    recall  f1-score   support

           0       0.86      0.98      0.92      2777
           1       0.83      0.32      0.46       644

    accuracy                           0.86      3421
   macro avg       0.85      0.65      0.69      3421
weighted avg       0.86      0.86      0.83      3421

y_pred_test = svm_rbf.predict(X_test_scaled)

metrics_score(y_test, y_pred_test)

              precision    recall  f1-score   support

           0       0.85      0.98      0.91      1191
           1       0.79      0.26      0.39       276

    accuracy                           0.85      1467
   macro avg       0.82      0.62      0.65      1467
weighted avg       0.84      0.85      0.81      1467

# Predict on train data
y_scores_svm=svm_rbf.predict_proba(X_train_scaled)

precisions_svm, recalls_svm, thresholds_svm = precision_recall_curve(y_train, y_scores_svm[:,1])

# Plot values of precisions, recalls, and thresholds
plt.figure(figsize=(10,7))
plt.plot(thresholds_svm, precisions_svm[:-1], 'b--', label='precision')
plt.plot(thresholds_svm, recalls_svm[:-1], 'g--', label = 'recall')
plt.xlabel('Threshold')
plt.legend(loc='upper left')
plt.ylim([0,1])
plt.show()

optimal_threshold_svm=0.17

y_pred_train_svm = model.predict_proba(X_train_scaled)
metrics_score(y_train, y_pred_train_svm[:,1]>optimal_threshold_svm)

              precision    recall  f1-score   support

           0       0.92      0.60      0.73      2777
           1       0.31      0.78      0.44       644

    accuracy                           0.63      3421
   macro avg       0.62      0.69      0.59      3421
weighted avg       0.81      0.63      0.67      3421

y_pred_test = svm_rbf.predict_proba(X_test_scaled)
metrics_score(y_test, y_pred_test[:,1]>optimal_threshold_svm)

              precision    recall  f1-score   support

           0       0.92      0.88      0.90      1191
           1       0.56      0.68      0.62       276

    accuracy                           0.84      1467
   macro avg       0.74      0.78      0.76      1467
weighted avg       0.85      0.84      0.85      1467

model_dt = DecisionTreeClassifier(random_state=1)
model_dt.fit(X_train, y_train)

DecisionTreeClassifier(random_state=1)

DecisionTreeClassifier(random_state=1)

 # Checking performance on the training dataset
pred_train_dt = model_dt.predict(X_train)
metrics_score(y_train, pred_train_dt)

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      2777
           1       1.00      1.00      1.00       644

    accuracy                           1.00      3421
   macro avg       1.00      1.00      1.00      3421
weighted avg       1.00      1.00      1.00      3421

pred_test_dt = model_dt.predict(X_test)
metrics_score(y_test, pred_test_dt)

              precision    recall  f1-score   support

           0       0.93      0.92      0.92      1191
           1       0.66      0.68      0.67       276

    accuracy                           0.88      1467
   macro avg       0.79      0.80      0.80      1467
weighted avg       0.88      0.88      0.88      1467

# Choose the type of classifier.
estimator = DecisionTreeClassifier(random_state=1)

# Grid of parameters to choose from
parameters = {
    "max_depth": np.arange(1,100,10),
    "max_leaf_nodes": [50, 75, 150, 250],
    "min_samples_split": [10, 30, 50, 70],
}
# Run the grid search
grid_obj = GridSearchCV(estimator, parameters, cv=5,scoring='recall',n_jobs=-1)
grid_obj = grid_obj.fit(X_train, y_train)

# Set the clf to the best combination of parameters
estimator = grid_obj.best_estimator_

# Fit the best algorithm to the data.
estimator.fit(X_train, y_train)

DecisionTreeClassifier(max_depth=21, max_leaf_nodes=250, min_samples_split=10,
                       random_state=1)

DecisionTreeClassifier(max_depth=21, max_leaf_nodes=250, min_samples_split=10,
                       random_state=1)

# Checking performance on the training dataset
dt_tuned = estimator.predict(X_train)
metrics_score(y_train,dt_tuned)

              precision    recall  f1-score   support

           0       0.96      0.97      0.96      2777
           1       0.85      0.82      0.84       644

    accuracy                           0.94      3421
   macro avg       0.91      0.89      0.90      3421
weighted avg       0.94      0.94      0.94      3421

# Checking performance on the training dataset
y_pred_tuned = estimator.predict(X_test)
metrics_score(y_test,y_pred_tuned)

              precision    recall  f1-score   support

           0       0.90      0.91      0.91      1191
           1       0.61      0.58      0.59       276

    accuracy                           0.85      1467
   macro avg       0.75      0.75      0.75      1467
weighted avg       0.85      0.85      0.85      1467

feature_names = list(X_train.columns)
plt.figure(figsize=(20, 10))
out = tree.plot_tree(
    estimator,
    max_depth=4,
    feature_names=feature_names,
    filled=True,
    fontsize=9,
    node_ids=False,
    class_names=None,
)
# below code will add arrows to the decision tree split if they are missing
for o in out:
    arrow = o.arrow_patch
    if arrow is not None:
        arrow.set_edgecolor("black")
        arrow.set_linewidth(1)
plt.show()

# Importance of features in the tree building
importances = estimator.feature_importances_
indices = np.argsort(importances)

plt.figure(figsize=(8, 8))
plt.title("Feature Importances")
plt.barh(range(len(indices)), importances[indices], color="violet", align="center")
plt.yticks(range(len(indices)), [feature_names[i] for i in indices])
plt.xlabel("Relative Importance")
plt.show()

rf_estimator = RandomForestClassifier( random_state = 1)

rf_estimator.fit(X_train, y_train)

RandomForestClassifier(random_state=1)

RandomForestClassifier(random_state=1)

y_pred_train_rf = rf_estimator.predict(X_train)

metrics_score(y_train, y_pred_train_rf)

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      2777
           1       1.00      1.00      1.00       644

    accuracy                           1.00      3421
   macro avg       1.00      1.00      1.00      3421
weighted avg       1.00      1.00      1.00      3421

y_pred_test_rf = rf_estimator.predict(X_test)

metrics_score(y_test, y_pred_test_rf)

              precision    recall  f1-score   support

           0       0.89      0.99      0.94      1191
           1       0.91      0.47      0.62       276

    accuracy                           0.89      1467
   macro avg       0.90      0.73      0.78      1467
weighted avg       0.89      0.89      0.88      1467

importances = rf_estimator.feature_importances_

columns = X_train.columns

importance_df = pd.DataFrame(importances, index = columns, columns = ['Importance']).sort_values(by = 'Importance', ascending = False)


plt.figure(figsize=(8, 8))
plt.title("Feature Importances")
sns.barplot(x = importance_df.Importance, y = importance_df.index, color="violet")

<Axes: title={'center': 'Feature Importances'}, xlabel='Importance'>

# Convert notebook to html
!jupyter nbconvert --to html "/content/drive/MyDrive/MIT - Data Sciences/Colab Notebooks/Week_Five_-_Classification_and_Hypothesis_Testing/Practice_Project_-_Travel_Package_Purchase_Prediction/Practice_Project_Solution_Tourism_Package_Prediction.ipynb"

	CustomerID	ProdTaken	Age	TypeofContact	CityTier	DurationOfPitch	Occupation	Gender	NumberOfPersonVisiting	NumberOfFollowups	ProductPitched	PreferredPropertyStar	MaritalStatus	NumberOfTrips	Passport	PitchSatisfactionScore	OwnCar	NumberOfChildrenVisiting	Designation	MonthlyIncome
0	200000	1	41.0	Self Enquiry	3	6.0	Salaried	Female	3	3.0	Deluxe	3.0	Single	1.0	1	2	1	0.0	Manager	20993.0
1	200001	0	49.0	Company Invited	1	14.0	Salaried	Male	3	4.0	Deluxe	4.0	Divorced	2.0	0	3	1	2.0	Manager	20130.0
2	200002	1	37.0	Self Enquiry	1	8.0	Free Lancer	Male	3	4.0	Basic	3.0	Single	7.0	1	3	0	0.0	Executive	17090.0
3	200003	0	33.0	Company Invited	1	9.0	Salaried	Female	2	3.0	Basic	3.0	Divorced	2.0	1	5	1	1.0	Executive	17909.0
4	200004	0	NaN	Self Enquiry	1	8.0	Small Business	Male	2	3.0	Basic	4.0	Divorced	1.0	0	5	1	0.0	Executive	18468.0

	CustomerID	ProdTaken	Age	TypeofContact	CityTier	DurationOfPitch	Occupation	Gender	NumberOfPersonVisiting	NumberOfFollowups	ProductPitched	PreferredPropertyStar	MaritalStatus	NumberOfTrips	Passport	PitchSatisfactionScore	OwnCar	NumberOfChildrenVisiting	Designation	MonthlyIncome
4883	204883	1	49.0	Self Enquiry	3	9.0	Small Business	Male	3	5.0	Deluxe	4.0	Unmarried	2.0	1	1	1	1.0	Manager	26576.0
4884	204884	1	28.0	Company Invited	1	31.0	Salaried	Male	4	5.0	Basic	3.0	Single	3.0	1	3	1	2.0	Executive	21212.0
4885	204885	1	52.0	Self Enquiry	3	17.0	Salaried	Female	4	4.0	Standard	4.0	Married	7.0	0	1	1	3.0	Senior Manager	31820.0
4886	204886	1	19.0	Self Enquiry	3	16.0	Small Business	Male	3	4.0	Basic	3.0	Single	3.0	0	5	0	2.0	Executive	20289.0
4887	204887	1	36.0	Self Enquiry	1	14.0	Salaried	Male	4	4.0	Basic	4.0	Unmarried	3.0	1	3	1	2.0	Executive	24041.0

	% of Missing Values
DurationOfPitch	5.14
MonthlyIncome	4.77
Age	4.62
NumberOfTrips	2.86
NumberOfChildrenVisiting	1.35
NumberOfFollowups	0.92
PreferredPropertyStar	0.53
TypeofContact	0.51
Designation	0.00
OwnCar	0.00
PitchSatisfactionScore	0.00
Passport	0.00
CustomerID	0.00
MaritalStatus	0.00
ProdTaken	0.00
NumberOfPersonVisiting	0.00
Gender	0.00
Occupation	0.00
CityTier	0.00
ProductPitched	0.00

	count	mean	std	min	25%	50%	75%	max
ProdTaken	4888.0	0.188216	0.390925	0.0	0.0	0.0	0.0	1.0
Age	4662.0	37.622265	9.316387	18.0	31.0	36.0	44.0	61.0
CityTier	4888.0	1.654255	0.916583	1.0	1.0	1.0	3.0	3.0
DurationOfPitch	4637.0	15.490835	8.519643	5.0	9.0	13.0	20.0	127.0
NumberOfPersonVisiting	4888.0	2.905074	0.724891	1.0	2.0	3.0	3.0	5.0
NumberOfFollowups	4843.0	3.708445	1.002509	1.0	3.0	4.0	4.0	6.0
PreferredPropertyStar	4862.0	3.581037	0.798009	3.0	3.0	3.0	4.0	5.0
NumberOfTrips	4748.0	3.236521	1.849019	1.0	2.0	3.0	4.0	22.0
Passport	4888.0	0.290917	0.454232	0.0	0.0	0.0	1.0	1.0
PitchSatisfactionScore	4888.0	3.078151	1.365792	1.0	2.0	3.0	4.0	5.0
OwnCar	4888.0	0.620295	0.485363	0.0	0.0	1.0	1.0	1.0
NumberOfChildrenVisiting	4822.0	1.187267	0.857861	0.0	1.0	1.0	2.0	3.0
MonthlyIncome	4655.0	23619.853491	5380.698361	1000.0	20346.0	22347.0	25571.0	98678.0

	ProdTaken	Age	TypeofContact	CityTier	DurationOfPitch	Occupation	Gender	NumberOfPersonVisiting	NumberOfFollowups	ProductPitched	PreferredPropertyStar	MaritalStatus	NumberOfTrips	Passport	PitchSatisfactionScore	OwnCar	NumberOfChildrenVisiting	Designation	MonthlyIncome
1434	0	NaN	Company Invited	3	126.0	Salaried	Male	2	3.0	Basic	3.0	Married	3.0	0	1	1	1.0	Executive	18482.0
3878	0	53.0	Company Invited	3	127.0	Salaried	Male	3	4.0	Basic	3.0	Married	4.0	0	1	1	2.0	Executive	22160.0

	Age	TypeofContact	CityTier	DurationOfPitch	Occupation	Gender	NumberOfPersonVisiting	NumberOfFollowups	ProductPitched	PreferredPropertyStar	MaritalStatus	NumberOfTrips	Passport	PitchSatisfactionScore	OwnCar	NumberOfChildrenVisiting	Designation	MonthlyIncome
38	36.0	Self Enquiry	1	11.0	Salaried	Female	2	4.0	Basic	NaN	Divorced	1.0	1	2	1	0.0	Executive	95000.0
142	38.0	Self Enquiry	1	9.0	Large Business	Female	2	3.0	Deluxe	3.0	Single	4.0	1	5	0	0.0	Manager	1000.0
2482	37.0	Self Enquiry	1	12.0	Salaried	Female	3	5.0	Basic	5.0	Divorced	2.0	1	2	1	1.0	Executive	98678.0
2586	39.0	Self Enquiry	1	10.0	Large Business	Female	3	4.0	Deluxe	3.0	Single	5.0	1	5	0	1.0	Manager	4678.0

Classification and Hypothesis Testing Practice Project: Travel Package Purchase Prediction¶

Context¶

Objective¶

Data Description¶

Importing the libraries required¶

Loading the dataset¶

Overview of the dataset¶

View the first and last 5 rows of the dataset¶

Understand the shape of the dataset¶

Check the data types of the columns for the dataset¶

Check the percentage of missing values in each column¶

Check the number of unique values in each column¶

Question 1: Check the summary statistics of the dataset and write your observations (2 Marks)¶

Check the count of each unique category in each of the categorical variables.¶

Exploratory Data Analysis¶

Question 2: Univariate Analysis¶

Question 2.1: Plot the histogram and box plot for the variable Age using the hist_box function provided and write your insights. (1 Mark)¶

Question 2.2: Plot the histogram and box plot for the variable Duration of Pitch using the hist_box function provided and write your insights. (1 Mark)¶

Let's understand the distribution of the categorical variables¶

Question 3: Bivariate Analysis¶

Question 3.1: Find and visualize the correlation matrix using a heatmap and write your observations from the plot. (2 Marks)¶

Question 3.2: Plot the stacked barplot for the variable Marital Status against the target variable ProdTaken using the stacked_barplot function provided and write your insights. (1 Mark)¶

Question 3.3: Plot the stacked barplot for the variable ProductPitched against the target variable ProdTaken using the stacked_barplot function provided and write your insights. (1 Mark)¶

Data Preparation for Modeling¶

Model evaluation criterion:¶

The model can make wrong predictions as:¶

Which case is more important?¶

How to reduce this loss i.e need to reduce False Negatives?¶

Building the model¶

Question 4: Logistic Regression (6 Marks)¶

Question 4.1: Build a Logistic Regression model (Use the Scikit-learn library) (1 Mark)¶

Question 4.2: Check the performance of the model on train and test data (2 Marks)¶

Let's check the performance on the test set¶

Question 4.3: Find the optimal threshold for the model using the Precision-Recall Curve. (1 Mark)¶

Question 4.4: Check the performance of the model on train and test data using the optimal threshold. (2 Marks)¶

Let's check the performance on the test set¶

Question 5: Support Vector Machines (11 Marks)¶

Question 5.1: Build a Support Vector Machine model using a linear kernel (1 Mark)¶

Question 5.2: Check the performance of the model on train and test data (2 Marks)¶

Checking model performance on test set¶

Question 5.3: Find the optimal threshold for the model using the Precision-Recall Curve. (1 Mark)¶

Question 5.4: Check the performance of the model on train and test data using the optimal threshold. (2 Marks)¶

Question 5.5: Build a Support Vector Machines model using an RBF kernel (1 Mark)¶

Question 5.6: Check the performance of the model on train and test data (2 Marks)¶

Checking model performance on test set¶

Question 5.7: Check the performance of the model on train and test data using the optimal threshold. (2 Marks)¶

Checking model performance on training set¶

Checking model performance on test set¶

Question 6: Decision Trees (7 Marks)¶

Question 6.1: Build a Decision Tree Model (1 Mark)¶

Question 6.2: Check the performance of the model on train and test data (2 Marks)¶

Checking model performance on test set¶

Question 6.3: Perform hyperparameter tuning for the decision tree model using GridSearch CV (1 Mark)¶

Question 6.4: Check the performance of the model on the train and test data using the tuned model (2 Mark)¶

Checking performance on the training set¶

Visualizing the Decision Tree¶

Question 6.5: What are some important features based on the tuned decision tree? (1 Mark)¶

Question 7: Random Forest (4 Marks)¶

Question 7.1: Build a Random Forest Model (1 Mark)¶

Question 7.2: Check the performance of the model on the train and test data (2 Marks)¶

Question 7.3: What are some important features based on the Random Forest? (1 Mark)¶

Conclusion:¶

Question 8: Conclude ANY FOUR key takeaways for business recommendations (4 Marks)¶

Question 2.1: Plot the histogram and box plot for the variable `Age` using the hist_box function provided and write your insights. (1 Mark)¶

Question 2.2: Plot the histogram and box plot for the variable `Duration of Pitch` using the hist_box function provided and write your insights. (1 Mark)¶

Question 3.2: Plot the stacked barplot for the variable `Marital Status` against the target variable `ProdTaken` using the stacked_barplot function provided and write your insights. (1 Mark)¶

Question 3.3: Plot the stacked barplot for the variable `ProductPitched` against the target variable `ProdTaken` using the stacked_barplot function provided and write your insights. (1 Mark)¶