# Libraries to help with reading and manipulating data
import pandas as pd
import numpy as np

# Libaries to help with data visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Removes the limit for the number of displayed columns
pd.set_option("display.max_columns", None)
# Sets the limit for the number of displayed rows
pd.set_option("display.max_rows", 200)
# Setting the precision of floating numbers to 5 decimal points
pd.set_option("display.float_format", lambda x: "%.5f" % x)

# To tune model, get different metric scores, and split data
from Scikit-learn.metrics import (
    f1_score,
    accuracy_score,
    recall_score,
    precision_score,
    confusion_matrix,
    roc_auc_score,
    classification_report,
    precision_recall_curve
)
from Scikit-learn import metrics

from Scikit-learn.model_selection import train_test_split, StratifiedKFold, cross_val_score

# To be used for data scaling and one hot encoding
from Scikit-learn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder

# To impute missing values
from Scikit-learn.impute import SimpleImputer

# To help with model building
from Scikit-learn.linear_model import LogisticRegression

# To build classification models
from Scikit-learn.svm import SVC
from Scikit-learn.tree import DecisionTreeClassifier
from Scikit-learn.ensemble import RandomForestClassifier
from Scikit-learn.linear_model import LogisticRegression


# For tuning the model
from Scikit-learn.model_selection import GridSearchCV

# To supress warnings
import warnings
warnings.filterwarnings("ignore")

# Connect to google
from google.colab import drive
drive.mount('/content/drive')

# Loading the dataset
churn = pd.read_csv("/content/drive/MyDrive/MIT - Data Sciences/Colab Notebooks/Week_Five_-_Classification_and_Hypothesis_Testing/Hands-on Session:_Classification_and_Hypothesis_Testing/BankChurners.csv")

# Checking the number of rows and columns in the data
churn.shape

(10127, 21)

# Let's create a copy of the data
data = churn.copy()

# Let's view the first 5 rows of the data
data.head()

# Let's view the last 5 rows of the data
data.tail()

# Let's check the data types of the columns in the dataset
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10127 entries, 0 to 10126
Data columns (total 21 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   CLIENTNUM                 10127 non-null  int64  
 1   Attrition_Flag            10127 non-null  object 
 2   Customer_Age              10127 non-null  int64  
 3   Gender                    10127 non-null  object 
 4   Dependent_count           10127 non-null  int64  
 5   Education_Level           8608 non-null   object 
 6   Marital_Status            9378 non-null   object 
 7   Income_Category           10127 non-null  object 
 8   Card_Category             10127 non-null  object 
 9   Months_on_book            10127 non-null  int64  
 10  Total_Relationship_Count  10127 non-null  int64  
 11  Months_Inactive_12_mon    10127 non-null  int64  
 12  Contacts_Count_12_mon     10127 non-null  int64  
 13  Credit_Limit              10127 non-null  float64
 14  Total_Revolving_Bal       10127 non-null  int64  
 15  Avg_Open_To_Buy           10127 non-null  float64
 16  Total_Amt_Chng_Q4_Q1      10127 non-null  float64
 17  Total_Trans_Amt           10127 non-null  int64  
 18  Total_Trans_Ct            10127 non-null  int64  
 19  Total_Ct_Chng_Q4_Q1       10127 non-null  float64
 20  Avg_Utilization_Ratio     10127 non-null  float64
dtypes: float64(5), int64(10), object(6)
memory usage: 1.6+ MB

# Let's check for duplicate values in the data
data.duplicated().sum()

0

# Let's check for missing values in the data
round(data.isnull().sum() / data.isnull().count() * 100, 2)

CLIENTNUM                   0.00000
Attrition_Flag              0.00000
Customer_Age                0.00000
Gender                      0.00000
Dependent_count             0.00000
Education_Level            15.00000
Marital_Status              7.40000
Income_Category             0.00000
Card_Category               0.00000
Months_on_book              0.00000
Total_Relationship_Count    0.00000
Months_Inactive_12_mon      0.00000
Contacts_Count_12_mon       0.00000
Credit_Limit                0.00000
Total_Revolving_Bal         0.00000
Avg_Open_To_Buy             0.00000
Total_Amt_Chng_Q4_Q1        0.00000
Total_Trans_Amt             0.00000
Total_Trans_Ct              0.00000
Total_Ct_Chng_Q4_Q1         0.00000
Avg_Utilization_Ratio       0.00000
dtype: float64

# Let's view the statistical summary of the numerical columns in the data
data.describe().T

data.describe(include=["object"]).T

# Printing the % sub categories of each category

for i in data.describe(include=["object"]).columns:
    print("Unique values in", i, "are :")
    print(data[i].value_counts())
    print("*" * 50)

Unique values in Attrition_Flag are :
Attrition_Flag
Existing Customer    8500
Attrited Customer    1627
Name: count, dtype: int64
**************************************************
Unique values in Gender are :
Gender
F    5358
M    4769
Name: count, dtype: int64
**************************************************
Unique values in Education_Level are :
Education_Level
Graduate         3128
High School      2013
Uneducated       1487
College          1013
Post-Graduate     516
Doctorate         451
Name: count, dtype: int64
**************************************************
Unique values in Marital_Status are :
Marital_Status
Married     4687
Single      3943
Divorced     748
Name: count, dtype: int64
**************************************************
Unique values in Income_Category are :
Income_Category
Less than $40K    3561
$40K - $60K       1790
$80K - $120K      1535
$60K - $80K       1402
abc               1112
$120K +            727
Name: count, dtype: int64
**************************************************
Unique values in Card_Category are :
Card_Category
Blue        9436
Silver       555
Gold         116
Platinum      20
Name: count, dtype: int64
**************************************************

# CLIENTNUM consists of uniques ID for clients and hence will not add value to the modeling

data.drop(["CLIENTNUM"], axis=1, inplace=True)

## Encoding Existing and Attrited customers to 0 and 1 respectively, for analysis.

data["Attrition_Flag"].replace("Existing Customer", 0, inplace=True)
data["Attrition_Flag"].replace("Attrited Customer", 1, inplace=True)

f, (ax_box, ax_hist) = plt.subplots(2, sharex=True, gridspec_kw={'height_ratios': (.15, .85)}, figsize=(10, 6))
sns.set_style("darkgrid")

# Add a graph in each part
sns.boxplot(data=data, x="Customer_Age", ax=ax_box)
sns.histplot(data=data, x="Customer_Age", kde=True, ax=ax_hist)
ax_box.set(xlabel='Customer Age')

plt.show()

f, (ax_box, ax_hist) = plt.subplots(2, sharex=True, gridspec_kw={"height_ratios": (.15, .85)})
sns.boxplot(data=data, x="Months_on_book", ax=ax_box)
sns.histplot(data=data, x="Months_on_book", ax=ax_hist, stat='density')
sns.kdeplot(data=data, x="Months_on_book", ax=ax_hist, color='blue')
ax_box.set(xlabel='Months_on_book')
plt.show()

f, (ax_box, ax_hist) = plt.subplots(2, sharex=True, gridspec_kw={"height_ratios": (.15, .85)})
sns.boxplot(data=data, x="Credit_Limit", ax=ax_box)
sns.histplot(data=data, x="Credit_Limit", ax=ax_hist, stat='density')
sns.kdeplot(data=data, x="Credit_Limit", ax=ax_hist, color='blue')
ax_box.set(xlabel='Credit_Limit')
plt.show()

f, (ax_box, ax_hist) = plt.subplots(2, sharex=True, gridspec_kw={"height_ratios": (.15, .85)})
sns.boxplot(data=data, x="Total_Revolving_Bal", ax=ax_box)
sns.histplot(data=data, x="Total_Revolving_Bal", ax=ax_hist, stat='density')
sns.kdeplot(data=data, x="Total_Revolving_Bal", ax=ax_hist, color='blue')
ax_box.set(xlabel='Total_Revolving_Bal')
plt.show()

f, (ax_box, ax_hist) = plt.subplots(2, sharex=True, gridspec_kw={"height_ratios": (.15, .85)})
sns.boxplot(data=data, x="Avg_Open_To_Buy", ax=ax_box)
sns.histplot(data=data, x="Avg_Open_To_Buy", ax=ax_hist, stat='density')
sns.kdeplot(data=data, x="Avg_Open_To_Buy", ax=ax_hist, color='blue')
ax_box.set(xlabel='')
ax_hist.set_xlabel('Avg_Open_To_Buy')
plt.tight_layout()
plt.show()

f, (ax_box, ax_hist) = plt.subplots(2, sharex=True, gridspec_kw={"height_ratios": (.15, .85)})
sns.boxplot(data=data, x=data["Total_Trans_Ct"], ax=ax_box)
sns.histplot(data["Total_Trans_Ct"], ax=ax_hist, kde=True, stat="density")  # Add kde=True and stat="density"
ax_box.set(xlabel='Total_Trans_Ct')
plt.show()

f, (ax_box, ax_hist) = plt.subplots(2, sharex=True, gridspec_kw={"height_ratios": (.15, .85)})
sns.boxplot(data=data, x=data["Total_Amt_Chng_Q4_Q1"], ax=ax_box)
sns.histplot(data["Total_Amt_Chng_Q4_Q1"], ax=ax_hist, kde=True, stat="density", bins=30)  # Adjust the number of bins
ax_box.set(xlabel='Total_Amt_Chng_Q4_Q1')
plt.show()

f, (ax_box, ax_hist) = plt.subplots(2, sharex=True, gridspec_kw={"height_ratios": (.15, .85)})
sns.boxplot(data=data, x=data["Total_Trans_Amt"], ax=ax_box)
sns.histplot(data=data["Total_Trans_Amt"], ax=ax_hist, kde=True, stat='density')
ax_box.set(xlabel='Total_Trans_Amt')
plt.show()

f, (ax_box, ax_hist) = plt.subplots(2, sharex=True, gridspec_kw={"height_ratios": (.15, .85)})
sns.boxplot(data=data, x=data["Total_Ct_Chng_Q4_Q1"], ax=ax_box)
sns.histplot(data=data["Total_Ct_Chng_Q4_Q1"], ax=ax_hist, kde=True, stat='density')
ax_box.set(xlabel='Total_Ct_Chng_Q4_Q1')
plt.show()

f, (ax_box, ax_hist) = plt.subplots(2, sharex=True, gridspec_kw={"height_ratios": (.15, .85)})
sns.boxplot(data=data, x="Avg_Utilization_Ratio", ax=ax_box)
sns.histplot(data=data, x="Avg_Utilization_Ratio", ax=ax_hist, kde=True, stat='density')
sns.kdeplot(data=data, x="Avg_Utilization_Ratio", ax=ax_hist, color='blue')
ax_box.set(xlabel='Avg_Utilization_Ratio')
plt.show()

numeric_data = data.select_dtypes(include=['float64', 'int64'])
plt.figure(figsize=(15, 7))
sns.heatmap(numeric_data.corr(), annot=True, vmin=-1, vmax=1, fmt=".2f", cmap="Spectral")
plt.show()

(pd.crosstab(data['Gender'],data['Attrition_Flag'],normalize='index')*100).plot(kind='bar',figsize=(8,4),stacked=True)
plt.ylabel('Percentage Attrition %')

Text(0, 0.5, 'Percentage Attrition %')

(pd.crosstab(data['Marital_Status'],data['Attrition_Flag'],normalize='index')*100).plot(kind='bar',figsize=(8,4),stacked=True)
plt.ylabel('Percentage Attrition %')

Text(0, 0.5, 'Percentage Attrition %')

(pd.crosstab(data['Education_Level'],data['Attrition_Flag'],normalize='index')*100).plot(kind='bar',figsize=(8,4),stacked=True)
plt.ylabel('Percentage Attrition %')

Text(0, 0.5, 'Percentage Attrition %')

(pd.crosstab(data['Income_Category'],data['Attrition_Flag'],normalize='index')*100).plot(kind='bar',figsize=(8,4),stacked=True)
plt.ylabel('Percentage Attrition %')

Text(0, 0.5, 'Percentage Attrition %')

(pd.crosstab(data['Card_Category'],data['Attrition_Flag'],normalize='index')*100).plot(kind='bar',figsize=(8,4),stacked=True)
plt.ylabel('Percentage Attrition %')

Text(0, 0.5, 'Percentage Attrition %')

(pd.crosstab(data['Contacts_Count_12_mon'],data['Attrition_Flag'],normalize='index')*100).plot(kind='bar',figsize=(8,4),stacked=True)
plt.ylabel('Percentage Attrition %')

Text(0, 0.5, 'Percentage Attrition %')

(pd.crosstab(data['Months_Inactive_12_mon'],data['Attrition_Flag'],normalize='index')*100).plot(kind='bar',figsize=(8,4),stacked=True)
plt.ylabel('Percentage Attrition %')

Text(0, 0.5, 'Percentage Attrition %')

(pd.crosstab(data['Total_Relationship_Count'],data['Attrition_Flag'],normalize='index')*100).plot(kind='bar',figsize=(8,4),stacked=True)
plt.ylabel('Percentage Attrition %')

Text(0, 0.5, 'Percentage Attrition %')

(pd.crosstab(data['Dependent_count'],data['Attrition_Flag'],normalize='index')*100).plot(kind='bar',figsize=(8,4),stacked=True)
plt.ylabel('Percentage Attrition %')

Text(0, 0.5, 'Percentage Attrition %')

numeric_data = data.select_dtypes(include=['float64', 'int64'])

# Calculate quartiles and IQR
Q1 = numeric_data.quantile(0.25)
Q3 = numeric_data.quantile(0.75)
IQR = Q3 - Q1

# Find lower and upper bounds for outliers
lower = Q1 - 1.5 * IQR
upper = Q3 + 1.5 * IQR

(
    (data.select_dtypes(include=["float64", "int64"]) < lower)
    | (data.select_dtypes(include=["float64", "int64"]) > upper)
).sum() / len(data) * 100

Attrition_Flag             16.06596
Customer_Age                0.01975
Dependent_count             0.00000
Months_on_book              3.81159
Total_Relationship_Count    0.00000
Months_Inactive_12_mon      3.26849
Contacts_Count_12_mon       6.21112
Credit_Limit                9.71660
Total_Revolving_Bal         0.00000
Avg_Open_To_Buy             9.50923
Total_Amt_Chng_Q4_Q1        3.91034
Total_Trans_Amt             8.84764
Total_Trans_Ct              0.01975
Total_Ct_Chng_Q4_Q1         3.89059
Avg_Utilization_Ratio       0.00000
dtype: float64

data1 = data.copy()

data1["Income_Category"].replace("abc", np.nan, inplace=True)

data1.isna().sum()

Attrition_Flag                 0
Customer_Age                   0
Gender                         0
Dependent_count                0
Education_Level             1519
Marital_Status               749
Income_Category             1112
Card_Category                  0
Months_on_book                 0
Total_Relationship_Count       0
Months_Inactive_12_mon         0
Contacts_Count_12_mon          0
Credit_Limit                   0
Total_Revolving_Bal            0
Avg_Open_To_Buy                0
Total_Amt_Chng_Q4_Q1           0
Total_Trans_Amt                0
Total_Trans_Ct                 0
Total_Ct_Chng_Q4_Q1            0
Avg_Utilization_Ratio          0
dtype: int64

X = data1.drop(["Attrition_Flag"], axis=1)
y = data1["Attrition_Flag"]

# Splitting the data
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.3,random_state=1,stratify=y)

# Columns to impute
reqd_col_for_impute = ["Education_Level", "Marital_Status", "Income_Category"]

# Simple Imputer with Mode
imputer = SimpleImputer(strategy="most_frequent")

# Fit and transform the train data
X_train[reqd_col_for_impute] = imputer.fit_transform(X_train[reqd_col_for_impute])

# Transform the test data
X_test[reqd_col_for_impute] = imputer.transform(X_test[reqd_col_for_impute])

# Checking that no column has missing values in train or test sets
print(X_train.isna().sum())
print("-" * 30)
print(X_test.isna().sum())

Customer_Age                0
Gender                      0
Dependent_count             0
Education_Level             0
Marital_Status              0
Income_Category             0
Card_Category               0
Months_on_book              0
Total_Relationship_Count    0
Months_Inactive_12_mon      0
Contacts_Count_12_mon       0
Credit_Limit                0
Total_Revolving_Bal         0
Avg_Open_To_Buy             0
Total_Amt_Chng_Q4_Q1        0
Total_Trans_Amt             0
Total_Trans_Ct              0
Total_Ct_Chng_Q4_Q1         0
Avg_Utilization_Ratio       0
dtype: int64
------------------------------
Customer_Age                0
Gender                      0
Dependent_count             0
Education_Level             0
Marital_Status              0
Income_Category             0
Card_Category               0
Months_on_book              0
Total_Relationship_Count    0
Months_Inactive_12_mon      0
Contacts_Count_12_mon       0
Credit_Limit                0
Total_Revolving_Bal         0
Avg_Open_To_Buy             0
Total_Amt_Chng_Q4_Q1        0
Total_Trans_Amt             0
Total_Trans_Ct              0
Total_Ct_Chng_Q4_Q1         0
Avg_Utilization_Ratio       0
dtype: int64

cols = X_train.select_dtypes(include=["object", "category"])
for i in cols.columns:
    print(X_train[i].value_counts())
    print("*" * 30)

Gender
F    3770
M    3318
Name: count, dtype: int64
******************************
Education_Level
Graduate         3247
High School      1425
Uneducated       1031
College           709
Post-Graduate     364
Doctorate         312
Name: count, dtype: int64
******************************
Marital_Status
Married     3815
Single      2771
Divorced     502
Name: count, dtype: int64
******************************
Income_Category
Less than $40K    3273
$40K - $60K       1254
$80K - $120K      1084
$60K - $80K        974
$120K +            503
Name: count, dtype: int64
******************************
Card_Category
Blue        6621
Silver       375
Gold          78
Platinum      14
Name: count, dtype: int64
******************************

cols = X_test.select_dtypes(include=["object", "category"])
for i in cols.columns:
    print(X_train[i].value_counts())
    print("*" * 30)

Gender
F    3770
M    3318
Name: count, dtype: int64
******************************
Education_Level
Graduate         3247
High School      1425
Uneducated       1031
College           709
Post-Graduate     364
Doctorate         312
Name: count, dtype: int64
******************************
Marital_Status
Married     3815
Single      2771
Divorced     502
Name: count, dtype: int64
******************************
Income_Category
Less than $40K    3273
$40K - $60K       1254
$80K - $120K      1084
$60K - $80K        974
$120K +            503
Name: count, dtype: int64
******************************
Card_Category
Blue        6621
Silver       375
Gold          78
Platinum      14
Name: count, dtype: int64
******************************

X_train = pd.get_dummies(X_train, drop_first=True)
X_test = pd.get_dummies(X_test, drop_first=True)
print(X_train.shape, X_test.shape)

(7088, 29) (3039, 29)

X_train.head()

# Creating metric function

def metrics_score(actual, predicted):
    print(classification_report(actual, predicted))
    cm = confusion_matrix(actual, predicted)
    plt.figure(figsize=(8,5))
    sns.heatmap(cm, annot=True,  fmt='.2f', xticklabels=['Not Attrite', 'Attrite'], yticklabels=['Not Attrite', 'Attrite'])
    plt.ylabel('Actual')
    plt.xlabel('Predicted')
    plt.show()

# Fitting logistic regression model

lg = LogisticRegression()
lg.fit(X_train,y_train)

LogisticRegression()

LogisticRegression()

# Checking the performance on the training data

y_pred_train = lg.predict(X_train)

metrics_score(y_train, y_pred_train)

              precision    recall  f1-score   support

           0       0.90      0.96      0.93      5949
           1       0.69      0.44      0.53      1139

    accuracy                           0.88      7088
   macro avg       0.79      0.70      0.73      7088
weighted avg       0.87      0.88      0.87      7088

# Checking the performance on the test dataset

y_pred_test = lg.predict(X_test)

metrics_score(y_test, y_pred_test)

              precision    recall  f1-score   support

           0       0.90      0.96      0.93      2551
           1       0.70      0.44      0.54       488

    accuracy                           0.88      3039
   macro avg       0.80      0.70      0.74      3039
weighted avg       0.87      0.88      0.87      3039

# Printing the coefficients of logistic regression

cols=X_train.columns

coef_lg=lg.coef_

pd.DataFrame(coef_lg,columns=cols).T.sort_values(by=0,ascending=False)

# Finding the odds
odds = np.exp(lg.coef_[0])

# Adding the odds to a dataframe and sorting the values
pd.DataFrame(odds, X_train.columns, columns=['odds']).sort_values(by='odds', ascending=False)

# Predict_proba gives the probability of each observation belonging to each class

y_scores_lg=lg.predict_proba(X_train)

precisions_lg, recalls_lg, thresholds_lg = precision_recall_curve(y_train, y_scores_lg[:,1])

# Plot values of precisions, recalls, and thresholds
plt.figure(figsize=(10,7))
plt.plot(thresholds_lg, precisions_lg[:-1], 'b--', label='precision')
plt.plot(thresholds_lg, recalls_lg[:-1], 'g--', label = 'recall')
plt.xlabel('Threshold')
plt.legend(loc='upper left')
plt.ylim([0,1])
plt.show()

# Calculating the exact threshold where precision and recall are equal.

for i in np.arange(len(thresholds_lg)):
    if precisions_lg[i]==recalls_lg[i]:
        print(thresholds_lg[i])

0.34806960434560535

optimal_threshold=.35
y_pred_train = lg.predict_proba(X_train)
metrics_score(y_train, y_pred_train[:,1]>optimal_threshold)

              precision    recall  f1-score   support

           0       0.92      0.93      0.92      5949
           1       0.61      0.60      0.61      1139

    accuracy                           0.87      7088
   macro avg       0.77      0.76      0.77      7088
weighted avg       0.87      0.87      0.87      7088

optimal_threshold1=.35
y_pred_test = lg.predict_proba(X_test)
metrics_score(y_test, y_pred_test[:,1]>optimal_threshold1)

              precision    recall  f1-score   support

           0       0.92      0.93      0.93      2551
           1       0.62      0.60      0.61       488

    accuracy                           0.88      3039
   macro avg       0.77      0.77      0.77      3039
weighted avg       0.88      0.88      0.88      3039

# To Speed-Up SVM training.

scaling = MinMaxScaler(feature_range=(-1,1)).fit(X_train)
X_train_scaled = scaling.transform(X_train)
X_test_scaled = scaling.transform(X_test)

# Fitting SVM
svm = SVC(kernel = 'linear') # Linear kernel or linear decision boundary
model = svm.fit(X = X_train_scaled, y = y_train)

y_pred_train_svm = model.predict(X_train_scaled)

metrics_score(y_train, y_pred_train_svm)

              precision    recall  f1-score   support

           0       0.92      0.97      0.95      5949
           1       0.80      0.58      0.67      1139

    accuracy                           0.91      7088
   macro avg       0.86      0.77      0.81      7088
weighted avg       0.90      0.91      0.90      7088

# Checking performance on the test data
y_pred_test_svm = model.predict(X_test_scaled)
metrics_score(y_test, y_pred_test_svm)

              precision    recall  f1-score   support

           0       0.92      0.97      0.94      2551
           1       0.78      0.56      0.65       488

    accuracy                           0.90      3039
   macro avg       0.85      0.76      0.80      3039
weighted avg       0.90      0.90      0.90      3039

svm_rbf=SVC(kernel='rbf',probability=True)
svm_rbf.fit(X_train_scaled,y_train)
y_scores_svm=svm_rbf.predict_proba(X_train_scaled) # Predict_proba gives the probability of each observation belonging to each class


precisions_svm, recalls_svm, thresholds_svm = precision_recall_curve(y_train, y_scores_svm[:,1])

# Plot values of precisions, recalls, and thresholds
plt.figure(figsize=(10,7))
plt.plot(thresholds_svm, precisions_svm[:-1], 'b--', label='precision')
plt.plot(thresholds_svm, recalls_svm[:-1], 'g--', label = 'recall')
plt.xlabel('Threshold')
plt.legend(loc='upper left')
plt.ylim([0,1])
plt.show()

# Calculating the exact threshold where precision and recall are equal.
for i in np.arange(len(thresholds_svm)):
    if precisions_svm[i]==recalls_svm[i]:
        print(thresholds_svm[i])

0.3122015685746674

optimal_threshold1=0.31
y_pred_train = svm_rbf.predict_proba(X_train_scaled)

metrics_score(y_train, y_pred_train[:,1]>optimal_threshold1)

              precision    recall  f1-score   support

           0       0.96      0.96      0.96      5949
           1       0.77      0.77      0.77      1139

    accuracy                           0.93      7088
   macro avg       0.86      0.86      0.86      7088
weighted avg       0.93      0.93      0.93      7088

y_pred_test = svm_rbf.predict_proba(X_test_scaled)

metrics_score(y_test, y_pred_test[:,1]>optimal_threshold1)

              precision    recall  f1-score   support

           0       0.94      0.93      0.94      2551
           1       0.67      0.71      0.69       488

    accuracy                           0.90      3039
   macro avg       0.80      0.82      0.81      3039
weighted avg       0.90      0.90      0.90      3039

# Building decision tree model

model_dt= DecisionTreeClassifier(random_state=1,max_depth=8)
model_dt.fit(X_train, y_train)

DecisionTreeClassifier(max_depth=8, random_state=1)

DecisionTreeClassifier(max_depth=8, random_state=1)

# Checking performance on the training dataset

pred_train_dt = model_dt.predict(X_train)

metrics_score(y_train, pred_train_dt)

              precision    recall  f1-score   support

           0       0.98      0.99      0.99      5949
           1       0.93      0.92      0.93      1139

    accuracy                           0.98      7088
   macro avg       0.96      0.95      0.96      7088
weighted avg       0.98      0.98      0.98      7088

pred_test_dt = model_dt.predict(X_test)
metrics_score(y_test, pred_test_dt)

              precision    recall  f1-score   support

           0       0.97      0.96      0.97      2551
           1       0.82      0.82      0.82       488

    accuracy                           0.94      3039
   macro avg       0.89      0.89      0.89      3039
weighted avg       0.94      0.94      0.94      3039

features = list(X_train.columns)

plt.figure(figsize=(20,20))
from Scikit-learn import tree
tree.plot_tree(model_dt,feature_names=features,max_depth =4, filled=True,fontsize=9,node_ids=True,class_names=True)
plt.show()

# Importance of features in the tree building

feature_names = list(X_train.columns)
importances = model_dt.feature_importances_
indices = np.argsort(importances)

plt.figure(figsize=(8, 8))
plt.title("Feature Importances")
plt.barh(range(len(indices)), importances[indices], color="violet", align="center")
plt.yticks(range(len(indices)), [feature_names[i] for i in indices])
plt.xlabel("Relative Importance")
plt.show()

# Fitting the Random Forest classifier on the training data
rf_estimator = RandomForestClassifier(random_state = 1)

rf_estimator.fit(X_train, y_train)

RandomForestClassifier(random_state=1)

RandomForestClassifier(random_state=1)

# Checking performance on the training data
y_pred_train_rf = rf_estimator.predict(X_train)

metrics_score(y_train, y_pred_train_rf)

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      5949
           1       1.00      1.00      1.00      1139

    accuracy                           1.00      7088
   macro avg       1.00      1.00      1.00      7088
weighted avg       1.00      1.00      1.00      7088

# Checking performance on the testing data
y_pred_test_rf = rf_estimator.predict(X_test)

metrics_score(y_test, y_pred_test_rf)

              precision    recall  f1-score   support

           0       0.96      0.99      0.97      2551
           1       0.92      0.79      0.85       488

    accuracy                           0.96      3039
   macro avg       0.94      0.89      0.91      3039
weighted avg       0.95      0.96      0.95      3039

importances = rf_estimator.feature_importances_

columns = X_train.columns

importance_df = pd.DataFrame(importances, index = columns, columns = ['Importance']).sort_values(by = 'Importance', ascending = False)

plt.figure(figsize = (13, 13))

sns.barplot(x = importance_df.Importance, y = importance_df.index)

<Axes: xlabel='Importance', ylabel='None'>

# Convert notebook to html
!jupyter nbconvert --to html "/content/drive/MyDrive/MIT - Data Sciences/Colab Notebooks/Week_Five_-_Classification_and_Hypothesis_Testing/Hands-on Session:_Classification_and_Hypothesis_Testing/Credit_Card_Users_Churn_Prediction.ipynb"

[NbConvertApp] Converting notebook /content/drive/MyDrive/MIT - Data Sciences/Colab Notebooks/Week_Five_-_Classification_and_Hypothesis_Testing/Hands-on Session:_Classification_and_Hypothesis_Testing/Credit_Card_Users_Churn_Prediction.ipynb to html
[NbConvertApp] WARNING | Alternative text is missing on 37 image(s).
[NbConvertApp] Writing 2528924 bytes to /content/drive/MyDrive/MIT - Data Sciences/Colab Notebooks/Week_Five_-_Classification_and_Hypothesis_Testing/Hands-on Session:_Classification_and_Hypothesis_Testing/Credit_Card_Users_Churn_Prediction.html

	count	mean	std	min	25%	50%	75%	max
CLIENTNUM	10127.00000	739177606.33366	36903783.45023	708082083.00000	713036770.50000	717926358.00000	773143533.00000	828343083.00000
Customer_Age	10127.00000	46.32596	8.01681	26.00000	41.00000	46.00000	52.00000	73.00000
Dependent_count	10127.00000	2.34620	1.29891	0.00000	1.00000	2.00000	3.00000	5.00000
Months_on_book	10127.00000	35.92841	7.98642	13.00000	31.00000	36.00000	40.00000	56.00000
Total_Relationship_Count	10127.00000	3.81258	1.55441	1.00000	3.00000	4.00000	5.00000	6.00000
Months_Inactive_12_mon	10127.00000	2.34117	1.01062	0.00000	2.00000	2.00000	3.00000	6.00000
Contacts_Count_12_mon	10127.00000	2.45532	1.10623	0.00000	2.00000	2.00000	3.00000	6.00000
Credit_Limit	10127.00000	8631.95370	9088.77665	1438.30000	2555.00000	4549.00000	11067.50000	34516.00000
Total_Revolving_Bal	10127.00000	1162.81406	814.98734	0.00000	359.00000	1276.00000	1784.00000	2517.00000
Avg_Open_To_Buy	10127.00000	7469.13964	9090.68532	3.00000	1324.50000	3474.00000	9859.00000	34516.00000
Total_Amt_Chng_Q4_Q1	10127.00000	0.75994	0.21921	0.00000	0.63100	0.73600	0.85900	3.39700
Total_Trans_Amt	10127.00000	4404.08630	3397.12925	510.00000	2155.50000	3899.00000	4741.00000	18484.00000
Total_Trans_Ct	10127.00000	64.85869	23.47257	10.00000	45.00000	67.00000	81.00000	139.00000
Total_Ct_Chng_Q4_Q1	10127.00000	0.71222	0.23809	0.00000	0.58200	0.70200	0.81800	3.71400
Avg_Utilization_Ratio	10127.00000	0.27489	0.27569	0.00000	0.02300	0.17600	0.50300	0.99900

	0
Contacts_Count_12_mon	0.31531
Months_Inactive_12_mon	0.24919
Dependent_count	0.20002
Customer_Age	0.11755
Income_Category_Less than $40K	0.05269
Marital_Status_Single	0.04579
Education_Level_Graduate	0.01085
Education_Level_Post-Graduate	0.00870
Education_Level_Doctorate	0.00737
Education_Level_Uneducated	0.00455
Card_Category_Silver	0.00365
Card_Category_Gold	0.00318
Income_Category_$40K - $60K	0.00264
Card_Category_Platinum	0.00175
Avg_Utilization_Ratio	0.00137
Total_Trans_Amt	0.00038
Avg_Open_To_Buy	0.00034
Education_Level_High School	-0.00032
Credit_Limit	-0.00035
Total_Revolving_Bal	-0.00068
Income_Category_$80K - $120K	-0.00609
Total_Amt_Chng_Q4_Q1	-0.01180
Income_Category_$60K - $80K	-0.01614
Marital_Status_Married	-0.01957
Gender_M	-0.03225
Total_Ct_Chng_Q4_Q1	-0.05203
Months_on_book	-0.09103
Total_Trans_Ct	-0.09617
Total_Relationship_Count	-0.21321

	odds
Contacts_Count_12_mon	1.37069
Months_Inactive_12_mon	1.28299
Dependent_count	1.22143
Customer_Age	1.12474
Income_Category_Less than $40K	1.05410
Marital_Status_Single	1.04686
Education_Level_Graduate	1.01091
Education_Level_Post-Graduate	1.00874
Education_Level_Doctorate	1.00739
Education_Level_Uneducated	1.00456
Card_Category_Silver	1.00366
Card_Category_Gold	1.00319
Income_Category_$40K - $60K	1.00265
Card_Category_Platinum	1.00176
Avg_Utilization_Ratio	1.00137
Total_Trans_Amt	1.00038
Avg_Open_To_Buy	1.00034
Education_Level_High School	0.99968
Credit_Limit	0.99965
Total_Revolving_Bal	0.99932
Income_Category_$80K - $120K	0.99392
Total_Amt_Chng_Q4_Q1	0.98827
Income_Category_$60K - $80K	0.98399
Marital_Status_Married	0.98062
Gender_M	0.96826
Total_Ct_Chng_Q4_Q1	0.94930
Months_on_book	0.91299
Total_Trans_Ct	0.90831
Total_Relationship_Count	0.80799

	CLIENTNUM	Attrition_Flag	Customer_Age	Gender	Dependent_count	Education_Level	Marital_Status	Income_Category	Card_Category	Months_on_book	Total_Relationship_Count	Months_Inactive_12_mon	Contacts_Count_12_mon	Credit_Limit	Total_Revolving_Bal	Avg_Open_To_Buy	Total_Amt_Chng_Q4_Q1	Total_Trans_Amt	Total_Trans_Ct	Total_Ct_Chng_Q4_Q1	Avg_Utilization_Ratio
0	768805383	Existing Customer	45	M	3	High School	Married	$60K - $80K	Blue	39	5	1	3	12691.00000	777	11914.00000	1.33500	1144	42	1.62500	0.06100
1	818770008	Existing Customer	49	F	5	Graduate	Single	Less than $40K	Blue	44	6	1	2	8256.00000	864	7392.00000	1.54100	1291	33	3.71400	0.10500
2	713982108	Existing Customer	51	M	3	Graduate	Married	$80K - $120K	Blue	36	4	1	0	3418.00000	0	3418.00000	2.59400	1887	20	2.33300	0.00000
3	769911858	Existing Customer	40	F	4	High School	NaN	Less than $40K	Blue	34	3	4	1	3313.00000	2517	796.00000	1.40500	1171	20	2.33300	0.76000
4	709106358	Existing Customer	40	M	3	Uneducated	Married	$60K - $80K	Blue	21	5	1	0	4716.00000	0	4716.00000	2.17500	816	28	2.50000	0.00000

	CLIENTNUM	Attrition_Flag	Customer_Age	Gender	Dependent_count	Education_Level	Marital_Status	Income_Category	Card_Category	Months_on_book	Total_Relationship_Count	Months_Inactive_12_mon	Contacts_Count_12_mon	Credit_Limit	Total_Revolving_Bal	Avg_Open_To_Buy	Total_Amt_Chng_Q4_Q1	Total_Trans_Amt	Total_Trans_Ct	Total_Ct_Chng_Q4_Q1	Avg_Utilization_Ratio
10122	772366833	Existing Customer	50	M	2	Graduate	Single	$40K - $60K	Blue	40	3	2	3	4003.00000	1851	2152.00000	0.70300	15476	117	0.85700	0.46200
10123	710638233	Attrited Customer	41	M	2	NaN	Divorced	$40K - $60K	Blue	25	4	2	3	4277.00000	2186	2091.00000	0.80400	8764	69	0.68300	0.51100
10124	716506083	Attrited Customer	44	F	1	High School	Married	Less than $40K	Blue	36	5	3	4	5409.00000	0	5409.00000	0.81900	10291	60	0.81800	0.00000
10125	717406983	Attrited Customer	30	M	2	Graduate	NaN	$40K - $60K	Blue	36	4	3	3	5281.00000	0	5281.00000	0.53500	8395	62	0.72200	0.00000
10126	714337233	Attrited Customer	43	F	2	Graduate	Married	Less than $40K	Silver	25	6	2	4	10388.00000	1961	8427.00000	0.70300	10294	61	0.64900	0.18900

	count	unique	top	freq
Attrition_Flag	10127	2	Existing Customer	8500
Gender	10127	2	F	5358
Education_Level	8608	6	Graduate	3128
Marital_Status	9378	3	Married	4687
Income_Category	10127	6	Less than $40K	3561
Card_Category	10127	4	Blue	9436

	Customer_Age	Dependent_count	Months_on_book	Total_Relationship_Count	Months_Inactive_12_mon	Contacts_Count_12_mon	Credit_Limit	Total_Revolving_Bal	Avg_Open_To_Buy	Total_Amt_Chng_Q4_Q1	Total_Trans_Amt	Total_Trans_Ct	Total_Ct_Chng_Q4_Q1	Avg_Utilization_Ratio	Gender_M	Education_Level_Doctorate	Education_Level_Graduate	Education_Level_High School	Education_Level_Post-Graduate	Education_Level_Uneducated	Marital_Status_Married	Marital_Status_Single	Income_Category_$40K - $60K	Income_Category_$60K - $80K	Income_Category_$80K - $120K	Income_Category_Less than $40K	Card_Category_Gold	Card_Category_Platinum	Card_Category_Silver
4124	50	1	43	6	1	2	7985.00000	0	7985.00000	1.03200	3873	72	0.67400	0.00000	False	False	True	False	False	False	True	False	False	False	False	True	False	False	False
4686	50	0	36	3	3	2	5444.00000	2499	2945.00000	0.46800	4509	80	0.66700	0.45900	True	False	True	False	False	False	False	False	False	True	False	False	False	False	False
1276	26	0	13	6	3	4	1643.00000	1101	542.00000	0.71300	2152	50	0.47100	0.67000	False	False	True	False	False	False	False	True	True	False	False	False	False	False	False
6119	65	0	55	3	3	0	2022.00000	0	2022.00000	0.57900	4623	65	0.54800	0.00000	False	False	False	False	False	False	False	True	False	False	False	True	False	False	False
2253	46	3	35	6	3	4	4930.00000	0	4930.00000	1.01900	3343	77	0.63800	0.00000	True	False	True	False	False	False	False	True	False	False	True	False	False	False	False

Credit Card Users Churn Prediction¶

Problem Statement¶

Dataset Description¶

What is a Revolving Balance?¶

What is the Average Open to buy?¶

What is the Average utilization Ratio?¶

Importing the libraries and overview of the dataset¶

Loading Data¶

Data Overview¶

Checking the info of the dataset¶

Checking for Duplicate values and Missing Values¶

Univariate analysis of numerical columns¶

Observations:¶

Univariate analysis for categorical variables¶

Data Preprocessing¶

Exploratory Data Analysis¶

Univariate analysis¶

1.Customer_Age:¶

2.Months_on_book¶

3.Credit_Limit¶

4.Total_Revolving_Bal¶

5.Avg_Open_To_Buy¶

6.Total_Trans_Ct¶

7.Total_Amt_Chng_Q4_Q1¶

8.Total_Trans_Amt¶

9.Total_Ct_Chng_Q4_Q1¶

10.Avg_Utilization_Ratio¶

Bivariate Analysis¶

Attrition_Flag vs Gender¶

Attrition_Flag vs Marital_Status¶

Attrition_Flag vs Education_Level¶

Attrition_Flag vs Income_Category¶

Attrition_Flag vs Card_Category¶

Attrition_Flag vs Contacts_Count_12_mon¶

Attrition_Flag vs Months_Inactive_12_mon¶

Attrition_Flag vs Total_Relationship_Count¶

Attrition_Flag vs Dependent_count¶

Let's find the percentage of outliers, in each column of the data, using IQR.¶

Missing value imputation¶

Encoding categorical variables¶

Model evaluation criterion¶

Building the model¶

Logistic Regression Model¶

Precision-Recall Curve for logistic regression¶

Support Vector Machines¶

Linear Kernel¶

RBF Kernel¶

Decision Tree¶

Random Forest¶

Business Recommendations:¶

Key Factors Driving Attrition:¶

Additional Insights:¶