#Import all the necessary packages

import pandas as pd
import numpy as np

import matplotlib.pylab as plt
import seaborn as sns

#to scale the data using z-score
from Scikit-learn.preprocessing import StandardScaler

#importing clustering algorithms
from Scikit-learn.cluster import KMeans
from Scikit-learn.mixture import GaussianMixture
!pip install scikit-learn-extra
from Scikit-learn_extra.cluster import KMedoids

from Scikit-learn.metrics import silhouette_score

import warnings
warnings.filterwarnings("ignore")

Requirement already satisfied: scikit-learn-extra in /usr/local/lib/python3.7/dist-packages (0.2.0)
Requirement already satisfied: numpy>=1.13.3 in /usr/local/lib/python3.7/dist-packages (from scikit-learn-extra) (1.21.6)
Requirement already satisfied: scikit-learn>=0.23.0 in /usr/local/lib/python3.7/dist-packages (from scikit-learn-extra) (1.0.2)
Requirement already satisfied: scipy>=0.19.1 in /usr/local/lib/python3.7/dist-packages (from scikit-learn-extra) (1.4.1)
Requirement already satisfied: threadpoolctl>=2.0.0 in /usr/local/lib/python3.7/dist-packages (from scikit-learn>=0.23.0->scikit-learn-extra) (3.1.0)
Requirement already satisfied: joblib>=0.11 in /usr/local/lib/python3.7/dist-packages (from scikit-learn>=0.23.0->scikit-learn-extra) (1.1.0)

# Connect to google
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive

data = pd.read_excel('Credit Card Customer Data.xlsx')
data.head()

data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 655 entries, 0 to 654
Data columns (total 7 columns):
 #   Column               Non-Null Count  Dtype
---  ------               --------------  -----
 0   Sl_No                655 non-null    int64
 1   Customer Key         655 non-null    int64
 2   Avg_Credit_Limit     655 non-null    int64
 3   Total_Credit_Cards   655 non-null    int64
 4   Total_visits_bank    655 non-null    int64
 5   Total_visits_online  655 non-null    int64
 6   Total_calls_made     655 non-null    int64
dtypes: int64(7)
memory usage: 35.9 KB

data.nunique()

Sl_No                  655
Customer Key           655
Avg_Credit_Limit       109
Total_Credit_Cards      10
Total_visits_bank        6
Total_visits_online     16
Total_calls_made        11
dtype: int64

data.drop(columns = ['Sl_No', 'Customer Key'], inplace = True)

data[data.duplicated()]

data=data[~data.duplicated()]

data.shape

(644, 5)

data.describe().T

for col in data.columns:
    print(col)
    print('Skew :',round(data[col].skew(),2))
    plt.figure(figsize=(15,4))
    plt.subplot(1,2,1)
    data[col].hist(bins=10, grid=False)
    plt.ylabel('count')
    plt.subplot(1,2,2)
    sns.boxplot(x=data[col])
    plt.show()

Avg_Credit_Limit
Skew : 2.19

Total_Credit_Cards
Skew : 0.17

Total_visits_bank
Skew : 0.15

Total_visits_online
Skew : 2.21

Total_calls_made
Skew : 0.65

plt.figure(figsize=(8,6))
sns.heatmap(data.corr(), annot=True, fmt='0.2f')
plt.show()

scaler=StandardScaler()
data_scaled=pd.DataFrame(scaler.fit_transform(data), columns=data.columns)

data_scaled.head()

#Creating copy of the data to store labels from each algorithm
data_scaled_copy = data_scaled.copy(deep=True)

# step 1
WCSS = {}

# step 2 - iterate for a range of Ks and fit the scaled data to the algorithm. Use inertia attribute from the clustering object and
# store the inertia value for that k
for k in range(1, 10):
    kmeans = KMeans(n_clusters=k, max_iter=1000).fit(data_scaled)
    WCSS[k] = kmeans.inertia_

# step 3
plt.figure()
plt.plot(list(WCSS.keys()), list(WCSS.values()), 'bx-')
plt.xlabel("Number of cluster")
plt.ylabel("WCSS")
plt.show()

#Empty dictionary to store the Silhouette score for each value of k
sc = {}

# iterate for a range of Ks and fit the scaled data to the algorithm. Store the Silhouette score for that k
for k in range(2, 10):
    kmeans = KMeans(n_clusters=k).fit(data_scaled)
    labels = kmeans.predict(data_scaled)
    sc[k] = silhouette_score(data_scaled, labels)

#Elbow plot
plt.figure()
plt.plot(list(sc.keys()), list(sc.values()), 'bx-')
plt.xlabel("Number of cluster")
plt.ylabel("Silhouette Score")
plt.show()

kmeans = KMeans(n_clusters=3, max_iter= 1000, random_state=1)
kmeans.fit(data_scaled)

#Adding predicted labels to the original data and scaled data
data_scaled_copy['Labels'] = kmeans.predict(data_scaled)
data['Labels'] = kmeans.predict(data_scaled)

#Number of observations in each cluster
data.Labels.value_counts()

1    374
0    221
2     49
Name: Labels, dtype: int64

#Calculating summary statistics of the original data for each label
mean = data.groupby('Labels').mean()
median = data.groupby('Labels').median()
df_kmeans = pd.concat([mean, median], axis=0)
df_kmeans.index = ['group_0 Mean', 'group_1 Mean', 'group_2 Mean', 'group_0 Median', 'group_1 Median', 'group_2 Median']
df_kmeans.T

#Visualizing different features w.r.t K-means labels
data_scaled_copy.boxplot(by = 'Labels', layout = (1,5),figsize=(20,7))
plt.show()

kmedo = KMedoids(n_clusters = 3, random_state=1)
kmedo.fit(data_scaled)

data_scaled_copy['kmedoLabels'] = kmedo.predict(data_scaled)
data['kmedoLabels'] = kmedo.predict(data_scaled)

#Number of observations in each cluster
data.kmedoLabels.value_counts()

2    289
0    222
1    133
Name: kmedoLabels, dtype: int64

#Calculating summary statistics of the original data for each label
original_features = ["Avg_Credit_Limit","Total_Credit_Cards","Total_visits_bank","Total_visits_online","Total_calls_made"]

mean = data.groupby('kmedoLabels').mean()
median = data.groupby('kmedoLabels').median()
df_kmedoids = pd.concat([mean, median], axis=0)
df_kmedoids.index = ['group_0 Mean', 'group_1 Mean', 'group_2 Mean', 'group_0 Median', 'group_1 Median', 'group_2 Median']
df_kmedoids[original_features].T

#plotting boxplots with the new K-Medoids based labels

features_with_lables = ["Avg_Credit_Limit",	"Total_Credit_Cards","Total_visits_bank","Total_visits_online","Total_calls_made","kmedoLabels"]

data_scaled_copy[features_with_lables].boxplot(by = 'kmedoLabels', layout = (1,5),figsize=(20,7))
plt.show()

comparison = pd.concat([df_kmedoids, df_kmeans], axis=1)[original_features]
comparison

#Empty dictionary to store the Silhouette score for each value of k
sc = {}

# iterate for a range of Ks and fit the scaled data to the algorithm. Store the Silhouette score for that k
for k in range(2, 10):
    GMM = GaussianMixture(n_components = k).fit(data_scaled)
    labels = GMM.predict(data_scaled)
    sc[k] = silhouette_score(data_scaled, labels)

#Elbow plot
plt.figure()
plt.plot(list(sc.keys()), list(sc.values()), 'bx-')
plt.xlabel("Number of cluster")
plt.ylabel("Silhouette Score")
plt.show()

gmm = GaussianMixture(n_components = 3)
gmm.fit(data_scaled)

data_scaled_copy['GmmLabels'] = gmm.predict(data_scaled)
data['GmmLabels'] = gmm.predict(data_scaled)

#Number of observations in each cluster
data.GmmLabels.value_counts()

0    374
1    221
2     49
Name: GmmLabels, dtype: int64

#Calculating summary statistics of the original data for each label
original_features = ["Avg_Credit_Limit","Total_Credit_Cards","Total_visits_bank","Total_visits_online","Total_calls_made"]

mean = data.groupby('GmmLabels').mean()
median = data.groupby('GmmLabels').median()
df_gmm = pd.concat([mean, median], axis=0)
df_gmm.index = ['group_0 Mean', 'group_1 Mean', 'group_2 Mean', 'group_0 Median', 'group_1 Median', 'group_2 Median']
df_gmm[original_features].T

# plotting boxplots with the new GMM based labels

features_with_lables = ["Avg_Credit_Limit","Total_Credit_Cards","Total_visits_bank","Total_visits_online","Total_calls_made","GmmLabels"]

data_scaled_copy[features_with_lables].boxplot(by = 'GmmLabels', layout = (1,5),figsize=(20,7))
plt.show()

# Convert notebook to html
!jupyter nbconvert --to html "/content/drive/MyDrive/MIT - Data Sciences/Colab Notebooks/Week Three - Making Sense of Unstructured Data/Mentored Learning Session/MLS_Case_Study_Unsupervised_Learning.ipynb"

	Sl_No	Customer Key	Avg_Credit_Limit	Total_Credit_Cards	Total_visits_bank	Total_visits_online	Total_calls_made
0	1	87073	100000	2	1	1	0
1	2	38414	50000	3	0	10	9
2	3	17341	50000	7	1	3	4
3	4	40496	30000	5	1	1	4
4	5	47437	100000	6	0	12	3

	Avg_Credit_Limit	Total_Credit_Cards	Total_visits_bank	Total_visits_online	Total_calls_made
162	8000	2	0	3	4
175	6000	1	0	2	5
215	8000	4	0	4	7
295	10000	6	4	2	3
324	9000	4	5	0	4
360	18000	6	3	1	4
377	12000	6	5	2	1
384	8000	7	4	2	0
394	5000	4	5	0	1
452	47000	6	2	0	4
494	52000	4	2	1	2

	count	mean	std	min	25%	50%	75%	max
Avg_Credit_Limit	644.0	34543.478261	37428.704286	3000.0	11000.0	18000.0	48000.00	200000.0
Total_Credit_Cards	644.0	4.694099	2.175338	1.0	3.0	5.0	6.00	10.0
Total_visits_bank	644.0	2.395963	1.626964	0.0	1.0	2.0	4.00	5.0
Total_visits_online	644.0	2.624224	2.957728	0.0	1.0	2.0	4.00	15.0
Total_calls_made	644.0	3.608696	2.880025	0.0	1.0	3.0	5.25	10.0

	Avg_Credit_Limit	Total_Credit_Cards	Total_visits_bank	Total_visits_online	Total_calls_made
0	1.750192	-1.239437	-0.858684	-0.549573	-1.253982
1	0.413280	-0.779381	-1.473803	2.495669	1.873420
2	0.413280	1.060843	-0.858684	0.127148	0.135974
3	-0.121485	0.140731	-0.858684	-0.549573	0.135974
4	1.750192	0.600787	-1.473803	3.172390	-0.211515

	group_0 Mean	group_1 Mean	group_2 Mean	group_0 Median	group_1 Median	group_2 Median
Avg_Credit_Limit	12239.819005	33893.048128	140102.040816	12000.0	31500.0	145000.0
Total_Credit_Cards	2.411765	5.508021	8.775510	2.0	6.0	9.0
Total_visits_bank	0.945701	3.489305	0.591837	1.0	3.0	1.0
Total_visits_online	3.561086	0.975936	10.979592	4.0	1.0	11.0
Total_calls_made	6.891403	1.997326	1.102041	7.0	2.0	1.0

MLS Case study: Unsupervised Learning¶

Problem Statement:¶

Objective:¶

About the data:¶

Importing libraries and overview of the dataset¶

Loading data¶

Check the info of the data¶

Data Preprocessing and Exploratory Data Analysis¶

Summary Statistics¶

Now let's go ahead with the exploring each variable at hand. We will check the distribution and outliers for each variable in the data.¶

Scaling the data¶

K-Means¶

Creating cluster profiles using the below summary statistics and box plots for each label¶

K-Medoids¶

Additional Content (Optional)¶

Gaussian Mixture¶

	group_0 Mean	group_1 Mean	group_2 Mean	group_0 Median	group_1 Median	group_2 Median
Avg_Credit_Limit	12216.216216	85052.631579	28449.826990	12000.0	68000.0	20000.0
Total_Credit_Cards	2.423423	7.030075	5.363322	2.0	7.0	5.0
Total_visits_bank	0.950450	1.691729	3.830450	1.0	2.0	4.0
Total_visits_online	3.554054	4.639098	0.982699	4.0	2.0	1.0
Total_calls_made	6.878378	1.969925	1.851211	7.0	2.0	2.0