!pip install scikit-learn-extra

Collecting scikit-learn-extra
  Downloading scikit_learn_extra-0.3.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.6 kB)
Requirement already satisfied: numpy>=1.13.3 in /usr/local/lib/python3.10/dist-packages (from scikit-learn-extra) (1.26.4)
Requirement already satisfied: scipy>=0.19.1 in /usr/local/lib/python3.10/dist-packages (from scikit-learn-extra) (1.13.1)
Requirement already satisfied: scikit-learn>=0.23.0 in /usr/local/lib/python3.10/dist-packages (from scikit-learn-extra) (1.3.2)
Requirement already satisfied: joblib>=1.1.1 in /usr/local/lib/python3.10/dist-packages (from scikit-learn>=0.23.0->scikit-learn-extra) (1.4.2)
Requirement already satisfied: threadpoolctl>=2.0.0 in /usr/local/lib/python3.10/dist-packages (from scikit-learn>=0.23.0->scikit-learn-extra) (3.5.0)
Downloading scikit_learn_extra-0.3.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (2.0 MB)
   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 2.0/2.0 MB 12.5 MB/s eta 0:00:00
Installing collected packages: scikit-learn-extra
Successfully installed scikit-learn-extra-0.3.0

import pandas as pd

import numpy as np

import matplotlib.pylab as plt

import seaborn as sns

# To scale the data using z-score
from Scikit-learn.preprocessing import StandardScaler

# Importing clustering algorithms
from Scikit-learn.cluster import KMeans

from Scikit-learn.mixture import GaussianMixture

from Scikit-learn_extra.cluster import KMedoids

from Scikit-learn.cluster import AgglomerativeClustering

from Scikit-learn.cluster import DBSCAN

# Silhouette score
from Scikit-learn.metrics import silhouette_score

import warnings
warnings.filterwarnings("ignore")

# Connect collab
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive

data= pd.read_csv("/content/drive/MyDrive/MIT - Data Sciences/Colab Notebooks/Week Three - Making Sense of Unstructured Data/Socio-economic Factors for Geographic Clustering/Country-data.csv")

data.head()

data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 167 entries, 0 to 166
Data columns (total 10 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   country     167 non-null    object 
 1   child_mort  167 non-null    float64
 2   exports     167 non-null    float64
 3   health      167 non-null    float64
 4   imports     167 non-null    float64
 5   income      167 non-null    int64  
 6   inflation   167 non-null    float64
 7   life_expec  167 non-null    float64
 8   total_fer   167 non-null    float64
 9   gdpp        167 non-null    int64  
dtypes: float64(7), int64(2), object(1)
memory usage: 13.2+ KB

data[data.duplicated()]

data.describe().T

for col in data.columns[1:]:
    print(col)

    print('Skew :', round(data[col].skew(), 2))

    plt.figure(figsize = (15, 4))

    plt.subplot(1, 2, 1)

    data[col].hist(bins = 10, grid = False)

    plt.ylabel('count')

    plt.subplot(1, 2, 2)

    sns.boxplot(x = data[col])

    plt.show()

child_mort
Skew : 1.45

exports
Skew : 2.45

health
Skew : 0.71

imports
Skew : 1.91

income
Skew : 2.23

inflation
Skew : 5.15

life_expec
Skew : -0.97

total_fer
Skew : 0.97

gdpp
Skew : 2.22

data.select_dtypes(include = "number").columns.to_list()

['child_mort',
 'exports',
 'health',
 'imports',
 'income',
 'inflation',
 'life_expec',
 'total_fer',
 'gdpp']

df = data.drop(columns = ["country"])
df.corr()

plt.figure(figsize  = (10, 10))

num_cols = data.select_dtypes(include = "number").columns.to_list()

sns.heatmap(data[num_cols].corr(), annot = True, cmap = "YlGnBu")

plt.show()

data_new = data.drop(columns = ["country", "gdpp"])

# Scaling the data and storing the output as a new DataFrame

scaler = StandardScaler()

data_scaled = pd.DataFrame(scaler.fit_transform(data_new), columns = data_new.columns)

data_scaled.head()

# Creating copy of the data to store labels from each algorithm
data_scaled_copy = data_scaled.copy(deep = True)

data_scaled_copy

# Empty dictionary to store the SSE for each value of K
sse = {}

# Iterate for a range of Ks and fit the scaled data to the algorithm.
# Use inertia attribute from the clustering object and store the inertia value for that K
for k in range(1, 10):
    kmeans = KMeans(n_clusters = k, random_state = 1).fit(data_scaled)

    sse[k] = kmeans.inertia_

# Print actual values
print(sse)

# Elbow plot
plt.figure()

plt.plot(list(sse.keys()), list(sse.values()), 'bx-')

plt.xlabel("Number of cluster")

plt.ylabel("SSE")

plt.show()

{1: 1336.0000000000002, 2: 920.6199002862205, 3: 781.2485957837691, 4: 657.5273613859782, 5: 591.0334329179409, 6: 525.7903876756021, 7: 449.26462385238887, 8: 408.13353009081015, 9: 383.8885689344541}

# Empty dictionary to store the Silhouette score for each value of K
sc = {}

# Iterate for a range of Ks and fit the scaled data to the algorithm. Store the Silhouette score for that K
for k in range(2, 10):
    kmeans = KMeans(n_clusters = k, random_state = 1).fit(data_scaled)

    labels = kmeans.predict(data_scaled)

    sc[k] = silhouette_score(data_scaled, labels)

# Print actual values
print(sc)

# Elbow plot
plt.figure()

plt.plot(list(sc.keys()), list(sc.values()), 'bx-')

plt.xlabel("Number of cluster")

plt.ylabel("Silhouette Score")

plt.show()

{2: 0.3285621671534965, 3: 0.3378060896729783, 4: 0.2318166762659259, 5: 0.21966408510222055, 6: 0.2651996035799795, 7: 0.24238349227710929, 8: 0.2505167995943273, 9: 0.24575086651835348}

kmeans = KMeans(n_clusters = 3, random_state = 1)

kmeans.fit(data_scaled)

# Adding predicted labels to the original data and the scaled data
data_scaled_copy['KMeans_Labels'] = kmeans.predict(data_scaled)

data['KMeans_Labels'] = kmeans.predict(data_scaled)

# Lets take a look at which country is in what cluster

data.head()

data['KMeans_Labels'].value_counts()

# Calculating the mean and the median of the original data for each label

### THERE IS AN ERROR HERE - Non-numeric value has been included

num_cols = num_cols + ["KMeans_Labels"]

mean = data[num_cols].groupby('KMeans_Labels').mean()

median = data[num_cols].groupby('KMeans_Labels').median()

df_kmeans = pd.concat([mean, median], axis = 0)

df_kmeans.index = ['group_0 Mean', 'group_1 Mean', 'group_2 Mean', 'group_0 Median', 'group_1 Median', 'group_2 Median']

df_kmeans.T

# Select only numeric columns for calculating the mean
numeric_data = data.select_dtypes(include='number')

# Try grouping by 'KMeans_Labels' on the numeric data
try:
    result = numeric_data.groupby('KMeans_Labels').mean()

    mean = result.mean()

    median = result.median()

    df_kmeans = pd.concat([mean, median], axis = 0)

    #df_kmeans.index = ['group_0 Mean', 'group_1 Mean', 'group_2 Mean', 'group_0 Median', 'group_1 Median', 'group_2 Median']

    df_kmeans.T

except Exception as e:
    print("Error occurred during grouping:")
    print(e)

cols_visualise = ['child_mort', 'exports', 'health', 'imports', 'income', 'inflation', 'life_expec', 'total_fer', 'gdpp']

for col in cols_visualise:
    sns.boxplot(x = 'KMeans_Labels', y = col, data = data)
    plt.show()

cols_visualise = ['child_mort', 'exports', 'health', 'imports', 'income', 'inflation', 'life_expec', 'total_fer', 'gdpp']

for col in cols_visualise:
    sns.scatterplot(x = col, y = 'gdpp', data = data, hue = 'KMeans_Labels', palette = 'Dark2')

    plt.show()

kmedo = KMedoids(n_clusters = 3, random_state = 1)

kmedo.fit(data_scaled)

data_scaled_copy['kmedoLabels'] = kmedo.predict(data_scaled)

data['kmedoLabels'] = kmedo.predict(data_scaled)

data.kmedoLabels.value_counts()

data

# Calculating the mean and the median of the original data for each label
original_features = ['child_mort', 'exports', 'health', 'imports', 'income', 'inflation', 'life_expec', 'total_fer', 'gdpp' ]
num_cols = original_features + ['kmedoLabels']
mean = data[num_cols].groupby('kmedoLabels').mean()

median = data[num_cols].groupby('kmedoLabels').median()

df_kmedoids = pd.concat([mean, median], axis = 0)

df_kmedoids.index = ['group_0 Mean', 'group_1 Mean', 'group_2 Mean', 'group_0 Median', 'group_1 Median', 'group_2 Median']

df_kmedoids[original_features].T

for col in cols_visualise:
    sns.boxplot(x = 'kmedoLabels', y = col, data = data)

    plt.show()

gmm = GaussianMixture(n_components = 3, random_state = 1)

gmm.fit(data_scaled)

data_scaled_copy['GmmLabels'] = gmm.predict(data_scaled)

data['GmmLabels'] = gmm.predict(data_scaled)

data.GmmLabels.value_counts()

# Calculating the mean and the median of the original data for each label
original_features = ['child_mort', 'exports', 'health', 'imports', 'income', 'inflation', 'life_expec', 'total_fer', 'gdpp']
num_cols = original_features + ['GmmLabels']
mean = data[num_cols].groupby('GmmLabels').mean()

median = data[num_cols].groupby('GmmLabels').median()

df_gmm = pd.concat([mean, median], axis = 0)

df_gmm.index = ['group_0 Mean', 'group_1 Mean', 'group_2 Mean', 'group_0 Median', 'group_1 Median', 'group_2 Median']

df_gmm[original_features].T

cols_visualise = ['child_mort', 'exports', 'health', 'imports', 'income', 'inflation', 'life_expec', 'total_fer', 'gdpp']

for col in cols_visualise:
    sns.boxplot(x = 'GmmLabels', y = col, data = data)

    plt.show()

from scipy.cluster.hierarchy import dendrogram, linkage

# The List of all linkage methods to check
methods = ['single',
           'average',
           'complete']

# Create a subplot image
fig, axs = plt.subplots(len(methods), 1, figsize = (20, 15))

# Enumerate through the list of all methods above, get linkage and plot dendrogram
for i, method in enumerate(methods):
    Z = linkage(data_scaled, metric = 'euclidean', method = method)

    dendrogram(Z, ax = axs[i]);

    axs[i].set_title(f'Dendrogram ({method.capitalize()} Linkage)')

    axs[i].set_ylabel('Distance')

plt.figure(figsize = (20, 7))

plt.title("Dendrograms")

dend = dendrogram(linkage(data_scaled, method = 'complete'))

plt.axhline(y = 9, color = 'r', linestyle = '--')

<matplotlib.lines.Line2D at 0x7f9f9d417e20>

# Clustering with 4 clusters
hierarchical = AgglomerativeClustering(n_clusters = 4, affinity = 'euclidean', linkage = 'complete')

hierarchical.fit(data_scaled)

AgglomerativeClustering(affinity='euclidean', linkage='complete', n_clusters=4)

AgglomerativeClustering(affinity='euclidean', linkage='complete', n_clusters=4)

data_scaled_copy['HCLabels'] = hierarchical.labels_

data['HCLabels'] = hierarchical.labels_

data.HCLabels.value_counts()

# Checking 3 countries in cluster 2
data[data.HCLabels == 2]

# Checking 1 country in cluster 3
data[data.HCLabels == 3]

# Calculating the mean and the median of the original data for each label
original_features = ['child_mort', 'exports', 'health', 'imports', 'income', 'inflation', 'life_expec', 'total_fer', 'gdpp']
num_cols = original_features + ['HCLabels']
mean = data[num_cols].groupby('HCLabels').mean()

median = data[num_cols].groupby('HCLabels').median()

df_hierachical = pd.concat([mean, median], axis = 0)

df_hierachical.index = ['group_0 Mean', 'group_1 Mean', 'group_2 Mean', 'group_3 Mean', 'group_0 Median', 'group_1 Median', 'group_2 Median', 'group_3 Median']

df_hierachical[original_features].T

cols_visualise = ['child_mort', 'exports', 'health', 'imports', 'income', 'inflation', 'life_expec', 'total_fer', 'gdpp']

for col in cols_visualise:
    sns.boxplot(x = 'HCLabels', y = col, data = data)
    plt.show()

dbs = DBSCAN(eps = 1)

data_scaled_copy['DBSLabels'] = dbs.fit_predict(data_scaled)

data['DBSLabels'] = dbs.fit_predict(data_scaled)

data['DBSLabels'].value_counts()

# Calculating the mean and the median of the original data for each label
original_features = ['child_mort', 'exports', 'health', 'imports', 'income', 'inflation', 'life_expec', 'total_fer', 'gdpp']
num_cols = original_features + ['DBSLabels']
mean = data[num_cols].groupby('DBSLabels').mean()

median = data[num_cols].groupby('DBSLabels').median()

df_hierachical = pd.concat([mean, median], axis = 0)

df_hierachical.index = ['group_-1 Mean', 'group_0 Mean', 'group_1 Mean', 'group_2 Mean', 'group_-1 Median', 'group_0 Median', 'group_1 Median', 'group_2 Median']

df_hierachical[original_features].T

for col in cols_visualise:
    sns.boxplot(x = 'DBSLabels', y = col, data = data)

    plt.show()

# prompt: use !jupyter nbconvert --to html to enable me to convert this entire notebook to html

!jupyter nbconvert --to html /content/your_notebook_name.ipynb

	country	child_mort	exports	health	imports	income	inflation	life_expec	total_fer	gdpp
0	Afghanistan	90.2	10.0	7.58	44.9	1610	9.44	56.2	5.82	553
1	Albania	16.6	28.0	6.55	48.6	9930	4.49	76.3	1.65	4090
2	Algeria	27.3	38.4	4.17	31.4	12900	16.10	76.5	2.89	4460
3	Angola	119.0	62.3	2.85	42.9	5900	22.40	60.1	6.16	3530
4	Antigua and Barbuda	10.3	45.5	6.03	58.9	19100	1.44	76.8	2.13	12200

	count	mean	std	min	25%	50%	75%	max
child_mort	167.0	38.270060	40.328931	2.6000	8.250	19.30	62.10	208.00
exports	167.0	41.108976	27.412010	0.1090	23.800	35.00	51.35	200.00
health	167.0	6.815689	2.746837	1.8100	4.920	6.32	8.60	17.90
imports	167.0	46.890215	24.209589	0.0659	30.200	43.30	58.75	174.00
income	167.0	17144.688623	19278.067698	609.0000	3355.000	9960.00	22800.00	125000.00
inflation	167.0	7.781832	10.570704	-4.2100	1.810	5.39	10.75	104.00
life_expec	167.0	70.555689	8.893172	32.1000	65.300	73.10	76.80	82.80
total_fer	167.0	2.947964	1.513848	1.1500	1.795	2.41	3.88	7.49
gdpp	167.0	12964.155689	18328.704809	231.0000	1330.000	4660.00	14050.00	105000.00

	child_mort	exports	health	imports	income	inflation	life_expec	total_fer	gdpp
child_mort	1.000000	-0.318093	-0.200402	-0.127211	-0.524315	0.288276	-0.886676	0.848478	-0.483032
exports	-0.318093	1.000000	-0.114408	0.737381	0.516784	-0.107294	0.316313	-0.320011	0.418725
health	-0.200402	-0.114408	1.000000	0.095717	0.129579	-0.255376	0.210692	-0.196674	0.345966
imports	-0.127211	0.737381	0.095717	1.000000	0.122406	-0.246994	0.054391	-0.159048	0.115498
income	-0.524315	0.516784	0.129579	0.122406	1.000000	-0.147756	0.611962	-0.501840	0.895571
inflation	0.288276	-0.107294	-0.255376	-0.246994	-0.147756	1.000000	-0.239705	0.316921	-0.221631
life_expec	-0.886676	0.316313	0.210692	0.054391	0.611962	-0.239705	1.000000	-0.760875	0.600089
total_fer	0.848478	-0.320011	-0.196674	-0.159048	-0.501840	0.316921	-0.760875	1.000000	-0.454910
gdpp	-0.483032	0.418725	0.345966	0.115498	0.895571	-0.221631	0.600089	-0.454910	1.000000

	child_mort	exports	health	imports	income	inflation	life_expec	total_fer
0	1.291532	-1.138280	0.279088	-0.082455	-0.808245	0.157336	-1.619092	1.902882
1	-0.538949	-0.479658	-0.097016	0.070837	-0.375369	-0.312347	0.647866	-0.859973
2	-0.272833	-0.099122	-0.966073	-0.641762	-0.220844	0.789274	0.670423	-0.038404
3	2.007808	0.775381	-1.448071	-0.165315	-0.585043	1.387054	-1.179234	2.128151
4	-0.695634	0.160668	-0.286894	0.497568	0.101732	-0.601749	0.704258	-0.541946

	child_mort	exports	health	imports	income	inflation	life_expec	total_fer
0	1.291532	-1.138280	0.279088	-0.082455	-0.808245	0.157336	-1.619092	1.902882
1	-0.538949	-0.479658	-0.097016	0.070837	-0.375369	-0.312347	0.647866	-0.859973
2	-0.272833	-0.099122	-0.966073	-0.641762	-0.220844	0.789274	0.670423	-0.038404
3	2.007808	0.775381	-1.448071	-0.165315	-0.585043	1.387054	-1.179234	2.128151
4	-0.695634	0.160668	-0.286894	0.497568	0.101732	-0.601749	0.704258	-0.541946
...	...	...	...	...	...	...	...	...
162	-0.225578	0.200917	-0.571711	0.240700	-0.738527	-0.489784	-0.852161	0.365754
163	-0.526514	-0.461363	-0.695862	-1.213499	-0.033542	3.616865	0.546361	-0.316678
164	-0.372315	1.130305	0.008877	1.380030	-0.658404	0.409732	0.286958	-0.661206
165	0.448417	-0.406478	-0.597272	-0.517472	-0.658924	1.500916	-0.344633	1.140944
166	1.114951	-0.150348	-0.338015	-0.662477	-0.721358	0.590015	-2.092785	1.624609

Socio-economic Factors for Geographic Clustering¶

Context¶

Objective¶

Data Dictionary¶

Importing the libraries and overview of the dataset¶

Loading the data¶

Checking the info of the data¶

Check duplicate entries¶

Exploratory Data Analysis¶

Summary Statistics¶

Let's check the distribution and outliers for each column in the data¶

Let's check the correlation among the variables¶

Scaling the data¶

K-Means Clustering¶

K-Medoids Clustering¶

Gaussian Mixture Model¶

Hierarchical Clustering¶

DBSCAN¶

Conclusion¶

	group_0 Mean	group_1 Mean	group_2 Mean	group_0 Median	group_1 Median	group_2 Median
child_mort	16.339252	7.766667	89.664706	14.20	8.60	88.20
exports	41.208224	110.988889	28.569000	38.40	93.80	23.80
health	7.228972	4.878889	6.290392	6.85	3.66	5.66
imports	45.459813	90.033333	42.277763	44.50	86.50	40.30
income	19338.785047	66288.888889	3868.882353	15400.00	72100.00	1930.00
inflation	6.093056	5.261556	11.769706	3.80	3.83	8.92
life_expec	75.021495	78.822222	59.727451	75.40	79.50	60.40
total_fer	2.129159	1.816667	4.865490	1.98	1.87	5.02
gdpp	15504.635514	45700.000000	1857.215686	8080.00	38500.00	967.00

	group_0 Mean	group_1 Mean	group_2 Mean	group_0 Median	group_1 Median	group_2 Median
child_mort	7.085294	17.853333	82.951724	4.700	15.70	79.600
exports	36.938235	52.989333	28.191362	31.600	48.30	23.800
health	9.413235	6.202133	6.086379	9.585	6.00	5.275
imports	33.452941	57.694667	40.795964	30.750	55.10	36.800
income	38094.705882	17592.800000	4284.189655	36550.000	13500.00	2145.000
inflation	4.020676	5.511573	12.922328	1.825	3.82	9.225
life_expec	79.614706	73.870667	60.958621	80.200	74.10	60.800
total_fer	1.842353	2.179600	4.589655	1.875	2.13	4.875
gdpp	35925.588235	10989.813333	2057.034483	38700.000	6230.00	994.000

	group_0 Mean	group_1 Mean	group_2 Mean	group_0 Median	group_1 Median	group_2 Median
child_mort	26.975806	87.0720	6.636364	21.300	88.750	5.50
exports	37.541597	28.3524	56.727273	34.900	23.800	50.20
health	5.872903	6.6262	8.050727	5.645	5.790	8.35
imports	45.718805	43.2400	51.529091	48.250	41.500	42.10
income	9284.354839	4418.2600	37574.909091	8890.000	2145.000	33900.00
inflation	7.547419	12.9951	3.306745	6.875	9.060	1.66
life_expec	71.633871	60.4280	78.547273	71.850	60.300	79.50
total_fer	2.478226	4.7968	1.796727	2.395	5.035	1.84
gdpp	4547.403226	2254.5000	32188.181818	3885.000	1020.000	30600.00

	country	child_mort	exports	health	imports	income	inflation	life_expec	total_fer	gdpp	KMeans_Labels	kmedoLabels	GmmLabels	HCLabels
91	Luxembourg	2.8	175.0	7.77	142.0	91700	3.620	81.3	1.63	105000	1	1	2	2
98	Malta	6.8	153.0	8.65	154.0	28300	3.830	80.3	1.36	21100	1	1	2	2
133	Singapore	2.8	200.0	3.96	174.0	72100	-0.046	82.7	1.15	46600	1	1	2	2

	group_0 Mean	group_1 Mean	group_2 Mean	group_3 Mean	group_0 Median	group_1 Median	group_2 Median	group_3 Median
child_mort	16.678641	75.513333	4.133333	130.00	10.80	73.300	2.80	130.00
exports	42.532806	32.183667	176.000000	25.30	38.70	28.900	175.00	25.30
health	7.013883	6.505667	6.793333	5.07	6.91	5.685	7.77	5.07
imports	42.438504	49.535000	156.666667	17.40	38.40	47.650	154.00	17.40
income	23425.533981	4218.050000	64033.333333	5150.00	17800.00	2500.000	72100.00	5150.00
inflation	6.723262	8.261100	2.468000	104.00	4.49	5.860	3.62	104.00
life_expec	75.471845	61.740000	81.433333	60.50	76.10	61.300	81.30	60.50
total_fer	2.074660	4.477333	1.380000	5.84	1.93	4.710	1.36	5.84
gdpp	18053.689320	2174.233333	57566.666667	2330.00	10700.00	1185.000	46600.00	2330.00

	group_-1 Mean	group_0 Mean	group_1 Mean	group_2 Mean	group_-1 Median	group_0 Median	group_1 Median	group_2 Median
child_mort	54.907778	17.130909	4.147059	87.340	50.900	15.70	4.100	90.20
exports	42.922211	41.525455	35.194118	24.000	36.100	37.00	29.900	22.80
health	6.254556	6.709455	10.294706	6.256	5.275	6.55	10.100	6.01
imports	48.265177	49.510909	33.982353	37.200	42.400	51.30	31.000	34.90
income	16254.611111	13433.090909	38382.352941	1785.600	5170.000	11200.00	38800.000	1610.00
inflation	11.155856	4.015527	1.309118	10.486	8.605	3.53	0.873	9.44
life_expec	67.202222	74.203636	81.076471	55.020	67.700	74.50	81.300	54.50
total_fer	3.578222	2.067455	1.708235	5.504	3.250	1.92	1.630	5.43
gdpp	10940.611111	8043.018182	43200.000000	718.600	2775.000	6250.00	41900.000	553.00

	count
DBSLabels
-1	90
0	55
1	17
2	5