!pip install scikit-learn-extra

Defaulting to user installation because normal site-packages is not writeable
Requirement already satisfied: scikit-learn-extra in c:\users\aditya chaudhary\appdata\roaming\python\python311\site-packages (0.3.0)
Requirement already satisfied: numpy>=1.13.3 in c:\programdata\anaconda3\lib\site-packages (from scikit-learn-extra) (1.26.4)
Requirement already satisfied: scipy>=0.19.1 in c:\programdata\anaconda3\lib\site-packages (from scikit-learn-extra) (1.11.4)
Requirement already satisfied: scikit-learn>=0.23.0 in c:\programdata\anaconda3\lib\site-packages (from scikit-learn-extra) (1.2.2)
Requirement already satisfied: joblib>=1.1.1 in c:\programdata\anaconda3\lib\site-packages (from scikit-learn>=0.23.0->scikit-learn-extra) (1.2.0)
Requirement already satisfied: threadpoolctl>=2.0.0 in c:\programdata\anaconda3\lib\site-packages (from scikit-learn>=0.23.0->scikit-learn-extra) (2.2.0)

# Libraries to help with reading and manipulating data
import numpy as np
import pandas as pd

# Libraries to help with data visualization
import matplotlib.pyplot as plt
import seaborn as sns

sns.set_theme(style='darkgrid')

# Removes the limit for the number of displayed columns
pd.set_option("display.max_columns", None)
# Sets the limit for the number of displayed rows
pd.set_option("display.max_rows", 200)

# To scale the data using z-score
from Scikit-learn.preprocessing import StandardScaler

# To compute distances
from scipy.spatial.distance import cdist, pdist

# To perform K-Means clustering and compute silhouette scores
from Scikit-learn.cluster import KMeans
from Scikit-learn.metrics import silhouette_score

# To import K-Medoids
from Scikit-learn_extra.cluster import KMedoids

# To import DBSCAN and Gaussian Mixture
from Scikit-learn.cluster import DBSCAN
from Scikit-learn.mixture import GaussianMixture

# To perform hierarchical clustering, compute cophenetic correlation, and create dendrograms
from Scikit-learn.cluster import AgglomerativeClustering
from scipy.cluster.hierarchy import dendrogram, linkage, cophenet

import warnings
warnings.filterwarnings("ignore")

# Connect to google
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive

data = pd.read_csv("fpl_data.csv")

data.shape

(476, 13)

# Viewing 10 random rows of the data
data.sample(10, random_state = 1)

# Copying the data to another variable to avoid any changes to original data
df = data.copy()

# Checking datatypes and number of non-null values for each column
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 476 entries, 0 to 475
Data columns (total 13 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Player_Name     476 non-null    object 
 1   Club            476 non-null    object 
 2   Position        476 non-null    object 
 3   Goals_Scored    476 non-null    int64  
 4   Assists         476 non-null    int64  
 5   Total_Points    476 non-null    int64  
 6   Minutes         476 non-null    int64  
 7   Goals_Conceded  476 non-null    int64  
 8   Creativity      476 non-null    float64
 9   Influence       476 non-null    float64
 10  Threat          476 non-null    int64  
 11  Bonus           476 non-null    int64  
 12  Clean_Sheets    476 non-null    int64  
dtypes: float64(2), int64(8), object(3)
memory usage: 48.5+ KB

# Checking for duplicate values
df.duplicated().sum()

0

# Checking for missing values
df.isnull().sum()

Player_Name       0
Club              0
Position          0
Goals_Scored      0
Assists           0
Total_Points      0
Minutes           0
Goals_Conceded    0
Creativity        0
Influence         0
Threat            0
Bonus             0
Clean_Sheets      0
dtype: int64

df.describe(include = 'all').T

# Function to plot a boxplot and a histogram along the same scale


def histogram_boxplot(data, feature, figsize = (12, 7), kde = False, bins = None):

    """
    Boxplot and histogram combined

    data: dataframe
    feature: dataframe column
    figsize: size of figure (default (12, 7))
    kde: whether to the show density curve (default False)
    bins: number of bins for histogram (default None)
    """

    f2, (ax_box2, ax_hist2) = plt.subplots(
        nrows = 2,      # Number of rows of the subplot grid = 2
        sharex = True,  # X-axis will be shared among all subplots
        gridspec_kw = {"height_ratios": (0.25, 0.75)},
        figsize = figsize,
    )  # Creating the 2 subplots
    sns.boxplot(
        data = data, x = feature, ax = ax_box2, showmeans = True, color = "violet"
    )  # Boxplot will be created and a star will indicate the mean value of the column
    sns.histplot(
        data = data, x = feature, kde = kde, ax = ax_hist2, bins = bins, palette = "winter"
    ) if bins else sns.histplot(
        data = data, x = feature, kde = kde, ax = ax_hist2
    )  # For histogram
    ax_hist2.axvline(
        data[feature].mean(), color = "green", linestyle = "--"
    )  # Add mean to the histogram
    ax_hist2.axvline(
        data[feature].median(), color = "black", linestyle = "-"
    )  # Add median to the histogram

histogram_boxplot(df, 'Goals_Scored')

histogram_boxplot(df, 'Assists')

histogram_boxplot(df, 'Goals_Conceded')

histogram_boxplot(df, 'Clean_Sheets')

histogram_boxplot(df, 'Minutes')

histogram_boxplot(df, 'Total_Points')

histogram_boxplot(df, 'Creativity')

histogram_boxplot(df, 'Influence')

histogram_boxplot(df, 'Threat')

histogram_boxplot(df, 'Bonus')

# Function to create labeled barplots


def labeled_barplot(data, feature, perc = False, n = None):
    """
    Barplot with percentage at the top

    data: dataframe
    feature: dataframe column
    perc: whether to display percentages instead of count (default is False)
    n: displays the top n category levels (default is None, i.e., display all levels)
    """

    total = len(data[feature])  # Length of the column
    count = data[feature].nunique()
    if n is None:
        plt.figure(figsize = (count + 1, 5))
    else:
        plt.figure(figsize = (n + 1, 5))

    plt.xticks(rotation = 90, fontsize = 15)
    ax = sns.countplot(
        data = data,
        x = feature,
        palette = "Paired",
        order = data[feature].value_counts().index[:n].sort_values(),
    )

    for p in ax.patches:
        if perc == True:
            label = "{:.1f}%".format(
                100 * p.get_height() / total
            )                       # Percentage of each class of the category
        else:
            label = p.get_height()  # Count of each level of the category

        x = p.get_x() + p.get_width() / 2  # Width of the plot
        y = p.get_height()                 # Height of the plot

        ax.annotate(
            label,
            (x, y),
            ha = "center",
            va = "center",
            size = 12,
            xytext = (0, 5),
            textcoords = "offset points",
        )  # Annotate the percentage

    plt.show()  # Show the plot

labeled_barplot(df, 'Club')

labeled_barplot(df, 'Position')

# Correlation check
cols_list = df.select_dtypes(include = np.number).columns.tolist()

plt.figure(figsize = (15, 7))

sns.heatmap(
    df[cols_list].corr(numeric_only = True), annot = True, vmin = -1, vmax = 1, fmt = ".2f", cmap = "Spectral"
)

plt.show()

plt.figure(figsize = (15,5))
sns.barplot(data = df, x = 'Club', y = 'Total_Points', errorbar=('ci', False))
plt.xticks(rotation = 90)
plt.show()

plt.figure(figsize = (7, 5))
sns.barplot(data = df, x = 'Position', y = 'Total_Points', errorbar=('ci', False))
plt.xticks(rotation = 90)
plt.show()

plt.figure(figsize = (7, 5))
sns.barplot(data = df, x = 'Position', y = 'Minutes', errorbar=('ci', False))
plt.xticks(rotation = 90)
plt.show()

plt.figure(figsize = (15, 5))
sns.barplot(data = df, x = 'Club', y = 'Bonus', errorbar=('ci', False))
plt.xticks(rotation = 90)
plt.show()

pos_list = df.Position.unique().tolist()
best_df = pd.DataFrame()

for pos in pos_list:
    df_aux = df[df.Position == pos]
    best_df = pd.concat([best_df, df_aux[df_aux.Total_Points == df_aux.Total_Points.max()][['Player_Name', 'Club', 'Position', 'Total_Points']]])

best_df

best10_df = pd.DataFrame()

for pos in pos_list:
    df_aux = df[df.Position == pos]
    best10_df = pd.concat([best10_df, df_aux.sort_values('Total_Points', ascending=False).reset_index(drop=True).loc[:10, ['Player_Name', 'Club', 'Position', 'Total_Points']]])

best10_df

plt.figure(figsize = (20, 10))

numeric_columns = df.select_dtypes(include = np.number).columns.tolist()

for i, variable in enumerate(numeric_columns):

    plt.subplot(2, 5, i + 1)

    plt.boxplot(df[variable], whis = 1.5)

    plt.tight_layout()

    plt.title(variable)

plt.show()

# Scaling the data before clustering
scaler = StandardScaler()
subset = df.iloc[:, 3:].copy()
subset_scaled = scaler.fit_transform(subset)

# Creating a dataframe of the scaled data
subset_scaled_df = pd.DataFrame(subset_scaled, columns = subset.columns)

# Importing PCA
from Scikit-learn.decomposition import PCA

# Defining the number of principal components to generate
n = subset.shape[1]                                                 # Storing the number of variables in the data

pca = PCA(n_components = n, random_state = 1)                       # Storing PCA function with n components

data_pca = pd.DataFrame(pca.fit_transform(subset_scaled_df ))       # Applying PCA on scaled data

# The percentage of variance explained by each principal component is stored
exp_var = (pca.explained_variance_ratio_)

k_means_df = data_pca.copy()

clusters = range(1, 15)
meanDistortions = []

for k in clusters:
    model = KMeans(n_clusters = k, random_state = 1, n_init = "auto")
    model.fit(data_pca)
    prediction = model.predict(k_means_df)
    distortion = (
        sum(np.min(cdist(k_means_df, model.cluster_centers_, "euclidean"), axis = 1))
        / k_means_df.shape[0]
    )

    meanDistortions.append(distortion)

    print("Number of Clusters:", k, "\tAverage Distortion:", distortion)

plt.plot(clusters, meanDistortions, "bx-")
plt.xlabel("k")
plt.ylabel("Average Distortion")
plt.title("Selecting k with the Elbow Method", fontsize = 20)
plt.show()

Number of Clusters: 1 	Average Distortion: 2.7730371100978024
Number of Clusters: 2 	Average Distortion: 1.8635736785898265
Number of Clusters: 3 	Average Distortion: 1.5612774038101602
Number of Clusters: 4 	Average Distortion: 1.3545171820838156
Number of Clusters: 5 	Average Distortion: 1.2931541699741689
Number of Clusters: 6 	Average Distortion: 1.2341231453420074
Number of Clusters: 7 	Average Distortion: 1.1457934035634147
Number of Clusters: 8 	Average Distortion: 1.115307656889079
Number of Clusters: 9 	Average Distortion: 1.0797310475776056
Number of Clusters: 10 	Average Distortion: 1.0174369926410631
Number of Clusters: 11 	Average Distortion: 0.9986112688354987
Number of Clusters: 12 	Average Distortion: 0.9862831494790055
Number of Clusters: 13 	Average Distortion: 0.9602766985773116
Number of Clusters: 14 	Average Distortion: 0.9501453167908899

kmeans = KMeans(n_clusters = 4, random_state = 1, n_init = "auto")
kmeans.fit(k_means_df)

KMeans(n_clusters=4, n_init='auto', random_state=1)

KMeans(n_clusters=4, n_init='auto', random_state=1)

# Creating a copy of the original data
df1 = df.copy()

# Adding K-Means cluster labels to the K-Means and original dataframes
k_means_df["KM_segments"] = kmeans.labels_
df1["KM_segments"] = kmeans.labels_

km_cluster_profile = df1.groupby("KM_segments").mean(numeric_only = True)

# Creating the "count_in_each_segment" feature in K-Means cluster profile

km_cluster_profile["count_in_each_segment"] = (
    df1.groupby("KM_segments")["Total_Points"].count().values
)

km_cluster_profile.style.highlight_max(color = "lightgreen", axis = 0)

# Let's see the names of the players in each cluster
for cl in df1["KM_segments"].unique():
    print("In cluster {}, the following players are present:".format(cl))
    print(df1[df1["KM_segments"] == cl]["Player_Name"].unique())
    print()

In cluster 1, the following players are present:
['Alex Runnarsson' 'Cedric Soares' 'Edward Nketiah'
 'Gabriel Teodoro Martinelli Silva' 'Matt Macey' 'Miguel Azeez'
 'Pablo Mari' 'Reiss Nelson' 'Sead Kolasinac' 'Shkodran Mustafi'
 'Sokratis Papastathopoulos' 'William Saliba' 'Ahmed El Mohamady'
 'Carney Chukwuemeka' 'Conor Hourihane' 'Henri Lansbury' 'Jacob Ramsey'
 'Jaden Philogene-Bidace' 'Jose Peleteiro Romallo' 'Keinan Davis'
 'Kortney Hause' 'Marvelous Nakamba' 'Morgan Sanson' 'Orjan Nyland'
 'Wesley Moraes' 'Alireza Jahanbakhsh' 'Andi Zeqiri'
 'Bernardo Fernandes da Silva Junior' 'Davy Propper' 'Jakub Moder'
 'Jason Steele' 'Jayson Molumby' 'Jose Izquierdo' 'Percy Tau'
 'Reda Khadra' 'Anthony Driscoll-Glennon' 'Bailey Peacock-Farrell'
 'Dale Stephens' 'Jimmy Dunne' 'Joel Mumbongo' 'Josh Benson' 'Kevin Long'
 'Lewis Richardson' 'Phil Bardsley' 'Will Norris' 'Billy Gilmour'
 'Emerson Palmieri dos Santos' 'Faustino Anjorin' 'Fikayo Tomori'
 'Karlo Ziger' 'Kepa Arrizabalaga' 'Valentino Livramento'
 'Willy Caballero' 'Connor Wickham' 'Jack Butland' 'James McCarthy'
 'James Tomkins' 'Jean-Philippe Mateta' 'Mamadou Sakho' 'Martin Kelly'
 'Nathan Ferguson' 'Reece Hannam' 'Ryan Inniss' 'Sam Woods'
 'Stephen Henderson' 'Anthony Gordon' 'Bernard Caldeira Duarte'
 'Cenk Tosun' 'Fabian Delph' 'Joao Virginia' 'Jonjoe Kenny' 'Joshua King'
 'Moise Kean' 'Nathan Broadhead' 'Niels Nkounkou' 'Robin Olsen'
 'Adam Forshaw' 'Francisco Casilla' 'Gaetano Berardi'
 'Ian Carlo Poveda-Ocampo' 'Jack Jenkins' 'Jamie Shackleton'
 'Jay-Roy Grot' 'Jordan Stevens' 'Kamil Miazek' 'Leif Davis'
 'Mateusz Bogusz' 'Niall Huggins' 'Pablo Hernandez' 'Cengiz Under'
 'Christian Fuchs' 'Daniel Amartey' 'Demarai Gray' 'Filip Benkovic'
 'Hamza Choudhury' 'Islam Slimani' 'Luke Thomas' 'Sidnei Tavares'
 'Thakgalo Leshabela' 'Vontae Daley-Campbell' 'Wes Morgan'
 'Adrian Castillo' 'Alex Oxlade-Chamberlain' 'Caoimhin Kelleher'
 'Divock Origi' 'Joel Matip' 'Joseph Gomez' 'Naby Keita' 'Neco Williams'
 'Ozan Kabak' 'Rhys Williams' 'Virgil van Dijk' 'Xherdan Shaqiri'
 'Eric Garcia' 'Liam Delap' 'Luke Mbete' 'Nathan Ake' 'Nicolas Otamendi'
 'Scott Carson' 'Taylor Harwood-Bellis' 'Zack Steffen'
 'Alex Nicolao Telles' 'Amad Diallo' 'Anthony Elanga' 'Axel Tuanzebe'
 'Brandon Williams' 'Donny van de Beek' 'Hannibal Mejbri' 'Juan Mata'
 'Nathan Bishop' 'Odion Ighalo' 'Shola Shoretire' 'William Fish'
 'Andy Carroll' 'DeAndre Yedlin' 'Dwight Gayle' 'Elliot Anderson'
 'Florian Lejeune' 'Javier Manquillo' 'Kelland Watts' 'Matthew Longstaff'
 'Yoshinori Muto' 'Caleb Watts' "Daniel N'Lundulu" 'Fraser Forster'
 'Jake Vokins' 'Kgaogelo Chauke' 'Michael Obafemi' 'Mohammed Salisu'
 'Nathan Tella' 'Shane Long' 'William Smallbone' 'Yan Valery'
 'Bamidele Alli' 'Cameron Carter-Vickers' 'Carlos Vinicius Alves Morais'
 'Dane Scarlett' 'Danny Rose' 'Erik Lamela' 'Harry Winks'
 'Japhet Tanganga' 'Joe Rodon' 'Juan Foyth' 'Paulo Gazzaniga'
 'Ryan Sessegnon' 'Ademipo Odubeko' 'Albian Ajeti' 'Andriy Yarmolenko'
 'Ben Johnson' 'Darren Randolph' 'Felipe Anderson Pereira Gomes'
 'Frederik Alves' 'Jamal Baptiste' 'Jordan Hugill' 'Manuel Lanzini'
 'Mark Noble' 'Roberto Jimenez Gago' 'Ryan Fredericks' 'Fernando Marcal'
 'John Ruddy' 'Jonathan Castro Otto' 'Ki-Jana Hoever' 'Morgan Gibbs-White'
 'Oskar Buur' 'Owen Otasowie' 'Patrick Cutrone' 'Ruben Vinagre'
 'Vitor Ferreira']

In cluster 2, the following players are present:
['Alexandre Lacazette' 'Bukayo Saka' 'Nicolas Pepe'
 'Pierre-Emerick Aubameyang' 'Anwar El Ghazi' 'Bertrand Traore'
 'Jack Grealish' 'Ollie Watkins' 'Leandro Trossard' 'Neal Maupay'
 'Pascal Gross' 'Chris Wood' 'Mason Mount' 'Timo Werner'
 'Christian Benteke' 'Eberechi Eze' 'Wilfried Zaha'
 'Dominic Calvert-Lewin' 'Gylfi Sigurdsson' 'James Rodriguez'
 'Lucas Digne' 'Richarlison de Andrade' 'Jack Harrison' 'Patrick Bamford'
 'Raphael Dias Belloli' 'Rodrigo Moreno' 'Stuart Dallas' 'Harvey Barnes'
 'James Maddison' 'Jamie Vardy' 'Kelechi Iheanacho' 'Youri Tielemans'
 'Andrew Robertson' 'Mohamed Salah' 'Roberto Firmino' 'Sadio Mane'
 'Trent Alexander-Arnold' 'Gabriel Fernando de Jesus' 'Ilkay Gundogan'
 'Joao Cancelo' 'Kevin De Bruyne' 'Phil Foden' 'Raheem Sterling'
 'Riyad Mahrez' 'Bruno Fernandes' 'Edinson Cavani' 'Luke Shaw'
 'Marcus Rashford' 'Callum Wilson' 'Che Adams' 'Danny Ings'
 'James Ward-Prowse' 'Gareth Bale' 'Harry Kane' 'Heung-Min Son'
 'Aaron Cresswell' 'Jarrod Bowen' 'Jesse Lingard' 'Michail Antonio'
 'Pablo Fornals' 'Tomas Soucek' 'Pedro Lomba Neto']

In cluster 0, the following players are present:
['Bernd Leno' 'Granit Xhaka' 'Hector Bellerin' 'Kieran Tierney'
 'Rob Holding' 'Douglas Luiz Soares de Paulo' 'Emiliano Martinez'
 'Ezri Konsa Ngoyo' 'John McGinn' 'Matt Targett' 'Matthew Cash'
 'Tyrone Mings' 'Adam Webster' 'Ben White' 'Joel Veltman' 'Lewis Dunk'
 'Robert Sanchez' 'Yves Bissouma' 'Ashley Westwood' 'Ben Mee'
 'Charlie Taylor' 'Dwight McNeil' 'James Tarkowski' 'Josh Brownhill'
 'Matthew Lowton' 'Nick Pope' 'Benjamin Chilwell' 'Cesar Azpilicueta'
 'Edouard Mendy' 'Jorge Luiz Frello Filho' 'Kurt Zouma' 'Reece James'
 'Thiago Silva' 'Andros Townsend' 'Cheikhou Kouyate' 'Joel Ward'
 'Luka Milivojevic' 'Vicente Guaita' 'Abdoulaye Doucoure' 'Ben Godfrey'
 'Jordan Pickford' 'Mason Holgate' 'Michael Keane' 'Ezgjan Alioski'
 'Illan Meslier' 'Kalvin Phillips' 'Liam Cooper' 'Luke Ayling'
 'Mateusz Klich' 'James Justin' 'Jonny Evans' 'Kasper Schmeichel'
 'Timothy Castagne' 'Wesley Fofana' 'Wilfred Ndidi' 'Alisson Becker'
 'Fabio Henrique Tavares' 'Georginio Wijnaldum' 'Bernardo Silva'
 'Ederson Moares' 'John Stones' 'Rodrigo Hernandez' 'Ruben Dias'
 'Aaron Wan-Bissaka' 'David de Gea' 'Frederico Rodrigues de Paula Santos'
 'Harry Maguire' 'Paul Pogba' 'Scott McTominay' 'Victor Lindelof'
 'Jonjo Shelvey' 'Karl Darlow' 'Miguel Almiron' 'Alex McCarthy'
 'Jan Bednarek' 'Jannik Vestergaard' 'Kyle Walker-Peters' 'Ryan Bertrand'
 'Stuart Armstrong' 'Eric Dier' 'Hugo Lloris' 'Pierre-Emile Hojbjerg'
 'Sergio Reguilon' 'Tanguy Ndombele' 'Toby Alderweireld' 'Angelo Ogbonna'
 'Declan Rice' 'Lukasz Fabianski' 'Vladimir Coufal' 'Adama Traore'
 'Conor Coady' 'Joao Santos Moutinho' 'Leander Dendoncker' 'Nelson Semedo'
 'Romain Saiss' 'Ruben Neves' 'Rui Pedro Patricio']

In cluster 3, the following players are present:
['Calum Chambers' 'Daniel Ceballos' 'David Luiz' 'Emile Smith Rowe'
 'Gabriel Maghalaes' 'Martin Odegaard' 'Mohamed Naser El Sayed Elneny'
 'Thomas Partey' 'Willian Borges Da Silva' 'Mahmoud Ahmed Ibrahim Hassan'
 'Ross Barkley' 'Aaron Connolly' 'Adam Lallana' 'Alexis Mac Allister'
 'Dan Burn' 'Danny Welbeck' 'Mathew Ryan' 'Solomon March' 'Steven Alzate'
 'Tariq Lamptey' 'Ashley Barnes' 'Erik Pieters' 'Jack Cork'
 'Jay Rodriguez' 'Jeff Hendrick' 'Johann Berg Gudmundsson' 'Matej Vydra'
 'Robbie Brady' 'Andreas Christensen' 'Antonio Rudiger'
 'Callum Hudson-Odoi' 'Christian Pulisic' 'Hakim Ziyech' 'Kai Havertz'
 'Marcos Alonso' 'Mateo Kovacic' "N'Golo Kante" 'Olivier Giroud'
 'Tammy Abraham' 'Gary Cahill' 'Jairo Riedewald' 'James McArthur'
 'Jeffrey Schlupp' 'Jordan Ayew' 'Michy Batshuayi' 'Nathaniel Clyne'
 'Patrick van Aanholt' 'Scott Dann' 'Tyrick Mitchell' 'Alex Iwobi'
 'Allan Marques Loureiro' 'Andre Tavares Gomes' 'Seamus Coleman'
 'Tom Davies' 'Yerry Mina' 'Diego Llorente' 'Helder Costa'
 'Pascal Struijk' 'Robin Koch' 'Tyler Roberts' 'Ayoze Perez'
 'Calgar Soyuncu' 'Dennis Praet' 'Marc Albrighton' 'Nampalys Mendy'
 'Ricardo Domingos Barbosa Pereira' 'Curtis Jones' 'Dean Henderson'
 'Diogo Jota' 'James Milner' 'Jordan Henderson' 'Nathaniel Phillips'
 'Thiago Alcantara' 'Aymeric Laporte' 'Benjamin Mendy'
 'Fernando Luiz Rosa' 'Ferran Torres' 'Kyle Walker' 'Oleksandr Zinchenko'
 'Sergio Aguero' 'Anthony Martial' 'Daniel James' 'Eric Bailly'
 'Mason Greenwood' 'Nemanja Matic' 'Allan Saint-Maximin' 'Ciaran Clark'
 'Emil Krafth' 'Fabian Schar' 'Federico Fernandez' 'Isaac Hayden'
 'Jacob Murphy' 'Jamaal Lascelles' 'Jamal Lewis' 'Joelinton de Lira'
 'Joseph Willock' 'Martin Dubravka' 'Matt Ritchie' 'Paul Dummett'
 'Ryan Fraser' 'Sean Longstaff' 'Ibrahima Diallo' 'Jack Stephens'
 'Moussa Djenepo' 'Nathan Redmond' 'Oriol Romeu Vidal' 'Takumi Minamino'
 'Theo Walcott' 'Ben Davies' 'Davinson Sanchez' 'Giovani Lo Celso'
 'Lucas Moura' 'Matt Doherty' 'Moussa Sissoko' 'Serge Aurier'
 'Steven Bergwijn' 'Arthur Masuaku' 'Craig Dawson' 'Fabian Balbuena'
 'Issa Diop' 'Said Benrahma' 'Sebastian Haller' 'Daniel Castelo Podence'
 'Fabio Silva' 'Max Kilman' 'Raul Jimenez ' 'Rayan Ait Nouri'
 'Willian Jose' 'Willy Boly']

df1.groupby(["KM_segments", "Position"])['Player_Name'].count()

KM_segments  Position  
0            Defender      48
             Goalkeeper    17
             Midfielder    32
1            Defender      70
             Forward       28
             Goalkeeper    25
             Midfielder    65
2            Defender       7
             Forward       20
             Midfielder    35
3            Defender      47
             Forward       16
             Goalkeeper     3
             Midfielder    63
Name: Player_Name, dtype: int64

fig, axes = plt.subplots(3, 4, figsize = (20, 20))
counter = 0

for ii in range(3):
    for jj in range(4):
        if counter < 10:
            sns.boxplot(
                ax = axes[ii][jj],
                data = df1,
                y = df1.columns[3 + counter],
                x = "KM_segments", showmeans = True
            )
            counter = counter + 1

fig.tight_layout(pad = 3.0)

k_med_df = data_pca.copy()

kmed = KMedoids(n_clusters = 4, random_state = 1) # Create K-Medoids with nclusters = 4
kmed.fit(k_med_df)

KMedoids(n_clusters=4, random_state=1)

KMedoids(n_clusters=4, random_state=1)

# Creating a copy of the original data
df2 = df.copy()

# Add K-Medoids cluster labels to K-Medoids data
k_med_df["KMed_segments"] = kmed.labels_
# Add K-Medoids cluster labels to the whole data
df2["KMed_segments"] =  kmed.labels_

kmed_cluster_profile = df2.groupby("KMed_segments").mean(numeric_only = True)

df2.groupby(["KMed_segments", "Position"])['Player_Name'].count()

KMed_segments  Position  
0              Defender      12
               Forward       21
               Midfielder    49
1              Defender      49
               Goalkeeper    17
               Midfielder    21
2              Defender      50
               Forward       16
               Goalkeeper     3
               Midfielder    67
3              Defender      61
               Forward       27
               Goalkeeper    25
               Midfielder    58
Name: Player_Name, dtype: int64

kmed_cluster_profile["count_in_each_segment"] = (
    df2.groupby("KMed_segments")["Total_Points"].count().values
)

kmed_cluster_profile.style.highlight_max(color = "lightgreen", axis = 0)

fig, axes = plt.subplots(3, 4, figsize = (20, 20))
counter = 0

for ii in range(3):
    for jj in range(4):
        if counter < 10:
            sns.boxplot(
                ax = axes[ii][jj],
                data = df2,
                y = df2.columns[3 + counter],
                x = "KMed_segments", showmeans = True
            )
            counter = counter + 1

fig.tight_layout(pad = 3.0)

data_pca.head()

hc_df = data_pca.copy()

hc_df1 = hc_df.copy()

# List of distance metrics
distance_metrics = ["euclidean", "chebyshev", "mahalanobis", "cityblock"]

# List of linkage methods
linkage_methods = ["single", "complete", "average", "weighted"]

high_cophenet_corr = 0
high_dm_lm = [0, 0]

for dm in distance_metrics:
    for lm in linkage_methods:
        Z = linkage(hc_df1, metric = dm, method = lm)
        c, coph_dists = cophenet(Z, pdist(hc_df))
        print(
            "Cophenetic correlation for {} distance and {} linkage is {}.".format(
                dm.capitalize(), lm, c
            )
        )
        if high_cophenet_corr < c:
            high_cophenet_corr = c
            high_dm_lm[0] = dm
            high_dm_lm[1] = lm

# Printing the combination of distance metric and linkage method with the highest cophenetic correlation
print('*'*100)
print(
    "Highest cophenetic correlation is {}, which is obtained with {} distance and {} linkage.".format(
        high_cophenet_corr, high_dm_lm[0].capitalize(), high_dm_lm[1]
    )
)

Cophenetic correlation for Euclidean distance and single linkage is 0.8430175514228708.
Cophenetic correlation for Euclidean distance and complete linkage is 0.741204129226176.
Cophenetic correlation for Euclidean distance and average linkage is 0.8476499945585417.
Cophenetic correlation for Euclidean distance and weighted linkage is 0.8624581351067481.
Cophenetic correlation for Chebyshev distance and single linkage is 0.8381223141111798.
Cophenetic correlation for Chebyshev distance and complete linkage is 0.8028394390632132.
Cophenetic correlation for Chebyshev distance and average linkage is 0.8167064931302255.
Cophenetic correlation for Chebyshev distance and weighted linkage is 0.844849787663964.
Cophenetic correlation for Mahalanobis distance and single linkage is 0.8065008904132245.
Cophenetic correlation for Mahalanobis distance and complete linkage is 0.6583135946489013.
Cophenetic correlation for Mahalanobis distance and average linkage is 0.7747800632434059.
Cophenetic correlation for Mahalanobis distance and weighted linkage is 0.6486408054242748.
Cophenetic correlation for Cityblock distance and single linkage is 0.840183551166332.
Cophenetic correlation for Cityblock distance and complete linkage is 0.8241586035407029.
Cophenetic correlation for Cityblock distance and average linkage is 0.8564523087071935.
Cophenetic correlation for Cityblock distance and weighted linkage is 0.8395672301050403.
****************************************************************************************************
Highest cophenetic correlation is 0.8624581351067481, which is obtained with Euclidean distance and weighted linkage.

# List of linkage methods
linkage_methods = ["single", "complete", "average", "centroid", "ward", "weighted"]

high_cophenet_corr = 0
high_dm_lm = [0, 0]

for lm in linkage_methods:
    Z = linkage(hc_df1, metric = "euclidean", method = lm)
    c, coph_dists = cophenet(Z, pdist(hc_df))
    print("Cophenetic correlation for {} linkage is {}.".format(lm, c))
    if high_cophenet_corr < c:
        high_cophenet_corr = c
        high_dm_lm[0] = "euclidean"
        high_dm_lm[1] = lm

# Printing the combination of distance metric and linkage method with the highest cophenetic correlation
print('*'*100)
print(
    "Highest cophenetic correlation is {}, which is obtained with {} linkage.".format(
        high_cophenet_corr, high_dm_lm[1]
    )
)

Cophenetic correlation for single linkage is 0.8430175514228708.
Cophenetic correlation for complete linkage is 0.741204129226176.
Cophenetic correlation for average linkage is 0.8476499945585417.
Cophenetic correlation for centroid linkage is 0.8068296032280465.
Cophenetic correlation for ward linkage is 0.577773844586155.
Cophenetic correlation for weighted linkage is 0.8624581351067481.
****************************************************************************************************
Highest cophenetic correlation is 0.8624581351067481, which is obtained with weighted linkage.

# List of linkage methods
linkage_methods = ["single", "complete", "average", "centroid", "ward", "weighted"]

# Lists to save results of cophenetic correlation calculation
compare_cols = ["Linkage", "Cophenetic Coefficient"]
compare = []

# To create a subplot image
fig, axs = plt.subplots(len(linkage_methods), 1, figsize = (15, 30))

# We will enumerate through the list of linkage methods above
# For each linkage method, we will plot the dendrogram and calculate the cophenetic correlation
for i, method in enumerate(linkage_methods):
    Z = linkage(hc_df1, metric = "euclidean", method = method)

    dendrogram(Z, ax = axs[i])
    axs[i].set_title(f"Dendrogram ({method.capitalize()} Linkage)")

    coph_corr, coph_dist = cophenet(Z, pdist(hc_df))
    axs[i].annotate(
        f"Cophenetic\nCorrelation\n{coph_corr:0.2f}",
        (0.80, 0.80),
        xycoords="axes fraction",
    )

    compare.append([method, coph_corr])

# Create and print a dataframe to compare cophenetic correlations for different linkage methods
df_cc = pd.DataFrame(compare, columns = compare_cols)

df_cc = df_cc.sort_values(by = "Cophenetic Coefficient")
df_cc

HCmodel = AgglomerativeClustering(n_clusters = 4, metric = "euclidean", linkage = "average")
HCmodel.fit(hc_df1)

AgglomerativeClustering(linkage='average', metric='euclidean', n_clusters=4)

AgglomerativeClustering(linkage='average', metric='euclidean', n_clusters=4)

# Creating a copy of the original data
df3 = df.copy()

# Adding hierarchical cluster labels to the original and whole dataframes
hc_df["HC_segments_L1"] = HCmodel.labels_
df3["HC_segments_L1"] = HCmodel.labels_

hc_cluster_profile = df3.groupby("HC_segments_L1").mean(numeric_only = True)

hc_cluster_profile["count_in_each_segment"] = (
    df3.groupby("HC_segments_L1")["Total_Points"].count().values
)

hc_cluster_profile.style.highlight_max(color = "lightgreen", axis = 0)

# Let's see the names of the players in each cluster
for cl in df3["HC_segments_L1"].unique():
    print("In cluster {}, the following players are present:".format(cl))
    print(df3[df3["HC_segments_L1"] == cl]["Player_Name"].unique())
    print()

In cluster 0, the following players are present:
['Alex Runnarsson' 'Bernd Leno' 'Calum Chambers' 'Cedric Soares'
 'Daniel Ceballos' 'David Luiz' 'Edward Nketiah' 'Emile Smith Rowe'
 'Gabriel Maghalaes' 'Gabriel Teodoro Martinelli Silva' 'Granit Xhaka'
 'Hector Bellerin' 'Kieran Tierney' 'Martin Odegaard' 'Matt Macey'
 'Miguel Azeez' 'Mohamed Naser El Sayed Elneny' 'Pablo Mari'
 'Reiss Nelson' 'Rob Holding' 'Sead Kolasinac' 'Shkodran Mustafi'
 'Sokratis Papastathopoulos' 'Thomas Partey' 'William Saliba'
 'Willian Borges Da Silva' 'Ahmed El Mohamady' 'Carney Chukwuemeka'
 'Conor Hourihane' 'Douglas Luiz Soares de Paulo' 'Emiliano Martinez'
 'Ezri Konsa Ngoyo' 'Henri Lansbury' 'Jacob Ramsey'
 'Jaden Philogene-Bidace' 'John McGinn' 'Jose Peleteiro Romallo'
 'Keinan Davis' 'Kortney Hause' 'Mahmoud Ahmed Ibrahim Hassan'
 'Marvelous Nakamba' 'Matt Targett' 'Matthew Cash' 'Morgan Sanson'
 'Orjan Nyland' 'Ross Barkley' 'Tyrone Mings' 'Wesley Moraes'
 'Aaron Connolly' 'Adam Lallana' 'Adam Webster' 'Alexis Mac Allister'
 'Alireza Jahanbakhsh' 'Andi Zeqiri' 'Ben White'
 'Bernardo Fernandes da Silva Junior' 'Dan Burn' 'Davy Propper'
 'Jakub Moder' 'Jason Steele' 'Jayson Molumby' 'Joel Veltman'
 'Jose Izquierdo' 'Lewis Dunk' 'Mathew Ryan' 'Percy Tau' 'Reda Khadra'
 'Robert Sanchez' 'Solomon March' 'Steven Alzate' 'Tariq Lamptey'
 'Yves Bissouma' 'Anthony Driscoll-Glennon' 'Ashley Barnes'
 'Ashley Westwood' 'Bailey Peacock-Farrell' 'Ben Mee' 'Charlie Taylor'
 'Dale Stephens' 'Dwight McNeil' 'Erik Pieters' 'Jack Cork'
 'James Tarkowski' 'Jay Rodriguez' 'Jeff Hendrick' 'Jimmy Dunne'
 'Joel Mumbongo' 'Johann Berg Gudmundsson' 'Josh Benson' 'Josh Brownhill'
 'Kevin Long' 'Lewis Richardson' 'Matej Vydra' 'Matthew Lowton'
 'Nick Pope' 'Phil Bardsley' 'Robbie Brady' 'Will Norris'
 'Andreas Christensen' 'Antonio Rudiger' 'Benjamin Chilwell'
 'Billy Gilmour' 'Callum Hudson-Odoi' 'Cesar Azpilicueta'
 'Christian Pulisic' 'Edouard Mendy' 'Emerson Palmieri dos Santos'
 'Faustino Anjorin' 'Fikayo Tomori' 'Hakim Ziyech'
 'Jorge Luiz Frello Filho' 'Kai Havertz' 'Karlo Ziger' 'Kepa Arrizabalaga'
 'Kurt Zouma' 'Marcos Alonso' 'Mateo Kovacic' "N'Golo Kante"
 'Olivier Giroud' 'Reece James' 'Tammy Abraham' 'Thiago Silva'
 'Valentino Livramento' 'Willy Caballero' 'Andros Townsend'
 'Cheikhou Kouyate' 'Connor Wickham' 'Eberechi Eze' 'Gary Cahill'
 'Jack Butland' 'Jairo Riedewald' 'James McArthur' 'James McCarthy'
 'James Tomkins' 'Jean-Philippe Mateta' 'Jeffrey Schlupp' 'Joel Ward'
 'Jordan Ayew' 'Luka Milivojevic' 'Mamadou Sakho' 'Martin Kelly'
 'Michy Batshuayi' 'Nathan Ferguson' 'Nathaniel Clyne'
 'Patrick van Aanholt' 'Reece Hannam' 'Ryan Inniss' 'Sam Woods'
 'Scott Dann' 'Stephen Henderson' 'Tyrick Mitchell' 'Vicente Guaita'
 'Abdoulaye Doucoure' 'Alex Iwobi' 'Allan Marques Loureiro'
 'Andre Tavares Gomes' 'Anthony Gordon' 'Ben Godfrey'
 'Bernard Caldeira Duarte' 'Cenk Tosun' 'Fabian Delph' 'James Rodriguez'
 'Joao Virginia' 'Jonjoe Kenny' 'Jordan Pickford' 'Joshua King'
 'Lucas Digne' 'Mason Holgate' 'Michael Keane' 'Moise Kean'
 'Nathan Broadhead' 'Niels Nkounkou' 'Robin Olsen' 'Seamus Coleman'
 'Tom Davies' 'Yerry Mina' 'Adam Forshaw' 'Diego Llorente'
 'Ezgjan Alioski' 'Francisco Casilla' 'Gaetano Berardi' 'Helder Costa'
 'Ian Carlo Poveda-Ocampo' 'Illan Meslier' 'Jack Jenkins'
 'Jamie Shackleton' 'Jay-Roy Grot' 'Jordan Stevens' 'Kalvin Phillips'
 'Kamil Miazek' 'Leif Davis' 'Liam Cooper' 'Luke Ayling' 'Mateusz Bogusz'
 'Mateusz Klich' 'Niall Huggins' 'Pablo Hernandez' 'Pascal Struijk'
 'Robin Koch' 'Tyler Roberts' 'Ayoze Perez' 'Calgar Soyuncu'
 'Cengiz Under' 'Christian Fuchs' 'Daniel Amartey' 'Demarai Gray'
 'Dennis Praet' 'Filip Benkovic' 'Hamza Choudhury' 'Islam Slimani'
 'James Justin' 'Jonny Evans' 'Kasper Schmeichel' 'Luke Thomas'
 'Marc Albrighton' 'Nampalys Mendy' 'Ricardo Domingos Barbosa Pereira'
 'Sidnei Tavares' 'Thakgalo Leshabela' 'Timothy Castagne'
 'Vontae Daley-Campbell' 'Wes Morgan' 'Wesley Fofana' 'Wilfred Ndidi'
 'Youri Tielemans' 'Adrian Castillo' 'Alex Oxlade-Chamberlain'
 'Alisson Becker' 'Andrew Robertson' 'Caoimhin Kelleher' 'Curtis Jones'
 'Dean Henderson' 'Diogo Jota' 'Divock Origi' 'Fabio Henrique Tavares'
 'Georginio Wijnaldum' 'James Milner' 'Joel Matip' 'Jordan Henderson'
 'Joseph Gomez' 'Naby Keita' 'Nathaniel Phillips' 'Neco Williams'
 'Ozan Kabak' 'Rhys Williams' 'Thiago Alcantara' 'Trent Alexander-Arnold'
 'Virgil van Dijk' 'Xherdan Shaqiri' 'Aymeric Laporte' 'Benjamin Mendy'
 'Bernardo Silva' 'Ederson Moares' 'Eric Garcia' 'Fernando Luiz Rosa'
 'Ferran Torres' 'Joao Cancelo' 'John Stones' 'Kyle Walker' 'Liam Delap'
 'Luke Mbete' 'Nathan Ake' 'Nicolas Otamendi' 'Oleksandr Zinchenko'
 'Rodrigo Hernandez' 'Ruben Dias' 'Scott Carson' 'Sergio Aguero'
 'Taylor Harwood-Bellis' 'Zack Steffen' 'Aaron Wan-Bissaka'
 'Alex Nicolao Telles' 'Amad Diallo' 'Anthony Elanga' 'Anthony Martial'
 'Axel Tuanzebe' 'Brandon Williams' 'Daniel James' 'David de Gea'
 'Donny van de Beek' 'Eric Bailly' 'Frederico Rodrigues de Paula Santos'
 'Hannibal Mejbri' 'Harry Maguire' 'Juan Mata' 'Luke Shaw' 'Nathan Bishop'
 'Nemanja Matic' 'Odion Ighalo' 'Paul Pogba' 'Scott McTominay'
 'Shola Shoretire' 'Victor Lindelof' 'William Fish' 'Allan Saint-Maximin'
 'Andy Carroll' 'Ciaran Clark' 'DeAndre Yedlin' 'Dwight Gayle'
 'Elliot Anderson' 'Emil Krafth' 'Fabian Schar' 'Federico Fernandez'
 'Florian Lejeune' 'Isaac Hayden' 'Jacob Murphy' 'Jamaal Lascelles'
 'Jamal Lewis' 'Javier Manquillo' 'Joelinton de Lira' 'Jonjo Shelvey'
 'Joseph Willock' 'Karl Darlow' 'Kelland Watts' 'Martin Dubravka'
 'Matt Ritchie' 'Matthew Longstaff' 'Miguel Almiron' 'Paul Dummett'
 'Ryan Fraser' 'Sean Longstaff' 'Yoshinori Muto' 'Alex McCarthy'
 'Caleb Watts' "Daniel N'Lundulu" 'Fraser Forster' 'Ibrahima Diallo'
 'Jack Stephens' 'Jake Vokins' 'James Ward-Prowse' 'Jan Bednarek'
 'Jannik Vestergaard' 'Kgaogelo Chauke' 'Kyle Walker-Peters'
 'Michael Obafemi' 'Mohammed Salisu' 'Moussa Djenepo' 'Nathan Redmond'
 'Nathan Tella' 'Oriol Romeu Vidal' 'Ryan Bertrand' 'Shane Long'
 'Stuart Armstrong' 'Takumi Minamino' 'Theo Walcott' 'William Smallbone'
 'Yan Valery' 'Bamidele Alli' 'Ben Davies' 'Cameron Carter-Vickers'
 'Carlos Vinicius Alves Morais' 'Dane Scarlett' 'Danny Rose'
 'Davinson Sanchez' 'Eric Dier' 'Erik Lamela' 'Giovani Lo Celso'
 'Harry Winks' 'Hugo Lloris' 'Japhet Tanganga' 'Joe Rodon' 'Juan Foyth'
 'Lucas Moura' 'Matt Doherty' 'Moussa Sissoko' 'Paulo Gazzaniga'
 'Pierre-Emile Hojbjerg' 'Ryan Sessegnon' 'Serge Aurier' 'Sergio Reguilon'
 'Steven Bergwijn' 'Tanguy Ndombele' 'Toby Alderweireld' 'Aaron Cresswell'
 'Ademipo Odubeko' 'Albian Ajeti' 'Andriy Yarmolenko' 'Angelo Ogbonna'
 'Arthur Masuaku' 'Ben Johnson' 'Craig Dawson' 'Darren Randolph'
 'Declan Rice' 'Fabian Balbuena' 'Felipe Anderson Pereira Gomes'
 'Frederik Alves' 'Issa Diop' 'Jamal Baptiste' 'Jordan Hugill'
 'Lukasz Fabianski' 'Manuel Lanzini' 'Mark Noble' 'Roberto Jimenez Gago'
 'Ryan Fredericks' 'Said Benrahma' 'Sebastian Haller' 'Vladimir Coufal'
 'Adama Traore' 'Conor Coady' 'Daniel Castelo Podence' 'Fabio Silva'
 'Fernando Marcal' 'Joao Santos Moutinho' 'John Ruddy'
 'Jonathan Castro Otto' 'Ki-Jana Hoever' 'Leander Dendoncker' 'Max Kilman'
 'Morgan Gibbs-White' 'Nelson Semedo' 'Oskar Buur' 'Owen Otasowie'
 'Patrick Cutrone' 'Raul Jimenez ' 'Rayan Ait Nouri' 'Romain Saiss'
 'Ruben Neves' 'Ruben Vinagre' 'Rui Pedro Patricio' 'Vitor Ferreira'
 'Willian Jose' 'Willy Boly']

In cluster 2, the following players are present:
['Alexandre Lacazette' 'Bukayo Saka' 'Nicolas Pepe'
 'Pierre-Emerick Aubameyang' 'Anwar El Ghazi' 'Bertrand Traore'
 'Jack Grealish' 'Danny Welbeck' 'Leandro Trossard' 'Neal Maupay'
 'Pascal Gross' 'Chris Wood' 'Mason Mount' 'Timo Werner'
 'Christian Benteke' 'Wilfried Zaha' 'Gylfi Sigurdsson'
 'Richarlison de Andrade' 'Jack Harrison' 'Raphael Dias Belloli'
 'Rodrigo Moreno' 'Stuart Dallas' 'Harvey Barnes' 'James Maddison'
 'Kelechi Iheanacho' 'Roberto Firmino' 'Sadio Mane'
 'Gabriel Fernando de Jesus' 'Ilkay Gundogan' 'Kevin De Bruyne'
 'Phil Foden' 'Raheem Sterling' 'Riyad Mahrez' 'Edinson Cavani'
 'Marcus Rashford' 'Mason Greenwood' 'Callum Wilson' 'Che Adams'
 'Danny Ings' 'Gareth Bale' 'Jarrod Bowen' 'Jesse Lingard'
 'Michail Antonio' 'Pablo Fornals' 'Tomas Soucek' 'Pedro Lomba Neto']

In cluster 1, the following players are present:
['Ollie Watkins' 'Dominic Calvert-Lewin' 'Patrick Bamford' 'Jamie Vardy'
 'Mohamed Salah']

In cluster 3, the following players are present:
['Bruno Fernandes' 'Harry Kane' 'Heung-Min Son']

df3.groupby(["HC_segments_L1", "Position"])['Player_Name'].count()

HC_segments_L1  Position  
0               Defender      171
                Forward        43
                Goalkeeper     45
                Midfielder    163
1               Forward         4
                Midfielder      1
2               Defender        1
                Forward        16
                Midfielder     29
3               Forward         1
                Midfielder      2
Name: Player_Name, dtype: int64

hc_df2 = data_pca.copy()

HCmodel = AgglomerativeClustering(n_clusters = 4, metric = "euclidean", linkage = "ward")
HCmodel.fit(hc_df2)

AgglomerativeClustering(metric='euclidean', n_clusters=4)

AgglomerativeClustering(metric='euclidean', n_clusters=4)

# Creating a copy of the original data
df3 = df.copy()

# Adding hierarchical cluster labels to the HC algorithm and original dataframes
hc_df["HC_segments_L2"] = HCmodel.labels_
df3["HC_segments_L2"] = HCmodel.labels_

hc_cluster_profile = df3.groupby("HC_segments_L2").mean(numeric_only = True)

hc_cluster_profile["count_in_each_segment"] = (
    df3.groupby("HC_segments_L2")["Total_Points"].count().values
)

hc_cluster_profile.style.highlight_max(color = "lightgreen", axis = 0)

# Let's see the names of the players in each cluster
for cl in df3["HC_segments_L2"].unique():
    print("In cluster {}, the following players are present:".format(cl))
    print(df3[df3["HC_segments_L2"] == cl]["Player_Name"].unique())
    print()

In cluster 3, the following players are present:
['Alex Runnarsson' 'Calum Chambers' 'Cedric Soares' 'Edward Nketiah'
 'Martin Odegaard' 'Matt Macey' 'Miguel Azeez' 'Pablo Mari' 'Reiss Nelson'
 'Sead Kolasinac' 'Shkodran Mustafi' 'Sokratis Papastathopoulos'
 'William Saliba' 'Ahmed El Mohamady' 'Carney Chukwuemeka'
 'Conor Hourihane' 'Henri Lansbury' 'Jacob Ramsey'
 'Jaden Philogene-Bidace' 'Jose Peleteiro Romallo' 'Keinan Davis'
 'Kortney Hause' 'Marvelous Nakamba' 'Morgan Sanson' 'Orjan Nyland'
 'Wesley Moraes' 'Alireza Jahanbakhsh' 'Andi Zeqiri'
 'Bernardo Fernandes da Silva Junior' 'Davy Propper' 'Jakub Moder'
 'Jason Steele' 'Jayson Molumby' 'Jose Izquierdo' 'Percy Tau'
 'Reda Khadra' 'Tariq Lamptey' 'Anthony Driscoll-Glennon'
 'Bailey Peacock-Farrell' 'Dale Stephens' 'Jimmy Dunne' 'Joel Mumbongo'
 'Josh Benson' 'Kevin Long' 'Lewis Richardson' 'Phil Bardsley'
 'Will Norris' 'Billy Gilmour' 'Emerson Palmieri dos Santos'
 'Faustino Anjorin' 'Fikayo Tomori' 'Karlo Ziger' 'Kepa Arrizabalaga'
 'Valentino Livramento' 'Willy Caballero' 'Connor Wickham' 'Jack Butland'
 'James McCarthy' 'James Tomkins' 'Jean-Philippe Mateta' 'Mamadou Sakho'
 'Martin Kelly' 'Michy Batshuayi' 'Nathan Ferguson' 'Reece Hannam'
 'Ryan Inniss' 'Sam Woods' 'Stephen Henderson' 'Anthony Gordon'
 'Bernard Caldeira Duarte' 'Cenk Tosun' 'Fabian Delph' 'Joao Virginia'
 'Jonjoe Kenny' 'Joshua King' 'Moise Kean' 'Nathan Broadhead'
 'Niels Nkounkou' 'Robin Olsen' 'Adam Forshaw' 'Francisco Casilla'
 'Gaetano Berardi' 'Ian Carlo Poveda-Ocampo' 'Jack Jenkins'
 'Jamie Shackleton' 'Jay-Roy Grot' 'Jordan Stevens' 'Kamil Miazek'
 'Leif Davis' 'Mateusz Bogusz' 'Niall Huggins' 'Pablo Hernandez'
 'Cengiz Under' 'Christian Fuchs' 'Daniel Amartey' 'Demarai Gray'
 'Dennis Praet' 'Filip Benkovic' 'Hamza Choudhury' 'Islam Slimani'
 'Luke Thomas' 'Sidnei Tavares' 'Thakgalo Leshabela'
 'Vontae Daley-Campbell' 'Wes Morgan' 'Adrian Castillo'
 'Alex Oxlade-Chamberlain' 'Caoimhin Kelleher' 'Divock Origi' 'Joel Matip'
 'Joseph Gomez' 'Naby Keita' 'Neco Williams' 'Ozan Kabak' 'Rhys Williams'
 'Virgil van Dijk' 'Xherdan Shaqiri' 'Eric Garcia' 'Liam Delap'
 'Luke Mbete' 'Nathan Ake' 'Nicolas Otamendi' 'Scott Carson'
 'Taylor Harwood-Bellis' 'Zack Steffen' 'Alex Nicolao Telles'
 'Amad Diallo' 'Anthony Elanga' 'Axel Tuanzebe' 'Brandon Williams'
 'Donny van de Beek' 'Hannibal Mejbri' 'Juan Mata' 'Nathan Bishop'
 'Odion Ighalo' 'Shola Shoretire' 'William Fish' 'Andy Carroll'
 'DeAndre Yedlin' 'Dwight Gayle' 'Elliot Anderson' 'Florian Lejeune'
 'Javier Manquillo' 'Kelland Watts' 'Matthew Longstaff' 'Yoshinori Muto'
 'Caleb Watts' "Daniel N'Lundulu" 'Fraser Forster' 'Jake Vokins'
 'Kgaogelo Chauke' 'Michael Obafemi' 'Mohammed Salisu' 'Nathan Tella'
 'Shane Long' 'William Smallbone' 'Yan Valery' 'Bamidele Alli'
 'Cameron Carter-Vickers' 'Carlos Vinicius Alves Morais' 'Dane Scarlett'
 'Danny Rose' 'Harry Winks' 'Japhet Tanganga' 'Joe Rodon' 'Juan Foyth'
 'Paulo Gazzaniga' 'Ryan Sessegnon' 'Ademipo Odubeko' 'Albian Ajeti'
 'Andriy Yarmolenko' 'Ben Johnson' 'Darren Randolph'
 'Felipe Anderson Pereira Gomes' 'Frederik Alves' 'Jamal Baptiste'
 'Jordan Hugill' 'Manuel Lanzini' 'Mark Noble' 'Roberto Jimenez Gago'
 'Ryan Fredericks' 'Fernando Marcal' 'John Ruddy' 'Jonathan Castro Otto'
 'Ki-Jana Hoever' 'Morgan Gibbs-White' 'Oskar Buur' 'Owen Otasowie'
 'Patrick Cutrone' 'Ruben Vinagre' 'Vitor Ferreira']

In cluster 0, the following players are present:
['Alexandre Lacazette' 'Bukayo Saka' 'Nicolas Pepe'
 'Pierre-Emerick Aubameyang' 'Anwar El Ghazi' 'Bertrand Traore'
 'Jack Grealish' 'John McGinn' 'Matt Targett' 'Ollie Watkins'
 'Danny Welbeck' 'Leandro Trossard' 'Neal Maupay' 'Pascal Gross'
 'Ashley Westwood' 'Chris Wood' 'Dwight McNeil' 'Benjamin Chilwell'
 'Mason Mount' 'Timo Werner' 'Christian Benteke' 'Eberechi Eze'
 'Wilfried Zaha' 'Dominic Calvert-Lewin' 'Gylfi Sigurdsson'
 'James Rodriguez' 'Lucas Digne' 'Richarlison de Andrade' 'Jack Harrison'
 'Patrick Bamford' 'Raphael Dias Belloli' 'Rodrigo Moreno' 'Stuart Dallas'
 'Harvey Barnes' 'James Maddison' 'Jamie Vardy' 'Kelechi Iheanacho'
 'Youri Tielemans' 'Andrew Robertson' 'Diogo Jota' 'Mohamed Salah'
 'Roberto Firmino' 'Sadio Mane' 'Trent Alexander-Arnold' 'Bernardo Silva'
 'Gabriel Fernando de Jesus' 'Ilkay Gundogan' 'Kevin De Bruyne'
 'Phil Foden' 'Raheem Sterling' 'Riyad Mahrez' 'Aaron Wan-Bissaka'
 'Bruno Fernandes' 'Edinson Cavani' 'Luke Shaw' 'Marcus Rashford'
 'Mason Greenwood' 'Paul Pogba' 'Callum Wilson' 'Joseph Willock'
 'Che Adams' 'Danny Ings' 'James Ward-Prowse' 'Stuart Armstrong'
 'Gareth Bale' 'Harry Kane' 'Heung-Min Son' 'Pierre-Emile Hojbjerg'
 'Aaron Cresswell' 'Jarrod Bowen' 'Jesse Lingard' 'Michail Antonio'
 'Pablo Fornals' 'Tomas Soucek' 'Vladimir Coufal' 'Pedro Lomba Neto']

In cluster 2, the following players are present:
['Bernd Leno' 'Gabriel Maghalaes' 'Granit Xhaka' 'Hector Bellerin'
 'Kieran Tierney' 'Rob Holding' 'Douglas Luiz Soares de Paulo'
 'Emiliano Martinez' 'Ezri Konsa Ngoyo' 'Matthew Cash' 'Tyrone Mings'
 'Adam Webster' 'Ben White' 'Joel Veltman' 'Lewis Dunk' 'Robert Sanchez'
 'Yves Bissouma' 'Ben Mee' 'Charlie Taylor' 'James Tarkowski'
 'Josh Brownhill' 'Matthew Lowton' 'Nick Pope' 'Andreas Christensen'
 'Antonio Rudiger' 'Cesar Azpilicueta' 'Edouard Mendy'
 'Jorge Luiz Frello Filho' 'Kurt Zouma' 'Mateo Kovacic' "N'Golo Kante"
 'Reece James' 'Thiago Silva' 'Andros Townsend' 'Cheikhou Kouyate'
 'Joel Ward' 'Luka Milivojevic' 'Vicente Guaita' 'Abdoulaye Doucoure'
 'Ben Godfrey' 'Jordan Pickford' 'Mason Holgate' 'Michael Keane'
 'Yerry Mina' 'Ezgjan Alioski' 'Illan Meslier' 'Kalvin Phillips'
 'Liam Cooper' 'Luke Ayling' 'Mateusz Klich' 'James Justin' 'Jonny Evans'
 'Kasper Schmeichel' 'Timothy Castagne' 'Wesley Fofana' 'Wilfred Ndidi'
 'Alisson Becker' 'Fabio Henrique Tavares' 'Georginio Wijnaldum'
 'Thiago Alcantara' 'Ederson Moares' 'Joao Cancelo' 'John Stones'
 'Kyle Walker' 'Oleksandr Zinchenko' 'Rodrigo Hernandez' 'Ruben Dias'
 'David de Gea' 'Frederico Rodrigues de Paula Santos' 'Harry Maguire'
 'Scott McTominay' 'Victor Lindelof' 'Jonjo Shelvey' 'Karl Darlow'
 'Miguel Almiron' 'Alex McCarthy' 'Jan Bednarek' 'Jannik Vestergaard'
 'Kyle Walker-Peters' 'Ryan Bertrand' 'Eric Dier' 'Hugo Lloris'
 'Sergio Reguilon' 'Tanguy Ndombele' 'Toby Alderweireld' 'Angelo Ogbonna'
 'Craig Dawson' 'Declan Rice' 'Lukasz Fabianski' 'Adama Traore'
 'Conor Coady' 'Joao Santos Moutinho' 'Leander Dendoncker' 'Nelson Semedo'
 'Romain Saiss' 'Ruben Neves' 'Rui Pedro Patricio']

In cluster 1, the following players are present:
['Daniel Ceballos' 'David Luiz' 'Emile Smith Rowe'
 'Gabriel Teodoro Martinelli Silva' 'Mohamed Naser El Sayed Elneny'
 'Thomas Partey' 'Willian Borges Da Silva' 'Mahmoud Ahmed Ibrahim Hassan'
 'Ross Barkley' 'Aaron Connolly' 'Adam Lallana' 'Alexis Mac Allister'
 'Dan Burn' 'Mathew Ryan' 'Solomon March' 'Steven Alzate' 'Ashley Barnes'
 'Erik Pieters' 'Jack Cork' 'Jay Rodriguez' 'Jeff Hendrick'
 'Johann Berg Gudmundsson' 'Matej Vydra' 'Robbie Brady'
 'Callum Hudson-Odoi' 'Christian Pulisic' 'Hakim Ziyech' 'Kai Havertz'
 'Marcos Alonso' 'Olivier Giroud' 'Tammy Abraham' 'Gary Cahill'
 'Jairo Riedewald' 'James McArthur' 'Jeffrey Schlupp' 'Jordan Ayew'
 'Nathaniel Clyne' 'Patrick van Aanholt' 'Scott Dann' 'Tyrick Mitchell'
 'Alex Iwobi' 'Allan Marques Loureiro' 'Andre Tavares Gomes'
 'Seamus Coleman' 'Tom Davies' 'Diego Llorente' 'Helder Costa'
 'Pascal Struijk' 'Robin Koch' 'Tyler Roberts' 'Ayoze Perez'
 'Calgar Soyuncu' 'Marc Albrighton' 'Nampalys Mendy'
 'Ricardo Domingos Barbosa Pereira' 'Curtis Jones' 'Dean Henderson'
 'James Milner' 'Jordan Henderson' 'Nathaniel Phillips' 'Aymeric Laporte'
 'Benjamin Mendy' 'Fernando Luiz Rosa' 'Ferran Torres' 'Sergio Aguero'
 'Anthony Martial' 'Daniel James' 'Eric Bailly' 'Nemanja Matic'
 'Allan Saint-Maximin' 'Ciaran Clark' 'Emil Krafth' 'Fabian Schar'
 'Federico Fernandez' 'Isaac Hayden' 'Jacob Murphy' 'Jamaal Lascelles'
 'Jamal Lewis' 'Joelinton de Lira' 'Martin Dubravka' 'Matt Ritchie'
 'Paul Dummett' 'Ryan Fraser' 'Sean Longstaff' 'Ibrahima Diallo'
 'Jack Stephens' 'Moussa Djenepo' 'Nathan Redmond' 'Oriol Romeu Vidal'
 'Takumi Minamino' 'Theo Walcott' 'Ben Davies' 'Davinson Sanchez'
 'Erik Lamela' 'Giovani Lo Celso' 'Lucas Moura' 'Matt Doherty'
 'Moussa Sissoko' 'Serge Aurier' 'Steven Bergwijn' 'Arthur Masuaku'
 'Fabian Balbuena' 'Issa Diop' 'Said Benrahma' 'Sebastian Haller'
 'Daniel Castelo Podence' 'Fabio Silva' 'Max Kilman' 'Raul Jimenez '
 'Rayan Ait Nouri' 'Willian Jose' 'Willy Boly']

df3.groupby(["HC_segments_L2", "Position"])['Player_Name'].count()

HC_segments_L2  Position  
0               Defender      10
                Forward       21
                Midfielder    45
1               Defender      38
                Forward       14
                Goalkeeper     3
                Midfielder    57
2               Defender      52
                Goalkeeper    17
                Midfielder    28
3               Defender      72
                Forward       29
                Goalkeeper    25
                Midfielder    65
Name: Player_Name, dtype: int64

fig, axes = plt.subplots(3, 4, figsize = (20, 20))
counter = 0

for ii in range(3):
    for jj in range(4):
        if counter < 10:
            sns.boxplot(
                ax = axes[ii][jj],
                data = df3,
                y = df3.columns[3 + counter],
                x = "HC_segments_L2",
            )
            counter = counter + 1

fig.tight_layout(pad = 3.0)

gmm_df = data_pca.copy()

# Let's apply Gaussian Mixture
gmm = GaussianMixture(n_components = 4, random_state = 1)  # Initializing the Gaussian Mixture algorithm with n_components = 4

gmm.fit(gmm_df)                                            # Fitting the algorithm on the gmm_df data

GaussianMixture(n_components=4, random_state=1)

GaussianMixture(n_components=4, random_state=1)

# Creating a copy of the original data
df4 = df.copy()

# Adding GMM cluster labels to the original and scaled dataframes
gmm_df["GMM_segments"] = gmm.predict(gmm_df)
df4["GMM_segments"] = gmm.predict(data_pca)

gmm_cluster_profile = df4.groupby("GMM_segments").mean(numeric_only = True)

gmm_cluster_profile["count_in_each_segment"] = (
    df4.groupby("GMM_segments")["Total_Points"].count().values
)

gmm_cluster_profile.style.highlight_max(color = "lightgreen", axis = 0)

# Let's see the names of the players in each cluster
for cl in df4["GMM_segments"].unique():
    print("In cluster {}, the following players are present:".format(cl))
    print(df4[df4["GMM_segments"] == cl]["Player_Name"].unique())
    print()

In cluster 1, the following players are present:
['Alex Runnarsson' 'Matt Macey' 'Miguel Azeez' 'Reiss Nelson'
 'Sead Kolasinac' 'Shkodran Mustafi' 'Sokratis Papastathopoulos'
 'William Saliba' 'Carney Chukwuemeka' 'Henri Lansbury'
 'Jaden Philogene-Bidace' 'Jose Peleteiro Romallo' 'Morgan Sanson'
 'Orjan Nyland' 'Wesley Moraes' 'Bernardo Fernandes da Silva Junior'
 'Davy Propper' 'Jason Steele' 'Jayson Molumby' 'Jose Izquierdo'
 'Reda Khadra' 'Anthony Driscoll-Glennon' 'Bailey Peacock-Farrell'
 'Dale Stephens' 'Joel Mumbongo' 'Josh Benson' 'Lewis Richardson'
 'Phil Bardsley' 'Will Norris' 'Billy Gilmour'
 'Emerson Palmieri dos Santos' 'Faustino Anjorin' 'Fikayo Tomori'
 'Karlo Ziger' 'Valentino Livramento' 'Willy Caballero' 'Connor Wickham'
 'Jack Butland' 'James McCarthy' 'Mamadou Sakho' 'Martin Kelly'
 'Nathan Ferguson' 'Reece Hannam' 'Ryan Inniss' 'Sam Woods'
 'Stephen Henderson' 'Anthony Gordon' 'Cenk Tosun' 'Fabian Delph'
 'Joao Virginia' 'Jonjoe Kenny' 'Joshua King' 'Moise Kean'
 'Nathan Broadhead' 'Niels Nkounkou' 'Adam Forshaw' 'Jack Jenkins'
 'Jamie Shackleton' 'Jay-Roy Grot' 'Jordan Stevens' 'Kamil Miazek'
 'Leif Davis' 'Mateusz Bogusz' 'Niall Huggins' 'Demarai Gray'
 'Filip Benkovic' 'Hamza Choudhury' 'Islam Slimani' 'Sidnei Tavares'
 'Thakgalo Leshabela' 'Vontae Daley-Campbell' 'Wes Morgan'
 'Caoimhin Kelleher' 'Divock Origi' 'Joseph Gomez' 'Neco Williams'
 'Eric Garcia' 'Liam Delap' 'Luke Mbete' 'Nicolas Otamendi'
 'Taylor Harwood-Bellis' 'Zack Steffen' 'Axel Tuanzebe' 'Brandon Williams'
 'Hannibal Mejbri' 'Nathan Bishop' 'Odion Ighalo' 'Shola Shoretire'
 'William Fish' 'DeAndre Yedlin' 'Elliot Anderson' 'Florian Lejeune'
 'Javier Manquillo' 'Kelland Watts' 'Matthew Longstaff' 'Yoshinori Muto'
 'Caleb Watts' "Daniel N'Lundulu" 'Jake Vokins' 'Kgaogelo Chauke'
 'Michael Obafemi' 'Shane Long' 'William Smallbone' 'Yan Valery'
 'Cameron Carter-Vickers' 'Dane Scarlett' 'Danny Rose' 'Harry Winks'
 'Japhet Tanganga' 'Juan Foyth' 'Paulo Gazzaniga' 'Ryan Sessegnon'
 'Ademipo Odubeko' 'Albian Ajeti' 'Darren Randolph'
 'Felipe Anderson Pereira Gomes' 'Frederik Alves' 'Jamal Baptiste'
 'Jordan Hugill' 'Roberto Jimenez Gago' 'Fernando Marcal' 'John Ruddy'
 'Ki-Jana Hoever' 'Oskar Buur' 'Patrick Cutrone' 'Ruben Vinagre']

In cluster 2, the following players are present:
['Alexandre Lacazette' 'Bukayo Saka' 'Hector Bellerin' 'Nicolas Pepe'
 'Pierre-Emerick Aubameyang' 'Willian Borges Da Silva' 'Anwar El Ghazi'
 'Bertrand Traore' 'Jack Grealish' 'Mahmoud Ahmed Ibrahim Hassan'
 'Ollie Watkins' 'Ross Barkley' 'Danny Welbeck' 'Leandro Trossard'
 'Neal Maupay' 'Pascal Gross' 'Ashley Barnes' 'Chris Wood' 'Jay Rodriguez'
 'Matej Vydra' 'Callum Hudson-Odoi' 'Christian Pulisic'
 'Jorge Luiz Frello Filho' 'Kai Havertz' 'Mason Mount' 'Olivier Giroud'
 'Reece James' 'Tammy Abraham' 'Timo Werner' 'Andros Townsend'
 'Christian Benteke' 'Eberechi Eze' 'Jairo Riedewald' 'Jordan Ayew'
 'Wilfried Zaha' 'Dominic Calvert-Lewin' 'Gylfi Sigurdsson'
 'James Rodriguez' 'Lucas Digne' 'Richarlison de Andrade' 'Helder Costa'
 'Jack Harrison' 'Mateusz Klich' 'Patrick Bamford' 'Raphael Dias Belloli'
 'Rodrigo Moreno' 'Stuart Dallas' 'Ayoze Perez' 'Harvey Barnes'
 'James Maddison' 'Jamie Vardy' 'Kelechi Iheanacho' 'Marc Albrighton'
 'Andrew Robertson' 'Diogo Jota' 'Mohamed Salah' 'Roberto Firmino'
 'Sadio Mane' 'Trent Alexander-Arnold' 'Bernardo Silva' 'Ferran Torres'
 'Gabriel Fernando de Jesus' 'Ilkay Gundogan' 'Kevin De Bruyne'
 'Phil Foden' 'Raheem Sterling' 'Riyad Mahrez' 'Sergio Aguero'
 'Anthony Martial' 'Bruno Fernandes' 'Edinson Cavani' 'Luke Shaw'
 'Marcus Rashford' 'Mason Greenwood' 'Paul Pogba' 'Allan Saint-Maximin'
 'Callum Wilson' 'Joelinton de Lira' 'Joseph Willock' 'Matt Ritchie'
 'Miguel Almiron' 'Che Adams' 'Danny Ings' 'James Ward-Prowse'
 'Nathan Redmond' 'Stuart Armstrong' 'Theo Walcott' 'Gareth Bale'
 'Harry Kane' 'Heung-Min Son' 'Lucas Moura' 'Sergio Reguilon'
 'Aaron Cresswell' 'Jarrod Bowen' 'Jesse Lingard' 'Michail Antonio'
 'Pablo Fornals' 'Said Benrahma' 'Tomas Soucek' 'Vladimir Coufal'
 'Adama Traore' 'Daniel Castelo Podence' 'Fabio Silva' 'Pedro Lomba Neto'
 'Raul Jimenez ']

In cluster 0, the following players are present:
['Bernd Leno' 'Daniel Ceballos' 'Emile Smith Rowe' 'Gabriel Maghalaes'
 'Granit Xhaka' 'Kieran Tierney' 'Rob Holding'
 'Douglas Luiz Soares de Paulo' 'Emiliano Martinez' 'Ezri Konsa Ngoyo'
 'John McGinn' 'Matt Targett' 'Matthew Cash' 'Tyrone Mings' 'Adam Webster'
 'Ben White' 'Joel Veltman' 'Lewis Dunk' 'Robert Sanchez' 'Solomon March'
 'Yves Bissouma' 'Ashley Westwood' 'Ben Mee' 'Charlie Taylor'
 'Dwight McNeil' 'James Tarkowski' 'Josh Brownhill' 'Matthew Lowton'
 'Nick Pope' 'Andreas Christensen' 'Antonio Rudiger' 'Benjamin Chilwell'
 'Cesar Azpilicueta' 'Edouard Mendy' 'Hakim Ziyech' 'Kurt Zouma'
 'Mateo Kovacic' "N'Golo Kante" 'Thiago Silva' 'Cheikhou Kouyate'
 'Gary Cahill' 'Joel Ward' 'Luka Milivojevic' 'Vicente Guaita'
 'Abdoulaye Doucoure' 'Allan Marques Loureiro' 'Ben Godfrey'
 'Jordan Pickford' 'Mason Holgate' 'Michael Keane' 'Seamus Coleman'
 'Yerry Mina' 'Ezgjan Alioski' 'Illan Meslier' 'Kalvin Phillips'
 'Liam Cooper' 'Luke Ayling' 'Pascal Struijk' 'James Justin' 'Jonny Evans'
 'Kasper Schmeichel' 'Timothy Castagne' 'Wesley Fofana' 'Wilfred Ndidi'
 'Youri Tielemans' 'Alisson Becker' 'Fabio Henrique Tavares'
 'Georginio Wijnaldum' 'Nathaniel Phillips' 'Thiago Alcantara'
 'Aymeric Laporte' 'Benjamin Mendy' 'Ederson Moares' 'Fernando Luiz Rosa'
 'Joao Cancelo' 'John Stones' 'Kyle Walker' 'Oleksandr Zinchenko'
 'Rodrigo Hernandez' 'Ruben Dias' 'Aaron Wan-Bissaka' 'David de Gea'
 'Frederico Rodrigues de Paula Santos' 'Harry Maguire' 'Scott McTominay'
 'Victor Lindelof' 'Federico Fernandez' 'Jacob Murphy' 'Jonjo Shelvey'
 'Karl Darlow' 'Martin Dubravka' 'Alex McCarthy' 'Jack Stephens'
 'Jan Bednarek' 'Jannik Vestergaard' 'Kyle Walker-Peters' 'Ryan Bertrand'
 'Eric Dier' 'Hugo Lloris' 'Pierre-Emile Hojbjerg' 'Serge Aurier'
 'Tanguy Ndombele' 'Toby Alderweireld' 'Angelo Ogbonna' 'Craig Dawson'
 'Declan Rice' 'Lukasz Fabianski' 'Conor Coady' 'Joao Santos Moutinho'
 'Leander Dendoncker' 'Max Kilman' 'Nelson Semedo' 'Romain Saiss'
 'Ruben Neves' 'Rui Pedro Patricio' 'Willy Boly']

In cluster 3, the following players are present:
['Calum Chambers' 'Cedric Soares' 'David Luiz' 'Edward Nketiah'
 'Gabriel Teodoro Martinelli Silva' 'Martin Odegaard'
 'Mohamed Naser El Sayed Elneny' 'Pablo Mari' 'Thomas Partey'
 'Ahmed El Mohamady' 'Conor Hourihane' 'Jacob Ramsey' 'Keinan Davis'
 'Kortney Hause' 'Marvelous Nakamba' 'Aaron Connolly' 'Adam Lallana'
 'Alexis Mac Allister' 'Alireza Jahanbakhsh' 'Andi Zeqiri' 'Dan Burn'
 'Jakub Moder' 'Mathew Ryan' 'Percy Tau' 'Steven Alzate' 'Tariq Lamptey'
 'Erik Pieters' 'Jack Cork' 'Jeff Hendrick' 'Jimmy Dunne'
 'Johann Berg Gudmundsson' 'Kevin Long' 'Robbie Brady' 'Kepa Arrizabalaga'
 'Marcos Alonso' 'James McArthur' 'James Tomkins' 'Jean-Philippe Mateta'
 'Jeffrey Schlupp' 'Michy Batshuayi' 'Nathaniel Clyne'
 'Patrick van Aanholt' 'Scott Dann' 'Tyrick Mitchell' 'Alex Iwobi'
 'Andre Tavares Gomes' 'Bernard Caldeira Duarte' 'Robin Olsen'
 'Tom Davies' 'Diego Llorente' 'Francisco Casilla' 'Gaetano Berardi'
 'Ian Carlo Poveda-Ocampo' 'Pablo Hernandez' 'Robin Koch' 'Tyler Roberts'
 'Calgar Soyuncu' 'Cengiz Under' 'Christian Fuchs' 'Daniel Amartey'
 'Dennis Praet' 'Luke Thomas' 'Nampalys Mendy'
 'Ricardo Domingos Barbosa Pereira' 'Adrian Castillo'
 'Alex Oxlade-Chamberlain' 'Curtis Jones' 'Dean Henderson' 'James Milner'
 'Joel Matip' 'Jordan Henderson' 'Naby Keita' 'Ozan Kabak' 'Rhys Williams'
 'Virgil van Dijk' 'Xherdan Shaqiri' 'Nathan Ake' 'Scott Carson'
 'Alex Nicolao Telles' 'Amad Diallo' 'Anthony Elanga' 'Daniel James'
 'Donny van de Beek' 'Eric Bailly' 'Juan Mata' 'Nemanja Matic'
 'Andy Carroll' 'Ciaran Clark' 'Dwight Gayle' 'Emil Krafth' 'Fabian Schar'
 'Isaac Hayden' 'Jamaal Lascelles' 'Jamal Lewis' 'Paul Dummett'
 'Ryan Fraser' 'Sean Longstaff' 'Fraser Forster' 'Ibrahima Diallo'
 'Mohammed Salisu' 'Moussa Djenepo' 'Nathan Tella' 'Oriol Romeu Vidal'
 'Takumi Minamino' 'Bamidele Alli' 'Ben Davies'
 'Carlos Vinicius Alves Morais' 'Davinson Sanchez' 'Erik Lamela'
 'Giovani Lo Celso' 'Joe Rodon' 'Matt Doherty' 'Moussa Sissoko'
 'Steven Bergwijn' 'Andriy Yarmolenko' 'Arthur Masuaku' 'Ben Johnson'
 'Fabian Balbuena' 'Issa Diop' 'Manuel Lanzini' 'Mark Noble'
 'Ryan Fredericks' 'Sebastian Haller' 'Jonathan Castro Otto'
 'Morgan Gibbs-White' 'Owen Otasowie' 'Rayan Ait Nouri' 'Vitor Ferreira'
 'Willian Jose']

df4.groupby(["GMM_segments", "Position"])['Player_Name'].count()

GMM_segments  Position  
0             Defender      64
              Goalkeeper    18
              Midfielder    34
1             Defender      48
              Forward       21
              Goalkeeper    19
              Midfielder    38
2             Defender      10
              Forward       32
              Midfielder    63
3             Defender      50
              Forward       11
              Goalkeeper     8
              Midfielder    60
Name: Player_Name, dtype: int64

fig, axes = plt.subplots(3, 4, figsize = (20, 20))
counter = 0

for ii in range(3):
    for jj in range(4):
        if counter < 10:
            sns.boxplot(
                ax = axes[ii][jj],
                data = df4,
                y = df4.columns[3 + counter],
                x = "GMM_segments",showmeans = True
            )
            counter = counter + 1

fig.tight_layout(pad = 3.0)

dbscan_df = data_pca.copy()
dbscan_df1 = dbscan_df.copy()

# Initializing lists
eps_value = [2,3]           # Taking random eps value
min_sample_values = [6,20]  # Taking random min_sample value

# Creating a dictionary for each of the values in eps_value with min_sample_values
res = {eps_value[i]: min_sample_values for i in range(len(eps_value))}

# Finding the silhouette_score for each of the combination

high_silhouette_avg = 0                                               # Assigning 0 to the high_silhouette_avg variable
high_i_j = [0, 0]                                                     # Assigning 0's to the high_i_j list
key = res.keys()                                                      # Assigning dictionary keys to a variable called key
for i in key:
    z = res[i]                                                        # Assigning dictionary values of each i to z
    for j in z:
        db = DBSCAN(eps = i, min_samples = j).fit(dbscan_df)          # Applying DBSCAN to each of the combinations in dictionary
        core_samples_mask = np.zeros_like(db.labels_, dtype = bool)
        core_samples_mask[db.core_sample_indices_] = True
        labels = db.labels_
        silhouette_avg = silhouette_score(dbscan_df, labels)          # Finding silhouette score
        print(
            "For eps value =" + str(i),
            "For min sample =" + str(j),
            "The average silhoutte_score is :",
            silhouette_avg,                                          # Printing the silhouette score for each of the combination
        )
        if high_silhouette_avg < silhouette_avg:                     # If the silhouette score is greater than 0 or the previous score, it will get appended to the high_silhouette_avg list with its combination of i and j
            high_i_j[0] = i
            high_i_j[1] = j

For eps value =2 For min sample =6 The average silhoutte_score is : 0.5283008912823889
For eps value =2 For min sample =20 The average silhoutte_score is : 0.3647818751696756
For eps value =3 For min sample =6 The average silhoutte_score is : 0.624205189855851
For eps value =3 For min sample =20 The average silhoutte_score is : 0.6188492416303977

# Printing the highest silhouette score
print(
    "Highest_silhoutte_avg is {} for eps = {} and min sample = {}".format(
        high_silhouette_avg, high_i_j[0], high_i_j[1]
    )
)

Highest_silhoutte_avg is 0 for eps = 3 and min sample = 20

# Applying DBSCAN with eps as 3 and min sample as 20
dbs = DBSCAN(eps = 3, min_samples = 20)

# Creating a copy of the original data
df5 = df.copy()

# Add DBSCAN cluster labels to dbscan data
dbscan_df1["db_segments"] = dbs.fit_predict(dbscan_df1)

# Add DBSCAN cluster labels to whole data
df5["db_segments"] =  dbs.fit_predict(dbscan_df)

db_cluster_profile = df5.groupby("db_segments").mean(numeric_only = True)

db_cluster_profile["count_in_each_segment"] = (
    df5.groupby("db_segments")["Total_Points"].count().values
)

db_cluster_profile.style.highlight_max(color = "lightgreen", axis = 0)

kmeans = KMeans(n_clusters = 4, random_state = 1, n_init = 'auto')        # Initializing K-Means with number of clusters as 4 and random_state=1

preds = kmeans.fit_predict((data_pca))                   # Fitting and predicting K-Means on data_pca

score = silhouette_score(data_pca, preds)                # Calculating the silhouette score

print(score)

0.40411092686635713

kmedoids = KMedoids(n_clusters = 4, random_state = 1)   # Initializing K-Medoids with number of clusters as 4 and random_state=1

preds = kmedoids.fit_predict((data_pca))                # Fitting and predicting K-Medoids on data_pca

score = silhouette_score(data_pca, preds)               # Calculating the silhouette score

print(score)

0.393822499693573

# Initializing Agglomerative Clustering with distance as Euclidean, linkage as ward with clusters = 4
HCmodel = AgglomerativeClustering(n_clusters = 4, metric = "euclidean", linkage = "ward",)

# Fitting on PCA data
preds = HCmodel.fit_predict(data_pca)

score = silhouette_score(data_pca, preds)             # Calculating the silhouette score

print(score)

0.3849709986025467

# Initializing Gaussian Mixture algorithm with number of clusters as 4 and random_state = 1
gmm = GaussianMixture(n_components=4, random_state=1)

# Fitting and predicting Gaussian Mixture algorithm on data_pca
preds = gmm.fit_predict((data_pca))

# Calculating the silhouette score
score = silhouette_score(data_pca, preds)

# Printing the score
print(score)

0.28494644297302146

# Convert notebook to html
!jupyter nbconvert --to html "/content/drive/My Drive/Colab Notebooks/Copy of FDS_Project_LearnerNotebook_FullCode.ipynb"

	count	unique	top	freq	mean	std	min	25%	50%	75%	max
Player_Name	476	476	Alex Runnarsson	1	NaN	NaN	NaN	NaN	NaN	NaN	NaN
Club	476	17	Arsenal	30	NaN	NaN	NaN	NaN	NaN	NaN	NaN
Position	476	4	Midfielder	195	NaN	NaN	NaN	NaN	NaN	NaN	NaN
Goals_Scored	476.0	NaN	NaN	NaN	1.907563	3.455562	0.0	0.0	0.5	2.0	23.0
Assists	476.0	NaN	NaN	NaN	1.752101	2.708563	0.0	0.0	0.0	2.0	14.0
Total_Points	476.0	NaN	NaN	NaN	58.516807	51.293559	0.0	10.0	48.0	94.25	244.0
Minutes	476.0	NaN	NaN	NaN	1336.909664	1073.773995	0.0	268.75	1269.5	2256.25	3420.0
Goals_Conceded	476.0	NaN	NaN	NaN	19.157563	15.946171	0.0	4.0	18.0	31.0	68.0
Creativity	476.0	NaN	NaN	NaN	195.97605	251.478541	0.0	8.3	96.95	296.95	1414.9
Influence	476.0	NaN	NaN	NaN	294.617647	267.779681	0.0	46.5	233.1	499.5	1318.2
Threat	476.0	NaN	NaN	NaN	224.962185	318.240377	0.0	5.75	104.5	298.25	1980.0
Bonus	476.0	NaN	NaN	NaN	4.718487	6.252625	0.0	0.0	2.0	7.0	40.0
Clean_Sheets	476.0	NaN	NaN	NaN	4.745798	4.394312	0.0	0.0	4.0	8.0	19.0

	Goals_Scored	Assists	Total_Points	Minutes	Goals_Conceded	Creativity	Influence	Threat	Bonus	Clean_Sheets	count_in_each_segment
KM_segments
0	1.371134	1.783505	103.000000	2674.288660	37.649485	256.951546	579.144330	198.288660	7.463918	9.989691	97
1	0.148936	0.202128	9.824468	238.750000	3.930851	28.171809	43.164894	30.244681	0.409574	0.558511	188
2	8.919355	6.709677	141.725806	2458.306452	33.451613	625.253226	661.458065	860.677419	16.322581	9.322581	62
3	1.503876	1.604651	56.038760	1392.736434	20.573643	188.358915	270.818605	223.255814	3.356589	4.705426	129

	Goals_Scored	Assists	Total_Points	Minutes	Goals_Conceded	Creativity	Influence	Threat	Bonus	Clean_Sheets	count_in_each_segment
KMed_segments
0	7.512195	6.195122	133.243902	2452.243902	33.853659	602.902439	625.653659	745.402439	14.573171	9.231707	82
1	1.068966	1.091954	99.528736	2638.195402	36.632184	184.582759	575.818391	166.333333	6.988506	9.931034	87
2	1.338235	1.558824	51.073529	1270.051471	18.977941	180.458824	242.588235	203.102941	2.904412	4.205882	136
3	0.099415	0.111111	7.736842	193.187135	3.362573	18.979532	34.188304	22.608187	0.280702	0.385965	171

	0	1	2	3	4	5	6	7	8	9
0	-2.916600	0.569939	-0.041871	0.190663	0.003485	0.008158	-0.042314	0.064757	0.057486	0.006269
1	3.815468	1.999554	-2.216345	0.757341	-0.119000	-0.541975	-0.233941	-0.293053	0.075356	-0.238019
2	1.943396	-2.757446	-0.958238	0.816920	0.041188	0.376978	0.005235	0.138392	-0.306183	-0.136656
3	3.502427	1.043441	0.581995	-0.969567	0.415848	-0.358456	0.661746	0.376272	-0.260200	-0.155934
4	-1.153639	0.422189	0.642307	0.269271	-0.271583	0.405367	-0.033575	0.098886	-0.035234	-0.034877

	Goals_Scored	Assists	Total_Points	Minutes	Goals_Conceded	Creativity	Influence	Threat	Bonus	Clean_Sheets	count_in_each_segment
HC_segments_L1
0	0.881517	1.139810	47.969194	1205.945498	17.580569	148.574408	249.536967	131.753555	3.293839	4.182464	422
1	16.800000	9.200000	189.000000	3033.200000	44.000000	494.340000	860.720000	1591.600000	21.800000	10.800000	5
2	8.565217	5.826087	129.391304	2238.934783	29.760870	543.273913	586.234783	861.739130	14.021739	8.739130	46
3	19.333333	13.000000	238.000000	3101.000000	37.000000	1041.300000	1221.000000	1294.666667	34.000000	12.666667	3

Unsupervised Learning Practice Project Solution: Fantasy Sports Clustering Analysis¶

Context¶

Objective¶

Data Description¶

Importing the necessary libraries and overview of the dataset¶

Exploratory Data Analysis¶

Univariate Analysis¶

Bivariate Analysis¶

Checking Outliers¶

Scaling¶

Applying PCA¶

K-Means Clustering¶

Cluster Profiles¶

Characteristics of each cluster:¶

K-Medoids Clustering¶

Cluster Profiling¶

Comparison of cluster profiles from K-Means and K-Medoids¶

Characteristics of each cluster¶

Hierarchical Clustering¶

Cluster Profiling¶

Cluster Profiling¶

Comparison of cluster profiles from Hierarchical and previous algorithms¶

Characteristics of each cluster¶

Gaussian Mixture Model (GMM) Clustering¶

Cluster Profiling¶

Comparison of cluster profiles from GMM and previous algorithms¶

Characteristics of each cluster¶

Density-Based Spatial Clustering of Applications with Noise (DBSCAN)¶

What is the silhouette score?¶

Choosing the Best Algorithm¶

Conclusion:¶

Recommendations:¶

	Player_Name	Club	Position	Goals_Scored	Assists	Total_Points	Minutes	Goals_Conceded	Creativity	Influence	Threat	Bonus	Clean_Sheets
441	Mark Noble	West Ham United	Midfielder	0	0	27	701	15	88.6	80.4	7	0	0
363	Sean Longstaff	Newcastle United	Midfielder	0	1	41	1405	26	182.8	179.2	148	1	2
31	Anwar El Ghazi	Aston Villa	Midfielder	10	0	111	1604	22	426.1	500.4	726	13	5
132	Olivier Giroud	Chelsea	Forward	4	0	47	740	5	112.0	161.4	403	6	4
90	Chris Wood	Burnley	Forward	12	3	138	2741	43	323.2	595.8	1129	16	9
249	Vontae Daley-Campbell	Leicester City	Defender	0	0	0	0	0	0.0	0.0	0	0	0
65	Danny Welbeck	Brighton and Hove Albion	Forward	6	4	89	1541	18	269.7	319.8	595	15	6
445	Ryan Fredericks	West Ham United	Defender	1	1	28	564	9	166.8	155.2	96	0	1
117	Christian Pulisic	Chelsea	Midfielder	4	3	82	1731	21	378.8	361.4	724	3	7
415	Ryan Sessegnon	Tottenham Hotspurs	Defender	0	0	0	0	0	0.0	0.0	0	0	0

	Player_Name	Club	Position	Total_Points
36	Emiliano Martinez	Aston Villa	Goalkeeper	186
403	Harry Kane	Tottenham Hotspurs	Forward	242
315	Bruno Fernandes	Manchester United	Midfielder	244
223	Stuart Dallas	Leeds United	Defender	171

	Linkage	Cophenetic Coefficient
4	ward	0.577774
1	complete	0.741204
3	centroid	0.806830
0	single	0.843018
2	average	0.847650
5	weighted	0.862458

	Goals_Scored	Assists	Total_Points	Minutes	Goals_Conceded	Creativity	Influence	Threat	Bonus	Clean_Sheets	count_in_each_segment
HC_segments_L2
0	7.960526	6.342105	135.592105	2467.605263	33.802632	594.343421	638.431579	772.302632	14.736842	9.302632	76
1	1.357143	1.669643	53.812500	1367.767857	20.866071	186.037500	259.967857	218.866071	3.000000	4.437500	112
2	1.247423	1.206186	98.453608	2557.814433	35.371134	220.125773	556.723711	185.505155	7.237113	9.721649	97
3	0.157068	0.251309	10.324607	248.863874	4.094241	31.026702	45.018848	30.785340	0.460733	0.586387	191

	Goals_Scored	Assists	Total_Points	Minutes	Goals_Conceded	Creativity	Influence	Threat	Bonus	Clean_Sheets	count_in_each_segment
GMM_segments
0	1.189655	1.344828	94.439655	2453.189655	33.793103	221.881897	535.862069	174.991379	6.793103	9.431034	116
1	0.000000	0.000000	3.547619	107.436508	2.055556	8.590476	15.739683	5.746032	0.000000	0.182540	126
2	6.552381	5.390476	114.876190	2104.809524	29.619048	501.710476	521.451429	683.466667	11.904762	7.685714	105
3	0.635659	0.868217	34.031008	908.968992	14.186047	106.855039	165.444961	110.813953	1.612403	2.596899	129

	Goals_Scored	Assists	Total_Points	Minutes	Goals_Conceded	Creativity	Influence	Threat	Bonus	Clean_Sheets	count_in_each_segment
db_segments
-1	18.666667	11.666667	221.000000	3045.000000	41.000000	779.533333	1067.566667	1447.000000	28.666667	11.166667	6
0	1.693617	1.625532	56.442553	1315.104255	18.878723	188.526383	284.750213	209.361702	4.412766	4.663830	470