# Import Libraries

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Visualization
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')

# Overwrite defauls pandas display limits
# pd.set_option('display.max_columns', None)
# pd.set_option('display.max_rows', 200)
# pd.set_option('display.max_colwidth', None)

# Connect to Google Drive
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).

# Read dataset file
data = pd.read_csv("/content/drive/MyDrive/MIT - Data Sciences/Colab Notebooks/Week_Five_-_Classification_and_Hypothesis_Testing/Guided_Project:_Classification_and_Hypothesis_Testing/Dataset - Hospital LOS Prediction.csv")

# Lets make a backup copy of the data frame
data_backup = data.copy()

# Lets get an idea about the data
# Print the first two rows
data.head(2)

# Print the last two rows
data.tail(2)

# What is the shape of our data
data.shape

(500000, 15)

# What tpes of features do we have
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500000 entries, 0 to 499999
Data columns (total 15 columns):
 #   Column                             Non-Null Count   Dtype  
---  ------                             --------------   -----  
 0   Available Extra Rooms in Hospital  500000 non-null  int64  
 1   Department                         500000 non-null  object 
 2   Ward_Facility_Code                 500000 non-null  object 
 3   doctor_name                        500000 non-null  object 
 4   staff_available                    500000 non-null  int64  
 5   patientid                          500000 non-null  int64  
 6   Age                                500000 non-null  object 
 7   gender                             500000 non-null  object 
 8   Type of Admission                  500000 non-null  object 
 9   Severity of Illness                500000 non-null  object 
 10  health_conditions                  348112 non-null  object 
 11  Visitors with Patient              500000 non-null  int64  
 12  Insurance                          500000 non-null  object 
 13  Admission_Deposit                  500000 non-null  float64
 14  Stay (in days)                     500000 non-null  int64  
dtypes: float64(1), int64(5), object(9)
memory usage: 57.2+ MB

# Checking unique values in each column
data.nunique()

data.duplicated()

data.duplicated().sum()

0

# How many unique patients has the facility served
data['patientid'].nunique()

126399

# Descriptive statistics for numerical features
data.describe().T

# Define numerical features
num_cols = ['Available Extra Rooms in Hospital', 'staff_available', 'Visitors with Patient', 'Admission_Deposit', 'Stay (in days)']

# Creating histograms
data[num_cols].hist(figsize=(10,10))
plt.show()

# Define a function to find unique values, counts and percentages associated with a feature
total_patients = data['Department'].count()

def calculate_percentage(df, column_name):
  """Calculates the percentage of each unique value in a given column.

  Args:
    df: The pandas DataFrame containing the data.
    column_name: The name of the column to analyze.

  Returns:
    A pandas DataFrame with the percentage of each unique value.
  """
  return (df[column_name].value_counts(1, dropna=False)
            .reset_index(name='Percentage')
            .rename(columns={'index': column_name})
            .assign(Count=lambda x: (x['Percentage'] * total_patients).astype(int))
            .assign(Percentage=lambda x: x['Percentage'].apply(lambda p: f'{p * 100:.2f}%'))
            .sort_values(by='Count', ascending=False)
            .loc[:, [column_name, 'Count', 'Percentage']]
          )

# Calculate the percentage of time when "Available Extra Rooms in Hospital" is 0 (min) and 24 (max)
calculate_percentage(data, 'Available Extra Rooms in Hospital')

# Find unique values, counts and percentages associated with this feature
calculate_percentage(data, 'Department')

# Find unique values, counts and percentages associated with this feature
calculate_percentage(data, 'Ward_Facility_Code')

# Find unique values, counts and percentages associated with this feature
calculate_percentage(data, 'doctor_name')

# Find unique values, counts and percentages associated with this feature
calculate_percentage(data, 'staff_available')

# Find unique values, counts and percentages associated with this feature
calculate_percentage(data, 'Age')

# Find unique values, counts and percentages associated with this feature
calculate_percentage(data, 'gender')

# Find unique values, counts and percentages associated with this feature
calculate_percentage(data, 'Type of Admission')

# Find unique values, counts and percentages associated with this feature
calculate_percentage(data, 'Severity of Illness')

# Find unique values, counts and percentages associated with this feature
calculate_percentage(data, 'health_conditions')

# Find unique values, counts and percentages associated with this feature
calculate_percentage(data, 'Visitors with Patient')

# Find unique values, counts and percentages associated with this feature
calculate_percentage(data, 'Insurance')

# Find unique values, counts and percentages associated with this feature
data['Admission_Deposit'].describe()

# Find unique values, counts and percentages associated with this feature
calculate_percentage(data, 'Stay (in days)')

# Bin the 'Stay (in days)' column, fixing the bin labels
data['Stay_bins'] = pd.cut(data['Stay (in days)'],
                           bins=[0, 2, 5, 7, 10, 13, 16, 19, 50, np.inf],
                           labels=['0-2', '3-5', '6-7', '8-10', '11-13', '14-16', '17-19', '20-50', '51+'],
                           include_lowest=True)

# Calculate the percentage of patients in each bin
bin_percentages = data['Stay_bins'].value_counts(normalize=True) * 100

# Get the counts of patients in each bin
bin_counts = data['Stay_bins'].value_counts()

# Concatenate the counts and percentages
bin_counts = pd.concat([bin_counts, bin_percentages], axis=1)
bin_counts.columns = ['Count', 'Percentage']

# Add percentage symbol to Percentage column
bin_counts['Percentage'] = bin_counts['Percentage'].apply(lambda x: f'{x:.2f}%')

# Order the data by labels
desired_order = ['0-2', '3-5', '6-7', '8-10', '11-13', '14-16', '17-19', '20-50', '51+']
bin_counts = bin_counts.reindex(desired_order)

# Print the results
print(bin_counts)

            Count Percentage
Stay_bins                   
0-2             0      0.00%
3-5         11101      2.22%
6-7         63851     12.77%
8-10       291426     58.29%
11-13       18658      3.73%
14-16        6031      1.21%
17-19       13495      2.70%
20-50       95436     19.09%
51+             2      0.00%

data.columns

Index(['Available Extra Rooms in Hospital', 'Department', 'Ward_Facility_Code',
       'doctor_name', 'staff_available', 'patientid', 'Age', 'gender',
       'Type of Admission', 'Severity of Illness', 'health_conditions',
       'Visitors with Patient', 'Insurance', 'Admission_Deposit',
       'Stay (in days)', 'Stay_bins'],
      dtype='object')

# We MUST drop the "Stay_bins" as this is something we appended for LOS determinations - else it will mess with our decision tree
#data.drop('Stay_bins', axis=1, inplace=True)
#data.drop('Stay_bins_3-5', axis=1, inplace=True)
#data.drop('Stay_bins_6-7', axis=1, inplace=True)
#data.drop('Stay_bins_8-10', axis=1, inplace=True)
#data.drop('Stay_bins_11-13', axis=1, inplace=True)
#data.drop('Stay_bins_14-16', axis=1, inplace=True)
#data.drop('Stay_bins_17-19', axis=1, inplace=True)
#data.drop('Stay_bins_20-50', axis=1, inplace=True)
#data.drop('Stay_bins_51+', axis=1, inplace=True)

# Handle the missing values in the health_conditions column
data['health_conditions'].fillna('None', inplace=True)

# make sure all missing vlaues have been replaced
data['health_conditions'].isnull().sum()

data['health_conditions'].unique()

array(['Diabetes', 'Heart disease', 'None', 'Other', 'Asthama',
       'High Blood Pressure'], dtype=object)

# Lets make sure that corrected the missing values counts
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500000 entries, 0 to 499999
Data columns (total 16 columns):
 #   Column                             Non-Null Count   Dtype   
---  ------                             --------------   -----   
 0   Available Extra Rooms in Hospital  500000 non-null  int64   
 1   Department                         500000 non-null  object  
 2   Ward_Facility_Code                 500000 non-null  object  
 3   doctor_name                        500000 non-null  object  
 4   staff_available                    500000 non-null  int64   
 5   patientid                          500000 non-null  int64   
 6   Age                                500000 non-null  object  
 7   gender                             500000 non-null  object  
 8   Type of Admission                  500000 non-null  object  
 9   Severity of Illness                500000 non-null  object  
 10  health_conditions                  500000 non-null  object  
 11  Visitors with Patient              500000 non-null  int64   
 12  Insurance                          500000 non-null  object  
 13  Admission_Deposit                  500000 non-null  float64 
 14  Stay (in days)                     500000 non-null  int64   
 15  Stay_bins                          500000 non-null  category
dtypes: category(1), float64(1), int64(5), object(9)
memory usage: 57.7+ MB

# The statistical description suggests there might be a patient id with a negative value, so data cleanup my be needed
# Find any patient ID that is <=0
data[data['patientid'] <= 0]

# Drop the patientid column
data.drop('patientid', axis=1, inplace=True)

data.head(2)

# Function to plot a boxplot and a histogram along the same scale

def histogram_boxplot(data, feature, figsize=(12, 7), kde=False, bins=None):
    """
    Boxplot and histogram combined

    data: dataframe
    feature: dataframe column
    figsize: size of figure (default (12,7))
    kde: whether to the show density curve (default False)
    bins: number of bins for histogram (default None)
    """
    f2, (ax_box2, ax_hist2) = plt.subplots(
        nrows = 2,      # Number of rows of the subplot grid = 2
        sharex = True,  # x-axis will be shared among all subplots
        gridspec_kw = {"height_ratios": (0.25, 0.75)},
        figsize = figsize,
    )                   # Creating the 2 subplots
    sns.boxplot(data = data, x = feature, ax = ax_box2, showmeans = True, color = "violet"
    )                   # Boxplot will be created and a star will indicate the mean value of the column
    sns.histplot(
        data = data, x = feature, kde = kde, ax = ax_hist2, bins = bins, palette = "winter"
    ) if bins else sns.histplot(
        data = data, x = feature, kde = kde, ax = ax_hist2
    )                   # For histogram
    ax_hist2.axvline(
        data[feature].mean(), color = "green", linestyle = "--"
    )                   # Add mean to the histogram
    ax_hist2.axvline(
        data[feature].median(), color = "black", linestyle = "-"
    )                   # Add median to the histogram

histogram_boxplot(data, "Available Extra Rooms in Hospital", kde = True, bins = 24)

histogram_boxplot(data, "staff_available", kde = True, bins = 5)

histogram_boxplot(data, "Visitors with Patient", kde = True, bins = 20)

histogram_boxplot(data, "Admission_Deposit", kde = True, bins = 20)

histogram_boxplot(data, "Stay (in days)", kde = True, bins = 30)

# Define categorical features
cat_cols = ['Department', 'Ward_Facility_Code', 'doctor_name', 'Age', 'gender', 'Type of Admission', 'Severity of Illness', 'health_conditions', 'Insurance']

# Printing the % sub categories of each category
for i in cat_cols:
    print(data[i].value_counts(normalize=True))
    print('*'*40)

Department
gynecology            0.686956
radiotherapy          0.168630
anesthesia            0.088358
TB & Chest disease    0.045780
surgery               0.010276
Name: proportion, dtype: float64
****************************************
Ward_Facility_Code
F    0.241076
D    0.238110
B    0.207770
E    0.190748
A    0.093102
C    0.029194
Name: proportion, dtype: float64
****************************************
doctor_name
Dr Sarah     0.199192
Dr Olivia    0.196704
Dr Sophia    0.149506
Dr Nathan    0.141554
Dr Sam       0.111422
Dr John      0.102526
Dr Mark      0.088820
Dr Isaac     0.006718
Dr Simon     0.003558
Name: proportion, dtype: float64
****************************************
Age
21-30     0.319586
31-40     0.266746
41-50     0.160812
11-20     0.093072
61-70     0.053112
51-60     0.043436
71-80     0.037406
81-90     0.016362
0-10      0.006736
91-100    0.002732
Name: proportion, dtype: float64
****************************************
gender
Female    0.74162
Male      0.20696
Other     0.05142
Name: proportion, dtype: float64
****************************************
Type of Admission
Trauma       0.621072
Emergency    0.271568
Urgent       0.107360
Name: proportion, dtype: float64
****************************************
Severity of Illness
Moderate    0.560394
Minor       0.263074
Extreme     0.176532
Name: proportion, dtype: float64
****************************************
health_conditions
None                   0.303776
Other                  0.188822
High Blood Pressure    0.158804
Diabetes               0.147288
Asthama                0.131028
Heart disease          0.070282
Name: proportion, dtype: float64
****************************************
Insurance
Yes    0.78592
No     0.21408
Name: proportion, dtype: float64
****************************************

# Correlation matrix of numerical variables
# Select only numerical columns
numerical_data = data.select_dtypes(include=['float64', 'int64'])

# Calculate the correlation matrix
numerical_data.corr()

# Plot the correlation matrix

corr_matrix = numerical_data.corr()

# Plot the heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(corr_matrix, annot=True, cmap='vlag')
plt.title('Correlation Matrix of Numerical Variables')
plt.show()

# Function to plot stacked bar plots

def stacked_barplot(data, predictor, target):
    """
    Print the category counts and plot a stacked bar chart

    data: dataframe
    predictor: independent variable
    target: target variable
    """
    count  = data[predictor].nunique()
    sorter = data[target].value_counts().index[-1]
    tab1   = pd.crosstab(data[predictor], data[target], margins = True).sort_values(
        by = sorter, ascending = False
    )
    print(tab1)
    print("-" * 120)
    tab = pd.crosstab(data[predictor], data[target], normalize = "index").sort_values(
        by = sorter, ascending = False
    )
    tab.plot(kind = "bar", stacked = True, figsize = (count + 1, 5))
    plt.legend(
        loc     = "lower left",
        frameon = False,
    )
    plt.legend(loc = "upper left", bbox_to_anchor = (1, 1))
    plt.show()

# Function to calculate and plot average LOS by feature
def plot_avg_stay(data, group_by_column, target_column):
  """
  Plots the average stay for a target column grouped by a specified column.

  Args:
    data: The pandas DataFrame containing the data.
    group_by_column: The column to group by.
    target_column: The target column for calculating average stay.
  """
  avg_stay = data.groupby(group_by_column)[target_column].mean().sort_values(ascending=True)
  sns.barplot(y=group_by_column, x=target_column, data=data, order=avg_stay.index)
  plt.show()

# For cases where the plot needs to have its axis swapped
def plot_avg_stay2(data, group_by_column, target_column):
  """
  Plots the average stay for a target column grouped by a specified column.

  Args:
    data: The pandas DataFrame containing the data.
    group_by_column: The column to group by.
    target_column: The target column for calculating average stay.
  """
  avg_stay = data.groupby(group_by_column)[target_column].mean().sort_values(ascending=True)
  sns.barplot(x=group_by_column, y=target_column, data=data, order=avg_stay.index)
  plt.show()

plot_avg_stay(data, 'Department', 'Stay (in days)')

stacked_barplot(data, 'Department', 'Ward_Facility_Code')

Ward_Facility_Code      A       B      C       D      E       F     All
Department                                                             
All                 46551  103885  14597  119055  95374  120538  500000
radiotherapy        21093       0   9079       0  54143       0   84315
anesthesia          15611       0   4199       0  24369       0   44179
TB & Chest disease   4709       0   1319       0  16862       0   22890
gynecology              0  103885      0  119055      0  120538  343478
surgery              5138       0      0       0      0       0    5138
------------------------------------------------------------------------------------------------------------------------

stacked_barplot(data, 'Department', 'doctor_name')

doctor_name         Dr Isaac  Dr John  Dr Mark  Dr Nathan  Dr Olivia  Dr Sam  \
Department                                                                     
surgery                 3359        0        0          0          0       0   
All                     3359    51263    44410      70777      98352   55711   
anesthesia                 0    14920    29259          0          0       0   
TB & Chest disease         0     7739    15151          0          0       0   
radiotherapy               0    28604        0          0          0   55711   
gynecology                 0        0        0      70777      98352       0   

doctor_name         Dr Sarah  Dr Simon  Dr Sophia     All  
Department                                                 
surgery                    0      1779          0    5138  
All                    99596      1779      74753  500000  
anesthesia                 0         0          0   44179  
TB & Chest disease         0         0          0   22890  
radiotherapy               0         0          0   84315  
gynecology             99596         0      74753  343478  
------------------------------------------------------------------------------------------------------------------------

plot_avg_stay(data, 'Ward_Facility_Code', 'Stay (in days)')

stacked_barplot(data, 'Ward_Facility_Code', 'Department')

Department          TB & Chest disease  anesthesia  gynecology  radiotherapy  \
Ward_Facility_Code                                                             
A                                 4709       15611           0         21093   
All                              22890       44179      343478         84315   
B                                    0           0      103885             0   
C                                 1319        4199           0          9079   
D                                    0           0      119055             0   
E                                16862       24369           0         54143   
F                                    0           0      120538             0   

Department          surgery     All  
Ward_Facility_Code                   
A                      5138   46551  
All                    5138  500000  
B                         0  103885  
C                         0   14597  
D                         0  119055  
E                         0   95374  
F                         0  120538  
------------------------------------------------------------------------------------------------------------------------

stacked_barplot(data, 'Ward_Facility_Code', 'Severity of Illness')

Severity of Illness  Extreme   Minor  Moderate     All
Ward_Facility_Code                                    
All                    88266  131537    280197  500000
D                      29549   27220     62286  119055
B                      24222   23579     56084  103885
A                      13662    7877     25012   46551
E                      11488   22254     61632   95374
F                       5842   47594     67102  120538
C                       3503    3013      8081   14597
------------------------------------------------------------------------------------------------------------------------

plot_avg_stay(data, 'doctor_name', 'Stay (in days)')

stacked_barplot(data, 'doctor_name', 'Department')

Department   TB & Chest disease  anesthesia  gynecology  radiotherapy  \
doctor_name                                                             
All                       22890       44179      343478         84315   
Dr Isaac                      0           0           0             0   
Dr Simon                      0           0           0             0   
Dr John                    7739       14920           0         28604   
Dr Mark                   15151       29259           0             0   
Dr Nathan                     0           0       70777             0   
Dr Sam                        0           0           0         55711   
Dr Olivia                     0           0       98352             0   
Dr Sarah                      0           0       99596             0   
Dr Sophia                     0           0       74753             0   

Department   surgery     All  
doctor_name                   
All             5138  500000  
Dr Isaac        3359    3359  
Dr Simon        1779    1779  
Dr John            0   51263  
Dr Mark            0   44410  
Dr Nathan          0   70777  
Dr Sam             0   55711  
Dr Olivia          0   98352  
Dr Sarah           0   99596  
Dr Sophia          0   74753  
------------------------------------------------------------------------------------------------------------------------

stacked_barplot(data, 'doctor_name', 'Ward_Facility_Code')

Ward_Facility_Code      A       B      C       D      E       F     All
doctor_name                                                            
All                 46551  103885  14597  119055  95374  120538  500000
Dr Sam              13889       0   6029       0  35793       0   55711
Dr John             14041       0   4888       0  32334       0   51263
Dr Mark             13483       0   3680       0  27247       0   44410
Dr Isaac             3359       0      0       0      0       0    3359
Dr Nathan               0   17317      0   27035      0   26425   70777
Dr Olivia               0   33761      0   31598      0   32993   98352
Dr Sarah                0   29983      0   34565      0   35048   99596
Dr Simon             1779       0      0       0      0       0    1779
Dr Sophia               0   22824      0   25857      0   26072   74753
------------------------------------------------------------------------------------------------------------------------

plot_avg_stay(data, 'staff_available', 'Stay (in days)')
# sort x axis desc

# sort by stay in descending order
data.sort_values(by='Age', ascending=False, inplace=True)
sns.barplot(y='Age', x='Stay (in days)', data=data)
plt.show()

plot_avg_stay(data, 'gender', 'Stay (in days)')

plot_avg_stay(data, 'Type of Admission', 'Stay (in days)')

plot_avg_stay(data, 'Severity of Illness', 'Stay (in days)')

plot_avg_stay(data, 'health_conditions', 'Stay (in days)')

plot_avg_stay(data, 'Visitors with Patient', 'Stay (in days)')

plot_avg_stay(data, 'Insurance', 'Stay (in days)')

# This is too costly to run in terms of compute power
#plot_avg_stay(data, 'Admission_Deposit', 'Stay (in days)')

# Perform one host encoding
data = pd.get_dummies(
                      data,
                      columns = data.select_dtypes(include = ['object', 'category']).columns,
                      drop_first = True,
                      dtype = int
                      )

x = data.drop('Stay (in days)', axis=1)
y = data['Stay (in days)']

# Import additional libraries
from Scikit-learn.model_selection import train_test_split
from Scikit-learn.tree import DecisionTreeRegressor
from Scikit-learn.ensemble import RandomForestRegressor, BaggingRegressor

from Scikit-learn.model_selection import GridSearchCV

from Scikit-learn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Perform the data split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, shuffle=True, random_state=1)

# Verify the split
x_train.shape, x_test.shape, y_train.shape, y_test.shape

((400000, 50), (100000, 50), (400000,), (100000,))

# Define a function to calculate the adjusted r2
def adj_r2(predictors, target, predictions):
    r2 = r2_score(target, predictions)
    n = predictors.shape[0]
    k = predictors.shape[1]

    return 1-((1-r2)*(n-1)/(n-k-1))

# Define a function to calculate mean_absolute_error
def mape_score(target, predictions):
    return np.mean(np.abs((target - predictions) / target)) * 100

def model_performance_regression(model, predictors, target):
    pred   = model.predict(predictors)
    r2     = r2_score(target, pred) # Use r2_score instead of r2
    adjusted_r2 = adj_r2(predictors, target, pred) # Pass predictors to the function and rename the variable to adjusted_r2
    rmse   = np.sqrt(mean_squared_error(target, pred))
    mae = mean_absolute_error(target, pred)
    mape = mape_score(target, pred)

    df_perf = pd.DataFrame({
                            'RMSE': rmse,
                            'MAE': mae,
                            'R2': r2,
                            'Adjusted R2': adjusted_r2, # Use adjusted_r2 instead of adj_r2
                            'MAPE': mape
                          }, index=[0])
    return df_perf

# Create a regressor
dt_regressor = DecisionTreeRegressor(random_state=1)
dt_regressor.fit(x_train, y_train)

DecisionTreeRegressor(random_state=1)

DecisionTreeRegressor(random_state=1)

# Evaluate the regressor
dt_regressor_perf_test = model_performance_regression(dt_regressor, x_test, y_test)
dt_regressor_perf_test

# Lets visulaize the tree
from Scikit-learn import tree

features = list(x.columns)
features

['Available Extra Rooms in Hospital',
 'staff_available',
 'Visitors with Patient',
 'Admission_Deposit',
 'Department_anesthesia',
 'Department_gynecology',
 'Department_radiotherapy',
 'Department_surgery',
 'Ward_Facility_Code_B',
 'Ward_Facility_Code_C',
 'Ward_Facility_Code_D',
 'Ward_Facility_Code_E',
 'Ward_Facility_Code_F',
 'doctor_name_Dr John',
 'doctor_name_Dr Mark',
 'doctor_name_Dr Nathan',
 'doctor_name_Dr Olivia',
 'doctor_name_Dr Sam',
 'doctor_name_Dr Sarah',
 'doctor_name_Dr Simon',
 'doctor_name_Dr Sophia',
 'Age_11-20',
 'Age_21-30',
 'Age_31-40',
 'Age_41-50',
 'Age_51-60',
 'Age_61-70',
 'Age_71-80',
 'Age_81-90',
 'Age_91-100',
 'gender_Male',
 'gender_Other',
 'Type of Admission_Trauma',
 'Type of Admission_Urgent',
 'Severity of Illness_Minor',
 'Severity of Illness_Moderate',
 'health_conditions_Diabetes',
 'health_conditions_Heart disease',
 'health_conditions_High Blood Pressure',
 'health_conditions_None',
 'health_conditions_Other',
 'Insurance_Yes',
 'Stay_bins_3-5',
 'Stay_bins_6-7',
 'Stay_bins_8-10',
 'Stay_bins_11-13',
 'Stay_bins_14-16',
 'Stay_bins_17-19',
 'Stay_bins_20-50',
 'Stay_bins_51+']

# Train a shallow tree to aid in visualization
dt_regressor_shallow = DecisionTreeRegressor(max_depth=3, random_state=1)
dt_regressor_shallow.fit(x_train, y_train)

DecisionTreeRegressor(max_depth=3, random_state=1)

DecisionTreeRegressor(max_depth=3, random_state=1)

# Evaluate the regressor
dt_regressor_shallow_test = model_performance_regression(dt_regressor_shallow, x_test, y_test)
dt_regressor_shallow_test

# Visualize the tree
plt.figure(figsize=(20, 20))
tree.plot_tree(dt_regressor_shallow, feature_names=features, filled=True, fontsize=12, node_ids=True, class_names=True)
plt.show()

# There is another way to visualize this as text
print(tree.export_text(dt_regressor_shallow, feature_names=features, show_weights=True))

|--- Stay_bins_20-50 <= 0.50
|   |--- Stay_bins_17-19 <= 0.50
|   |   |--- Stay_bins_6-7 <= 0.50
|   |   |   |--- value: [8.93]
|   |   |--- Stay_bins_6-7 >  0.50
|   |   |   |--- value: [6.66]
|   |--- Stay_bins_17-19 >  0.50
|   |   |--- Department_radiotherapy <= 0.50
|   |   |   |--- value: [17.94]
|   |   |--- Department_radiotherapy >  0.50
|   |   |   |--- value: [18.37]
|--- Stay_bins_20-50 >  0.50
|   |--- Department_radiotherapy <= 0.50
|   |   |--- Age_31-40 <= 0.50
|   |   |   |--- value: [31.35]
|   |   |--- Age_31-40 >  0.50
|   |   |   |--- value: [20.87]
|   |--- Department_radiotherapy >  0.50
|   |   |--- Visitors with Patient <= 3.50
|   |   |   |--- value: [22.95]
|   |   |--- Visitors with Patient >  3.50
|   |   |   |--- value: [23.61]

# bagging
bagging_estimator = BaggingRegressor(random_state=1)
bagging_estimator.fit(x_train, y_train)

BaggingRegressor(random_state=1)

BaggingRegressor(random_state=1)

# Calculate the performance
bagging_estimator_perf_test = model_performance_regression(bagging_estimator, x_test, y_test)
bagging_estimator_perf_test

rf_regressor = RandomForestRegressor(random_state=1)
rf_regressor.fit(x_train, y_train)

RandomForestRegressor(random_state=1)

RandomForestRegressor(random_state=1)

# Calculate the performance
rf_regressor_perf_test = model_performance_regression(rf_regressor, x_test, y_test)
rf_regressor_perf_test

from Scikit-learn.ensemble import AdaBoostRegressor

ada_regressor = AdaBoostRegressor(random_state=1)
ada_regressor.fit(x_train, y_train)

AdaBoostRegressor(random_state=1)

AdaBoostRegressor(random_state=1)

# Measure performance
ada_regressor_perf_test = model_performance_regression(ada_regressor, x_test, y_test)
ada_regressor_perf_test

from Scikit-learn.ensemble import GradientBoostingRegressor

grad_regressor = GradientBoostingRegressor(random_state=1)
grad_regressor.fit(x_train, y_train)

GradientBoostingRegressor(random_state=1)

GradientBoostingRegressor(random_state=1)

# Measure performance
grad_regressor_perf_test = model_performance_regression(grad_regressor, x_test, y_test)
grad_regressor_perf_test

!pip install xgboost

Requirement already satisfied: xgboost in /usr/local/lib/python3.11/dist-packages (2.1.4)
Requirement already satisfied: numpy in /usr/local/lib/python3.11/dist-packages (from xgboost) (1.26.4)
Requirement already satisfied: nvidia-nccl-cu12 in /usr/local/lib/python3.11/dist-packages (from xgboost) (2.21.5)
Requirement already satisfied: scipy in /usr/local/lib/python3.11/dist-packages (from xgboost) (1.13.1)

from xgboost import XGBRegressor

# Instantiate the regressor
xgb_regressor = XGBRegressor(random_state=1)
xgb_regressor.fit(x_train, y_train)

XGBRegressor(base_score=None, booster=None, callbacks=None,
             colsample_bylevel=None, colsample_bynode=None,
             colsample_bytree=None, device=None, early_stopping_rounds=None,
             enable_categorical=False, eval_metric=None, feature_types=None,
             gamma=None, grow_policy=None, importance_type=None,
             interaction_constraints=None, learning_rate=None, max_bin=None,
             max_cat_threshold=None, max_cat_to_onehot=None,
             max_delta_step=None, max_depth=None, max_leaves=None,
             min_child_weight=None, missing=nan, monotone_constraints=None,
             multi_strategy=None, n_estimators=None, n_jobs=None,
             num_parallel_tree=None, random_state=1, ...)

XGBRegressor(base_score=None, booster=None, callbacks=None,
             colsample_bylevel=None, colsample_bynode=None,
             colsample_bytree=None, device=None, early_stopping_rounds=None,
             enable_categorical=False, eval_metric=None, feature_types=None,
             gamma=None, grow_policy=None, importance_type=None,
             interaction_constraints=None, learning_rate=None, max_bin=None,
             max_cat_threshold=None, max_cat_to_onehot=None,
             max_delta_step=None, max_depth=None, max_leaves=None,
             min_child_weight=None, missing=nan, monotone_constraints=None,
             multi_strategy=None, n_estimators=None, n_jobs=None,
             num_parallel_tree=None, random_state=1, ...)

# Measure performance
xgb_regressor_perf_test = model_performance_regression(xgb_regressor, x_test, y_test)
xgb_regressor_perf_test

# Compare all regression models
models_test_comp = pd.concat(
    [
        dt_regressor_perf_test.T,
        bagging_estimator_perf_test.T,
        rf_regressor_perf_test.T,
        ada_regressor_perf_test.T,
        grad_regressor_perf_test.T,
        xgb_regressor_perf_test.T
    ],
    axis=1)

models_test_comp.columns = [
                            'Decision Tree',
                            'Bagging',
                            'Random Forest',
                            'AdaBoost',
                            'Gradient Boosting',
                            'XGBoost'
                            ]
models_test_comp.T

# Instantiate a random forest regressor
rf_tuned = RandomForestRegressor(random_state=1)

# Specify hyerparameters (note you MUST include the default) - this is very costly in terms of computation power (3*3*2*5 so 90 forests)
rf_tuned_params = {
                  'n_estimators': [100, 110, 120],
                  'max_depth': [None, 5, 7],
                  'max_features': [0.8, 1],
                  }

from seaborn.axisgrid import Grid

# Instatiate (we can choose what to maximize)
rf_tuned_grid_obj = GridSearchCV(rf_tuned, rf_tuned_params, scoring='neg_mean_squared_error', cv=5, n_jobs=-1, verbose=3)
rf_tuned_grid_obj.fit(x_train, y_train)

Fitting 5 folds for each of 18 candidates, totalling 90 fits

GridSearchCV(cv=5, estimator=RandomForestRegressor(random_state=1), n_jobs=-1,
             param_grid={'max_depth': [None, 5, 7], 'max_features': [0.8, 1],
                         'n_estimators': [100, 110, 120]},
             scoring='neg_mean_squared_error', verbose=3)

GridSearchCV(cv=5, estimator=RandomForestRegressor(random_state=1), n_jobs=-1,
             param_grid={'max_depth': [None, 5, 7], 'max_features': [0.8, 1],
                         'n_estimators': [100, 110, 120]},
             scoring='neg_mean_squared_error', verbose=3)

RandomForestRegressor(max_features=0.8, n_estimators=120, random_state=1)

RandomForestRegressor(max_features=0.8, n_estimators=120, random_state=1)

# Measure performance
rf_tuned_grid_obj_perf_test = model_performance_regression(rf_tuned_grid_obj, x_test, y_test)
rf_tuned_grid_obj_perf_test

# we can use .best_estimator_ to see which hyperperamaters worked the best and then fit those to the text data
rf_tuned_regressor = rf_tuned_grid_obj.best_estimator_
rf_tuned_regressor.fit(x_train, y_train)

RandomForestRegressor(max_features=0.8, n_estimators=120, random_state=1)

RandomForestRegressor(max_features=0.8, n_estimators=120, random_state=1)

# Measure performance
rf_tuned_regressor_perf_test = model_performance_regression(rf_tuned_regressor, x_test, y_test)
rf_tuned_regressor_perf_test

# Compare all regression models
models_test_comp = pd.concat(
    [
        dt_regressor_perf_test.T,
        bagging_estimator_perf_test.T,
        rf_regressor_perf_test.T,
        ada_regressor_perf_test.T,
        grad_regressor_perf_test.T,
        xgb_regressor_perf_test.T,
        rf_tuned_regressor_perf_test.T
    ],
    axis=1)

models_test_comp.columns = [
                            'Decision Tree',
                            'Bagging',
                            'Random Forest',
                            'AdaBoost',
                            'Gradient Boosting',
                            'XGBoost',
                            'Random Forest Tuned'
                            ]
models_test_comp.T

# Let's visualize which features were most important
importances = rf_tuned_regressor.feature_importances_   # Returns an unsorted array of importances
indices = np.argsort(importances)

plt.figure(figsize=(10, 10))
plt.title('Feature Importances')
plt.barh(range(len(features)), importances[indices], color='b', align='center')
plt.yticks(range(len(features)), [features[i] for i in indices])
plt.xlabel('Relative Importance')
plt.show;                                               # The ';' supresses all output but the plt

# Convert notebook to html
!jupyter nbconvert --to html "/content/drive/MyDrive/MIT - Data Sciences/Colab Notebooks/Week_Five_-_Classification_and_Hypothesis_Testing/Guided_Project:_Classification_and_Hypothesis_Testing/Guided Project: Classification and Hypothesis Testing.ipynb"

[NbConvertApp] Converting notebook /content/drive/MyDrive/MIT - Data Sciences/Colab Notebooks/Week_Five_-_Classification_and_Hypothesis_Testing/Guided_Project:_Classification_and_Hypothesis_Testing/Guided Project: Classification and Hypothesis Testing.ipynb to html
[NbConvertApp] WARNING | Alternative text is missing on 25 image(s).
[NbConvertApp] Writing 1989593 bytes to /content/drive/MyDrive/MIT - Data Sciences/Colab Notebooks/Week_Five_-_Classification_and_Hypothesis_Testing/Guided_Project:_Classification_and_Hypothesis_Testing/Guided Project: Classification and Hypothesis Testing.html

	count	mean	std	min	25%	50%	75%	max
Available Extra Rooms in Hospital	500000.0	3.638800	2.698124	0.000000	2.000000	3.000000	4.000000	24.00000
staff_available	500000.0	5.020470	3.158103	0.000000	2.000000	5.000000	8.000000	10.00000
patientid	500000.0	63150.519058	41689.479956	-3269.000000	25442.000000	57864.000000	103392.000000	134400.00000
Visitors with Patient	500000.0	3.549414	2.241054	0.000000	2.000000	3.000000	4.000000	32.00000
Admission_Deposit	500000.0	4722.315734	1047.324220	1654.005148	4071.714532	4627.003792	5091.612717	10104.72639
Stay (in days)	500000.0	12.381062	7.913174	3.000000	8.000000	9.000000	11.000000	51.00000

	Available Extra Rooms in Hospital	Count	Percentage
0	3	145044	29.01%
1	2	141205	28.24%
2	4	114011	22.80%
3	5	47644	9.53%
4	6	15561	3.11%
5	1	12194	2.44%
6	7	4975	1.00%
7	12	2844	0.57%
8	24	2786	0.56%
9	21	2377	0.48%
10	13	1918	0.38%
11	8	1750	0.35%
12	11	1600	0.32%
13	0	1573	0.31%
14	10	1559	0.31%
15	14	1265	0.25%
16	20	1020	0.20%
17	9	674	0.13%

	staff_available	Count	Percentage
0	6	46620	9.32%
1	10	45768	9.15%
2	7	45703	9.14%
3	3	45658	9.13%
4	8	45583	9.12%
5	9	45530	9.11%
6	4	45290	9.06%
7	5	45215	9.04%
8	0	45032	9.01%
9	1	44931	8.99%
10	2	44670	8.93%

	Age	Count	Percentage
0	21-30	159793	31.96%
1	31-40	133373	26.67%
2	41-50	80406	16.08%
3	11-20	46536	9.31%
4	61-70	26556	5.31%
5	51-60	21718	4.34%
6	71-80	18703	3.74%
7	81-90	8181	1.64%
8	0-10	3368	0.67%
9	91-100	1366	0.27%

	health_conditions	Count	Percentage
0	NaN	151888	30.38%
1	Other	94411	18.88%
2	High Blood Pressure	79402	15.88%
3	Diabetes	73644	14.73%
4	Asthama	65514	13.10%
5	Heart disease	35141	7.03%

	Available Extra Rooms in Hospital	Department	Ward_Facility_Code	doctor_name	staff_available	patientid	Age	gender	Type of Admission	Severity of Illness	health_conditions	Visitors with Patient	Insurance	Admission_Deposit	Stay (in days)
0	4	gynecology	D	Dr Sophia	0	33070	41-50	Female	Trauma	Extreme	Diabetes	4	Yes	2966.408696	8
1	4	gynecology	B	Dr Sophia	2	34808	31-40	Female	Trauma	Minor	Heart disease	2	No	3554.835677	9

	Available Extra Rooms in Hospital	Department	Ward_Facility_Code	doctor_name	staff_available	patientid	Age	gender	Type of Admission	Severity of Illness	health_conditions	Visitors with Patient	Insurance	Admission_Deposit	Stay (in days)
499998	2	radiotherapy	A	Dr John	1	29957	61-70	Female	Trauma	Extreme	Diabetes	2	No	4694.127772	23
499999	3	gynecology	F	Dr Sophia	3	45008	41-50	Female	Trauma	Moderate	Heart disease	4	Yes	4713.868519	10

	0
Available Extra Rooms in Hospital	18
Department	5
Ward_Facility_Code	6
doctor_name	9
staff_available	11
patientid	126399
Age	10
gender	3
Type of Admission	3
Severity of Illness	3
health_conditions	5
Visitors with Patient	28
Insurance	2
Admission_Deposit	499508
Stay (in days)	49

	0
0	False
1	False
2	False
3	False
4	False
...	...
499995	False
499996	False
499997	False
499998	False
499999	False

	Department	Count	Percentage
0	gynecology	343478	68.70%
1	radiotherapy	84315	16.86%
2	anesthesia	44179	8.84%
3	TB & Chest disease	22890	4.58%
4	surgery	5138	1.03%

	Ward_Facility_Code	Count	Percentage
0	F	120538	24.11%
1	D	119055	23.81%
2	B	103885	20.78%
3	E	95374	19.07%
4	A	46551	9.31%
5	C	14597	2.92%

	doctor_name	Count	Percentage
0	Dr Sarah	99596	19.92%
1	Dr Olivia	98352	19.67%
2	Dr Sophia	74753	14.95%
3	Dr Nathan	70777	14.16%
4	Dr Sam	55711	11.14%
5	Dr John	51263	10.25%
6	Dr Mark	44410	8.88%
7	Dr Isaac	3359	0.67%
8	Dr Simon	1779	0.36%

	Visitors with Patient	Count	Percentage
0	2	204716	40.94%
1	4	171986	34.40%
2	3	53474	10.69%
3	6	25930	5.19%
4	5	12495	2.50%
5	8	8633	1.73%
6	7	3922	0.78%
7	9	2909	0.58%
8	10	2873	0.57%
9	12	2729	0.55%
10	1	2111	0.42%
11	14	2107	0.42%
12	11	1647	0.33%
13	13	851	0.17%
14	0	822	0.16%
15	15	714	0.14%
16	16	412	0.08%
17	24	313	0.06%
18	19	209	0.04%
19	20	203	0.04%
20	22	190	0.04%
21	18	168	0.03%
22	17	146	0.03%
23	32	127	0.03%
24	23	126	0.03%
25	21	110	0.02%
26	25	45	0.01%
27	30	32	0.01%

	Admission_Deposit
count	500000.000000
mean	4722.315734
std	1047.324220
min	1654.005148
25%	4071.714532
50%	4627.003792
75%	5091.612717
max	10104.726390

	Stay (in days)	Count	Percentage
0	9	124110	24.82%
1	8	113462	22.69%
2	10	53854	10.77%
3	7	42292	8.46%
4	6	21559	4.31%
5	11	13577	2.72%
6	5	9794	1.96%
7	22	8893	1.78%
8	23	8713	1.74%
9	24	8299	1.66%
10	21	7509	1.50%
11	25	7128	1.43%
12	20	6112	1.22%
13	26	5777	1.16%
14	19	5157	1.03%
15	27	4625	0.92%
16	18	4603	0.92%
17	28	4203	0.84%
18	29	3929	0.79%
19	30	3764	0.75%
20	32	3761	0.75%
21	17	3735	0.75%
22	31	3631	0.73%
23	12	3431	0.69%
24	33	3411	0.68%
25	34	3129	0.63%
26	35	2768	0.55%
27	16	2756	0.55%
28	36	2373	0.47%
29	37	1933	0.39%
30	15	1901	0.38%
31	13	1650	0.33%
32	38	1465	0.29%
33	14	1374	0.27%
34	4	1289	0.26%
35	39	1122	0.22%
36	40	868	0.17%
37	41	648	0.13%
38	42	433	0.09%
39	43	328	0.07%
40	44	218	0.04%
41	45	172	0.03%
42	46	119	0.02%
43	47	55	0.01%
44	48	36	0.01%
45	3	18	0.00%
46	49	11	0.00%
47	50	3	0.00%
48	51	2	0.00%

	gender	Count	Percentage
0	Female	370810	74.16%
1	Male	103480	20.70%
2	Other	25710	5.14%

	Insurance	Count	Percentage
0	Yes	392960	78.59%
1	No	107040	21.41%

	Available Extra Rooms in Hospital	staff_available	Visitors with Patient	Admission_Deposit	Stay (in days)
Available Extra Rooms in Hospital	1.000000	-0.001784	0.070459	-0.050127	-0.019219
staff_available	-0.001784	1.000000	0.000578	0.000763	0.007398
Visitors with Patient	0.070459	0.000578	1.000000	-0.069043	0.027302
Admission_Deposit	-0.050127	0.000763	-0.069043	1.000000	0.044203
Stay (in days)	-0.019219	0.007398	0.027302	0.044203	1.000000

	RMSE	MAE	R2	Adjusted R2	MAPE
Decision Tree	1.588745	0.854190	0.959936	0.959916	6.213695
Bagging	1.196142	0.721663	0.977291	0.977279	5.538399
Random Forest	1.145173	0.696898	0.979185	0.979174	5.372642
AdaBoost	2.179979	1.461467	0.924570	0.924532	13.377773
Gradient Boosting	1.499456	0.900673	0.964313	0.964295	6.687052
XGBoost	1.272956	0.794048	0.974280	0.974267	6.153615

	Available Extra Rooms in Hospital	Department	Ward_Facility_Code	doctor_name	staff_available	patientid	Age	gender	Type of Admission	Severity of Illness	health_conditions	Visitors with Patient	Insurance	Admission_Deposit	Stay (in days)	Stay_bins
2238	1	gynecology	D	Dr Sarah	0	-914	11-20	Female	Trauma	Moderate	High Blood Pressure	6	Yes	6778.130714	8	8-10
2977	2	anesthesia	A	Dr Mark	6	-842	51-60	Male	Trauma	Moderate	None	2	Yes	4814.433104	41	20-50
4037	4	anesthesia	A	Dr Mark	2	-1046	21-30	Male	Trauma	Extreme	Heart disease	4	Yes	4440.225183	19	17-19
5354	3	gynecology	F	Dr Sophia	9	-928	31-40	Female	Trauma	Moderate	Heart disease	3	Yes	8376.323315	8	8-10
5682	2	gynecology	D	Dr Nathan	8	-61	41-50	Female	Emergency	Minor	None	4	Yes	4798.440801	8	8-10
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
496609	3	gynecology	B	Dr Nathan	2	-751	31-40	Female	Emergency	Minor	Other	4	No	3674.268067	8	8-10
497094	5	gynecology	B	Dr Sarah	9	-569	41-50	Female	Trauma	Extreme	Asthama	4	Yes	3172.891233	9	8-10
497200	6	gynecology	D	Dr Sarah	3	-588	11-20	Female	Emergency	Minor	High Blood Pressure	4	No	4581.742646	7	6-7
498190	2	anesthesia	E	Dr John	10	-55	81-90	Female	Trauma	Moderate	Diabetes	2	No	4624.389375	32	20-50
498341	4	gynecology	F	Dr Sarah	8	-739	11-20	Female	Emergency	Moderate	High Blood Pressure	2	Yes	4042.058382	9	8-10

Guided Project: Classification and Hypothesis Testing¶

Context:¶

Objective:¶

Data Dictionary:¶

Approach to solve the problem:¶

Importing Libraries, Mount Drive and Read Data¶

Data Overview¶

Data Duplication Checks¶

Exploratory Data Analysis¶

Statistical Analysis¶

Feature: Available Rooms¶

Feature: Departments¶

Feature: Wards¶

Feature: Doctors¶

Feature: Available Staff¶

Feature: Age Ranges¶

Feature: Gender¶

Feature: Admission Type¶

Feature: Severity of Illness¶

Feature: Health Conditions¶

Feature: Visitors¶

Feature: Insurance¶

Feature: Admission Deposit¶

Feature: Stay (in days)¶

Data Cleanup¶

Univariate Analysis - Numerical Features¶

Feature: Available Rooms¶

Feature: Available Staff¶

Feature: Visitors¶

Feature: Admission Deposit¶

Feature: Length of Stay¶

Univariate Analysis - Categorical Features¶

Summary¶

Multivariate Analysis¶

Correlation Matrix of Numerical Features¶

Target Variable: Length of Stay¶

Length of Stay by Department¶

Length of Stay by Ward¶

Length of Stay by Doctor¶

Length of Stay by Available Staff¶

Length of Stay by Age¶

Length of stay by Gender¶

Length of stay by Type of Admission¶

Length of stay by Severity of Illness¶

Length of stay by Health Conditions¶

Length of stay by Visitors¶

Length of stay by Insurance¶

Length of stay by Admission Deposit¶

Data Preparation¶

Encode Categorical Data¶

Split Data¶

Performance Metrics¶

Decision Tree Regression¶

Ensemble Learning Methods¶

Bagging (Bootstrap Aggregating)¶

Random Forest¶

Adapted Boosting (AdaBoost)¶

Gradient Boosting¶

Extreme Gradient Boosting (XGBoost)¶

Tuning Random Forest Regressor¶