import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings("ignore")

# Removes the limit for the number of displayed columns
pd.set_option("display.max_columns", None)

# Sets the limit for the number of displayed rows
pd.set_option("display.max_rows", 200)

# To build models for prediction
from Scikit-learn.model_selection import train_test_split, cross_val_score, KFold
from Scikit-learn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from Scikit-learn.tree import DecisionTreeRegressor
from Scikit-learn.ensemble import RandomForestRegressor,BaggingRegressor

# To encode categorical variables
from Scikit-learn.preprocessing import LabelEncoder

# For tuning the model
from Scikit-learn.model_selection import GridSearchCV

# To check model performance
from Scikit-learn.metrics import make_scorer,mean_squared_error, r2_score, mean_absolute_error

from google.colab import drive
drive.mount('/content/Mydrive')

Mounted at /content/Mydrive

# Read the healthcare dataset file
data = pd.read_csv("healthcare_data.csv")

# Copying data to another variable to avoid any changes to original data
same_data = data.copy()

# View the first 5 rows of the dataset
data.head()

# View the last 5 rows of the dataset
data.tail()

# Understand the shape of the data
data.shape

(500000, 15)

# Checking the info of the data
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500000 entries, 0 to 499999
Data columns (total 15 columns):
 #   Column                             Non-Null Count   Dtype  
---  ------                             --------------   -----  
 0   Available Extra Rooms in Hospital  500000 non-null  int64  
 1   Department                         500000 non-null  object 
 2   Ward_Facility_Code                 500000 non-null  object 
 3   doctor_name                        500000 non-null  object 
 4   staff_available                    500000 non-null  int64  
 5   patientid                          500000 non-null  int64  
 6   Age                                500000 non-null  object 
 7   gender                             500000 non-null  object 
 8   Type of Admission                  500000 non-null  object 
 9   Severity of Illness                500000 non-null  object 
 10  health_conditions                  500000 non-null  object 
 11  Visitors with Patient              500000 non-null  int64  
 12  Insurance                          500000 non-null  object 
 13  Admission_Deposit                  500000 non-null  float64
 14  Stay (in days)                     500000 non-null  int64  
dtypes: float64(1), int64(5), object(9)
memory usage: 57.2+ MB

# To view patientid and the number of times they have been admitted to the hospital
data['patientid'].value_counts()

126719    21
125695    21
44572     21
126623    21
125625    19
          ..
37634      1
91436      1
118936     1
52366      1
105506     1
Name: patientid, Length: 126399, dtype: int64

# Dropping patientid from the data as it is an identifier and will not add value to the analysis
data=data.drop(columns=["patientid"])

# Creating dummy variables for the categorical columns
# drop_first=True is used to avoid redundant variables
data = pd.get_dummies(
    data,
    columns = data.select_dtypes(include = ["object", "category"]).columns.tolist(),
    drop_first = True,
)

# Check the data after handling categorical data
data

# Separating independent variables and the target variable
x = data.drop('Stay (in days)',axis=1)

y = data['Stay (in days)']

# Splitting the dataset into train and test datasets
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, shuffle = True, random_state = 1)

# Checking the shape of the train and test data
print("Shape of Training set : ", x_train.shape)
print("Shape of test set : ", x_test.shape)

Shape of Training set :  (400000, 42)
Shape of test set :  (100000, 42)

import pickle

# Create a model with desired hyperparameters
model = RandomForestRegressor(n_estimators=120, max_depth=None, max_features=0.8, random_state=1)

model.fit(x_train, y_train)

RandomForestRegressor(max_features=0.8, n_estimators=120, random_state=1)

RandomForestRegressor(max_features=0.8, n_estimators=120, random_state=1)

with open('model.pkl', 'wb') as file:
    pickle.dump(model, file)

with open('model.pkl', 'rb') as file:
    loaded_model_pkl = pickle.load(file)

import joblib

joblib.dump(model, 'model.joblib')

loaded_model_joblib = joblib.load('model.joblib')

/content/Mydrive/MyDrive/MIT - Data Sciences/Colab Notebooks/Week Four - Regression and Prediction/Hospital_LOS_Prediction/Hospital_LOS_Prediction_Part_4.ipynb

	Available Extra Rooms in Hospital	Department	Ward_Facility_Code	doctor_name	staff_available	patientid	Age	gender	Type of Admission	Severity of Illness	health_conditions	Visitors with Patient	Insurance	Admission_Deposit	Stay (in days)
0	4	gynecology	D	Dr Sophia	0	33070	41-50	Female	Trauma	Extreme	Diabetes	4	Yes	2966.408696	8
1	4	gynecology	B	Dr Sophia	2	34808	31-40	Female	Trauma	Minor	Heart disease	2	No	3554.835677	9
2	2	gynecology	B	Dr Sophia	8	44577	21-30	Female	Trauma	Extreme	Diabetes	2	Yes	5624.733654	7
3	4	gynecology	D	Dr Olivia	7	3695	31-40	Female	Urgent	Moderate	None	4	No	4814.149231	8
4	2	anesthesia	E	Dr Mark	10	108956	71-80	Male	Trauma	Moderate	Diabetes	2	No	5169.269637	34

	Available Extra Rooms in Hospital	Department	Ward_Facility_Code	doctor_name	staff_available	patientid	Age	gender	Type of Admission	Severity of Illness	health_conditions	Visitors with Patient	Insurance	Admission_Deposit	Stay (in days)
499995	4	gynecology	F	Dr Sarah	2	43001	11-20	Female	Trauma	Minor	High Blood Pressure	3	No	4105.795901	10
499996	13	gynecology	F	Dr Olivia	8	85601	31-40	Female	Emergency	Moderate	Other	2	No	4631.550257	11
499997	2	gynecology	B	Dr Sarah	3	22447	11-20	Female	Emergency	Moderate	High Blood Pressure	2	No	5456.930075	8
499998	2	radiotherapy	A	Dr John	1	29957	61-70	Female	Trauma	Extreme	Diabetes	2	No	4694.127772	23
499999	3	gynecology	F	Dr Sophia	3	45008	41-50	Female	Trauma	Moderate	Heart disease	4	Yes	4713.868519	10

	Available Extra Rooms in Hospital	staff_available	Visitors with Patient	Admission_Deposit	Stay (in days)	Department_anesthesia	Department_gynecology	Department_radiotherapy	Department_surgery	Ward_Facility_Code_B	Ward_Facility_Code_C	Ward_Facility_Code_D	Ward_Facility_Code_E	Ward_Facility_Code_F	doctor_name_Dr John	doctor_name_Dr Mark	doctor_name_Dr Nathan	doctor_name_Dr Olivia	doctor_name_Dr Sam	doctor_name_Dr Sarah	doctor_name_Dr Simon	doctor_name_Dr Sophia	Age_11-20	Age_21-30	Age_31-40	Age_41-50	Age_51-60	Age_61-70	Age_71-80	Age_81-90	Age_91-100	gender_Male	gender_Other	Type of Admission_Trauma	Type of Admission_Urgent	Severity of Illness_Minor	Severity of Illness_Moderate	health_conditions_Diabetes	health_conditions_Heart disease	health_conditions_High Blood Pressure	health_conditions_None	health_conditions_Other	Insurance_Yes
0	4	0	4	2966.408696	8	0	1	0	0	0	0	1	0	0	0	0	0	0	0	0	0	1	0	0	0	1	0	0	0	0	0	0	0	1	0	0	0	1	0	0	0	0	1
1	4	2	2	3554.835677	9	0	1	0	0	1	0	0	0	0	0	0	0	0	0	0	0	1	0	0	1	0	0	0	0	0	0	0	0	1	0	1	0	0	1	0	0	0	0
2	2	8	2	5624.733654	7	0	1	0	0	1	0	0	0	0	0	0	0	0	0	0	0	1	0	1	0	0	0	0	0	0	0	0	0	1	0	0	0	1	0	0	0	0	1
3	4	7	4	4814.149231	8	0	1	0	0	0	0	1	0	0	0	0	0	1	0	0	0	0	0	0	1	0	0	0	0	0	0	0	0	0	1	0	1	0	0	0	1	0	0
4	2	10	2	5169.269637	34	1	0	0	0	0	0	0	1	0	0	1	0	0	0	0	0	0	0	0	0	0	0	0	1	0	0	1	0	1	0	0	1	1	0	0	0	0	0
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
499995	4	2	3	4105.795901	10	0	1	0	0	0	0	0	0	1	0	0	0	0	0	1	0	0	1	0	0	0	0	0	0	0	0	0	0	1	0	1	0	0	0	1	0	0	0
499996	13	8	2	4631.550257	11	0	1	0	0	0	0	0	0	1	0	0	0	1	0	0	0	0	0	0	1	0	0	0	0	0	0	0	0	0	0	0	1	0	0	0	0	1	0
499997	2	3	2	5456.930075	8	0	1	0	0	1	0	0	0	0	0	0	0	0	0	1	0	0	1	0	0	0	0	0	0	0	0	0	0	0	0	0	1	0	0	1	0	0	0
499998	2	1	2	4694.127772	23	0	0	1	0	0	0	0	0	0	1	0	0	0	0	0	0	0	0	0	0	0	0	1	0	0	0	0	0	1	0	0	0	1	0	0	0	0	0
499999	3	3	4	4713.868519	10	0	1	0	0	0	0	0	0	1	0	0	0	0	0	0	0	1	0	0	0	1	0	0	0	0	0	0	0	1	0	0	1	0	1	0	0	0	1

Hospital Length of Stay (LOS) Prediction¶

Context:¶

Objective:¶

Data Dictionary:¶

Approach to solve the problem:¶

Importing Libraries¶

Data Overview¶

Data Preparation for Model Building¶

Serialization¶

Pickle¶

Importing the library¶

Saving the trained model using Pickle¶

Loading the trained model using Pickle¶

Joblib¶

Importing the library¶

Saving the trained model using Joblib¶

Loading the trained model using Joblib¶