!pip install stats
!pip install hmmlearn

Collecting stats
  Downloading stats-0.1.2a.tar.gz (127 kB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 0.0/127.6 kB ? eta -:--:--
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━╸━━━━━━━━━━━ 92.2/127.6 kB 2.5 MB/s eta 0:00:01
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 127.6/127.6 kB 2.1 MB/s eta 0:00:00
  Preparing metadata (setup.py) ... done
Building wheels for collected packages: stats
  Building wheel for stats (setup.py) ... done
  Created wheel for stats: filename=stats-0.1.2a0-py3-none-any.whl size=24283 sha256=01eef272224286bee238f4b19c3feaf0bdf77e6933758d396828dac4c7b2687e
  Stored in directory: /root/.cache/pip/wheels/c9/85/bc/3371b9bce1e4f7b8b638e8c968a4dbd74db171ee180c48f808
Successfully built stats
Installing collected packages: stats
Successfully installed stats-0.1.2a0
Collecting hmmlearn
  Downloading hmmlearn-0.3.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (161 kB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 161.1/161.1 kB 3.1 MB/s eta 0:00:00
Requirement already satisfied: numpy>=1.10 in /usr/local/lib/python3.10/dist-packages (from hmmlearn) (1.25.2)
Requirement already satisfied: scikit-learn!=0.22.0,>=0.16 in /usr/local/lib/python3.10/dist-packages (from hmmlearn) (1.2.2)
Requirement already satisfied: scipy>=0.19 in /usr/local/lib/python3.10/dist-packages (from hmmlearn) (1.11.4)
Requirement already satisfied: joblib>=1.1.1 in /usr/local/lib/python3.10/dist-packages (from scikit-learn!=0.22.0,>=0.16->hmmlearn) (1.4.2)
Requirement already satisfied: threadpoolctl>=2.0.0 in /usr/local/lib/python3.10/dist-packages (from scikit-learn!=0.22.0,>=0.16->hmmlearn) (3.5.0)
Installing collected packages: hmmlearn
Successfully installed hmmlearn-0.3.2

import stats  # Library which provides basic statistics functions similar to those found on scientific calculators

import pandas as pd # Library used for data manipulation and analysis
import numpy as np  # Library used for working with arrays
import math # Library to use mathematical functions

import datetime # Library for manipulating dates and times.

from copy import deepcopy

import matplotlib.pyplot as plt # Library for plots and visualisations
%matplotlib inline
from pylab import rcParams

import seaborn as sns  # Library for advanced visualisations
sns.set()

from tqdm import tqdm

from sklearn.linear_model import LinearRegression

from hmmlearn.hmm import GaussianHMM # Library for supervised learning of HMMs
from sklearn.mixture import GaussianMixture
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error

import warnings
warnings.filterwarnings("ignore")
import logging
logging.getLogger('hmmlearn').setLevel(logging.ERROR)

dataset=pd.read_csv('UNEMPRATE.csv')

dataset.tail()

dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   DATE    891 non-null    object 
 1   UNRATE  891 non-null    float64
dtypes: float64(1), object(1)
memory usage: 14.0+ KB

dataset['Month']=pd.DatetimeIndex(dataset['DATE']).month
dataset['Year']=pd.DatetimeIndex(dataset['DATE']).year

ma_5_yr=dataset['UNRATE'].rolling(60).mean()
ma_3_yr=dataset['UNRATE'].rolling(36).mean()

plt.figure(figsize=(21,7))
sns.lineplot(x=dataset['Year'],y=ma_3_yr,color='green',label="3-Year Average")
sns.lineplot(x=dataset['Year'],y=ma_5_yr,color='blue',label="5-Year Average")
sns.lineplot(x=dataset['Year'],y=dataset['UNRATE'],color='red',label="Unemployment Rate")

<Axes: xlabel='Year', ylabel='UNRATE'>

plt.figure(figsize=(21,7))
sns.lineplot(x=dataset['Year'],y=np.abs(ma_3_yr-dataset['UNRATE']),color='green')

<Axes: xlabel='Year', ylabel='UNRATE'>

dataset['DATE'][(np.abs(ma_3_yr-dataset['UNRATE'])).argmin()]

'2001-02-01'

dataset['DATE']=pd.to_datetime(dataset['DATE'])

from datetime import datetime

dataset.groupby(['Year'])['UNRATE'].median()

Year
1948    3.80
1949    6.30
1950    5.20
1951    3.25
1952    3.00
        ... 
2018    3.85
2019    3.60
2020    7.40
2021    5.60
2022    3.80
Name: UNRATE, Length: 75, dtype: float64

plt.figure(figsize=(30,15))
sns.boxplot(x=dataset['Year'],y=dataset['UNRATE'])
plt.xticks(rotation=90)
plt.show()

dataset.groupby(['Year'])['UNRATE'].std().reset_index().dropna().sort_values(['UNRATE'],ascending=False)[:10].reset_index(drop=True)

plt.figure(figsize=(30,15))
sns.boxplot(x=dataset['Month'],y=dataset['UNRATE'])
plt.xticks(rotation=90)

([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11],
 [Text(0, 0, '1'),
  Text(1, 0, '2'),
  Text(2, 0, '3'),
  Text(3, 0, '4'),
  Text(4, 0, '5'),
  Text(5, 0, '6'),
  Text(6, 0, '7'),
  Text(7, 0, '8'),
  Text(8, 0, '9'),
  Text(9, 0, '10'),
  Text(10, 0, '11'),
  Text(11, 0, '12')])

dataset['SEASONALITY AND NOISE'] = dataset['UNRATE']/ma_5_yr

#first add a month column
#dataset['Month'] = dataset.index.strftime('%m').astype(np.int)

#initialize the month based dictionaries to store the running total of the month wise  seasonal sums and counts
average_seasonal_values = {1:0, 2:0, 3:0, 4:0, 5:0, 6:0, 7:0, 8:0, 9:0, 10:0, 11:0, 12:0}

average_seasonal_value_counts = {1:0, 2:0, 3:0, 4:0, 5:0, 6:0, 7:0, 8:0, 9:0, 10:0, 11:0, 12:0}

#calculate the sums and counts
for i in range(0, dataset['SEASONALITY AND NOISE'].size):
    if math.isnan(dataset['SEASONALITY AND NOISE'][i]) is False:
        average_seasonal_values[dataset['Month'][i]] = average_seasonal_values[dataset['Month'][i]] +dataset['SEASONALITY AND NOISE'][i]
        average_seasonal_value_counts[dataset['Month'][i]] = average_seasonal_value_counts[dataset['Month'][i]] + 1

#create a new column in the data frame and fill it with the value of the average seasonal component for the corresponding month
for i in range(1, 13):
    average_seasonal_values[i] = average_seasonal_values[i] / average_seasonal_value_counts[i]



dataset['SEASONALITY'] = np.nan

for i in range(0, dataset['SEASONALITY AND NOISE'].size):
    if math.isnan(dataset['SEASONALITY AND NOISE'][i]) is False:
        dataset['SEASONALITY'][i] = average_seasonal_values[dataset['Month'][i]]

plt.figure(figsize=(21,7))
sns.lineplot(x=dataset['Year'],y=dataset['SEASONALITY AND NOISE'])

<Axes: xlabel='Year', ylabel='SEASONALITY AND NOISE'>

plt.figure(figsize=(21,7))
sns.lineplot(x=dataset['Year'],y=dataset['SEASONALITY'])

plt.show()

dataset['NOISE'] = dataset['SEASONALITY AND NOISE']/dataset['SEASONALITY']

#plot the seasonal component

fig = plt.figure(figsize=(21,7))

fig.suptitle('The NOISE component')

plt.ylim(0, 1.3)

sns.lineplot(x=dataset['Year'],y=dataset['NOISE'])

plt.show()

values = dataset['UNRATE'][:-24].values.reshape(-1,1)

N = 650
train = values[:N]
test = values[N:]

hyperparameter_grid = {
    'n_components': [2,5,7,10,12,15,17,20,22,25,27,30,32,35,37,40],
    'covariance_type' :'diag',
    'algorithm' : ['algo','verterbi'],
    'n_iter' :10000
}

##Create and store all models combinations of n_components and algo
n_comp=[]
mse=[]
mae=[]
al=[]

for i in range(2,50,2):
  for algo in ['map','viterbi']:
    model_hmm = GaussianHMM(n_components=i+2, covariance_type="diag", algorithm =algo, n_iter=10000, random_state = 42)
    model_hmm.fit(train)
    prediction_hmm = model_hmm.predict(test)
    n_comp.append(i+2)
    al.append(algo)
    mse.append(mean_squared_error(test, prediction_hmm) ** (1 / 2))
    mae.append(mean_absolute_error(test, prediction_hmm))

model_df=pd.DataFrame({'comp':n_comp,'mse':mse,'mae':mae,'al':algo})

model_df[model_df['mae']==model_df['mae'].min()]

sns.lineplot(x=model_df['comp'],y=model_df['mse'],color='red')
sns.lineplot(x=model_df['comp'],y=model_df['mae'],color='blue')

<Axes: xlabel='comp', ylabel='mse'>

model_hmm = GaussianHMM(n_components=14, covariance_type="diag", algorithm = 'viterbi', n_iter=10000, random_state = 42)
model_hmm.fit(train)
prediction_hmm = model_hmm.predict(test)

labels = model_hmm.predict(test)
means = np.zeros_like(test)
for i in range(model_hmm.n_components):
    means[labels == i] = model_hmm.means_[i]

plt.figure(figsize=(14, 8))
plt.title(' Hidden Markov Model with Gaussian emissions')
plt.ylabel('Unemployment for the given month')
plt.plot(test, color = 'red', label = 'Original')
plt.plot(means, color = 'blue', label = 'HMM prediction')
plt.legend()
plt.show()

rmse_hmm = mean_squared_error(test, prediction_hmm) ** (1 / 2)
rmse_hmm

4.193180770841931

mae_hmm = mean_absolute_error(test, prediction_hmm)
mae_hmm

3.473732718894009

from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive

# Convert notebook to html
!jupyter nbconvert --to html "/content/drive/MyDrive/MIT - Data Sciences/Colab Notebooks/Week_Eight_-_Networking_and_Graphical_Models/Case_Studies/Unemployment_Rate_Prediction/Unemployment_Rate_Prediction.ipynb"

	DATE	UNRATE
886	2021-11-01	4.2
887	2021-12-01	3.9
888	2022-01-01	4.0
889	2022-02-01	3.8
890	2022-03-01	3.6

	Year	UNRATE
0	2020	3.616743
1	1949	1.031768
2	1950	0.894893
3	2021	0.831711
4	2008	0.780443
5	1983	0.761577
6	1982	0.719164
7	2009	0.696528
8	1974	0.674818
9	1970	0.664466

Introduction to Hidden Markov Models¶

Case Study - Context¶

Objective¶

Dataset¶

Installing the hmmlearn and stats libraries¶

Importing the necessary libraries¶

Reading the Unemployment Rate data¶

Employment stability over the years¶

Most Stable Years¶

Year-wise Unemployment Rate¶

Fluctuation in monthly unemployment for each year¶

Fluctuation in employment for a particular month¶

Checking for seasonality in the data¶

HMM Training¶

Hyperparameter Tuning¶

Model Performance Evaluation¶

Conclusions¶