#Mounting the drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive

import warnings                                                               # Used to ignore the warning given as output of the code.
warnings.filterwarnings('ignore')

import numpy as np                                                            # Basic libraries of python for numeric and dataframe computations.
import pandas as pd

import matplotlib.pyplot as plt                                               # Basic library for data visualization.
import seaborn as sns                                                         # Slightly advanced library for data visualization

from sklearn.metrics.pairwise import cosine_similarity                        # To compute the cosine similarity between two vectors.
from collections import defaultdict                                           # A dictionary output that does not raise a key error

from sklearn.metrics import mean_squared_error                                # A performance metrics in sklearn.

#importing the datasets
count_df = pd.read_csv('/content/drive/MyDrive/MIT - Data Sciences/Colab Notebooks/Week_Seven_-_Recommendation_Systems/Music_Recommendation_System/count_data.csv')
song_df  = pd.read_csv('/content/drive/MyDrive/MIT - Data Sciences/Colab Notebooks/Week_Seven_-_Recommendation_Systems/Music_Recommendation_System/song_data.csv')

count_df.shape

(2000000, 4)

song_df.shape

(1000000, 5)

count_df.head(10)

song_df

count_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000000 entries, 0 to 1999999
Data columns (total 4 columns):
 #   Column      Dtype 
---  ------      ----- 
 0   Unnamed: 0  int64 
 1   user_id     object
 2   song_id     object
 3   play_count  int64 
dtypes: int64(2), object(2)
memory usage: 61.0+ MB

song_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000000 entries, 0 to 999999
Data columns (total 5 columns):
 #   Column       Non-Null Count    Dtype 
---  ------       --------------    ----- 
 0   song_id      1000000 non-null  object
 1   title        999983 non-null   object
 2   release      999993 non-null   object
 3   artist_name  1000000 non-null  object
 4   year         1000000 non-null  int64 
dtypes: int64(1), object(4)
memory usage: 38.1+ MB

df = pd.merge(count_df, song_df.drop_duplicates(['song_id']), on="song_id", how="left")
df = df.drop(['Unnamed: 0'],axis=1)
df

df.play_count.describe()

#label encoding code
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
df['user_id'] = le.fit_transform(df['user_id'])

df['song_id'] = le.fit_transform(df['song_id'])

# Get the column containing the users
users = df.user_id

# Create a dictionary that maps users(listeners) to the number of songs that they have listened to
playing_count = dict()

for user in users:
    # If we already have the user, just add 1 to their playing count
    if user in playing_count:
        playing_count[user] += 1

    # Otherwise, set their playing count to 1
    else:
        playing_count[user] = 1

# We want our users to have listened at least 90 songs
SONG_COUNT_CUTOFF = 90

# Create a list of users who need to be removed
remove_users = []

for user, num_songs in playing_count.items():

    if num_songs < SONG_COUNT_CUTOFF:
        remove_users.append(user)

df = df.loc[ ~ df.user_id.isin(remove_users)]

# Get the column containing the songs
songs = df.song_id

# Create a dictionary that maps songs to its number of users(listeners)
playing_count = dict()

for song in songs:
    # If we already have the song, just add 1 to their playing count
    if song in playing_count:
        playing_count[song] += 1

    # Otherwise, set their playing count to 1
    else:
        playing_count[song] = 1

# We want our song to be listened by atleast 120 users to be considred
LISTENER_COUNT_CUTOFF = 120

remove_songs = []

for song, num_users in playing_count.items():
    if num_users < LISTENER_COUNT_CUTOFF:
        remove_songs.append(song)

df_final= df.loc[ ~ df.song_id.isin(remove_songs)]

# Keep only records of songs with play_count less than or equal to (<=) 5
df_final=df_final[df_final.play_count<=5]

df_final.shape

(117876, 7)

df_final.groupby("play_count").count()

# See the shape of the data
df_final.shape

(117876, 7)

df_final['user_id'].nunique()

3155

df_final['song_id'].nunique()

563

df_final['artist_name'].nunique()

232

df_final['title'].value_counts()

title
Use Somebody                       751
Dog Days Are Over (Radio Edit)     748
Sehr kosmisch                      713
Clocks                             662
The Scientist                      652
                                  ... 
Who's Real                         103
Brave The Elements                 102
Creil City                         101
Heaven Must Be Missing An Angel     97
The Big Gundown                     96
Name: count, Length: 561, dtype: int64

df_final['user_id'].value_counts()

user_id
61472    243
15733    227
37049    202
9570     184
23337    177
        ... 
19776      1
45476      1
17961      1
14439      1
10412      1
Name: count, Length: 3155, dtype: int64

count_songs = song_df.groupby('year').count()['title']
count = pd.DataFrame(count_songs)
count.drop(count.index[0], inplace=True)
count.tail()

plt.figure(figsize=(30,10))
ax = sns.barplot(x = count.index,
            y = 'title',
            data = count,
            estimator = np.median,)
for item in ax.get_xticklabels(): item.set_rotation(90)
plt.ylabel('number of songs released')
# Show the plot
plt.show()

#Calculating average play_count
average_count = df_final.groupby('song_id')['play_count'].mean()

#Calculating the frequency a song is played.
play_freq = df_final.groupby('song_id')['play_count'].count()

#Making a dataframe with the average_count and play_freq
final_play = pd.DataFrame({'avg_count':average_count, 'play_freq':play_freq})

def top_n_songs(data, n, min_interactions=100):

    #Finding songs with minimum number of play_counts
    recommendations = data[data['play_freq'] > min_interactions]

    #Sorting values w.r.t average count
    recommendations = recommendations.sort_values(by='avg_count', ascending=False)

    return recommendations.index[:n]

list(top_n_songs(final_play, 10, 100))

[7224, 6450, 9942, 5531, 5653, 8483, 2220, 657, 614, 352]

!pip install surprise

Collecting surprise
  Downloading surprise-0.1-py2.py3-none-any.whl (1.8 kB)
Collecting scikit-surprise (from surprise)
  Downloading scikit_surprise-1.1.4.tar.gz (154 kB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 154.4/154.4 kB 2.4 MB/s eta 0:00:00
  Installing build dependencies ... done
  Getting requirements to build wheel ... done
  Preparing metadata (pyproject.toml) ... done
Requirement already satisfied: joblib>=1.2.0 in /usr/local/lib/python3.10/dist-packages (from scikit-surprise->surprise) (1.4.2)
Requirement already satisfied: numpy>=1.19.5 in /usr/local/lib/python3.10/dist-packages (from scikit-surprise->surprise) (1.25.2)
Requirement already satisfied: scipy>=1.6.0 in /usr/local/lib/python3.10/dist-packages (from scikit-surprise->surprise) (1.11.4)
Building wheels for collected packages: scikit-surprise
  Building wheel for scikit-surprise (pyproject.toml) ... done
  Created wheel for scikit-surprise: filename=scikit_surprise-1.1.4-cp310-cp310-linux_x86_64.whl size=2357248 sha256=a99df192f535f9ebc101bac5235e8df41a0cb57803d94bd7d6a50a718c12e5f0
  Stored in directory: /root/.cache/pip/wheels/4b/3f/df/6acbf0a40397d9bf3ff97f582cc22fb9ce66adde75bc71fd54
Successfully built scikit-surprise
Installing collected packages: scikit-surprise, surprise
Successfully installed scikit-surprise-1.1.4 surprise-0.1

# To compute the accuracy of models
from surprise import accuracy

# class is used to parse a file containing play_counts, data should be in structure - user; item ; play_count
from surprise.reader import Reader

# class for loading datasets
from surprise.dataset import Dataset

# for tuning model hyperparameters
from surprise.model_selection import GridSearchCV

# for splitting the data in train and test dataset
from surprise.model_selection import train_test_split

# for implementing similarity-based recommendation system
from surprise.prediction_algorithms.knns import KNNBasic

# for implementing matrix factorization based recommendation system
from surprise.prediction_algorithms.matrix_factorization import SVD

# for implementing KFold cross-validation
from surprise.model_selection import KFold

#For implementing clustering-based recommendation system
from surprise import CoClustering

def precision_recall_at_k(model, k=30, threshold=1.5):
    """Return precision and recall at k metrics for each user"""

    # First map the predictions to each user.
    user_est_true = defaultdict(list)

    #Making predictions on the test data
    predictions = model.test(testset)

    for uid, _, true_r, est, _ in predictions:
        user_est_true[uid].append((est, true_r))

    precisions = dict()
    recalls = dict()
    for uid, playing_count in user_est_true.items():

        # Sort play count by estimated value
        playing_count.sort(key=lambda x: x[0], reverse=True)

        # Number of relevant items
        n_rel = sum((true_r >= threshold) for (_, true_r) in playing_count)

        # Number of recommended items in top k
        n_rec_k = sum((est >= threshold) for (est, _) in playing_count[:k])

        # Number of relevant and recommended items in top k
        n_rel_and_rec_k = sum(((true_r >= threshold) and (est >= threshold))
                              for (est, true_r) in playing_count[:k])

        # Precision@K: Proportion of recommended items that are relevant
        # When n_rec_k is 0, Precision is undefined. We here set Precision to 0 when n_rec_k is 0.

        precisions[uid] = n_rel_and_rec_k / n_rec_k if n_rec_k != 0 else 0

        # Recall@K: Proportion of relevant items that are recommended
        # When n_rel is 0, Recall is undefined. We here set Recall to 0 when n_rel is 0.

        recalls[uid] = n_rel_and_rec_k / n_rel if n_rel != 0 else 0

    #Mean of all the predicted precisions are calculated.
    precision = round((sum(prec for prec in precisions.values()) / len(precisions)),3)
    #Mean of all the predicted recalls are calculated.
    recall = round((sum(rec for rec in recalls.values()) / len(recalls)),3)

    accuracy.rmse(predictions)
    print('Precision: ', precision) #Command to print the overall precision
    print('Recall: ', recall) #Command to print the overall recall
    print('F_1 score: ', round((2*precision*recall)/(precision+recall),3)) # Formula to compute the F-1 score.

# instantiating Reader scale with expected rating scale
reader = Reader(rating_scale=(0, 5))

# loading the dataset
data = Dataset.load_from_df(df_final[['user_id', 'song_id', 'play_count']], reader)

# splitting the data into train and test dataset
trainset, testset = train_test_split(data, test_size=0.40, random_state=42)

#Declaring the similarity options.
sim_options = {'name': 'cosine',
               'user_based': True}

#KNN algorithm is used to find desired similar items.
sim_user_user = KNNBasic(sim_options=sim_options, verbose=False, random_state=1)

# Train the algorithm on the trainset, and predict play_count for the testset
sim_user_user.fit(trainset)

# Let us compute precision@k, recall@k, and f_1 score with k =30.
precision_recall_at_k(sim_user_user)

RMSE: 1.0878
Precision:  0.396
Recall:  0.692
F_1 score:  0.504

# predicting play_count for a sample user with a listened song.
sim_user_user.predict(6958, 1671, r_ui=2, verbose=True)

user: 6958       item: 1671       r_ui = 2.00   est = 1.80   {'actual_k': 40, 'was_impossible': False}

Prediction(uid=6958, iid=1671, r_ui=2, est=1.8009387435128914, details={'actual_k': 40, 'was_impossible': False})

#predicting play_count for a sample user with a song not-listened by the user.
sim_user_user.predict(6958, 3232, verbose=True)

user: 6958       item: 3232       r_ui = None   est = 1.64   {'actual_k': 40, 'was_impossible': False}

Prediction(uid=6958, iid=3232, r_ui=None, est=1.6386860897998294, details={'actual_k': 40, 'was_impossible': False})

# setting up parameter grid to tune the hyperparameters
param_grid = {'k': [10, 20, 30], 'min_k': [3, 6, 9],
              'sim_options': {'name': ["cosine",'pearson',"pearson_baseline"],
                              'user_based': [True], "min_support":[2,4]}
              }

# performing 3-fold cross validation to tune the hyperparameters
gs = GridSearchCV(KNNBasic, param_grid, measures=['rmse'], cv=3, n_jobs=-1)

# fitting the data
gs.fit(data)

# best RMSE score
print(gs.best_score['rmse'])

# combination of parameters that gave the best RMSE score
print(gs.best_params['rmse'])

1.0462441381791592
{'k': 30, 'min_k': 9, 'sim_options': {'name': 'pearson_baseline', 'user_based': True, 'min_support': 2}}

# using the optimal similarity measure for user-user based collaborative filtering
sim_options = {'name': 'pearson_baseline',
               'user_based': True, "min_support":2}

# creating an instance of KNNBasic with optimal hyperparameter values
sim_user_user_optimized = KNNBasic(sim_options=sim_options, k=30, min_k=9, random_state=1, verbose=False)

# training the algorithm on the trainset
sim_user_user_optimized.fit(trainset)

# Let us compute precision@k and recall@k also with k =30.
precision_recall_at_k(sim_user_user_optimized)

RMSE: 1.0521
Precision:  0.413
Recall:  0.721
F_1 score:  0.525

sim_user_user_optimized.predict(6958, 1671, r_ui=2, verbose=True)

user: 6958       item: 1671       r_ui = 2.00   est = 1.96   {'actual_k': 24, 'was_impossible': False}

Prediction(uid=6958, iid=1671, r_ui=2, est=1.962926073914969, details={'actual_k': 24, 'was_impossible': False})

sim_user_user_optimized.predict(6958,3232, verbose=True)

user: 6958       item: 3232       r_ui = None   est = 1.45   {'actual_k': 10, 'was_impossible': False}

Prediction(uid=6958, iid=3232, r_ui=None, est=1.4516261428486725, details={'actual_k': 10, 'was_impossible': False})

sim_user_user_optimized.get_neighbors(0,5) #Here 0 is the inner id of the above user.

[42, 1131, 17, 186, 249]

df_final.iloc[1131,:]

user_id                   51415
song_id                    2115
play_count                    1
title                  Tive Sim
release        Nova Bis-Cartola
artist_name             Cartola
year                       1974
Name: 15513, dtype: object

df_final[df_final.user_id==6958]

def get_recommendations(data, user_id, top_n, algo):

    # creating an empty list to store the recommended song ids
    recommendations = []

    # creating an user item interactions matrix
    user_item_interactions_matrix = data.pivot(index='user_id', columns='song_id', values='play_count')

    # extracting those song ids which the user_id has not played yet
    non_interacted_songs = user_item_interactions_matrix.loc[user_id][user_item_interactions_matrix.loc[user_id].isnull()].index.tolist()

    # looping through each of the song ids which user_id has not interacted yet
    for item_id in non_interacted_songs:

        # predicting the play_count for those non played song ids by this user
        est = algo.predict(user_id, item_id).est

        # appending the predicted play_count
        recommendations.append((item_id, est))

    # sorting the predicted play_count in descending order
    recommendations.sort(key=lambda x: x[1], reverse=True)

    return recommendations[:top_n] # returing top n highest predicted play_count songs for this user

#Making top 5 recommendations for user_id 6958 with a similarity-based recommendation engine.
recommendations = get_recommendations(df_final,6958, 5, sim_user_user)

#Building the dataframe for above recommendations with columns "song_id" and "recommendation_score"
pd.DataFrame(recommendations, columns=['song_id', 'predicted_play_count'])

def ranking_songs(recommendations, playing_count):
  # sort the songs based on play counts
  ranked_songs = playing_count.loc[[items[0] for items in recommendations]].sort_values('play_freq', ascending=False)[['play_freq']].reset_index()

  # merge with the recommended songs to get predicted play_count
  ranked_songs = ranked_songs.merge(pd.DataFrame(recommendations, columns=['song_id', 'predicted_play_count']), on='song_id', how='inner')

  # rank the songs based on corrected play_counts
  ranked_songs['corrected_play_count'] = ranked_songs['predicted_play_count'] - 1 / np.sqrt(ranked_songs['play_freq'])

  # sort the songs based on corrected play_counts
  ranked_songs = ranked_songs.sort_values('corrected_play_count', ascending=False)

  return ranked_songs

#Applying the ranking_songs function and sorting it based on corrected play_counts.
ranking_songs(recommendations, final_play)

#Declaring the similarity options.
sim_options = {'name': 'pearson',
               'user_based': False}

#KNN algorithm is used to find desired similar items.
sim_item_item = KNNBasic(sim_options=sim_options, random_state=1, verbose=False)

# Train the algorithm on the trainset, and predict play_count for the testset
sim_item_item.fit(trainset)

# Let us compute precision@k, recall@k, and f_1 score with k =30.
precision_recall_at_k(sim_item_item)

RMSE: 1.0588
Precision:  0.376
Recall:  0.538
F_1 score:  0.443

#predicting play_count for a sample user with a listened song.
sim_item_item.predict(6958, 1671, r_ui=2, verbose=True)

user: 6958       item: 1671       r_ui = 2.00   est = 1.92   {'actual_k': 10, 'was_impossible': False}

Prediction(uid=6958, iid=1671, r_ui=2, est=1.91669781984001, details={'actual_k': 10, 'was_impossible': False})

#predicting play count for a sample user with song not listened to by the user.
sim_item_item.predict(6958,3232, verbose=True)

user: 6958       item: 3232       r_ui = None   est = 1.00   {'actual_k': 5, 'was_impossible': False}

Prediction(uid=6958, iid=3232, r_ui=None, est=1.0, details={'actual_k': 5, 'was_impossible': False})

# setting up parameter grid to tune the hyperparameters
param_grid = {'k': [10, 20, 30], 'min_k': [3, 6, 9],
              'sim_options': {'name': ["cosine",'pearson',"pearson_baseline"],
                              'user_based': [False], "min_support":[2,4]}
              }

# performing 3-fold cross validation to tune the hyperparameters
gs = GridSearchCV(KNNBasic, param_grid, measures=['rmse'], cv=3, n_jobs=-1)

# fitting the data
gs.fit(data)

# best RMSE score
print(gs.best_score['rmse'])

# combination of parameters that gave the best RMSE score
print(gs.best_params['rmse'])

1.0228249658265955
{'k': 30, 'min_k': 6, 'sim_options': {'name': 'pearson_baseline', 'user_based': False, 'min_support': 2}}

# using the optimal similarity measure for item-item based collaborative filtering
sim_options = {'name': 'pearson_baseline',
               'user_based': False, "min_support":4}

# creating an instance of KNNBasic with optimal hyperparameter values
sim_item_item_optimized = KNNBasic(sim_options=sim_options, k=30, min_k=6, random_state=1, verbose=False)

# training the algorithm on the trainset
sim_item_item_optimized.fit(trainset)

# Let us compute precision@k and recall@k also with k =10.
precision_recall_at_k(sim_item_item_optimized)

RMSE: 1.0328
Precision:  0.405
Recall:  0.696
F_1 score:  0.512

sim_item_item_optimized.predict(6958, 1671, r_ui=2, verbose=True)

user: 6958       item: 1671       r_ui = 2.00   est = 1.96   {'actual_k': 10, 'was_impossible': False}

Prediction(uid=6958, iid=1671, r_ui=2, est=1.9634957386781853, details={'actual_k': 10, 'was_impossible': False})

sim_item_item_optimized.predict(6958, 3232, verbose=True)

user: 6958       item: 3232       r_ui = None   est = 1.70   {'was_impossible': True, 'reason': 'Not enough neighbors.'}

Prediction(uid=6958, iid=3232, r_ui=None, est=1.6989607635206787, details={'was_impossible': True, 'reason': 'Not enough neighbors.'})

sim_item_item_optimized.get_neighbors(0, k=5)

[124, 523, 173, 205, 65]

#Making top 5 recommendations for user_id 6958 with similarity-based recommendation engine.
recommendations = get_recommendations(df_final, 6958, 5, sim_item_item)

#Building the dataframe for above recommendations with columns "song_id" and "predicted_play_count"
pd.DataFrame(recommendations, columns=['songs_id', 'predicted_play_count'])

#Applying the ranking_songs function and sorting it based on corrected play_counts.
ranking_songs(recommendations, final_play)

# using SVD matrix factorization
svd = SVD(random_state=1)

# training the algorithm on the trainset
svd.fit(trainset)

# Let us compute precision@k and recall@k with k =30.
precision_recall_at_k(svd)

RMSE: 1.0252
Precision:  0.41
Recall:  0.633
F_1 score:  0.498

#Making prediction.
svd.predict(6958, 1671, r_ui=2, verbose=True)

user: 6958       item: 1671       r_ui = 2.00   est = 1.27   {'was_impossible': False}

Prediction(uid=6958, iid=1671, r_ui=2, est=1.267473397214638, details={'was_impossible': False})

#Making prediction.
svd.predict(6958, 3232, verbose=True)

user: 6958       item: 3232       r_ui = None   est = 1.56   {'was_impossible': False}

Prediction(uid=6958, iid=3232, r_ui=None, est=1.5561675084403663, details={'was_impossible': False})

# set the parameter space to tune
param_grid = {'n_epochs': [10, 20, 30], 'lr_all': [0.001, 0.005, 0.01],
              'reg_all': [0.2, 0.4, 0.6]}

# performing 3-fold gridsearch cross validation
gs_ = GridSearchCV(SVD, param_grid, measures=['rmse'], cv=3, n_jobs=-1)

# fitting data
gs_.fit(data)

# best RMSE score
print(gs_.best_score['rmse'])

# combination of parameters that gave the best RMSE score
print(gs_.best_params['rmse'])

1.0123682332653112
{'n_epochs': 30, 'lr_all': 0.01, 'reg_all': 0.2}

# building the optimized SVD model using optimal hyperparameter search
svd_optimized = SVD(n_epochs=30, lr_all=0.01, reg_all=0.2, random_state=1)

# training the algorithm on the trainset
svd_optimized=svd_optimized.fit(trainset)

# Let us compute precision@k and recall@k also with k =30.
precision_recall_at_k(svd_optimized)

RMSE: 1.0141
Precision:  0.415
Recall:  0.635
F_1 score:  0.502

#Using svd_algo_optimized model to recommend for userId 6958 and song_id 1671.
svd_optimized.predict(6958, 1671, r_ui=2, verbose=True)

user: 6958       item: 1671       r_ui = 2.00   est = 1.34   {'was_impossible': False}

Prediction(uid=6958, iid=1671, r_ui=2, est=1.3432395286125098, details={'was_impossible': False})

#Using svd_algo_optimized model to recommend for userId 6958 and song_id 3232 with unknown baseline play_count.
svd_optimized.predict(6958, 3232, verbose=True)

user: 6958       item: 3232       r_ui = None   est = 1.44   {'was_impossible': False}

Prediction(uid=6958, iid=3232, r_ui=None, est=1.4425484461176483, details={'was_impossible': False})

#Getting top 5 recommendations for user_id 6958 using "svd_optimized" algorithm.
svd_recommendations = get_recommendations(df_final, 6958, 5, svd_optimized)

#Ranking songs based on above recommendations
ranking_songs(svd_recommendations, final_play)

# using CoClustering algorithm.
clust_baseline = CoClustering(random_state=1)

# training the algorithm on the trainset
clust_baseline.fit(trainset)

# Let us compute precision@k and recall@k with k =30.
precision_recall_at_k(clust_baseline)

RMSE: 1.0487
Precision:  0.397
Recall:  0.582
F_1 score:  0.472

#Making prediction for user_id 6958 and song_id 1671.
clust_baseline.predict(6958,1671, r_ui=2, verbose=True)

user: 6958       item: 1671       r_ui = 2.00   est = 1.29   {'was_impossible': False}

Prediction(uid=6958, iid=1671, r_ui=2, est=1.2941824757363074, details={'was_impossible': False})

#Making prediction for userid 6958 and song_id 3232.
clust_baseline.predict(6958,3232, verbose=True)

user: 6958       item: 3232       r_ui = None   est = 1.48   {'was_impossible': False}

Prediction(uid=6958, iid=3232, r_ui=None, est=1.4785259100797417, details={'was_impossible': False})

# set the parameter space to tune
param_grid = {'n_cltr_u':[5,6,7,8], 'n_cltr_i': [5,6,7,8], 'n_epochs': [10,20,30]}

# performing 3-fold gridsearch cross validation
gs = GridSearchCV(CoClustering, param_grid, measures=['rmse'], cv=3, n_jobs=-1)

# fitting data
gs.fit(data)

# best RMSE score
print(gs.best_score['rmse'])

# combination of parameters that gave the best RMSE score
print(gs.best_params['rmse'])

1.0613293131139294
{'n_cltr_u': 5, 'n_cltr_i': 5, 'n_epochs': 30}

# using tuned Coclustering algorithm
clust_tuned = CoClustering(n_cltr_u=3,n_cltr_i=2, n_epochs=60, random_state=1)

# training the algorithm on the trainset
clust_tuned.fit(trainset)

# Let us compute precision@k and recall@k with k =30.
precision_recall_at_k(clust_tuned)

RMSE: 1.0471
Precision:  0.396
Recall:  0.572
F_1 score:  0.468

#Using co_clustering_optimized model to recommend for userId 6958 and song_id 1671.
clust_tuned.predict(6958, 1671, r_ui=2, verbose=True)

user: 6958       item: 1671       r_ui = 2.00   est = 1.59   {'was_impossible': False}

Prediction(uid=6958, iid=1671, r_ui=2, est=1.585941833604144, details={'was_impossible': False})

#Using Co_clustering based optimized model to recommend for userId 6958 and song_id 3232 with unknown baseline play_count.
clust_tuned.predict(6958, 3232, verbose=True)

user: 6958       item: 3232       r_ui = None   est = 1.77   {'was_impossible': False}

Prediction(uid=6958, iid=3232, r_ui=None, est=1.7702852679475787, details={'was_impossible': False})

#Getting top 5 recommendations for user_id 6958 using "Co-clustering based optimized" algorithm.
clustering_recommendations = get_recommendations(df_final, 6958, 5, clust_tuned)

#Ranking songs based on above recommendations
ranking_songs(clustering_recommendations, final_play)

df_final.shape

(117876, 7)

df_small=df_final

df_small['text'] = df_small['title'] + ' ' + df_small['release'] + ' ' + df_small['artist_name']
df_small.head()

df_small = df_small[['user_id', 'song_id', 'play_count', 'title', 'text']]
df_small = df_small.drop_duplicates(subset=['title'])
df_small = df_small.set_index('title')
df_small.head()

df_small.shape

(561, 4)

indices = pd.Series(df_small.index)
indices[:5]

0                 Daisy And Prudence
1    The Ballad of Michael Valentine
2          I Stand Corrected (Album)
3              They Might Follow You
4                         Monkey Man
Name: title, dtype: object

import nltk
nltk.download("punkt")
nltk.download("stopwords")
nltk.download("wordnet")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...

True

import re
import nltk
from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

def tokenize(text):
    text = re.sub(r"[^a-zA-Z]"," ",text.lower())
    tokens = word_tokenize(text)
    words = [word for word in tokens if word not in stopwords.words("english")]
    text_lems = [WordNetLemmatizer().lemmatize(lem).strip() for lem in words]

    return text_lems

tfidf = TfidfVectorizer(tokenizer=tokenize)
song_tfidf = tfidf.fit_transform(df_small['text'].values).toarray()

song_tfidf

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

song_tfidf.shape

(561, 1437)

similar_songs = cosine_similarity(song_tfidf, song_tfidf)
similar_songs

array([[1., 0., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 1., 0., 0.],
       [0., 0., 0., ..., 0., 1., 0.],
       [0., 0., 0., ..., 0., 0., 1.]])

# function that takes in song title as input and returns the top 10 recommended songs
def recommendations(title, similar_songs):

    recommended_songs = []

    # gettin the index of the song that matches the title
    idx = indices[indices == title].index[0]

    # creating a Series with the similarity scores in descending order
    score_series = pd.Series(similar_songs[idx]).sort_values(ascending = False)

    # getting the indexes of the 10 most similar songs
    top_10_indexes = list(score_series.iloc[1:11].index)
    print(top_10_indexes)

    # populating the list with the titles of the best 10 matching songs
    for i in top_10_indexes:
        recommended_songs.append(list(df_small.index)[i])

    return recommended_songs

recommendations('Learn To Fly', similar_songs)

[509, 234, 423, 345, 394, 370, 371, 372, 373, 375]

['Everlong',
 'The Pretender',
 'Nothing Better (Album)',
 'From Left To Right',
 'Lifespan Of A Fly',
 'Under The Gun',
 'I Need A Dollar',
 'Feel The Love',
 'All The Pretty Faces',
 'Bones']

# Convert notebook to html
!jupyter nbconvert --to html "/content/drive/My Drive/Colab Notebooks/Copy of FDS_Project_LearnerNotebook_FullCode.ipynb"

	Unnamed: 0	user_id	song_id	play_count
0	0	b80344d063b5ccb3212f76538f3d9e43d87dca9e	SOAKIMP12A8C130995	1
1	1	b80344d063b5ccb3212f76538f3d9e43d87dca9e	SOBBMDR12A8C13253B	2
2	2	b80344d063b5ccb3212f76538f3d9e43d87dca9e	SOBXHDL12A81C204C0	1
3	3	b80344d063b5ccb3212f76538f3d9e43d87dca9e	SOBYHAJ12A6701BF1D	1
4	4	b80344d063b5ccb3212f76538f3d9e43d87dca9e	SODACBL12A8C13C273	1
5	5	b80344d063b5ccb3212f76538f3d9e43d87dca9e	SODDNQT12A6D4F5F7E	5
6	6	b80344d063b5ccb3212f76538f3d9e43d87dca9e	SODXRTY12AB0180F3B	1
7	7	b80344d063b5ccb3212f76538f3d9e43d87dca9e	SOFGUAY12AB017B0A8	1
8	8	b80344d063b5ccb3212f76538f3d9e43d87dca9e	SOFRQTD12A81C233C0	1
9	9	b80344d063b5ccb3212f76538f3d9e43d87dca9e	SOHQWYZ12A6D4FA701	1

	song_id	title	release	artist_name	year
0	SOQMMHC12AB0180CB8	Silent Night	Monster Ballads X-Mas	Faster Pussy cat	2003
1	SOVFVAK12A8C1350D9	Tanssi vaan	Karkuteillä	Karkkiautomaatti	1995
2	SOGTUKN12AB017F4F1	No One Could Ever	Butter	Hudson Mohawke	2006
3	SOBNYVR12A8C13558C	Si Vos Querés	De Culo	Yerba Brava	2003
4	SOHSBXH12A8C13B0DF	Tangle Of Aspens	Rene Ablaze Presents Winter Sessions	Der Mystic	0
...	...	...	...	...	...
999995	SOTXAME12AB018F136	O Samba Da Vida	Pacha V.I.P.	Kiko Navarro	0
999996	SOXQYIQ12A8C137FBB	Jago Chhadeo	Naale Baba Lassi Pee Gya	Kuldeep Manak	0
999997	SOHODZI12A8C137BB3	Novemba	Dub_Connected: electronic music	Gabriel Le Mar	0
999998	SOLXGOR12A81C21EB7	Faraday	The Trance Collection Vol. 2	Elude	0
999999	SOWXJXQ12AB0189F43	Fernweh feat. Sektion Kuchikäschtli	So Oder So	Texta	2004

	user_id	song_id	play_count	title	release	artist_name	year
0	b80344d063b5ccb3212f76538f3d9e43d87dca9e	SOAKIMP12A8C130995	1	The Cove	Thicker Than Water	Jack Johnson	0
1	b80344d063b5ccb3212f76538f3d9e43d87dca9e	SOBBMDR12A8C13253B	2	Entre Dos Aguas	Flamenco Para Niños	Paco De Lucia	1976
2	b80344d063b5ccb3212f76538f3d9e43d87dca9e	SOBXHDL12A81C204C0	1	Stronger	Graduation	Kanye West	2007
3	b80344d063b5ccb3212f76538f3d9e43d87dca9e	SOBYHAJ12A6701BF1D	1	Constellations	In Between Dreams	Jack Johnson	2005
4	b80344d063b5ccb3212f76538f3d9e43d87dca9e	SODACBL12A8C13C273	1	Learn To Fly	There Is Nothing Left To Lose	Foo Fighters	1999
...	...	...	...	...	...	...	...
1999995	d8bfd4ec88f0f3773a9e022e3c1a0f1d3b7b6a92	SOJEYPO12AAA8C6B0E	2	Ignorance (Album Version)	Ignorance	Paramore	0
1999996	d8bfd4ec88f0f3773a9e022e3c1a0f1d3b7b6a92	SOJJYDE12AF729FC16	4	Two Is Better Than One	Love Drunk	Boys Like Girls featuring Taylor Swift	2009
1999997	d8bfd4ec88f0f3773a9e022e3c1a0f1d3b7b6a92	SOJKQSF12A6D4F5EE9	3	What I've Done (Album Version)	What I've Done	Linkin Park	2007
1999998	d8bfd4ec88f0f3773a9e022e3c1a0f1d3b7b6a92	SOJUXGA12AC961885C	1	Up	My Worlds	Justin Bieber	2010
1999999	d8bfd4ec88f0f3773a9e022e3c1a0f1d3b7b6a92	SOJYOLS12A8C13C06F	1	Soil_ Soil (Album Version)	The Con	Tegan And Sara	2007

	play_count
count	2.000000e+06
mean	3.045485e+00
std	6.579720e+00
min	1.000000e+00
25%	1.000000e+00
50%	1.000000e+00
75%	3.000000e+00
max	2.213000e+03

	user_id	song_id	title	release	artist_name	year
play_count
1	72473	72473	72473	72473	72473	72473
2	23890	23890	23890	23890	23890	23890
3	10774	10774	10774	10774	10774	10774
4	5874	5874	5874	5874	5874	5874
5	4865	4865	4865	4865	4865	4865

	user_id	song_id	play_count	title	release	artist_name	year
200	6958	447	1	Daisy And Prudence	Distillation	Erin McKeown	2000
202	6958	512	1	The Ballad of Michael Valentine	Sawdust	The Killers	2004
203	6958	549	1	I Stand Corrected (Album)	Vampire Weekend	Vampire Weekend	2007
204	6958	703	1	They Might Follow You	Tiny Vipers	Tiny Vipers	2007
205	6958	719	1	Monkey Man	You Know I'm No Good	Amy Winehouse	2007
206	6958	892	1	Bleeding Hearts	Hell Train	Soltero	0
209	6958	1050	5	Wet Blanket	Old World Underground_ Where Are You Now?	Metric	2003
213	6958	1480	1	Fast As I Can	Monday Morning Cold	Erin McKeown	2000
215	6958	1671	2	Sleeping In (Album)	Give Up	Postal Service	2003
216	6958	1752	1	Gimme Sympathy	Gimme Sympathy	Metric	2009
217	6958	1756	1	You Mustn't Kick It Around	Distillation	Erin McKeown	2000
218	6958	1787	2	Help I'm Alive	Fantasies	Metric	2009
219	6958	1818	1	Teenager	Modapop	Camera Obscura	0
221	6958	2107	1	Stadium Love	Fantasies	Metric	2009
225	6958	2289	1	Satellite Mind	Fantasies	Metric	2009
226	6958	2304	1	Daddy's Eyes	Sawdust	The Killers	2006
227	6958	2425	1	Señorita	Justified	Justin Timberlake	2002
228	6958	2501	1	Camaro	Because Of The Times	Kings Of Leon	2007
232	6958	2701	1	Tron	Antidotes	Foals	2008
235	6958	2898	1	Twilight Galaxy	Fantasies	Metric	2009
237	6958	2994	1	Elephant Gun	The Gulag Orkestar	Beirut	2006
239	6958	3074	1	Catch You Baby (Steve Pitron & Max Sanna Radio...	Catch You Baby	Lonnie Gordon	0
244	6958	3491	1	Bling (Confession Of A King)	Sam's Town	The Killers	2006
246	6958	3551	1	You're A Cad	Ray Guns Are Not Just The Future	the bird and the bee	2009
247	6958	3718	2	The Penalty	The Flying Club Cup	Beirut	2007
249	6958	3801	1	Baby	Ray Guns Are Not Just The Future	the bird and the bee	2009
251	6958	3907	1	What's In The Middle	Ray Guns Are Not Just The Future	the bird and the bee	2009
262	6958	5193	1	Goodnight Bad Morning	Midnight Boom	The Kills	2008
264	6958	5340	1	Postcards From Italy	The Gulag Orkestar	Beirut	2005
267	6958	5441	1	Where The White Boys Dance	Sam's Town	The Killers	2006
269	6958	5566	5	The Bachelor and the Bride	Her Majesty The Decemberists	The Decemberists	2003
271	6958	5894	1	Caring Is Creepy	Garden State - Music From The Motion Picture	The Shins	2001
272	6958	6305	1	Rhode Island Is Famous For You	Sing You Sinners	Erin McKeown	2007
280	6958	7738	1	Nantes	The Flying Club Cup	Beirut	2007
281	6958	8029	1	I CAN'T GET STARTED	It's The Time	Ron Carter	0
282	6958	8037	1	Gold Guns Girls	Fantasies	Metric	2009
286	6958	8425	1	Love Letter To Japan	Ray Guns Are Not Just The Future	the bird and the bee	2009
290	6958	9065	1	Balloons (Single version)	Balloons	Foals	2007
293	6958	9351	2	The Police And The Private	Live It Out	Metric	2005

	song_id	predicted_play_count
0	7224	3.141147
1	614	2.525000
2	5653	2.514023
3	352	2.425000
4	6450	2.394927

	song_id	play_freq	predicted_play_count	corrected_play_count
3	7224	107	3.141147	3.044473
1	614	373	2.525000	2.473222
2	5653	108	2.514023	2.417798
0	352	748	2.425000	2.388436
4	6450	102	2.394927	2.295913

	songs_id	predicted_play_count
0	750	5.000000
1	4377	4.206578
2	139	3.875420
3	5616	3.868549
4	861	3.840408

	song_id	play_freq	predicted_play_count	corrected_play_count
2	750	123	5.000000	4.909833
0	4377	159	4.206578	4.127273
3	139	119	3.875420	3.783750
4	5616	113	3.868549	3.774477
1	861	126	3.840408	3.751321

	song_id	play_freq	predicted_play_count	corrected_play_count
2	7224	107	2.601899	2.505225
1	5653	108	2.108728	2.012502
4	8324	96	2.014091	1.912029
0	9942	150	1.940115	1.858465
3	6450	102	1.952493	1.853478

	song_id	play_freq	predicted_play_count	corrected_play_count
3	6450	102	2.626819	2.527805
1	5653	108	2.578936	2.482711
2	7224	107	2.525240	2.428567
0	9942	150	2.506799	2.425149
4	4831	97	2.415542	2.314008

Music Recommendation Systems¶

Context¶

Objective¶

The key questions¶

Problem Formulation¶

Data Dictionary¶

Data Source¶

Importing libraries and Reading dataset¶

Understanding the data by viewing a few observations¶

Let us check the data types and and missing values of each column¶

Observations and Insights:¶

Exploratory Data Analysis¶

Let's check the total number of unique users, songs, artists in the data¶

Observations and Insights:¶

Let's find out about the most interacted songs and interacted users¶

Observations and Insights:¶

Observations and Insights:¶

Popularity Based Recommendation Systems¶

As we have now explored the data, let's start building Recommendation systems¶

Model 1: Create Rank-Based Recommendation System¶

Recommending top 10 Songs with 100 minimum interactions based on popularity¶

Collaborative Filtering Based Recommendation System¶

Types of Collaborative Filtering¶

Building a baseline user user similarity based recommendation system¶

Before building the recommendation systems, let's go over some basic terminologies we are going to use:¶

Precision@k and Recall@ k¶

Some useful functions¶

Improving similarity-based recommendation system by tuning its hyper-parameters¶

Identifying similar users to a given user (nearest neighbors)¶

Implementing the recommendation algorithm based on optimized KNNBasic model¶

Predicted top 5 songs for userId=6958 with user_user_similarity based recommendation system based on the recommendation score(output from the model, which is the likelihood of the user liking the song that the user will listen to)¶

Correcting the play_counts and Ranking the above songs¶

Item Item Similarity-based collaborative filtering recommendation systems¶

Improving similarity-based recommendation system by tuning its hyper-parameters¶

Identifying similar users to a given user (nearest neighbors)¶

Predicted top 5 songs for userId=6958 with similarity based recommendation system¶

Model Based Collaborative Filtering - Matrix Factorization¶

Singular Value Decomposition (SVD)¶

Building a baseline matrix factorization recommendation system¶

Improving matrix factorization based recommendation system by tuning its hyper-parameters¶

Cluster Based Recommendation System¶

Improving clustering-based recommendation system by tuning its hyper-parameters¶

Implementing the recommendation algorithm based on optimized CoClustering model¶

Correcting the play_count and Ranking the above songs¶

Content Based Recommendation Systems¶

Conclusions¶

Proposal for the final solution design:¶