!pip install featuretools

Requirement already satisfied: featuretools in /usr/local/lib/python3.10/dist-packages (1.31.0)
Requirement already satisfied: cloudpickle>=1.5.0 in /usr/local/lib/python3.10/dist-packages (from featuretools) (2.2.1)
Requirement already satisfied: holidays>=0.17 in /usr/local/lib/python3.10/dist-packages (from featuretools) (0.48)
Requirement already satisfied: numpy>=1.25.0 in /usr/local/lib/python3.10/dist-packages (from featuretools) (1.25.2)
Requirement already satisfied: packaging>=20.0 in /usr/local/lib/python3.10/dist-packages (from featuretools) (24.0)
Requirement already satisfied: pandas>=2.0.0 in /usr/local/lib/python3.10/dist-packages (from featuretools) (2.0.3)
Requirement already satisfied: psutil>=5.7.0 in /usr/local/lib/python3.10/dist-packages (from featuretools) (5.9.5)
Requirement already satisfied: scipy>=1.10.0 in /usr/local/lib/python3.10/dist-packages (from featuretools) (1.11.4)
Requirement already satisfied: tqdm>=4.66.3 in /usr/local/lib/python3.10/dist-packages (from featuretools) (4.66.4)
Requirement already satisfied: woodwork>=0.28.0 in /usr/local/lib/python3.10/dist-packages (from featuretools) (0.31.0)
Requirement already satisfied: python-dateutil in /usr/local/lib/python3.10/dist-packages (from holidays>=0.17->featuretools) (2.8.2)
Requirement already satisfied: pytz>=2020.1 in /usr/local/lib/python3.10/dist-packages (from pandas>=2.0.0->featuretools) (2023.4)
Requirement already satisfied: tzdata>=2022.1 in /usr/local/lib/python3.10/dist-packages (from pandas>=2.0.0->featuretools) (2024.1)
Requirement already satisfied: scikit-learn>=1.1.0 in /usr/local/lib/python3.10/dist-packages (from woodwork>=0.28.0->featuretools) (1.2.2)
Requirement already satisfied: importlib-resources>=5.10.0 in /usr/local/lib/python3.10/dist-packages (from woodwork>=0.28.0->featuretools) (6.4.0)
Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.10/dist-packages (from python-dateutil->holidays>=0.17->featuretools) (1.16.0)
Requirement already satisfied: joblib>=1.1.1 in /usr/local/lib/python3.10/dist-packages (from scikit-learn>=1.1.0->woodwork>=0.28.0->featuretools) (1.4.2)
Requirement already satisfied: threadpoolctl>=2.0.0 in /usr/local/lib/python3.10/dist-packages (from scikit-learn>=1.1.0->woodwork>=0.28.0->featuretools) (3.5.0)

# Data manipulation
import pandas as pd
import numpy as np

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

plt.style.use('fivethirtyeight')
%matplotlib inline

# Automated feature engineering
import featuretools as ft

# Machine learning
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score, precision_recall_curve, roc_curve
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestClassifier

#ignore warnings
import warnings
warnings.filterwarnings("ignore")

from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive

data = pd.read_excel("Online Retail.xlsx")

data.head()

data.shape

(541909, 8)

# Convert to dollars
data['UnitPrice'] = data['UnitPrice'] * 1.65
data['total'] = data['UnitPrice'] * data['Quantity']

# Restrict data to 2011
data_2011 = data[data['InvoiceDate'].dt.year == 2011]

data_2011.head()

data_2011.shape

(499428, 9)

data_2011.info()

<class 'pandas.core.frame.DataFrame'>
Index: 499428 entries, 42481 to 541908
Data columns (total 9 columns):
 #   Column       Non-Null Count   Dtype         
---  ------       --------------   -----         
 0   InvoiceNo    499428 non-null  object        
 1   StockCode    499428 non-null  object        
 2   Description  498099 non-null  object        
 3   Quantity     499428 non-null  int64         
 4   InvoiceDate  499428 non-null  datetime64[ns]
 5   UnitPrice    499428 non-null  float64       
 6   CustomerID   379979 non-null  float64       
 7   Country      499428 non-null  object        
 8   total        499428 non-null  float64       
dtypes: datetime64[ns](1), float64(3), int64(1), object(4)
memory usage: 38.1+ MB

data_2011.duplicated().sum()

4768

data_2011.isnull().sum()

InvoiceNo           0
StockCode           0
Description      1329
Quantity            0
InvoiceDate         0
UnitPrice           0
CustomerID     119449
Country             0
total               0
dtype: int64

# drop the duplicates
data_2011 = data_2011.drop_duplicates()

# drop rows with null customer id
data_2011 = data_2011.dropna(axis=0)

data_2011['cancelled'] = data_2011['InvoiceNo'].str.startswith('C', na = False)

# Size has changed as we have removed duplicats
data_2011.shape

(375250, 10)

data_2011[data_2011.cancelled==True].head()

data_2011['cancelled'].value_counts(normalize =True)

cancelled
False    0.978172
True     0.021828
Name: proportion, dtype: float64

#summary statitics for the new data
data_2011.describe()

plt.figure(figsize = (20, 6))
sns.boxplot(x = 'Country', y = 'total', data = data_2011[(data_2011['total'] > 0) & (data_2011['total'] < 1000)]);
plt.title("Total Purchase Amount by Country");
plt.xticks(rotation = 90);

plt.figure(figsize = (20, 6))
sns.boxplot(x = 'Country', y = 'Quantity', data = data_2011[(data_2011['total'] > 0) & (data_2011['total'] < 1000)]);
plt.title("Purchased Quantity by Country");
plt.xticks(rotation = 90);

def ecdf(data_2011):
    x = np.sort(data_2011)
    y = np.arange(1, len(x) + 1) / len(x)
    return x, y

plt.figure(figsize = (14, 4))

# Total
plt.subplot(121)
x, y = ecdf(data_2011.loc[data_2011['total'] > 0, 'total'])
plt.plot(x, y, marker = '.')
plt.xlabel('Total'); plt.ylabel('Percentile'); plt.title('ECDF of Purchase Total');

# Quantity
plt.subplot(122)
x, y = ecdf(data_2011.loc[data_2011['total'] > 0, 'Quantity'])
plt.plot(x, y, marker = '.')
plt.xlabel('Quantity'); plt.ylabel('Percentile'); plt.title('ECDF of Purchase Quantity');

data_2011.columns

Index(['InvoiceNo', 'StockCode', 'Description', 'Quantity', 'InvoiceDate',
       'UnitPrice', 'CustomerID', 'Country', 'total', 'cancelled'],
      dtype='object')

import datetime as dt

def make_retail_cutoffs_total(start_date, end_date, threshold=500):

    # Find customers who exist before start date
    customer_pool = data_2011[data_2011['InvoiceDate'] < start_date]['CustomerID'].unique()
    tmp = pd.DataFrame({'CustomerID': customer_pool})

    # For customers in the customer pool, find their sum between the start and end date
    totals = data_2011[data_2011['CustomerID'].isin(customer_pool) &
        (data_2011['InvoiceDate'] > start_date) &
        (data_2011['InvoiceDate'] < end_date)
    ].groupby('CustomerID')['total'].sum().reset_index()

    # Merge with all the customer ids to record all customers who existed before start date
    totals = totals.merge(tmp, on='CustomerID', how='right')

    # Set the total for any customer who did not have a purchase in the timeframe equal to 0
    totals['total'] = totals['total'].fillna(0)

    # Label is based on the threshold
    totals['label'] = (totals['total'] > threshold).astype(int)

    # The cutoff time is the start date
    totals['cutoff_time'] = pd.to_datetime(start_date)
    totals = totals[['CustomerID', 'cutoff_time', 'total', 'label']]

    return totals

may_spending = make_retail_cutoffs_total(dt.datetime(2011, 5, 1), dt.datetime(2011, 6, 1))
may_spending.head()

may_spending['label'].value_counts().plot.bar();
plt.title('Label Distribution for May');

may_spending['label'].value_counts()

label
0    1689
1     483
Name: count, dtype: int64

#making labels for each month
march_spending = make_retail_cutoffs_total('2011-03-01', '2011-04-01', 500)
april_spending = make_retail_cutoffs_total('2011-04-01', '2011-05-01', 500)
june_spending = make_retail_cutoffs_total('2011-06-01', '2011-07-01', 500)
july_spending = make_retail_cutoffs_total('2011-07-01', '2011-08-01', 500)
august_spending = make_retail_cutoffs_total('2011-08-01', '2011-09-01', 500)
september_spending = make_retail_cutoffs_total('2011-09-01', '2011-10-01', 500)
october_spending = make_retail_cutoffs_total('2011-10-01', '2011-11-01', 500)
november_spending = make_retail_cutoffs_total('2011-11-01', '2011-12-01', 500)
december_spending = make_retail_cutoffs_total('2011-12-01', '2012-01-01', 500)

labels = pd.concat([march_spending, april_spending, may_spending, june_spending, july_spending, august_spending,
                    september_spending, october_spending, november_spending, december_spending], axis = 0)
#labels.to_csv('../input/labels.csv')
print(labels.shape)
labels.label.value_counts(normalize=True)

(28133, 4)

label
0    0.829453
1    0.170547
Name: proportion, dtype: float64

plot_labels = labels.copy()
plot_labels['month'] = plot_labels['cutoff_time'].dt.month

plt.figure(figsize = (12, 6))
sns.boxplot(x = 'month', y = 'total',
            data = plot_labels[(plot_labels['total'] > 0) & (plot_labels['total'] < 1000)]);
plt.title('Customer Spending Distribution by Month');

labels.loc[labels['CustomerID'] == 12347]

labels.loc[labels['CustomerID'] == 12347].set_index('cutoff_time')['total'].plot(figsize = (6, 4), linewidth = 3)
plt.xlabel('Date', size = 16);
plt.ylabel('Spending', size = 16);
plt.title('Monthly Spending for Customer', size = 20);
plt.xticks(size = 16); plt.yticks(size = 16);

data_2011.head()

es = ft.EntitySet(id="Online Retail Logs")

# Add the entire data table as an entity
es = es.add_dataframe(dataframe=data_2011,
                      dataframe_name="purchases",
                      index="purchases_index",
                      time_index='InvoiceDate')

es["purchases"]["Description"].metadata = {"logical_type": "NaturalLanguage"}

es["purchases"]

# create a new "products" entity
es.normalize_dataframe(new_dataframe_name = "products",
                    base_dataframe_name = "purchases",
                    index="StockCode",
                    additional_columns = ["Description"])

es['products'].head()

es['purchases']

# create a new "customers" entity based on the orders entity
es.normalize_dataframe(new_dataframe_name="customers",
                    base_dataframe_name="purchases",
                    index="CustomerID")

# create a new "orders" entity
es.normalize_dataframe(new_dataframe_name="orders",
                    base_dataframe_name="purchases",
                    index="InvoiceNo",
                    additional_columns=["Country", 'cancelled'])

es

Entityset: Online Retail Logs
  DataFrames:
    purchases [Rows: 375250, Columns: 8]
    products [Rows: 3612, Columns: 3]
    customers [Rows: 4244, Columns: 2]
    orders [Rows: 20482, Columns: 4]
  Relationships:
    purchases.StockCode -> products.StockCode
    purchases.CustomerID -> customers.CustomerID
    purchases.InvoiceNo -> orders.InvoiceNo

labels.head()

labels.info()

<class 'pandas.core.frame.DataFrame'>
Index: 28133 entries, 0 to 4196
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype         
---  ------       --------------  -----         
 0   CustomerID   28133 non-null  float64       
 1   cutoff_time  28133 non-null  datetime64[ns]
 2   total        28133 non-null  float64       
 3   label        28133 non-null  int64         
dtypes: datetime64[ns](1), float64(2), int64(1)
memory usage: 1.1 MB

labels.rename(columns = {'cutoff_time':'time'}, inplace = True)

es

Entityset: Online Retail Logs
  DataFrames:
    purchases [Rows: 375250, Columns: 8]
    products [Rows: 3612, Columns: 3]
    customers [Rows: 4244, Columns: 2]
    orders [Rows: 20482, Columns: 4]
  Relationships:
    purchases.StockCode -> products.StockCode
    purchases.CustomerID -> customers.CustomerID
    purchases.InvoiceNo -> orders.InvoiceNo

feature_matrix, feature_names = ft.dfs(entityset=es, target_dataframe_name='customers',
                                       cutoff_time = labels, verbose = 2,
                                       cutoff_time_in_index = True,
                                       chunk_size = len(labels), n_jobs = -1,
                                       max_depth = 1)

feature_matrix.head()

Built 27 features
Elapsed: 00:00 | Progress:   0%|

INFO:distributed.http.proxy:To route to workers diagnostics web server please install jupyter-server-proxy: python -m pip install jupyter-server-proxy
INFO:distributed.scheduler:State start
INFO:distributed.scheduler:  Scheduler at:     tcp://127.0.0.1:38051
INFO:distributed.scheduler:  dashboard at:  http://127.0.0.1:8787/status
INFO:distributed.nanny:        Start Nanny at: 'tcp://127.0.0.1:40515'
INFO:distributed.nanny:        Start Nanny at: 'tcp://127.0.0.1:38201'
INFO:distributed.scheduler:Register worker <WorkerState 'tcp://127.0.0.1:34443', name: 1, status: init, memory: 0, processing: 0>
INFO:distributed.scheduler:Starting worker compute stream, tcp://127.0.0.1:34443
INFO:distributed.core:Starting established connection to tcp://127.0.0.1:49322
INFO:distributed.scheduler:Register worker <WorkerState 'tcp://127.0.0.1:38361', name: 0, status: init, memory: 0, processing: 0>
INFO:distributed.scheduler:Starting worker compute stream, tcp://127.0.0.1:38361
INFO:distributed.core:Starting established connection to tcp://127.0.0.1:49310
INFO:distributed.scheduler:Receive client connection: Client-4dacf4e9-1759-11ef-8e5b-0242ac1c000c
INFO:distributed.core:Starting established connection to tcp://127.0.0.1:49326

EntitySet scattered to 2 workers in 5 seconds
Elapsed: 00:37 | Progress:  95%|█████████▌

INFO:distributed.scheduler:Remove client Client-4dacf4e9-1759-11ef-8e5b-0242ac1c000c
INFO:distributed.core:Received 'close-stream' from tcp://127.0.0.1:49326; closing.
INFO:distributed.scheduler:Remove client Client-4dacf4e9-1759-11ef-8e5b-0242ac1c000c
INFO:distributed.scheduler:Close client connection: Client-4dacf4e9-1759-11ef-8e5b-0242ac1c000c
INFO:distributed.nanny:Closing Nanny at 'tcp://127.0.0.1:40515'. Reason: nanny-close
INFO:distributed.nanny:Nanny asking worker to close. Reason: nanny-close
INFO:distributed.nanny:Closing Nanny at 'tcp://127.0.0.1:38201'. Reason: nanny-close
INFO:distributed.nanny:Nanny asking worker to close. Reason: nanny-close
INFO:distributed.core:Received 'close-stream' from tcp://127.0.0.1:49310; closing.
INFO:distributed.core:Received 'close-stream' from tcp://127.0.0.1:49322; closing.
INFO:distributed.scheduler:Remove worker <WorkerState 'tcp://127.0.0.1:38361', name: 0, status: closing, memory: 0, processing: 0> (stimulus_id='handle-worker-cleanup-1716285822.672842')
INFO:distributed.scheduler:Remove worker <WorkerState 'tcp://127.0.0.1:34443', name: 1, status: closing, memory: 1, processing: 0> (stimulus_id='handle-worker-cleanup-1716285822.6755228')
INFO:distributed.scheduler:Lost all workers
INFO:distributed.scheduler:Scheduler closing due to unknown reason...
INFO:distributed.scheduler:Scheduler closing all comms

Elapsed: 00:38 | Progress: 100%|██████████

feature_matrix = feature_matrix.drop(columns = ['MODE(purchases.StockCode)', 'MODE(purchases.InvoiceNo)'])
feature_matrix.shape

(28133, 27)

feature_matrix.loc[12347, :].sample(10, axis = 1)

# feature_matrix.reset_index(inplace = True)

feature_matrix.groupby('time')['COUNT(purchases)'].mean().plot();
plt.title('Average Monthly Count of Purchases');
plt.ylabel('Purchases Per Customer');

feature_matrix.groupby('time')['SUM(purchases.Quantity)'].mean().plot();
plt.title('Average Monthly Sum of Purchased Products');
plt.ylabel('Total Purchased Products Per Customer');

feature_matrix = pd.get_dummies(feature_matrix).reset_index()
feature_matrix.shape

(28133, 3074)

corrs = feature_matrix.corr().sort_values('total')
corrs['total'].head()

MONTH(first_purchases_time)_3     -0.037102
MONTH(first_purchases_time)_4     -0.030643
MONTH(first_purchases_time)_5     -0.030183
WEEKDAY(first_purchases_time)_6   -0.026694
MIN(purchases.total)              -0.026663
Name: total, dtype: float64

corrs['total'].dropna().tail()

label                              0.343661
NUM_UNIQUE(purchases.InvoiceNo)    0.351811
SUM(purchases.Quantity)            0.566804
SUM(purchases.total)               0.630549
total                              1.000000
Name: total, dtype: float64

g = sns.FacetGrid(feature_matrix[(feature_matrix['SUM(purchases.total)'] > 0) & (feature_matrix['SUM(purchases.total)'] < 1000)],
                  hue = 'label', height = 4, aspect = 3)
g.map(sns.kdeplot, 'SUM(purchases.total)')
g.add_legend();
plt.title('Distribution of Purchases Total by Label');

model = RandomForestClassifier(n_estimators = 1000,
                               random_state = 50,
                               n_jobs = -1)

from sklearn.pipeline import Pipeline

def predict_month(month, feature_matrix, return_probs = False):
    """Train and test a machine learning model using a feature set
    for one month. Testing labels are from the month."""

    feature_matrix['month'] = feature_matrix['time'].dt.month

    # Subset labels
    test_labels = feature_matrix.loc[feature_matrix['month'] == month, 'label']
    train_labels = feature_matrix.loc[feature_matrix['month'] < month, 'label']

    # Features
    X_train = feature_matrix[feature_matrix['time'].dt.month < month].drop(columns = ['CustomerID', 'time',
                                                                                     'month', 'label', 'total'])
    X_test = feature_matrix[feature_matrix['time'].dt.month == month].drop(columns = ['CustomerID', 'time',
                                                                                     'month', 'label', 'total'])
    feature_names = list(X_train.columns)

    # Impute and scale features
    pipeline = Pipeline([('imputer', SimpleImputer(strategy = 'median')),
                      ('scaler', MinMaxScaler())])

    # Fit and transform training data
    X_train = pipeline.fit_transform(X_train)
    X_test = pipeline.transform(X_test)

    # Labels
    y_train = np.array(train_labels).reshape((-1, ))
    y_test = np.array(test_labels).reshape((-1, ))

    print('Training on {} observations.'.format(len(X_train)))
    print('Testing on {} observations.\n'.format(len(X_test)))

    # Train
    model.fit(X_train, y_train)

    # Make predictions
    predictions = model.predict(X_test)
    probs = model.predict_proba(X_test)[:, 1]

    # Calculate metrics
    p = precision_score(y_test, predictions)
    r = recall_score(y_test, predictions)
    f = f1_score(y_test, predictions)
    auc = roc_auc_score(y_test, probs)

    print(f'Precision: {round(p, 5)}')
    print(f'Recall: {round(r, 5)}')
    print(f'F1 Score: {round(f, 5)}')
    print(f'ROC AUC: {round(auc, 5)}')

    # Feature importances
    fi = pd.DataFrame({'feature': feature_names, 'importance': model.feature_importances_})

    if return_probs:
        return fi, probs

    return fi

june_fi = predict_month(6, feature_matrix)

Training on 5266 observations.
Testing on 2489 observations.

Precision: 0.58214
Recall: 0.38263
F1 Score: 0.46176
ROC AUC: 0.74479

def plot_feature_importances(df, n = 15, threshold = None):
    """Plots n most important features. Also plots the cumulative importance if
    threshold is specified and prints the number of features needed to reach threshold cumulative importance.
    Intended for use with any tree-based feature importances.

    Args:
        df (dataframe): Dataframe of feature importances. Columns must be "feature" and "importance".

        n (int): Number of most important features to plot. Default is 15.

        threshold (float): Threshold for cumulative importance plot. If not provided, no plot is made. Default is None.

    Returns:
        df (dataframe): Dataframe ordered by feature importances with a normalized column (sums to 1)
                        and a cumulative importance column

    Note:

        * Normalization in this case means sums to 1.
        * Cumulative importance is calculated by summing features from most to least important
        * A threshold of 0.9 will show the most important features needed to reach 90% of cumulative importance

    """

    # Sort features with most important at the head
    df = df.sort_values('importance', ascending = False).reset_index(drop = True)

    # Normalize the feature importances to add up to one and calculate cumulative importance
    df['importance_normalized'] = df['importance'] / df['importance'].sum()
    df['cumulative_importance'] = np.cumsum(df['importance_normalized'])

    plt.rcParams['font.size'] = 12

    # Bar plot of n most important features
    df.loc[:n, :].plot.barh(y = 'importance_normalized',
                            x = 'feature', color = 'blue',
                            edgecolor = 'k', figsize = (12, 8),
                            legend = False)

    plt.xlabel('Normalized Importance', size = 18); plt.ylabel('');
    plt.title(f'Top {n} Most Important Features', size = 18)
    plt.gca().invert_yaxis()


    if threshold:
        # Cumulative importance plot
        plt.figure(figsize = (8, 6))
        plt.plot(list(range(len(df))), df['cumulative_importance'], 'b-')
        plt.xlabel('Number of Features', size = 16); plt.ylabel('Cumulative Importance', size = 16);
        plt.title('Cumulative Feature Importance', size = 18);

        # Number of features needed for threshold cumulative importance
        # This is the index (will need to add 1 for the actual number)
        importance_index = np.min(np.where(df['cumulative_importance'] > threshold))

        # Add vertical line to plot
        plt.vlines(importance_index + 1, ymin = 0, ymax = 1.05, linestyles = '--', colors = 'red')
        plt.show();

        print('{} features required for {:.0f}% of cumulative importance.'.format(importance_index + 1,
                                                                                  100 * threshold))

    return df

norm_june_fi = plot_feature_importances(june_fi)

labels.head()

labels['month'] = labels['time'].dt.month
july_labels = labels[labels['month'] == 7]
june_labels = labels[labels['month'] == 6]

july_labels = july_labels.rename(columns = {'total': 'july_total'})
june_labels = june_labels.rename(columns = {'total': 'june_total'})

# Merge the current month with the previous
july_labels = july_labels.merge(june_labels[['CustomerID', 'june_total']], on = 'CustomerID', how = 'left')
july_labels['june_total'] = july_labels['june_total'].fillna(0)
july_labels['predicted_label'] = (july_labels['june_total'] > 500).astype(int)

july_labels['probability'] = july_labels['june_total'] / 500

# Set probabilities greater than 1 equal to 1
july_labels.loc[july_labels['probability'] > 1, 'probability'] = 1

july_labels.sample(10, random_state=50)

july_labels['july_total'].corr(july_labels['june_total'])

0.6212795006643788

sns.lmplot(x = 'june_total', y = 'july_total', data = july_labels, fit_reg = False)
plt.title('July vs June Spending');

print('Precision: {:.5f}.'.format(precision_score(july_labels['label'], july_labels['predicted_label'])))
print('Recall: {:.5f}.'.format(recall_score(july_labels['label'], july_labels['predicted_label'])))
print('F1 Score: {:.5f}.'.format(f1_score(july_labels['label'], july_labels['predicted_label'])))
print('ROC AUC Score: {:.5f}.'.format(roc_auc_score(july_labels['label'], july_labels['probability'])))

Precision: 0.41784.
Recall: 0.39644.
F1 Score: 0.40686.
ROC AUC Score: 0.64355.

july_fi, july_probs = predict_month(7, feature_matrix, True)

Training on 7755 observations.
Testing on 2752 observations.

Precision: 0.55172
Recall: 0.32071
F1 Score: 0.40563
ROC AUC: 0.75266

# Calculate false positive rates and true positive rates
base_fpr, base_tpr, _ = roc_curve(july_labels['label'], july_labels['probability'])
model_fpr, model_tpr, _ = roc_curve(feature_matrix[feature_matrix['month'] == 7]['label'], july_probs)

plt.figure(figsize = (8, 6))
plt.rcParams['font.size'] = 16
# Plot both curves
plt.plot(base_fpr, base_tpr, 'b', label = 'baseline')
plt.plot(model_fpr, model_tpr, 'r', label = 'model')
plt.legend();
plt.xlabel('False Positive Rate'); plt.ylabel('True Positive Rate'); plt.title('ROC Curves');

def informed_baseline(month_number, threshold = 500):
    """Calculate an informed baseline for a given month.
    The informed baseline is guessing the previous month's spending
    for the next month. The probability is assessed by dividing
    the previous month's total by the threshold and setting
    any values greater than 1 to 1."""

    # Subset to the months
    month = labels[labels['month'] == month_number]
    previous_month = labels[labels['month'] == (month_number - 1)]
    previous_month = previous_month.rename(columns = {'total': 'previous_total'})

    # Merge the current month with the previous month
    month = month.merge(previous_month[['CustomerID', 'previous_total']], on = 'CustomerID', how = 'left')

    # For customers who had no spending in the previous month, set their spending to 0
    month['previous_total'] = month['previous_total'].fillna(0)

    # Calculate a probability based on the previous months spending and the threshold
    month['probability'] = month['previous_total'] / threshold

    # Set probabilities greater than 1 equal to 1
    month.loc[month['probability'] > 1, 'probability'] = 1

    # Make the predicted label
    month['prediction'] = (month['previous_total'] > threshold).astype(int)

    # Calculate metrics
    print('Precision: {:.5f}.'.format(precision_score(month['label'], month['prediction'])))
    print('Recall: {:.5f}.'.format(recall_score(month['label'], month['prediction'])))
    print('F1 Score: {:.5f}.'.format(f1_score(month['label'], month['prediction'])))
    print('ROC AUC Score: {:.5f}.'.format(roc_auc_score(month['label'], month['probability'])))

    return month

def compare(month, feature_matrix):
    """Compare machine learning model to baseline performance.
    Computes statistics and shows ROC curve."""

    print('Baseline Performance')
    baseline = informed_baseline(month)

    print('\nModel Performance')
    fi, probs = predict_month(month, feature_matrix, return_probs=True)

    # Calculate false positive rates and true positive rates
    base_fpr, base_tpr, _ = roc_curve(baseline['label'], baseline['probability'])
    model_fpr, model_tpr, _ = roc_curve(feature_matrix[feature_matrix['month'] == month]['label'], probs)

    plt.figure(figsize = (8, 6))
    plt.rcParams['font.size'] = 16

    # Plot both curves
    plt.plot(base_fpr, base_tpr, 'b', label = 'baseline')
    plt.plot(model_fpr, model_tpr, 'r', label = 'model')
    plt.legend();
    plt.xlabel('False Positive Rate'); plt.ylabel('True Positive Rate'); plt.title('ROC Curves');

compare(6, feature_matrix)

Baseline Performance
Precision: 0.41201.
Recall: 0.46714.
F1 Score: 0.43784.
ROC AUC Score: 0.67322.

Model Performance
Training on 5266 observations.
Testing on 2489 observations.

Precision: 0.58214
Recall: 0.38263
F1 Score: 0.46176
ROC AUC: 0.74479

compare(12, feature_matrix)

Baseline Performance
Precision: 0.21606.
Recall: 0.57547.
F1 Score: 0.31416.
ROC AUC Score: 0.69498.

Model Performance
Training on 23936 observations.
Testing on 4197 observations.

Precision: 0.33038
Recall: 0.46855
F1 Score: 0.38752
ROC AUC: 0.79129

def precision_recall(month, feature_matrix):
    "Show the precision vs recall curve for a month"

    # Find the probability
    fi, probs = predict_month(month, feature_matrix,
                              return_probs = True)

    # Calculate metrics across thresholds
    precision, recall, t = precision_recall_curve(labels.loc[labels['time'].dt.month == month, 'label'],
                                                  probs)

    # Plot the curve
    plt.step(recall, precision, color='b', alpha=0.5,
             where='post')

    # Fill in the curve
    plt.fill_between(recall, precision, step='post', alpha=0.5,
                     color='b')

    plt.xlabel('Recall')
    plt.ylabel('Precision')
    plt.ylim([0.0, 1.05])
    plt.xlim([0.0, 1.0])
    plt.title("Precision vs. Recall Curve"); plt.show();

precision_recall(12, feature_matrix)

Training on 23936 observations.
Testing on 4197 observations.

Precision: 0.33038
Recall: 0.46855
F1 Score: 0.38752
ROC AUC: 0.79129

precision_recall(10, feature_matrix)

Training on 16631 observations.
Testing on 3467 observations.

Precision: 0.55786
Recall: 0.29841
F1 Score: 0.38883
ROC AUC: 0.72191

%%capture

# Record probabilities for each month
for month in range(4, 13):
    _, probs = predict_month(month, feature_matrix, return_probs=True)

    temp_df = pd.DataFrame({'Automated': probs, 'month': month})

    if month == 4:
        probs_df = temp_df.copy()
    else:
        probs_df = pd.concat([probs_df, temp_df], ignore_index=True)

probs_df.head()

probs_df.shape

(26851, 2)

probs_df['month'].unique()

array([ 4,  5,  6,  7,  8,  9, 10, 11, 12])

# Convert notebook to html
!jupyter nbconvert --to html "/content/drive/MyDrive/MIT - Data Sciences/Colab Notebooks/Week_Nine_Predictive_Engineering/Case_Studies/Automated_Retail_Spending/Automated_Retail_Spending.ipynb"

	InvoiceNo	StockCode	Description	Quantity	InvoiceDate	UnitPrice	CustomerID	Country
0	536365	85123A	WHITE HANGING HEART T-LIGHT HOLDER	6	2010-12-01 08:26:00	2.55	17850.0	United Kingdom
1	536365	71053	WHITE METAL LANTERN	6	2010-12-01 08:26:00	3.39	17850.0	United Kingdom
2	536365	84406B	CREAM CUPID HEARTS COAT HANGER	8	2010-12-01 08:26:00	2.75	17850.0	United Kingdom
3	536365	84029G	KNITTED UNION FLAG HOT WATER BOTTLE	6	2010-12-01 08:26:00	3.39	17850.0	United Kingdom
4	536365	84029E	RED WOOLLY HOTTIE WHITE HEART.	6	2010-12-01 08:26:00	3.39	17850.0	United Kingdom

	InvoiceNo	StockCode	Description	Quantity	InvoiceDate	UnitPrice	CustomerID	Country	total
42481	539993	22386	JUMBO BAG PINK POLKADOT	10	2011-01-04 10:00:00	3.2175	13313.0	United Kingdom	32.175
42482	539993	21499	BLUE POLKADOT WRAP	25	2011-01-04 10:00:00	0.6930	13313.0	United Kingdom	17.325
42483	539993	21498	RED RETROSPOT WRAP	25	2011-01-04 10:00:00	0.6930	13313.0	United Kingdom	17.325
42484	539993	22379	RECYCLING BAG RETROSPOT	5	2011-01-04 10:00:00	3.4650	13313.0	United Kingdom	17.325
42485	539993	20718	RED RETROSPOT SHOPPER BAG	10	2011-01-04 10:00:00	2.0625	13313.0	United Kingdom	20.625

	InvoiceNo	StockCode	Description	Quantity	InvoiceDate	UnitPrice	CustomerID	Country	total	cancelled
42557	C540006	21306	SET/4 DAISY MIRROR MAGNETS	-1	2011-01-04 10:48:00	3.4650	14606.0	United Kingdom	-3.4650	True
42558	C540006	84352	SILVER CHRISTMAS TREE BAUBLE STAND	-1	2011-01-04 10:48:00	27.9675	14606.0	United Kingdom	-27.9675	True
42559	C540006	22423	REGENCY CAKESTAND 3 TIER	-1	2011-01-04 10:48:00	21.0375	14606.0	United Kingdom	-21.0375	True
42560	C540007	21055	TOOL BOX SOFT TOY	-6	2011-01-04 11:08:00	14.7675	15379.0	United Kingdom	-88.6050	True
42561	C540007	22274	FELTCRAFT DOLL EMILY	-6	2011-01-04 11:08:00	4.8675	15379.0	United Kingdom	-29.2050	True

	Quantity	InvoiceDate	UnitPrice	CustomerID	total
count	375250.000000	375250	375250.000000	375250.000000	375250.000000
mean	12.252296	2011-07-25 10:03:49.841812480	5.762031	15265.136168	33.972397
min	-80995.000000	2011-01-04 10:00:00	0.000000	12346.000000	-277974.840000
25%	2.000000	2011-05-04 08:40:00	2.062500	13901.000000	7.012500
50%	5.000000	2011-08-14 12:55:30	3.217500	15108.000000	19.305000
75%	12.000000	2011-10-24 17:07:00	6.187500	16767.000000	32.670000
max	80995.000000	2011-12-09 12:50:00	64300.500000	18287.000000	277974.840000
std	258.180877	NaN	119.054102	1710.927381	733.803756

	CustomerID	cutoff_time	total	label
0	13313.0	2011-05-01	499.026	0
1	18097.0	2011-05-01	1318.977	1
2	16656.0	2011-05-01	1804.176	1
3	16875.0	2011-05-01	0.000	0
4	13094.0	2011-05-01	678.942	1

Predicting Retail Spending with Automated Feature Engineering in Featuretools¶

Roadmap¶

Load in Raw Data¶

Data Cleaning¶

Skewness of Data¶

Prediction Problem¶

Making Labels¶

Metrics¶

Featuretools Implementation¶

Normalizing Entities¶

Additional Tables¶

Deep Feature Synthesis¶

Using cutoff times¶

Correlations¶

Preliminary modeling¶

Model¶

Comparison to Baseline¶

Precision Recall Curve¶

Conclusions¶

	CustomerID	cutoff_time	total	label
666	12347.0	2011-03-01	0.0000	0
666	12347.0	2011-04-01	1049.8125	1
666	12347.0	2011-05-01	0.0000	0
666	12347.0	2011-06-01	631.1580	1
666	12347.0	2011-07-01	0.0000	0
666	12347.0	2011-08-01	965.1015	1
666	12347.0	2011-09-01	0.0000	0
666	12347.0	2011-10-01	2135.6280	1
666	12347.0	2011-11-01	0.0000	0
666	12347.0	2011-12-01	370.9530	0

		COUNT(purchases)	MAX(purchases.Quantity)	MAX(purchases.UnitPrice)	MAX(purchases.total)	MEAN(purchases.Quantity)	MEAN(purchases.UnitPrice)	MEAN(purchases.total)	MIN(purchases.Quantity)	MIN(purchases.UnitPrice)	MIN(purchases.total)	...	STD(purchases.total)	SUM(purchases.Quantity)	SUM(purchases.UnitPrice)	SUM(purchases.total)	DAY(first_purchases_time)	MONTH(first_purchases_time)	WEEKDAY(first_purchases_time)	YEAR(first_purchases_time)	total	label
CustomerID	time
13313.0	2011-03-01	17	25.0	7.0125	58.410	10.058824	3.692118	29.826176	4.0	0.6930	17.3250	...	11.998083	171.0	62.7660	507.0450	4	1	1	2011	0.000	0
18097.0	2011-03-01	9	60.0	16.4175	143.550	30.000000	4.882167	87.021000	6.0	0.6930	33.2640	...	41.359375	270.0	43.9395	783.1890	4	1	1	2011	0.000	0
16656.0	2011-03-01	14	216.0	14.0250	498.960	56.285714	5.079643	109.184036	-12.0	0.6930	-32.6700	...	144.361356	788.0	71.1150	1528.5765	4	1	1	2011	589.248	1
16875.0	2011-03-01	52	24.0	27.9675	65.670	7.250000	7.433250	24.218192	-8.0	0.4785	-16.4175	...	17.671397	377.0	386.5290	1259.3460	4	1	1	2011	0.000	0
13094.0	2011-03-01	3	72.0	1.7490	125.928	72.000000	1.749000	125.928000	72.0	1.7490	125.9280	...	0.000000	216.0	5.2470	377.7840	4	1	1	2011	115.434	0

	MEAN(purchases.total)	MIN(purchases.Quantity)	NUM_UNIQUE(purchases.InvoiceNo)	STD(purchases.UnitPrice)	MEAN(purchases.Quantity)	SUM(purchases.UnitPrice)	MIN(purchases.UnitPrice)	MIN(purchases.total)	total	MAX(purchases.total)
time
2011-03-01	27.048052	3.0	1	4.015815	10.862069	120.7305	0.6930	8.316	0.0000	63.1125
2011-04-01	27.048052	3.0	1	4.015815	10.862069	120.7305	0.6930	8.316	1049.8125	63.1125
2011-05-01	34.607660	3.0	2	4.088012	15.056604	223.5090	0.4125	8.316	0.0000	411.8400
2011-06-01	34.607660	3.0	2	4.088012	15.056604	223.5090	0.4125	8.316	631.1580	411.8400
2011-07-01	34.723437	2.0	3	3.954557	14.000000	311.9820	0.4125	8.316	0.0000	411.8400
2011-08-01	34.723437	2.0	3	3.954557	14.000000	311.9820	0.4125	8.316	965.1015	411.8400
2011-09-01	36.886726	2.0	4	4.128061	13.666667	424.5780	0.4125	8.316	0.0000	411.8400
2011-10-01	36.886726	2.0	4	4.128061	13.666667	424.5780	0.4125	8.316	2135.6280	411.8400
2011-11-01	39.757811	2.0	5	4.019489	13.907143	623.8320	0.4125	8.316	0.0000	411.8400
2011-12-01	39.757811	2.0	5	4.019489	13.907143	623.8320	0.4125	8.316	370.9530	411.8400

	CustomerID	time	july_total	label	month	june_total	predicted_label	probability
1647	14711.0	2011-07-01	0.0000	0	7	778.8165	1	1.00000
627	14849.0	2011-07-01	1342.9185	1	7	1624.8210	1	1.00000
2434	13213.0	2011-07-01	0.0000	0	7	221.2650	0	0.44253
2236	15258.0	2011-07-01	0.0000	0	7	507.3750	1	1.00000
1852	14747.0	2011-07-01	0.0000	0	7	0.0000	0	0.00000
1547	13946.0	2011-07-01	0.0000	0	7	0.0000	0	0.00000
241	13004.0	2011-07-01	508.9260	1	7	1121.6040	1	1.00000
437	13982.0	2011-07-01	0.0000	0	7	520.8060	1	1.00000
2043	14323.0	2011-07-01	0.0000	0	7	-29.2050	0	-0.05841
1972	17827.0	2011-07-01	0.0000	0	7	0.0000	0	0.00000

	Automated	month
0	0.037	4
1	0.080	4
2	0.799	4
3	0.176	4
4	0.209	4