# import the important packages
import pandas as pd                 # library used for data manipulation and analysis
import numpy as np                  # library used for working with arrays
import matplotlib.pyplot as plt     # library for plots and visualizations
import seaborn as sns               # library for visualizations
%matplotlib inline

import scipy.stats as stats  # this library contains a large number of probability distributions as well as a growing library of statistical functions

# to suppress warnings
import warnings
warnings.filterwarnings('ignore')

# Mount Google Colab drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive

fastfood = pd.read_csv('/content/drive/MyDrive/MIT - Data Sciences/Colab Notebooks/Week Two - Statistics for Data Science/Optional Content/FastFood1.csv')
fastfood.head()

# import the required functions
from scipy.stats import ttest_1samp

# calculate the test statistic and p-value
test_stat, p_value = ttest_1samp(fastfood['Time'], popmean = 40, alternative = 'greater')
print('The p-value is ', p_value)

The p-value is  1.4822680927543513e-05

import numpy as np
import scipy.stats as stats
daily_intake = np.array([5260,5070,3640,1180,1390,6515,6805,2015,4515,1230,1770])
stats.ttest_1samp(daily_intake, popmean= 7752)

TtestResult(statistic=-6.398908019819394, pvalue=7.843455779613616e-05, df=10)

rating = pd.read_csv('/content/drive/MyDrive/MIT - Data Sciences/Colab Notebooks/Week Two - Statistics for Data Science/Optional Content/rating.csv')
rating.head()

# find the sample means and sample standard deviations for the two samples
print('The mean rating for channel 1 is ' + str(round(rating['channel1_rating'].mean(), 1)))
print('The mean rating for channel 2 is ' + str(round(rating['channel2_rating'].mean(), 1)))

The mean rating for channel 1 is 3.5
The mean rating for channel 2 is 3.2

# user-defined function to compare the equality of two means from two independent populations, where population standard deviations are known
# this function returns the p-value for one-tailed test
# for two-tailed test, multiply the p-value by 2
def ztest_2samp(X1, X2, pop_sd1, pop_sd2, n1, n2):
    '''
    X1 - first of the two independent samples (sample 1)
    X2 - second of the two independent samples (sample 2)
    pop_sd1 - Population standard deviation of sample 1
    pop_sd2 - Population standard deviation of sample 2
    n1 - the size of sample 1
    n2 - the size of sample 2
    '''
    from numpy import sqrt, abs
    from scipy.stats import norm
    se = sqrt(pop_sd1**2/n1 + pop_sd2**2/n2)
    test_stat = ((X1.mean() - X2.mean()) - 0)/ se
    pval = 1 - norm.cdf(abs(test_stat))
    return pval

# find the p-value
p_value = ztest_2samp(rating['channel1_rating'].dropna(), rating['channel2_rating'], 0.48, 0.49, 150 ,300)
print('The P-value is', p_value)

tvradio = pd.read_csv('TVRadio.csv')
tvradio.head()

# find the sample means and sample standard deviations for the two samples
print('The mean usage time of cable TV is ' + str(tvradio['Cable_TV'].mean()))
print('The mean usage time of FM radio is ' + str(tvradio['FM_Radio'].mean()))
print('The standard deviation of usage time of cable TV is ' + str(round(tvradio['Cable_TV'].std(),2)))
print('The standard deviation of usage time of FM radio is ' + str(round(tvradio['FM_Radio'].std(),2)))

The mean usage time of cable TV is 18.8
The mean usage time of FM radio is 20.0
The standard deviation of usage time of cable TV is 5.41
The standard deviation of usage time of FM radio is 5.42

# import the required functions
from scipy.stats import ttest_ind

# find the p-value
test_stat, p_value = ttest_ind(tvradio['Cable_TV'], tvradio['FM_Radio'], equal_var = True, alternative = 'two-sided')
print('The p-value is ' + str(p_value))

The p-value is 0.5491435225558506

# import the data
satscore = pd.read_csv('SATVerbal1.csv')
satscore.head()

# find the sample means and sample standard deviations for the two samples
print('The mean SAT verbal score for College group is ' + str(satscore['College'].mean()))
print('The mean SAT verbal score for High School group is ' + str(satscore['High School'].mean()))
print('The standard deviation of SAT verbal score for College group is ' + str(round(satscore['College'].std(), 2)))
print('The standard deviation of SAT verbal score for High School group is ' + str(round(satscore['High School'].std(), 2)))

The mean SAT verbal score for College group is 525.0
The mean SAT verbal score for High School group is 480.0
The standard deviation of SAT verbal score for College group is 59.42
The standard deviation of SAT verbal score for High School group is 31.1

# import the required functions
from scipy.stats import ttest_ind

# find the p-value
test_stat, p_value = ttest_ind(satscore['College'], satscore['High School'].dropna(), equal_var = False, alternative = 'greater')
print('The p-value is ', p_value)

The p-value is  0.008034956026490195

import scipy.stats as stats
import numpy as np

# scores of two groups (0 and 1) of students
energ = np.array([[49, 0],[53, 1],[56, 1],[43, 1],[45, 1],[53, 1],[45, 1],[56, 1],[51, 0],[43, 0],[47, 1],[44, 0],[51, 0],[53, 1],[50, 0],[54, 0],[51, 0],[51, 1],[46, 0],[47, 1]])

# Separating the data into 2 groups

group1 = energ[:, 1] == 0 # Extracting the elements of the array where group == 0
group1 = energ[group1][:, 0] # Extracting the scores of group 0 students
group2 = energ[:, 1] == 1 # Extracting the elements of the array where group == 1
group2 = energ[group2][:, 0] # Extracting the scores of group 1 students

#applying t-test to check if the scores of group 0 and group 1 students are indeed different
stats.ttest_ind(group1, group2, equal_var = False)

TtestResult(statistic=-0.607694899652158, pvalue=0.5509860214402561, df=17.98688825272457)

# import the data
houseprice = pd.read_csv('Florida.csv')
houseprice.head()

# find the mean difference between the house prices in year 2002 and 2003
diff = np.mean(houseprice['Jan_2003'] - houseprice['Jan_2002'])
print('The mean of the differences between the house prices from 2002 to 2003', diff)

The mean of the differences between the house prices from 2002 to 2003 15.0

# import the required functions
from scipy.stats import ttest_rel

# find the p-value
test_stat, p_value = ttest_rel(houseprice['Jan_2002'], houseprice['Jan_2003'], alternative = 'less')
print('The p-value is ', p_value)

The p-value is  8.282698151615477e-05

# import the required fuction
from statsmodels.stats.proportion import proportions_ztest

# find the p-value
test_stat, p_value = proportions_ztest(24, 90, value = 0.5, alternative = 'larger')
print('The p-value is ' + str(p_value))

The p-value is 0.9999997216721324

# import the required fuction
from statsmodels.stats.proportion import proportions_ztest

# set the counts of defective items
defect_count = np.array([20, 25])

# set the sample sizes
nobs = np.array([200, 400])

# find the p-value
test_stat, p_value = proportions_ztest(defect_count, nobs)
print('The p-value is ' + str(p_value))

The p-value is 0.10017829422626796

# import the required function
from scipy.stats import chi2

# user-defined function to get the test stat and p-value
def chi_var(pop_var, sample_var, n):
    # calculate the test statistic
    test_stat = (n - 1) * sample_var / pop_var
    # calculate the p-value
    p_value = 1 - chi2.cdf(test_stat, n-1)
    return (test_stat, p_value)

# set the value of sample size
n = 32
# set the values of population and sample variance
sigma_2, s_2 = 22.4**2, 26.4**2

test_stat, p_value = chi_var(sigma_2, s_2, n)

print('The p-value is ', p_value)

The p-value is  0.0733923626973344

# plot the Chi-Square distribution (degrees of freedom n-1) with test statistic
x = np.linspace(0, 100, 100)
plt.plot(x, chi2.pdf(x, n-1))
plt.axvline(x = test_stat, c = 'r')
plt.show()

bagweight = pd.read_csv('Bags1.csv')
bagweight.head()

# import the required function
from scipy.stats import f

# user-defined function to perform F-test
def f_test(x, y):
    x = np.array(x)
    y = np.array(y)
    test_stat = np.var(x, ddof = 1)/np.var(y, ddof = 1) # calculate f-test statistic
    dfn = x.size-1 # define degrees of freedom numerator
    dfd = y.size-1 # define degrees of freedom denominator
    p = (1 - f.cdf(test_stat, dfn, dfd)) # find p-value of f-test statistic
    p1 = p*2 # converting one-tail to two-tail test
    return(print("The p_value is {}" .format(round(p1,8))))

# perform f-test
f_test(bagweight.dropna()['Machine 1'], bagweight.dropna()['Machine 2'])

The p_value is 5.1e-06

beverage = pd.read_csv('Beverage.csv')
beverage.head()

# import the required function
from scipy.stats import chi2_contingency

# find the p-value
chi, p_value, dof, expected = chi2_contingency(beverage.drop('Age', axis = 1))
print('The p-value is', p_value)

The p-value is 5.410957050304089e-10

aovdata = pd.read_csv('AOVData.csv')
aovdata.head()

# get the levels of factor fuel_type
aovdata['fuel_type'].value_counts()

fuel_type
Petrol    179
LPG       170
E85       161
Name: count, dtype: int64

# mean of the carbon emission at different levels of the fuel_type factor
print(aovdata.groupby("fuel_type")["co_emissions"].mean())

# draw the boxplot for visualization
fig, ax = plt.subplots(figsize = (6,6))
a = sns.boxplot(x = "fuel_type", y = 'co_emissions' , data = aovdata, hue = 'fuel_type')
a.set_title("Carbon Emission w.r.t. Fuel type (3 levels)", fontsize = 15)
plt.show()

fuel_type
E85       338.124534
LPG       363.744412
Petrol    371.722961
Name: co_emissions, dtype: float64

# Assumption 1: Normality
# import the required function
from scipy import stats

# find the p-value
w, p_value = stats.shapiro(aovdata['co_emissions'])
print('The p-value is', p_value)

The p-value is 0.49750789999961853

# Assumption 2: Homogeneity of Variance
# import the required function
from scipy.stats import levene
statistic, p_value = levene(aovdata['co_emissions'][aovdata['fuel_type'] == "Petrol"],
                            aovdata['co_emissions'][aovdata['fuel_type'] == "E85"],
                            aovdata['co_emissions'][aovdata['fuel_type'] ==  "LPG"])
# find the p-value
print('The p-value is', p_value)

The p-value is 0.19437768490117

# import the required function
from scipy.stats import f_oneway

# perform one-way Anova test
test_stat, p_value = f_oneway(aovdata.loc[aovdata['fuel_type'] == 'Petrol', 'co_emissions'],
                              aovdata.loc[aovdata['fuel_type'] == 'E85', 'co_emissions'],
                              aovdata.loc[aovdata['fuel_type'] == 'LPG', 'co_emissions'])
print('The p-value is ' + str(p_value))

The p-value is 8.274439764368132e-06

# import the required function
from statsmodels.stats.multicomp import pairwise_tukeyhsd

# perform multiple pairwise comparison (Tukey HSD)
m_comp = pairwise_tukeyhsd(endog = aovdata['co_emissions'], groups = aovdata['fuel_type'], alpha = 0.05)
print(m_comp)

Multiple Comparison of Means - Tukey HSD, FWER=0.05 
====================================================
group1 group2 meandiff p-adj   lower   upper  reject
----------------------------------------------------
   E85    LPG  25.6199 0.0012  8.6843 42.5554   True
   E85 Petrol  33.5984    0.0 16.8712 50.3256   True
   LPG Petrol   7.9785 0.4916 -8.5139  24.471  False
----------------------------------------------------

# Convert notebook to html
!jupyter nbconvert --to html "/content/drive/MyDrive/MIT - Data Sciences/Colab Notebooks/Week Two - Statistics for Data Science/Optional Content/Notebook+-+Hypothesis+Testing+Optional+Content.ipynb"

[NbConvertApp] Converting notebook /content/drive/MyDrive/MIT - Data Sciences/Colab Notebooks/Week Two - Statistics for Data Science/Optional Content/Notebook+-+Hypothesis+Testing+Optional+Content.ipynb to html
[NbConvertApp] WARNING | Alternative text is missing on 2 image(s).
[NbConvertApp] Writing 489830 bytes to /content/drive/MyDrive/MIT - Data Sciences/Colab Notebooks/Week Two - Statistics for Data Science/Optional Content/Notebook+-+Hypothesis+Testing+Optional+Content.html

	Delivery	Time
0	1	39.4
1	2	39.5
2	3	39.7
3	4	40.7
4	5	40.4

	channel1_rating	channel2_rating
0	3.5	3.3
1	3.7	3.3
2	3.0	2.9
3	3.8	3.7
4	3.3	4.0

	College	High School
0	485	489.0
1	534	507.0
2	650	447.0
3	554	515.0
4	550	470.0

	Metropolitan Area	Jan_2003	Jan_2002
0	Daytona Beach	117	96
1	Fort Lauderdale	207	169
2	Fort Myers	143	129
3	Fort Walton Beach	139	134
4	Gainesville	131	119

	Machine 1	Machine 2
0	2.95	3.22
1	3.45	3.30
2	3.50	3.34
3	3.75	3.28
4	3.48	3.29

	Car_ID	manufacturer	fuel_type	co_emissions
0	1	Audi	Petrol	441.55
1	2	BMW	E85	376.47
2	3	BMW	E85	414.12
3	4	BMW	E85	351.41
4	5	Volvo	E85	284.59

	Cable_TV	FM_Radio
0	22	25
1	8	10
2	25	29
3	22	19
4	12	13

	Age	Tea/Coffee	Soft Drink	Others
0	21 - 34	25	90	20
1	35 - 55	40	35	25
2	> 55	24	15	30

Concepts Covered:¶

Import the required packages¶

One Sample T-test for Population Mean¶

Let's revisit the example¶

Let's write the null and alternative hypothesis¶

Let's have a look at the sample data¶

Let's test whether the T-test assumptions are satisfied or not¶

Let's find the p-value¶

Insight¶

Two Independent Sample Z-test for Equality of Means¶

Let's revisit the example¶

Let's write the null hypothesis and alternative hypothesis¶

Let's have a look at the sample data¶

Let's test whether the Z-test assumptions are satisfied or not¶

Let's find the p-value¶

Insight¶

Two Independent Sample T-test for Equality of Means - Equal Std Dev¶

Let's revisit the example¶

Let's write the null hypothesis and alternative hypothesis¶

Let's have a look at the sample data¶

Let's test whether the T-test assumptions are satisfied or not¶

Let's find the p-value¶

Insight¶

Two Independent Sample T-test for Equality of Means - Unequal Std Dev¶

Let's revisit the example¶

Let's write the null hypothesis and alternative hypothesis¶

Let's have a look at the sample data¶

Let's test whether the T-test assumptions are satisfied or not¶

Let's find the p-value¶

Insight¶

Paired Sample T-test for Equality of Means¶

Let's revisit the example¶

Let's write the null hypothesis and alternative hypothesis¶

Let's have a look at the sample data¶

Let's test whether the paired T-test assumptions are satisfied or not¶

Let's find the p-value¶

Insight¶

One Proportion Z-test¶

Let's revisit the example¶

Let's write the null hypothesis and alternative hypothesis¶

Let's test whether the Z-test assumptions are satisfied or not¶

Let's find the p-value¶

Insight¶

Two Proportion Z-test¶

Let's revisit the example¶

Let's write the null hypothesis and alternative hypothesis¶

Let's test whether the Z-test assumptions are satisfied or not¶

Let's find the p-value¶

Insight¶

Chi-Square Test for Variance¶

Let's revisit an example¶

Let's write the null hypothesis and alternative hypothesis¶

Let's test whether the assumptions are satisfied or not¶

Let's find the p-value¶

Let's plot the Chi-Square distribution with the test statistic¶

Insight¶

F-test for Equality of Variances¶

Let's revisit the example¶

Let's write the null hypothesis and alternative hypothesis¶

Let's test whether the assumptions of the F-test are satisfied or not¶

Let's have a look at the sample data¶

Let's find the p-value¶

Insight¶

Chi-Square Test for Independence¶

Let's revisit the example¶

Let's have a look at the sample data¶

Let's write the null hypothesis and alternative hypothesis¶

Let's test whether the assumptions of the Chi-Square test are satisfied or not¶

Let's find the p-value¶

Insight¶

One-way Anova Test (Analysis Of Variance)¶

Let's revisit the example¶

Let's have a look at the sample data¶

Let's write the null hypothesis and alternative hypothesis¶

Shapiro-Wilk’s test¶

Levene’s test¶

Let's test whether the assumptions of Anova are satisfied or not¶

Let's find the p-value¶

Insight¶

Multiple Comparison test (Tukey HSD)¶