# import the important libraries
import pandas as pd  # library used for data manipulation and analysis
import numpy as np  # library used for working with arrays
import matplotlib.pyplot as plt  # library for visualization
import seaborn as sns  # library for visualization
%matplotlib inline
import scipy.stats as stats  # this library contains a large number of probability distributions as well as a growing library of statistical functions

# Connect to google
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive

# declare the sample size in variable n that represents the number of doses given to the volunteers
n=100

# declare p that represents the probability of success i.e. the probability that a dose will not do a satisfactory job
p=0.09

# declare the different number of doses selected in a Numpy array
k=np.arange(0,101)
k

array([  0,   1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,
        13,  14,  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,
        26,  27,  28,  29,  30,  31,  32,  33,  34,  35,  36,  37,  38,
        39,  40,  41,  42,  43,  44,  45,  46,  47,  48,  49,  50,  51,
        52,  53,  54,  55,  56,  57,  58,  59,  60,  61,  62,  63,  64,
        65,  66,  67,  68,  69,  70,  71,  72,  73,  74,  75,  76,  77,
        78,  79,  80,  81,  82,  83,  84,  85,  86,  87,  88,  89,  90,
        91,  92,  93,  94,  95,  96,  97,  98,  99, 100])

# import the binomial function from the scipy library
from scipy.stats import binom

# use the binom.pmf() function to generate the probability distribution
binomial = binom.pmf(k = k,n = n, p = p)

binomial

array([8.01935118e-005, 7.93122644e-004, 3.88281470e-003, 1.25444783e-002,
       3.00860701e-002, 5.71304716e-002, 8.94625517e-002, 1.18815257e-001,
       1.36604904e-001, 1.38106057e-001, 1.24295451e-001, 1.00578737e-001,
       7.37761613e-002, 4.93919862e-002, 3.03562992e-002, 1.72130224e-002,
       9.04393005e-003, 4.41965812e-003, 2.01555838e-003, 8.60313523e-004,
       3.44598109e-004, 1.29832882e-004, 4.61094802e-005, 1.54652915e-005,
       4.90725596e-006, 1.47541234e-006, 4.20922794e-007, 1.14096288e-007,
       2.94196474e-008, 7.22392251e-009, 1.69087417e-009, 3.77614579e-010,
       8.05283837e-011, 1.64113789e-011, 3.19846848e-012, 5.96511861e-013,
       1.06519975e-013, 1.82226034e-014, 2.98791675e-015, 4.69782347e-016,
       7.08545353e-017, 1.02550118e-017, 1.42475439e-018, 1.90064347e-019,
       2.43514011e-020, 2.99709552e-021, 3.54410483e-022, 4.02720353e-023,
       4.39783902e-024, 4.61580772e-025, 4.65638625e-026, 4.51491879e-027,
       4.20769059e-028, 3.76886240e-029, 3.24425884e-030, 2.68356276e-031,
       2.13273728e-032, 1.62823205e-033, 1.19387230e-034, 8.40535911e-036,
       5.68054489e-037, 3.68401398e-038, 2.29189810e-039, 1.36722336e-040,
       7.81740003e-042, 4.28205851e-043, 2.24583488e-044, 1.12715348e-045,
       5.40989952e-047, 2.48136815e-048, 1.08681588e-049, 4.54171625e-051,
       1.80920015e-052, 6.86314073e-054, 2.47660113e-055, 8.49120386e-057,
       2.76246511e-058, 8.51566239e-060, 2.48343493e-061, 6.83989590e-063,
       1.77574220e-064, 4.33636680e-066, 9.93726512e-068, 2.13138746e-069,
       4.26612090e-071, 7.94209967e-073, 1.37002742e-074, 2.18041498e-076,
       3.18567124e-078, 4.24808611e-080, 5.13504915e-082, 5.58090114e-084,
       5.39958185e-086, 4.59375981e-088, 3.38329282e-090, 2.11333502e-092,
       1.08859977e-094, 4.43974076e-097, 1.34416910e-099, 2.68565255e-102,
       2.65613989e-105])

# plot the distribution
plt.bar(k,binomial)
plt.xlim(-10,50)
plt.title('Binomial: n=%i , p=%.2f' % (n,p), fontsize=15)
plt.xlabel('Number of Doses')
plt.ylabel('Probability that doses will not do a satisfactory job')
plt.show()

binomial[3]

0.012544478265088905

# plot the probability distribution
# we are plotting the distributions here to better visualize the calculations
# ofcourse you do not 'need' to create the following visualization to answer the question above
# you can directly use the cdf function for probability calculations
barl=plt.bar(k,binomial)
plt.xlim(-10,50)
plt.title('Binomial: n=%i , p=%.2f' % (n,p), fontsize=15)
plt.xlabel('Number of Doses')
plt.ylabel('Probability that doses will not do a satisfactory job')
for i in range(0,4):
    barl[i].set_color('r')
plt.show()

# Calculate the cdf
from scipy.stats import binom
prob_atmost3 = binom.cdf(k = 3 ,n = n, p = p)
print(prob_atmost3)

0.017300609121718068

# declare the sample size in variable n which represents the number of doses selected randomly
n = 200

# declare p which represents the probability of success, i.e., the probability that a dose will not do a satisfactory job
p = 0.09

# declare the different number of doses selected in a Numpy array
k = np.arange(0,201)
k

array([  0,   1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,
        13,  14,  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,
        26,  27,  28,  29,  30,  31,  32,  33,  34,  35,  36,  37,  38,
        39,  40,  41,  42,  43,  44,  45,  46,  47,  48,  49,  50,  51,
        52,  53,  54,  55,  56,  57,  58,  59,  60,  61,  62,  63,  64,
        65,  66,  67,  68,  69,  70,  71,  72,  73,  74,  75,  76,  77,
        78,  79,  80,  81,  82,  83,  84,  85,  86,  87,  88,  89,  90,
        91,  92,  93,  94,  95,  96,  97,  98,  99, 100, 101, 102, 103,
       104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116,
       117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129,
       130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142,
       143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155,
       156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168,
       169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181,
       182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194,
       195, 196, 197, 198, 199, 200])

# import the required function
from scipy.stats import binom
# use the binom.pmf() function to generate the probability distribution
binomial = binom.pmf(k = k,n = n, p = p)

# plot the probability distribution
# we are plotting the distributions here to better visualize the calculations
# ofcourse you do not 'need' to create the following visualization to answer the question above
# you can directly use the cdf function for probability calculations
barl=plt.bar(k,binomial,width=0.7)
plt.xlim(-10,50)
plt.title('Binomial: n=%i , p=%.2f' % (n,p), fontsize=15)
plt.xlabel('Number of Doses')
plt.ylabel('Probability that doses will not do a satisfactory job')
for i in range(30,201):
    barl[i].set_color('r')
plt.show()

# calculate cdf
prob_atleast30 = 1-binom.cdf(k = 29 , n = 200, p = 0.09)
print(prob_atleast30)

0.003963863923503452

drug = pd.read_csv('doses.csv')
drug.head()

drug.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50 entries, 0 to 49
Data columns (total 2 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   drug_serial_number  50 non-null     int64  
 1   time_of_effect      50 non-null     float64
dtypes: float64(1), int64(1)
memory usage: 928.0 bytes

# import the required function
from scipy.stats import norm
# estimate the mean and standard deviation of the time of effect for the doses
mu = drug['time_of_effect'].mean()
sigma = drug['time_of_effect'].std()
print('The estimated mean is', round(mu,2))
print('The estimated standard deviation is', round(sigma,2))

The estimated mean is 13.44
The estimated standard deviation is 4.75

import seaborn as sns
sns.displot(drug['time_of_effect'],kind="kde")

<seaborn.axisgrid.FacetGrid at 0x1285beb61f0>

# find the cumulative probability
# norm.cdf() calculates the cumulative probability
prob = norm.cdf(11.5, mu, sigma)
print('The probability that a dose will take less than 11.5 hours is', round(prob, 4))

The probability that a dose will take less than 11.5 hours is 0.3412

perc_90th = norm.ppf(0.90, mu, sigma)

print('The 90th percentile of the time of effect for doses is', round(perc_90th, 2))

The 90th percentile of the time of effect for doses is 19.52

# import the required function
from scipy.stats import t

# set the values of the sample mean and sample standard deviation
x_bar, s = mu, sigma

# set the value of sample size and degrees of freedom
n = 50
k = n - 1

# construct the confidence interval
np.round(t.interval(0.95, df = k, loc = x_bar, scale = s/np.sqrt(n)), 2)

array([12.09, 14.79])

# Convert notebook to html
!jupyter nbconvert --to html "/content/drive/MyDrive/MIT - Data Sciences/Colab Notebooks/Week Two - Statistics for Data Science/Mentored Session/Notebook - Medicon Dose Testing.ipynb"

[NbConvertApp] Converting notebook /content/drive/MyDrive/MIT - Data Sciences/Colab Notebooks/Week Two - Statistics for Data Science/Mentored Session/Notebook - Medicon Dose Testing.ipynb to html
[NbConvertApp] WARNING | Alternative text is missing on 4 image(s).
[NbConvertApp] Writing 398138 bytes to /content/drive/MyDrive/MIT - Data Sciences/Colab Notebooks/Week Two - Statistics for Data Science/Mentored Session/Notebook - Medicon Dose Testing.html

	drug_serial_number	time_of_effect
0	672	5.8
1	895	17.3
2	518	16.7
3	448	13.1
4	402	13.6

Context¶

Problem Statement¶

Import the Libraries¶

Question 1 Solution¶

Let's estimate the probability distribution of doses¶

Question 2 Solution¶

Question 3 Solution¶

Data Dictionary:¶

Reading the Sample Data¶

Overview of the dataset¶

Plotting the Distribution¶