# Basic libraries of python for numeric and dataframe computations
import pandas as pd
import numpy as np

# Connect to google
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive

data=pd.read_csv('challenger-data.csv')

data.head()

data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 120 entries, 0 to 119
Data columns (total 3 columns):
 #   Column       Non-Null Count  Dtype
---  ------       --------------  -----
 0   Observation  120 non-null    int64
 1   Y            120 non-null    int64
 2   X            120 non-null    int64
dtypes: int64(3)
memory usage: 2.9 KB

data.describe()

# We will be using the Matplotlib library for plotting.

# subsetting the data
failures = data.loc[(data.Y == 1)]
no_failures	= data.loc[(data.Y == 0)]

# frequencies
failures_freq = failures.X.value_counts() #failures.groupby('X')
no_failures_freq = no_failures.X.value_counts()

# plotting
import matplotlib as mpl
from matplotlib	import pyplot as plt
plt.scatter(failures_freq.index, failures_freq, c='red', s=40)
plt.scatter(no_failures_freq.index, np.zeros(len(no_failures_freq)), c='blue', s=40)
plt.xlabel('X: Temperature')
plt.ylabel('Number of Failures')
plt.legend(['failures', 'No failures'])
plt.show()

# You will need to have the following libraries installed before proceeding:
import statsmodels.formula.api as SM

# Build the model
model = SM.logit(formula='Y~X',data=data)
result = model.fit()

# Summarize the model
print (result.summary())

Optimization terminated successfully.
         Current function value: 0.242411
         Iterations 7
                           Logit Regression Results                           
==============================================================================
Dep. Variable:                      Y   No. Observations:                  120
Model:                          Logit   Df Residuals:                      118
Method:                           MLE   Df Model:                            1
Date:                Tue, 27 Jul 2021   Pseudo R-squ.:                  0.1549
Time:                        20:25:44   Log-Likelihood:                -29.089
converged:                       True   LL-Null:                       -34.420
Covariance Type:            nonrobust   LLR p-value:                  0.001094
==============================================================================
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept      7.4049      3.041      2.435      0.015       1.445      13.365
X             -0.1466      0.047     -3.104      0.002      -0.239      -0.054
==============================================================================

# Convert notebook to html
!jupyter nbconvert --to html "/content/drive/My Drive/Colab Notebooks/Copy of FDS_Project_LearnerNotebook_FullCode.ipynb"

	Observation	Y	X
0	1	1	53
1	2	1	53
2	3	1	53
3	4	0	53
4	5	0	53

	Observation	Y	X
count	120.000000	120.000000	120.000000
mean	60.500000	0.083333	70.000000
std	34.785054	0.277544	7.100716
min	1.000000	0.000000	53.000000
25%	30.750000	0.000000	67.000000
50%	60.500000	0.000000	70.000000
75%	90.250000	0.000000	75.250000
max	120.000000	1.000000	81.000000

Case Study - Challenger Launch¶

Importing the necessary libraries¶

Loading the data¶

Visualizing the Data¶

Logistic Regression¶