# Importing the basic libraries we will require for the project

import numpy as np;
import pandas as pd;
import matplotlib.pyplot as plt;
import seaborn as sns;
import csv,json;
import os;

# Importing the Machine Learning models we require from Scikit-Learn
from Scikit-learn import tree;
from Scikit-learn.tree import DecisionTreeClassifier;
from Scikit-learn.ensemble import RandomForestClassifier;

# Importing the other functions we may require from Scikit-Learn
from Scikit-learn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV;
from Scikit-learn.metrics import recall_score, roc_curve, classification_report, confusion_matrix;
from Scikit-learn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder;
from Scikit-learn.compose import ColumnTransformer;
from Scikit-learn.impute import SimpleImputer;
from Scikit-learn.pipeline import Pipeline;
from Scikit-learn import metrics, model_selection;

# Setting the random seed to 1 for reproducibility of results
import random;
random.seed(1);
np.random.seed(1);

# Code to ignore warnings from function usage
import warnings;
warnings.filterwarnings('ignore')

#Uncomment if you are using google colab
from google.colab import drive
drive.mount('/content/drive')

df_astro = pd.read_csv('/content/drive/MyDrive/Skyserver250k.csv');

df_astro.head()

df_astro.shape

(250000, 18)

df_astro['class'].value_counts()

class
GALAXY    127117
STAR       96116
QSO        26767
Name: count, dtype: int64

# Let's view the first 5 rows of the dataset
df_astro.head()

# Now let's view the last 5 rows of the dataset
df_astro.tail()

# Let's check the datatypes of the columns in the
df_astro.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 250000 entries, 0 to 249999
Data columns (total 18 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   objid      250000 non-null  int64  
 1   ra         250000 non-null  float64
 2   dec        250000 non-null  float64
 3   u          250000 non-null  float64
 4   g          250000 non-null  float64
 5   r          250000 non-null  float64
 6   i          250000 non-null  float64
 7   z          250000 non-null  float64
 8   run        250000 non-null  int64  
 9   rerun      250000 non-null  int64  
 10  camcol     250000 non-null  int64  
 11  field      250000 non-null  int64  
 12  specobjid  250000 non-null  uint64 
 13  class      250000 non-null  object 
 14  redshift   250000 non-null  float64
 15  plate      250000 non-null  int64  
 16  mjd        250000 non-null  int64  
 17  fiberid    250000 non-null  int64  
dtypes: float64(8), int64(8), object(1), uint64(1)
memory usage: 34.3+ MB

df_astro = df_astro.sample(n=50000)

# Checking for any missing values just in case
df_astro.isnull().sum()

objid        0
ra           0
dec          0
u            0
g            0
r            0
i            0
z            0
run          0
rerun        0
camcol       0
field        0
specobjid    0
class        0
redshift     0
plate        0
mjd          0
fiberid      0
dtype: int64

# Let's also check for duplicate rows in the dataset
df_astro.duplicated().sum()

0

### Percentage class distribution of the target variable "class"
df_astro['class'].value_counts(1)*100

class
GALAXY   51.07400
STAR     38.36800
QSO      10.55800
Name: proportion, dtype: float64

le = LabelEncoder()
df_astro["class"] = le.fit_transform(df_astro["class"])
df_astro["class"] = df_astro["class"].astype(int)

df_astro['class']

240208    2
18744     0
207175    0
18669     0
189086    2
         ..
12026     0
101461    2
146611    1
152140    0
168265    1
Name: class, Length: 50000, dtype: int64

# We would like the format of the values in the table to be simple float numbers with 5 decimal places, hence the code below
pd.set_option('display.float_format', lambda x: '%.5f' % x)

# Let's view the statistical summary of the columns in the dataset
df_astro.describe().T

# Number of unique values in each column
df_astro.nunique()

objid        50000
ra           50000
dec          50000
u            44423
g            46340
r            46771
i            47078
z            47286
run            527
rerun            1
camcol           6
field          817
specobjid    50000
class            3
redshift     49713
plate         5726
mjd           2134
fiberid        996
dtype: int64

# Removing the objid and specobjid columns from the dataset
df_astro.drop(columns=['objid', 'specobjid'], inplace=True)

# Defining the hist_box() function
def hist_box(col):
    f, (ax_box, ax_hist) = plt.subplots(2, sharex=True, gridspec_kw={'height_ratios': (0.15, 0.85)}, figsize=(15,10))
    sns.set(style='darkgrid')
    # Adding a graph in each part
    sns.boxplot(x=df_astro[col], ax=ax_box, showmeans=True, color='skyblue')  # Boxplot color
    sns.histplot(df_astro[col], kde=True, ax=ax_hist, stat='density', color='skyblue', fill=True)  # Histogram with KDE and density on y-axis
    sns.kdeplot(df_astro[col], ax=ax_hist, color='blue', linestyle='-')  # KDE plot color and style

    ax_hist.axvline(df_astro[col].mean(), color='green', linestyle='--')  # Mean line color
    ax_hist.axvline(df_astro[col].median(), color='orange', linestyle='-')  # Median line color

    plt.show()

hist_box('redshift')

hist_box('ra')

hist_box('dec')

hist_box('u')

hist_box('g')

hist_box('i')

hist_box('r')

hist_box('z')

hist_box('run')

hist_box('rerun')

hist_box('camcol')

hist_box('field')

hist_box('plate')

hist_box('mjd')

hist_box('fiberid')

sns.boxplot(x=df_astro['class'], y=df_astro['redshift'], hue=df_astro['class'], palette="PuBu", legend=False)

<Axes: xlabel='class', ylabel='redshift'>

# class vs [u,g,r,i,z]
cols = df_astro[['u','g','r','i','z']].columns.tolist()
plt.figure(figsize=(10,10))

for i, variable in enumerate(cols):
    plt.subplot(3,2,i+1)
    sns.boxplot(x=df_astro["class"], y=df_astro[variable], hue=df_astro["class"], palette="PuBu", legend=False)
    plt.tight_layout()
    plt.title(variable)

plt.show()

cols = df_astro[['run', 'rerun', 'camcol', 'field']].columns.tolist()
plt.figure(figsize=(10,10))

for i, variable in enumerate(cols):
    plt.subplot(3,2,i+1)
    sns.boxplot(x=df_astro["class"], y=df_astro[variable], hue=df_astro["class"], palette="PuBu", legend=False)
    plt.tight_layout()
    plt.title(variable)

plt.show()

cols = df_astro[['plate', 'mjd', 'fiberid']].columns.tolist()
plt.figure(figsize=(10,10))

for i, variable in enumerate(cols):
    plt.subplot(3,2,i+1)
    sns.boxplot(x=df_astro["class"], y=df_astro[variable], hue=df_astro["class"], palette="PuBu", legend=False)
    plt.tight_layout()
    plt.title(variable)

plt.show()

def plot(column):
    for i in range(3):
        sns.kdeplot(data=df_astro[df_astro["class"] == i][column], label = le.inverse_transform([i]))
    sns.kdeplot(data=df_astro[column],label = ["All"])
    plt.legend();

def log_plot(column):
    for i in range(3):
        sns.kdeplot(data=np.log(df_astro[df_astro["class"] == i][column]), label = le.inverse_transform([i]))
    sns.kdeplot(data=np.log(df_astro[column]),label = ["All"])
    plt.legend();

df_astro["rerun"].nunique()

1

df_astro = df_astro.drop("rerun",axis=1)

plot("ra")

plot("dec")

plot("r")

plot("i")

plot("run")

df_astro = df_astro.drop("run",axis=1)

plot("field")

df_astro = df_astro.drop("field",axis=1)

plot("redshift")

log_plot("redshift")

plot("plate")

plot("mjd")

plot("fiberid")

df_astro = df_astro.drop("fiberid",axis=1)

sns.countplot(x=df_astro["camcol"]);

sns.countplot(x=df_astro["camcol"],hue=df_astro["class"]);

df_astro = df_astro.drop("camcol",axis=1)

sns.countplot(x=df_astro["class"]);

plt.figure(figsize=(20,10))
sns.heatmap(df_astro.corr(numeric_only=True),annot=True,fmt=".2f")
plt.show()

# Separating the dependent and independent columns in the dataset
X = df_astro.drop(['class'], axis=1);
Y = df_astro[['class']];

df_astro[['class']]

# Splitting the dataset into the Training and Testing set
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.1, random_state=42, stratify=Y);

# Checking the shape of the Train and Test sets
print('X Train Shape:', X_train.shape);
print('X Test Shape:', X_test.shape);
print('Y Train Shape:', y_train.shape);
print('Y Test Shape:', y_test.shape);

X Train Shape: (45000, 10)
X Test Shape: (5000, 10)
Y Train Shape: (45000, 1)
Y Test Shape: (5000, 1)

def metrics_score(actual, predicted):
    print(classification_report(actual, predicted));
    cm = confusion_matrix(actual, predicted);
    plt.figure(figsize = (8,5));
    sns.heatmap(cm, annot = True, fmt = '.2f', xticklabels = ['Galaxy', 'Quasar', 'Star'], yticklabels = ['Galaxy', 'Quasar', 'Star'])
    plt.ylabel('Actual'); plt.xlabel('Predicted');
    plt.show()

from Scikit-learn.neighbors import KNeighborsClassifier
knn_model= KNeighborsClassifier()
knn_model.fit(X_train,y_train)
knn_train_predictions = knn_model.predict(X_train)
metrics_score(y_train,knn_train_predictions)

              precision    recall  f1-score   support

           0       0.84      0.97      0.90     22983
           1       0.78      0.51      0.62      4751
           2       0.91      0.81      0.85     17266

    accuracy                           0.86     45000
   macro avg       0.84      0.76      0.79     45000
weighted avg       0.86      0.86      0.85     45000

y_test_pred_knn = knn_model.predict(X_test);
metrics_score(y_test, y_test_pred_knn)

              precision    recall  f1-score   support

           0       0.81      0.95      0.87      2554
           1       0.59      0.35      0.44       528
           2       0.87      0.77      0.81      1918

    accuracy                           0.81      5000
   macro avg       0.75      0.69      0.71      5000
weighted avg       0.81      0.81      0.80      5000

from Scikit-learn.neighbors import KNeighborsClassifier
from Scikit-learn.preprocessing import StandardScaler
from Scikit-learn.decomposition import PCA
scaler = StandardScaler()
X_train_std = scaler.fit_transform(X_train)
X_test_std = scaler.fit_transform(X_test)
pca = PCA()
X_train_pca = pca.fit_transform(X_train_std)
X_test_pca = pca.fit_transform(X_test_std)
knn_model= KNeighborsClassifier()
knn_model.fit(X_train_pca,y_train)
knn_train_predictions = knn_model.predict(X_train_pca)
metrics_score(y_train,knn_train_predictions)

              precision    recall  f1-score   support

           0       0.95      0.97      0.96     22983
           1       0.99      0.93      0.96      4751
           2       0.96      0.94      0.95     17266

    accuracy                           0.95     45000
   macro avg       0.96      0.95      0.95     45000
weighted avg       0.95      0.95      0.95     45000

y_test_pred_knn = knn_model.predict(X_test_pca);
metrics_score(y_test, y_test_pred_knn)

              precision    recall  f1-score   support

           0       0.87      0.86      0.86      2554
           1       0.99      0.89      0.94       528
           2       0.81      0.85      0.83      1918

    accuracy                           0.86      5000
   macro avg       0.89      0.86      0.88      5000
weighted avg       0.86      0.86      0.86      5000

dt = DecisionTreeClassifier(random_state=1);
dt.fit(X_train, y_train)
y_train_pred_dt = dt.predict(X_train)
metrics_score(y_train, y_train_pred_dt)

              precision    recall  f1-score   support

           0       1.00      1.00      1.00     22983
           1       1.00      1.00      1.00      4751
           2       1.00      1.00      1.00     17266

    accuracy                           1.00     45000
   macro avg       1.00      1.00      1.00     45000
weighted avg       1.00      1.00      1.00     45000

y_test_pred_dt = dt.predict(X_test);
metrics_score(y_test, y_test_pred_dt)

              precision    recall  f1-score   support

           0       0.99      0.99      0.99      2554
           1       0.94      0.95      0.95       528
           2       1.00      1.00      1.00      1918

    accuracy                           0.99      5000
   macro avg       0.98      0.98      0.98      5000
weighted avg       0.99      0.99      0.99      5000

from Scikit-learn.model_selection import cross_val_score
scores = cross_val_score(dt, X_test, y_test, cv=5)
print(f"The average score of the model with K-5 Cross validation is {np.average(scores)} ")

The average score of the model with K-5 Cross validation is 0.9836

dt = DecisionTreeClassifier(random_state=1);
X_train_std = scaler.fit_transform(X_train)
X_test_std = scaler.fit_transform(X_test)
dt.fit(X_train_std, y_train)
y_train_pred_dt = dt.predict(X_train_std)
metrics_score(y_train, y_train_pred_dt)

              precision    recall  f1-score   support

           0       1.00      1.00      1.00     22983
           1       1.00      1.00      1.00      4751
           2       1.00      1.00      1.00     17266

    accuracy                           1.00     45000
   macro avg       1.00      1.00      1.00     45000
weighted avg       1.00      1.00      1.00     45000

y_test_pred_dt = dt.predict(X_test_std);
metrics_score(y_test, y_test_pred_dt)

              precision    recall  f1-score   support

           0       0.89      0.98      0.93      2554
           1       0.96      0.93      0.94       528
           2       0.99      0.85      0.92      1918

    accuracy                           0.93      5000
   macro avg       0.94      0.92      0.93      5000
weighted avg       0.93      0.93      0.93      5000

from Scikit-learn.model_selection import cross_val_score
scores = cross_val_score(dt, X_test_std, y_test, cv=5)
print(f"The average score of the model with K-5 Cross validation is {np.average(scores)} ")

The average score of the model with K-5 Cross validation is 0.9836

features = list(X.columns)
plt.figure(figsize=(30,20))
tree.plot_tree(dt, max_depth=3, feature_names=features, filled=True, fontsize=12, node_ids=True, class_names=None);
plt.show()

# Plotting the feature importance
importances = dt.feature_importances_
columns = X.columns;
importance_df_astro = pd.DataFrame(importances, index=columns, columns=['Importance']).sort_values(by='Importance', ascending=False);
plt.figure(figsize=(13,13));
sns.barplot(x=importance_df_astro.Importance,y= importance_df_astro.index)
plt.show()

print(importance_df_astro)

          Importance
redshift     0.96467
u            0.00958
g            0.00894
z            0.00403
plate        0.00258
ra           0.00254
i            0.00247
dec          0.00234
r            0.00183
mjd          0.00103

# Convert notebook to html
!jupyter nbconvert --to html "/content/drive/My Drive/Colab Notebooks/Copy of FDS_Project_LearnerNotebook_FullCode.ipynb"

	objid	ra	dec	u	g	r	i	z	run	rerun	camcol	field	specobjid	class	redshift	plate	mjd	fiberid
0	1237661976015274033	196.36207	7.66702	19.32757	19.20759	19.16249	19.07652	18.86196	3842	301	4	102	2020027785916999680	QSO	1.98442	1794	54504	594
1	1237661362373066810	206.61466	45.92428	18.95918	17.09173	16.25019	15.83413	15.55686	3699	301	5	121	1649585252231833600	GALAXY	0.06446	1465	53082	516
2	1237661360767238272	220.29473	40.89458	17.75587	16.54700	16.67694	16.77780	16.88097	3699	301	2	194	3812387877359296512	STAR	-0.00051	3386	54952	330
3	1237665440983416884	206.31535	27.43815	19.29195	19.12720	19.03992	18.76714	18.73874	4649	301	2	152	6762291282364878848	QSO	1.88289	6006	56105	496
4	1237665531717812262	228.09265	20.80737	19.19731	18.26143	17.89954	17.76130	17.68726	4670	301	3	201	4454292673071960064	STAR	-0.00030	3956	55656	846

	objid	ra	dec	u	g	r	i	z	run	rerun	camcol	field	specobjid	class	redshift	plate	mjd	fiberid
0	1237661976015274033	196.36207	7.66702	19.32757	19.20759	19.16249	19.07652	18.86196	3842	301	4	102	2020027785916999680	QSO	1.98442	1794	54504	594
1	1237661362373066810	206.61466	45.92428	18.95918	17.09173	16.25019	15.83413	15.55686	3699	301	5	121	1649585252231833600	GALAXY	0.06446	1465	53082	516
2	1237661360767238272	220.29473	40.89458	17.75587	16.54700	16.67694	16.77780	16.88097	3699	301	2	194	3812387877359296512	STAR	-0.00051	3386	54952	330
3	1237665440983416884	206.31535	27.43815	19.29195	19.12720	19.03992	18.76714	18.73874	4649	301	2	152	6762291282364878848	QSO	1.88289	6006	56105	496
4	1237665531717812262	228.09265	20.80737	19.19731	18.26143	17.89954	17.76130	17.68726	4670	301	3	201	4454292673071960064	STAR	-0.00030	3956	55656	846

	objid	ra	dec	u	g	r	i	z	run	rerun	camcol	field	specobjid	class	redshift	plate	mjd	fiberid
249995	1237661360767565997	221.10171	40.64105	19.30748	18.22145	17.61426	17.32240	17.02841	3699	301	2	199	1572955889466370048	GALAXY	0.15881	1397	53119	268
249996	1237667783903084693	171.64509	22.79755	19.19911	17.79553	17.03988	16.63705	16.31786	5194	301	6	381	2814852916416899072	GALAXY	0.03443	2500	54178	375
249997	1237648704591233226	215.75110	0.04449	18.88386	17.51738	16.89393	16.39914	16.07888	752	301	4	482	342430828837496832	GALAXY	0.07868	304	51609	572
249998	1237660634386923630	163.50134	44.47039	18.49867	16.73666	15.88036	15.48524	15.14631	3530	301	1	286	1614541342266910720	GALAXY	0.04852	1434	53053	3
249999	1237668271376236954	236.49564	12.14243	19.39894	18.40550	18.06466	17.93771	17.87524	5308	301	2	295	5517031135849500672	STAR	0.00086	4900	55739	442

	count	mean	std	min	25%	50%	75%	max
objid	50000.00000	1237662592402745856.00000	7207093862089.51660	1237645942905438464.00000	1237657629514236160.00000	1237662268074393856.00000	1237667211599904768.00000	1237680530812895744.00000
ra	50000.00000	178.38919	77.87886	0.01518	138.12116	181.08311	224.55207	359.99357
dec	50000.00000	24.46484	20.08817	-19.50182	6.84518	23.12350	39.70158	84.79483
u	50000.00000	18.63623	0.82798	11.41754	18.20821	18.86979	19.26691	19.59998
g	50000.00000	17.40655	0.98268	9.66834	16.84567	17.51230	18.05566	19.99148
r	50000.00000	16.88019	1.12646	9.05049	16.19581	16.88970	17.58307	31.41264
i	50000.00000	16.62614	1.20586	8.80997	15.86300	16.59721	17.34320	29.09998
z	50000.00000	16.46675	1.27357	9.22884	15.62477	16.42961	17.23214	28.75626
run	50000.00000	3985.58622	1678.04277	109.00000	2830.00000	3910.00000	5061.00000	8162.00000
rerun	50000.00000	301.00000	0.00000	301.00000	301.00000	301.00000	301.00000	301.00000
camcol	50000.00000	3.41080	1.60723	1.00000	2.00000	3.00000	5.00000	6.00000
field	50000.00000	188.63092	142.56631	11.00000	84.00000	154.00000	251.00000	986.00000
specobjid	50000.00000	2932821382617439744.00000	2500435815384516608.00000	299493525735630848.00000	1337623353455831040.00000	2356649444004358144.00000	3277520682476398592.00000	13177760526313476096.00000
class	50000.00000	0.87294	0.93717	0.00000	0.00000	0.00000	2.00000	2.00000
redshift	50000.00000	0.16859	0.43125	-0.00414	0.00001	0.04558	0.09553	6.40423
plate	50000.00000	2604.78306	2220.81942	266.00000	1188.00000	2093.00000	2911.00000	11704.00000
mjd	50000.00000	53927.35574	1551.03867	51608.00000	52733.00000	53732.00000	54589.00000	58543.00000
fiberid	50000.00000	350.38148	215.46894	1.00000	172.00000	342.00000	508.00000	1000.00000

	class
240208	2
18744	0
207175	0
18669	0
189086	2
...	...
12026	0
101461	2
146611	1
152140	0
168265	1

Celestial Object Detection¶

Multi-class classification of astronomical objects into Stars, Galaxies or Quasars,¶

based on spectroscopic & photometric features made available as a tabular dataset.¶

Problem Context¶

Problem Statement¶

Data Description¶

Importing the libraries required¶

Loading the dataset¶

Data Overview¶

First 5 & Last 5 Rows of the Dataset¶

Datatypes of the Features¶

Missing Values¶

Duplicate Rows¶

Class Distribution¶

Statistical Summary¶

Unique Values in each Column¶

Data Preprocessing - Removal of ID columns¶

Exploratory Data Analysis¶

Univariate Analysis¶

Bivariate Analysis¶

Categorical and Continuous variables¶

kdeplot¶

rerun¶

alpha¶

delta¶

r¶

i¶

run¶

field¶

redshift¶

plate¶

mjd¶

fiberid¶

camcol¶

class¶

Multivariate Analysis¶

Data Preparation¶

Model Building¶

The k-Nearest Neighbors Model¶

Before Scaling and PCA¶

After Scaling and PCA¶

Tree-Based Models¶

The Decision Tree Classifier¶

Before Scaling¶

Model Evaluation using K-Fold Cross Validation¶

After Scaling¶

Model Evaluation using K-Fold Cross Validation¶

Feature importance¶

Conclusions and Recommendations¶

Algorithmic Insights¶

Dataset Insights¶