# Connect to google
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive

import pandas as pd
import numpy as np
from tqdm import tqdm

from Scikit-learn.decomposition import PCA
from Scikit-learn.cluster import KMeans
from Scikit-learn.preprocessing import StandardScaler

# we open the file and get an array of its lines
with open ("ccrescentus.fa", "r") as inputFile:
    data = inputFile.readlines()

# we concatenate each line from the second (first line is a description), stripped of empty chars
geneticCode = ''
for line in data[1:]:
    geneticCode += line.strip()

# we count the presence of each genome(a,g,t,c)
aCount = geneticCode.count('a')
gCount = geneticCode.count('g')
tCount = geneticCode.count('t')
cCount = geneticCode.count('c')

# for testing we print a sample of the string and check if there are only wanted characters
print(geneticCode[0:30])
print('Test: only a,g,t,c ?')
print(aCount + gCount + tCount + cCount == len(geneticCode))

gccgatagcctatgatccccggcaggcccg
Test: only a,g,t,c ?
True

# size of the sub strings (data points)
size = 300

dataPoints = []

# we copy the entire code into a string, which will removed of its first elements iteratively
tempString = geneticCode

# we iteratively remove a left chunk of the string and place it into our array
while len(tempString) > 0:
    dataPoints.append(tempString[0:size])
    tempString = tempString[size:]

print(dataPoints[0])

gccgatagcctatgatccccggcaggcccggggcttggagccgtctggtttggatggaaccctccaaaccagatcaagaggctcctagaacgccgcccgcagggtcacgccccaggtgcgcgggtcgcccggctggccggcgatcaggccggtgttgctgggacccacggccagttgctcgaaatagttctcgtcgaaggcgttgcggacccaggcatagaggttcagcccctcaggcgtgcggaagccggcccggaagttagcgatcgtgtagccgtcaacccaggtgtagatcgaggg

import itertools

iterables = ['a','g','t','c']
wordsDict =  {}

# for words of size 1 to 4, we calculate the cartesian product to get all possibilities
for i in range(1,5):
    words = []
    iterator = itertools.product(iterables, repeat = i)
    for word in iterator:
        s = ''
        for t in word:
            s += t
        words.append(s)
    wordsDict[i] = words

# print the dictionary for 3 letter words
print(wordsDict[3])

['aaa', 'aag', 'aat', 'aac', 'aga', 'agg', 'agt', 'agc', 'ata', 'atg', 'att', 'atc', 'aca', 'acg', 'act', 'acc', 'gaa', 'gag', 'gat', 'gac', 'gga', 'ggg', 'ggt', 'ggc', 'gta', 'gtg', 'gtt', 'gtc', 'gca', 'gcg', 'gct', 'gcc', 'taa', 'tag', 'tat', 'tac', 'tga', 'tgg', 'tgt', 'tgc', 'tta', 'ttg', 'ttt', 'ttc', 'tca', 'tcg', 'tct', 'tcc', 'caa', 'cag', 'cat', 'cac', 'cga', 'cgg', 'cgt', 'cgc', 'cta', 'ctg', 'ctt', 'ctc', 'cca', 'ccg', 'cct', 'ccc']

# dictionary that will contain the frequency table for each word size
freqTables = {}

for i in range(1,5):
    # create an empty dataFrame with columns being the words on the dictionary
    df = pd.DataFrame(columns = wordsDict[i])
    for index, dataP in enumerate(dataPoints):
        # we create a row with zero values corresponding to a data point
        df.loc[index] = np.zeros(len(wordsDict[i]))
        while len(dataP) > 0:
            # get the left part of the data point (i characters)
            left = dataP[0:i]
            # find it in the respective column and count it there
            df.loc[index, left] += 1
            dataP = dataP[i:]
    freqTables[i] = df

freqTables[3].head()

normFreqTables = {}

for i in range(1,5):
    # we eliminate the string column from the data, leaving only the actual frequencies
    data = freqTables[i]
    data = StandardScaler(copy=True, with_mean=True, with_std=True).fit_transform(data)
    normFreqTables[i] = pd.DataFrame(data, columns = wordsDict[i])

# for testing, we check that the average of a column is close to zero and stdev close to 1(i.e. checking for normalization)
print(normFreqTables[2].loc[:, 'gt'].mean())
print(normFreqTables[2].loc[:, 'gt'].std())

-7.352361918240343e-17
1.0004920049498138

pca = PCA(n_components = 2)

pCompTables = {}

for i in range(1,5):
    pca.fit(normFreqTables[i])
    pComponents = pca.transform(normFreqTables[i])
    # for each word size, we store the result of the PCA in a table containing only the 2 princicipal components
    pCompTables[i] = pd.DataFrame(pComponents[:, [0,1]], columns = ['pc1', 'pc2'])
    print('Explained variance for ' + str(i) + ' letters: ' + str(pca.explained_variance_ratio_.sum()))

print(pCompTables[2].head())

Explained variance for 1 letters: 0.7489363490534271
Explained variance for 2 letters: 0.22793202847749075
Explained variance for 3 letters: 0.31670201938180154
Explained variance for 4 letters: 0.02934302869285814
        pc1       pc2
0 -0.990391 -0.432029
1 -0.097711 -0.872844
2  1.179999  0.323746
3  0.552561  1.090515
4  0.079871  1.709785

# now we finally need to plot these tables to try to find correlations visually
import matplotlib.pyplot as plt

plt.figure(figsize=(10,10))

for i in range(1,5):
    plt.subplot(2,2,i)
    x = pCompTables[i].loc[:,'pc1']
    y = pCompTables[i].loc[:,'pc2']
    plt.scatter(x,y, s = 1)
    plt.xlabel('pc1')
    plt.ylabel('pc2')
    plt.title(str(i) + ' letter words')

plt.show()

kmeans = KMeans(n_clusters = 7)
kmeans.fit(normFreqTables[3])

KMeans(n_clusters=7)

plt.figure(figsize=(8,8))

x = pCompTables[3].loc[:,'pc1']
y = pCompTables[3].loc[:,'pc2']
plt.scatter(x,y, s = 20, c=kmeans.labels_, cmap = 'rainbow')
plt.xlabel('pc1')
plt.ylabel('pc2')
plt.title('K-Means clustering showing on top of principal components')

plt.show()

# Convert notebook to html
!jupyter nbconvert --to html "/content/drive/My Drive/Colab Notebooks/Copy of FDS_Project_LearnerNotebook_FullCode.ipynb"

Genomic Data Clustering¶

Background¶

Problem Statement¶

Mounting the Drive for Google Colab¶

Importing the Required Libraries¶

Data Preparation¶

Converting Text to a Numerical Table¶

Creating Frequency Tables¶

Principal Component Analysis¶

Clustering¶

Results in a Graph¶

	aaa	aag	aac	aga	agg	agt	agc	ata	atg	...	cgt	cgc	cta	ctg	ctt	ctc	cca	ccg	cct	ccc
0	0.0	1.0	3.0	0.0	2.0	0.0	2.0	2.0	0.0	...	2.0	2.0	2.0	1.0	1.0	4.0	4.0	4.0	1.0	1.0
1	1.0	0.0	3.0	0.0	1.0	0.0	0.0	5.0	0.0	...	2.0	4.0	0.0	2.0	4.0	2.0	1.0	1.0	0.0	1.0
2	0.0	1.0	0.0	0.0	0.0	0.0	0.0	2.0	0.0	...	2.0	4.0	0.0	4.0	5.0	4.0	1.0	0.0	0.0	0.0
3	1.0	0.0	0.0	0.0	0.0	0.0	0.0	1.0	0.0	...	0.0	4.0	0.0	5.0	5.0	2.0	2.0	0.0	0.0	0.0
4	0.0	0.0	0.0	2.0	0.0	1.0	0.0	0.0	1.0	...	2.0	2.0	0.0	8.0	3.0	1.0	5.0	1.0	0.0	0.0

	aaa	aag	aac	aga	agg	agt	agc	ata	atg	...	cgt	cgc	cta	ctg	ctt	ctc	cca	ccg	cct	ccc
0	0.0	1.0	3.0	0.0	2.0	0.0	2.0	2.0	0.0	...	2.0	2.0	2.0	1.0	1.0	4.0	4.0	4.0	1.0	1.0
1	1.0	0.0	3.0	0.0	1.0	0.0	0.0	5.0	0.0	...	2.0	4.0	0.0	2.0	4.0	2.0	1.0	1.0	0.0	1.0
2	0.0	1.0	0.0	0.0	0.0	0.0	0.0	2.0	0.0	...	2.0	4.0	0.0	4.0	5.0	4.0	1.0	0.0	0.0	0.0
3	1.0	0.0	0.0	0.0	0.0	0.0	0.0	1.0	0.0	...	0.0	4.0	0.0	5.0	5.0	2.0	2.0	0.0	0.0	0.0
4	0.0	0.0	0.0	2.0	0.0	1.0	0.0	0.0	1.0	...	2.0	2.0	0.0	8.0	3.0	1.0	5.0	1.0	0.0	0.0

	aaa	aag	aac	aga	agg	agt	agc	ata	atg	...	cgt	cgc	cta	ctg	ctt	ctc	cca	ccg	cct	ccc
0	0.0	1.0	3.0	0.0	2.0	0.0	2.0	2.0	0.0	...	2.0	2.0	2.0	1.0	1.0	4.0	4.0	4.0	1.0	1.0
1	1.0	0.0	3.0	0.0	1.0	0.0	0.0	5.0	0.0	...	2.0	4.0	0.0	2.0	4.0	2.0	1.0	1.0	0.0	1.0
2	0.0	1.0	0.0	0.0	0.0	0.0	0.0	2.0	0.0	...	2.0	4.0	0.0	4.0	5.0	4.0	1.0	0.0	0.0	0.0
3	1.0	0.0	0.0	0.0	0.0	0.0	0.0	1.0	0.0	...	0.0	4.0	0.0	5.0	5.0	2.0	2.0	0.0	0.0	0.0
4	0.0	0.0	0.0	2.0	0.0	1.0	0.0	0.0	1.0	...	2.0	2.0	0.0	8.0	3.0	1.0	5.0	1.0	0.0	0.0