In this project, it will show the Machine Learning Model for classifying DNA sequence. K-Nearest Neighborhood and Support Vector Machine and several algorithm for classification will be used. The original data is from UCI Machine Learning Repository.
import sys
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import sklearn
plt.rcParams['figure.figsize'] = (8, 8)
print('Python: {}'.format(sys.version))
print('Numpy: {}'.format(np.__version__))
print('Pandas: {}'.format(pd.__version__))
Python: 3.7.6 (default, Jan 8 2020, 20:23:39) [MSC v.1916 64 bit (AMD64)] Numpy: 1.18.1 Pandas: 1.0.1
The original data is from UCI Machine Learning Repository
# UCI molecular biology (promoter gene sequences dataset)
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/molecular-biology/promoter-gene-sequences/promoters.data'
names = ['Class', 'id', 'Sequence']
data = pd.read_csv(url, names=names)
data.head()
Class | id | Sequence | |
---|---|---|---|
0 | + | S10 | \t\ttactagcaatacgcttgcgttcggtggttaagtatgtataat... |
1 | + | AMPC | \t\ttgctatcctgacagttgtcacgctgattggtgtcgttacaat... |
2 | + | AROH | \t\tgtactagagaactagtgcattagcttatttttttgttatcat... |
3 | + | DEOP2 | \taattgtgatgtgtatcgaagtgtgttgcggagtagatgttagaa... |
4 | + | LEU1_TRNA | \ttcgataattaactattgacgaaaagctgaaaaccactagaatgc... |
# build dataset using a custom pandas dataframe
# each column in a dataframe is called a series
classes = data.loc[:, 'Class']
classes.value_counts()
+ 53 - 53 Name: Class, dtype: int64
# Generate list of DNA sequences
sequences = data.loc[:, 'Sequence'].tolist()
dataset = {}
# Loop throught the sequences and split into individual nucleotides
for i, seq in enumerate(sequences):
# split into nucleotides, remove tab characters
nucleotides = list(seq)
nucleotides = [x for x in nucleotides if x != '\t']
# Append class assignment
nucleotides.append(classes[i])
# add to dataset
dataset[i] = nucleotides
print(dataset[0])
['t', 'a', 'c', 't', 'a', 'g', 'c', 'a', 'a', 't', 'a', 'c', 'g', 'c', 't', 't', 'g', 'c', 'g', 't', 't', 'c', 'g', 'g', 't', 'g', 'g', 't', 't', 'a', 'a', 'g', 't', 'a', 't', 'g', 't', 'a', 't', 'a', 'a', 't', 'g', 'c', 'g', 'c', 'g', 'g', 'g', 'c', 't', 't', 'g', 't', 'c', 'g', 't', '+']
# Turn dataset into pandas dataframe
df = pd.DataFrame(dataset).T
df
0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | ... | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | t | a | c | t | a | g | c | a | a | t | ... | g | c | t | t | g | t | c | g | t | + |
1 | t | g | c | t | a | t | c | c | t | g | ... | c | a | t | c | g | c | c | a | a | + |
2 | g | t | a | c | t | a | g | a | g | a | ... | c | a | c | c | c | g | g | c | g | + |
3 | a | a | t | t | g | t | g | a | t | g | ... | a | a | c | a | a | a | c | t | c | + |
4 | t | c | g | a | t | a | a | t | t | a | ... | c | c | g | t | g | g | t | a | g | + |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
101 | c | c | t | c | a | a | t | g | g | c | ... | g | a | a | c | t | a | t | a | t | - |
102 | g | t | a | t | t | c | t | c | a | a | ... | t | c | a | a | c | a | t | t | g | - |
103 | c | g | c | g | a | c | t | a | c | g | ... | a | a | g | g | c | t | t | c | c | - |
104 | c | t | c | g | t | c | c | t | c | a | ... | a | g | g | a | g | g | a | a | c | - |
105 | t | a | a | c | a | t | t | a | a | t | ... | t | c | a | a | g | a | a | c | t | - |
106 rows × 58 columns
# rename the last column to class
df.rename(columns={57: 'Class'}, inplace=True)
df
0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | ... | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | Class | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | t | a | c | t | a | g | c | a | a | t | ... | g | c | t | t | g | t | c | g | t | + |
1 | t | g | c | t | a | t | c | c | t | g | ... | c | a | t | c | g | c | c | a | a | + |
2 | g | t | a | c | t | a | g | a | g | a | ... | c | a | c | c | c | g | g | c | g | + |
3 | a | a | t | t | g | t | g | a | t | g | ... | a | a | c | a | a | a | c | t | c | + |
4 | t | c | g | a | t | a | a | t | t | a | ... | c | c | g | t | g | g | t | a | g | + |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
101 | c | c | t | c | a | a | t | g | g | c | ... | g | a | a | c | t | a | t | a | t | - |
102 | g | t | a | t | t | c | t | c | a | a | ... | t | c | a | a | c | a | t | t | g | - |
103 | c | g | c | g | a | c | t | a | c | g | ... | a | a | g | g | c | t | t | c | c | - |
104 | c | t | c | g | t | c | c | t | c | a | ... | a | g | g | a | g | g | a | a | c | - |
105 | t | a | a | c | a | t | t | a | a | t | ... | t | c | a | a | g | a | a | c | t | - |
106 rows × 58 columns
df.describe()
0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | ... | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | Class | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
count | 106 | 106 | 106 | 106 | 106 | 106 | 106 | 106 | 106 | 106 | ... | 106 | 106 | 106 | 106 | 106 | 106 | 106 | 106 | 106 | 106 |
unique | 4 | 4 | 4 | 4 | 4 | 4 | 4 | 4 | 4 | 4 | ... | 4 | 4 | 4 | 4 | 4 | 4 | 4 | 4 | 4 | 2 |
top | t | a | a | c | a | a | a | a | a | a | ... | c | c | c | t | t | c | c | c | t | + |
freq | 38 | 34 | 30 | 30 | 36 | 42 | 38 | 34 | 33 | 36 | ... | 36 | 42 | 31 | 33 | 35 | 32 | 29 | 29 | 34 | 53 |
4 rows × 58 columns
# Record value counts for each sequence
series = []
for name in df.columns:
series.append(df[name].value_counts())
info = pd.DataFrame(series)
details = info.T
details
0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | ... | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | Class | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
t | 38.0 | 26.0 | 27.0 | 26.0 | 22.0 | 24.0 | 30.0 | 32.0 | 32.0 | 28.0 | ... | 21.0 | 22.0 | 23.0 | 33.0 | 35.0 | 30.0 | 23.0 | 29.0 | 34.0 | NaN |
c | 27.0 | 22.0 | 21.0 | 30.0 | 19.0 | 18.0 | 21.0 | 20.0 | 22.0 | 22.0 | ... | 36.0 | 42.0 | 31.0 | 32.0 | 21.0 | 32.0 | 29.0 | 29.0 | 17.0 | NaN |
a | 26.0 | 34.0 | 30.0 | 22.0 | 36.0 | 42.0 | 38.0 | 34.0 | 33.0 | 36.0 | ... | 23.0 | 24.0 | 28.0 | 27.0 | 25.0 | 22.0 | 26.0 | 24.0 | 27.0 | NaN |
g | 15.0 | 24.0 | 28.0 | 28.0 | 29.0 | 22.0 | 17.0 | 20.0 | 19.0 | 20.0 | ... | 26.0 | 18.0 | 24.0 | 14.0 | 25.0 | 22.0 | 28.0 | 24.0 | 28.0 | NaN |
+ | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 53.0 |
- | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 53.0 |
6 rows × 58 columns
# Switch to numerical data using pd.get_dummies()
numerical_df = pd.get_dummies(df)
numerical_df.head()
0_a | 0_c | 0_g | 0_t | 1_a | 1_c | 1_g | 1_t | 2_a | 2_c | ... | 55_a | 55_c | 55_g | 55_t | 56_a | 56_c | 56_g | 56_t | Class_+ | Class_- | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 1 | ... | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 1 | 0 |
1 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 1 | ... | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 |
2 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | ... | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 0 |
3 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 1 | 0 |
4 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | ... | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 0 |
5 rows × 230 columns
# Remove one of the class columns and rename to simply 'Class'
df = numerical_df.drop(columns=['Class_-'])
df.rename(columns={'Class_+':'Class'}, inplace=True)
df
0_a | 0_c | 0_g | 0_t | 1_a | 1_c | 1_g | 1_t | 2_a | 2_c | ... | 54_t | 55_a | 55_c | 55_g | 55_t | 56_a | 56_c | 56_g | 56_t | Class | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 1 | ... | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 1 |
1 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 1 | ... | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 |
2 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | ... | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 1 |
3 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 1 |
4 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | ... | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 1 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
101 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | ... | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 |
102 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | ... | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 |
103 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | ... | 1 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 |
104 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | ... | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 |
105 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 1 | 0 | ... | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 |
106 rows × 229 columns
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score
from sklearn.model_selection import KFold, train_test_split, cross_val_score
# Create X and y dataset for training
X = df.drop(['Class'], axis=1).to_numpy()
y = df['Class'].to_numpy()
# Split the data into training and test dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, shuffle=True)
# Define scoring method
scoring = 'accuracy'
# Define models to train
names = ['K Nearest Neighbors', 'Gaussian Process', 'Decision Tree', 'Random Forest',
'Neural Network', 'AdaBoost', 'Naive Bayes', 'SVM Linear', 'SVM RBF', 'SVM Sigmoid']
classifiers = [
KNeighborsClassifier(n_neighbors=3),
GaussianProcessClassifier(1.0 * RBF(1.0)),
DecisionTreeClassifier(max_depth=5),
RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1),
MLPClassifier(alpha=1, max_iter=500),
AdaBoostClassifier(),
GaussianNB(),
SVC(kernel='linear'),
SVC(kernel='rbf'),
SVC(kernel='sigmoid')
]
models = zip(names, classifiers)
# Evaluate each model in turn
results = []
names = []
for name, model in models:
kfold = KFold(n_splits=10, shuffle=True)
cv_results = cross_val_score(model, X_train, y_train, cv=kfold, scoring=scoring)
results.append(cv_results)
names.append(name)
msg = '{0}: {1} ({2})'.format(name, cv_results.mean(), cv_results.std())
print(msg)
K Nearest Neighbors: 0.7946428571428571 (0.1569499627789906) Gaussian Process: 0.9125 (0.08003905296791061) Decision Tree: 0.7839285714285714 (0.15980895801307413) Random Forest: 0.6339285714285714 (0.187329854774395) Neural Network: 0.8732142857142857 (0.0969726713027533) AdaBoost: 0.8482142857142858 (0.12222689256176861) Naive Bayes: 0.8607142857142858 (0.11785714285714285) SVM Linear: 0.8964285714285714 (0.08253014291636673) SVM RBF: 0.8607142857142858 (0.13044273119821195) SVM Sigmoid: 0.95 (0.09999999999999999)
models = zip(names, classifiers)
# Test the algorithm on the validation dataset
for name, model in models:
model.fit(X_train, y_train)
predictions = model.predict(X_test)
print(name)
print(accuracy_score(y_test, predictions))
print(classification_report(y_test, predictions))
K Nearest Neighbors 0.8518518518518519 precision recall f1-score support 0 1.00 0.73 0.85 15 1 0.75 1.00 0.86 12 accuracy 0.85 27 macro avg 0.88 0.87 0.85 27 weighted avg 0.89 0.85 0.85 27 Gaussian Process 0.9259259259259259 precision recall f1-score support 0 0.88 1.00 0.94 15 1 1.00 0.83 0.91 12 accuracy 0.93 27 macro avg 0.94 0.92 0.92 27 weighted avg 0.93 0.93 0.92 27 Decision Tree 0.8148148148148148 precision recall f1-score support 0 0.86 0.80 0.83 15 1 0.77 0.83 0.80 12 accuracy 0.81 27 macro avg 0.81 0.82 0.81 27 weighted avg 0.82 0.81 0.82 27 Random Forest 0.8148148148148148 precision recall f1-score support 0 0.92 0.73 0.81 15 1 0.73 0.92 0.81 12 accuracy 0.81 27 macro avg 0.82 0.82 0.81 27 weighted avg 0.84 0.81 0.81 27 Neural Network 0.9259259259259259 precision recall f1-score support 0 0.88 1.00 0.94 15 1 1.00 0.83 0.91 12 accuracy 0.93 27 macro avg 0.94 0.92 0.92 27 weighted avg 0.93 0.93 0.92 27 AdaBoost 0.9259259259259259 precision recall f1-score support 0 1.00 0.87 0.93 15 1 0.86 1.00 0.92 12 accuracy 0.93 27 macro avg 0.93 0.93 0.93 27 weighted avg 0.94 0.93 0.93 27 Naive Bayes 0.9259259259259259 precision recall f1-score support 0 0.88 1.00 0.94 15 1 1.00 0.83 0.91 12 accuracy 0.93 27 macro avg 0.94 0.92 0.92 27 weighted avg 0.93 0.93 0.92 27 SVM Linear 0.8888888888888888 precision recall f1-score support 0 0.88 0.93 0.90 15 1 0.91 0.83 0.87 12 accuracy 0.89 27 macro avg 0.89 0.88 0.89 27 weighted avg 0.89 0.89 0.89 27 SVM RBF 0.9259259259259259 precision recall f1-score support 0 0.88 1.00 0.94 15 1 1.00 0.83 0.91 12 accuracy 0.93 27 macro avg 0.94 0.92 0.92 27 weighted avg 0.93 0.93 0.92 27 SVM Sigmoid 0.8518518518518519 precision recall f1-score support 0 0.87 0.87 0.87 15 1 0.83 0.83 0.83 12 accuracy 0.85 27 macro avg 0.85 0.85 0.85 27 weighted avg 0.85 0.85 0.85 27