In [38]:

import re
import pandas as pd

Parse Clustal Omega alignment output¶

In [37]:

def read_clustal_omega(file_name: str) -> dict:
    with open(file_name, 'r') as reader:
        # skip the first 3 lines
        for _ in range(3):
            next(reader)

        res = {}
        for line in reader:
            line = line.strip()
            if not line.startswith(' ') and not line.startswith('\t') and line and not '*' in line:
                tokens = re.split(r'[ ]+|\t', line)

                if tokens[0] not in res:
                    res[tokens[0]] = list(tokens[1])
                else:
                    res[tokens[0]] = res[tokens[0]] + list(tokens[1])

        return res

In [41]:

seq = read_clustal_omega('data/VARFREQ_2021/clustalo-I20210309-103503-0178-93081341-p1m.clustal_num')
seq_df = pd.DataFrame.from_dict(seq, orient='index')
seq_df.head()

Out[41]:

	0	1	2	3	4	5	6	7	8	9	...	1059	1060	1061	1062	1063	1064	1065	1066	1067	1068
KM355915.1	A	T	G	G	A	T	T	A	T	C	...	-	-	-	-	-	-	-	-	-	-
KM355864.1	A	T	G	G	A	T	T	A	T	C	...	-	-	-	-	-	-	-	-	-	-
KM355862.1	A	T	G	G	A	T	T	A	T	C	...	-	-	-	-	-	-	-	-	-	-
KM355863.1	A	T	G	G	A	T	T	A	T	C	...	-	-	-	-	-	-	-	-	-	-
KM355861.1	A	T	G	G	A	T	T	A	T	C	...	-	-	-	-	-	-	-	-	-	-

5 rows × 1069 columns

Count nt occurrences per position¶

In [83]:

counts = pd.DataFrame(0, index=['A','C','G','T','-','R','S','K','Y','M','N'], columns=seq_df.columns)

for colname, col in seq_df.iteritems():
    temp = seq_df[colname].value_counts()
       
    for tidx in temp.index:
        counts.at[tidx, colname] = temp[tidx]

counts.transpose().head(10)

Out[83]:

	A	C	G	T
0	76	0	0	0
1	0	0	0	76
2	0	0	76	0
3	0	0	76	0
4	76	0	0	0
5	0	0	0	76
6	0	0	0	76
7	76	0	0	0
8	0	0	0	76
9	0	76	0	0

In [88]:

counts.transpose().to_excel("nt_frequency.xlsx", sheet_name='nt frequency')

Count missmatches¶

In [109]:

mm = counts.transpose().sum(axis=1) - counts.transpose().max(axis=1)
mm.sum()

Out[109]:

In [ ]:

	A	C	G	T
0	76	0	0	0
1	0	0	0	76
2	0	0	76	0
3	0	0	76	0
4	76	0	0	0
5	0	0	0	76
6	0	0	0	76
7	76	0	0	0
8	0	0	0	76
9	0	76	0	0

	A	C	G	T
0	76	0	0	0
1	0	0	0	76
2	0	0	76	0
3	0	0	76	0
4	76	0	0	0
5	0	0	0	76
6	0	0	0	76
7	76	0	0	0
8	0	0	0	76
9	0	76	0	0

	A	C	G	T
0	76	0	0	0
1	0	0	0	76
2	0	0	76	0
3	0	0	76	0
4	76	0	0	0
5	0	0	0	76
6	0	0	0	76
7	76	0	0	0
8	0	0	0	76
9	0	76	0	0