import re
import pandas as pd
def read_clustal_omega(file_name: str) -> dict:
with open(file_name, 'r') as reader:
# skip the first 3 lines
for _ in range(3):
next(reader)
res = {}
for line in reader:
line = line.strip()
if not line.startswith(' ') and not line.startswith('\t') and line and not '*' in line:
tokens = re.split(r'[ ]+|\t', line)
if tokens[0] not in res:
res[tokens[0]] = list(tokens[1])
else:
res[tokens[0]] = res[tokens[0]] + list(tokens[1])
return res
seq = read_clustal_omega('data/VARFREQ_2021/clustalo-I20210309-103503-0178-93081341-p1m.clustal_num')
seq_df = pd.DataFrame.from_dict(seq, orient='index')
seq_df.head()
0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | ... | 1059 | 1060 | 1061 | 1062 | 1063 | 1064 | 1065 | 1066 | 1067 | 1068 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
KM355915.1 | A | T | G | G | A | T | T | A | T | C | ... | - | - | - | - | - | - | - | - | - | - |
KM355864.1 | A | T | G | G | A | T | T | A | T | C | ... | - | - | - | - | - | - | - | - | - | - |
KM355862.1 | A | T | G | G | A | T | T | A | T | C | ... | - | - | - | - | - | - | - | - | - | - |
KM355863.1 | A | T | G | G | A | T | T | A | T | C | ... | - | - | - | - | - | - | - | - | - | - |
KM355861.1 | A | T | G | G | A | T | T | A | T | C | ... | - | - | - | - | - | - | - | - | - | - |
5 rows × 1069 columns
counts = pd.DataFrame(0, index=['A','C','G','T','-','R','S','K','Y','M','N'], columns=seq_df.columns)
for colname, col in seq_df.iteritems():
temp = seq_df[colname].value_counts()
for tidx in temp.index:
counts.at[tidx, colname] = temp[tidx]
counts.transpose().head(10)
A | C | G | T | - | R | S | K | Y | M | N | |
---|---|---|---|---|---|---|---|---|---|---|---|
0 | 76 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
1 | 0 | 0 | 0 | 76 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
2 | 0 | 0 | 76 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
3 | 0 | 0 | 76 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
4 | 76 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
5 | 0 | 0 | 0 | 76 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
6 | 0 | 0 | 0 | 76 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
7 | 76 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
8 | 0 | 0 | 0 | 76 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
9 | 0 | 76 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
counts.transpose().to_excel("nt_frequency.xlsx", sheet_name='nt frequency')
mm = counts.transpose().sum(axis=1) - counts.transpose().max(axis=1)
mm.sum()
241