#!/usr/bin/env python # coding: utf-8 # In[38]: import re import pandas as pd # # Parse Clustal Omega alignment output # In[37]: def read_clustal_omega(file_name: str) -> dict: with open(file_name, 'r') as reader: # skip the first 3 lines for _ in range(3): next(reader) res = {} for line in reader: line = line.strip() if not line.startswith(' ') and not line.startswith('\t') and line and not '*' in line: tokens = re.split(r'[ ]+|\t', line) if tokens[0] not in res: res[tokens[0]] = list(tokens[1]) else: res[tokens[0]] = res[tokens[0]] + list(tokens[1]) return res # In[41]: seq = read_clustal_omega('data/VARFREQ_2021/clustalo-I20210309-103503-0178-93081341-p1m.clustal_num') seq_df = pd.DataFrame.from_dict(seq, orient='index') seq_df.head() # ## Count nt occurrences per position # In[83]: counts = pd.DataFrame(0, index=['A','C','G','T','-','R','S','K','Y','M','N'], columns=seq_df.columns) for colname, col in seq_df.iteritems(): temp = seq_df[colname].value_counts() for tidx in temp.index: counts.at[tidx, colname] = temp[tidx] counts.transpose().head(10) # In[88]: counts.transpose().to_excel("nt_frequency.xlsx", sheet_name='nt frequency') # ## Count missmatches # In[109]: mm = counts.transpose().sum(axis=1) - counts.transpose().max(axis=1) mm.sum() # In[ ]: