#!/usr/bin/env python
# coding: utf-8

# In[38]:


import re
import pandas as pd


# # Parse Clustal Omega alignment output

# In[37]:


def read_clustal_omega(file_name: str) -> dict:
    with open(file_name, 'r') as reader:
        # skip the first 3 lines
        for _ in range(3):
            next(reader)

        res = {}
        for line in reader:
            line = line.strip()
            if not line.startswith(' ') and not line.startswith('\t') and line and not '*' in line:
                tokens = re.split(r'[ ]+|\t', line)

                if tokens[0] not in res:
                    res[tokens[0]] = list(tokens[1])
                else:
                    res[tokens[0]] = res[tokens[0]] + list(tokens[1])

        return res


# In[41]:


seq = read_clustal_omega('data/VARFREQ_2021/clustalo-I20210309-103503-0178-93081341-p1m.clustal_num')
seq_df = pd.DataFrame.from_dict(seq, orient='index')
seq_df.head()


# ## Count nt occurrences per position 

# In[83]:


counts = pd.DataFrame(0, index=['A','C','G','T','-','R','S','K','Y','M','N'], columns=seq_df.columns)

for colname, col in seq_df.iteritems():
    temp = seq_df[colname].value_counts()
       
    for tidx in temp.index:
        counts.at[tidx, colname] = temp[tidx]

counts.transpose().head(10)


# In[88]:


counts.transpose().to_excel("nt_frequency.xlsx", sheet_name='nt frequency')


# ## Count missmatches

# In[109]:


mm = counts.transpose().sum(axis=1) - counts.transpose().max(axis=1)
mm.sum()


# In[ ]: