#!/usr/bin/env python
# coding: utf-8

# # Star alleles
# 
# ## Table of contents
# 
# 1. [Non-rsID records](#Non-rsID-records)
# 2. [Genotype/allele annotations](#Genotype/allele-annotations)
# 3. [Allele definition tables](#Allele-definition-tables)
#     * [Comparison with PharmVar](#Comparison-with-PharmVar)
#     * [Informativeness](#Informativeness)
# 4. [Summary and questions](#Summary)

# In[1]:


import pandas as pd
import requests

from opentargets_pharmgkb.pandas_utils import read_tsv_to_df


# In[2]:


work_dir = '/home/april/projects/opentargets/pharmgkb/star-alleles'


# In[ ]:


# Rerun to refresh data
get_ipython().system('cd {work_dir}')
get_ipython().system('wget -q https://api.pharmgkb.org/v1/download/file/data/clinicalAnnotations.zip')
get_ipython().system('unzip -qj clinicalAnnotations.zip "*.tsv" -d {work_dir}')
get_ipython().system('rm clinicalAnnotations.zip')


# ## Non-rsID records
# 
# [Top of page](#Table-of-contents)

# In[3]:


annotations_df = read_tsv_to_df(f'{work_dir}/clinical_annotations.tsv')
alleles_df = read_tsv_to_df(f'{work_dir}/clinical_ann_alleles.tsv')


# In[4]:


len(annotations_df)


# In[5]:


no_rs_annotations = annotations_df[~annotations_df['Variant/Haplotypes'].str.contains('rs')]


# In[6]:


len(no_rs_annotations)


# In[7]:


# Check names to see if there's anything truly bizarre
names = no_rs_annotations['Variant/Haplotypes'].unique()


# In[8]:


# Note that the "variant/haplotype name" is a listing of which alleles are annotated in the specific record
names[:50]


# In[9]:


# Not necessarily an important distinction, but just to check...
star_allele_names = [n for n in names if '*' in n]
no_star_names = [n for n in names if '*' not in n]


# In[10]:


no_star_names


# No star allele observations:
# * [G6PD](https://www.pharmgkb.org/gene/PA28469/haplotype) seems well-defined though the naming is idiosyncratic (e.g. is it safe to just comma-split these strings?)
#   * may be a bit clearer in the alleles tables, e.g. [here](https://www.pharmgkb.org/clinicalAnnotation/1183621000)
# * [GSTT1](https://www.pharmgkb.org/gene/PA183/haplotype), [GSTM1](https://www.pharmgkb.org/gene/PA182/haplotype) null/non-null are just absence or presence of the entire gene, if this naming convention is standard we can work with it
# * [SLC6A4](https://www.pharmgkb.org/gene/PA312/haplotype) seems to be just... special
# 
# Note we can clearly get affected genes for all of these alleles though, from PGKB directly.

# In[11]:


# Confirming there are no missing genes in any of these
no_rs_annotations['Gene'].isna().any()


# ## Genotype/allele annotations
# 
# [Top of page](#Table-of-contents)

# In[12]:


pd.set_option('display.max_colwidth', None)


# In[13]:


joined_df = alleles_df.merge(no_rs_annotations, on='Clinical Annotation ID')


# In[14]:


# Remove some columns to make things easier to read...
joined_df = joined_df[['Clinical Annotation ID', 'Genotype/Allele', 'Annotation Text',
       'Allele Function', 'Variant/Haplotypes', 'Gene', 'Level of Evidence',
       'Phenotype Category', 'Drug(s)', 'Phenotype(s)']]


# In[15]:


# https://www.pharmgkb.org/clinicalAnnotation/1451259580
joined_df[joined_df['Clinical Annotation ID'] == '1451259580']


# In[16]:


# https://www.pharmgkb.org/clinicalAnnotation/1448427588
joined_df[joined_df['Clinical Annotation ID'] == '1448427588']


# In[17]:


# https://www.pharmgkb.org/clinicalAnnotation/981419263
joined_df[joined_df['Clinical Annotation ID'] == '981419263']


# In[18]:


# https://www.pharmgkb.org/clinicalAnnotation/1183621000
joined_df[joined_df['Clinical Annotation ID'] == '1183621000']


# Notes:
# * These are mostly allele-level annotations, though some are genotype-level (as opposed to rsID records which are nearly all genotype-level)
#     * Genotype-specific information is sometimes buried in the annotation text for each allele...
# * `*1xN` means `N` copies of the `*1` version of the gene

# ## Allele definition tables
# 
# [Top of page](#Table-of-contents)

# In[19]:


# Try to automatically get PGKB spreadsheet definitions
allele_definition_url = 'https://api.pharmgkb.org/v1/download/file/attachment/{gene}_allele_definition_table.xlsx'


# In[20]:


genes = no_rs_annotations['Gene'].unique()


# In[21]:


genes


# In[38]:


allele_def_tables = {}
for gene in genes:
    try:
        allele_def_tables[gene] = pd.read_excel(allele_definition_url.format(gene=gene), 
                                                storage_options={'User-Agent': 'Mozilla/5.0'},
                                                header=None)
    except Exception as e:
        print(f'Error for {gene}: {e}')


# In[23]:


# Genes with allele tables
pharmgkb_genes = set(allele_def_tables.keys())
pharmgkb_genes


# In[24]:


no_allele_def_table_genes = set(genes) - pharmgkb_genes


# In[25]:


no_allele_def_table_genes


# Checked a few of this list and they indeed don't have definition tables in PharmGKB, categories I see:
# * Refer to another resource: [some (but not all) CYP](https://www.pharmgkb.org/gene/PA129), [NAT](https://www.pharmgkb.org/gene/PA18/haplotype), [UGT](https://www.pharmgkb.org/gene/PA37179/haplotype)
# * Null/non-null: [GSTT1](https://www.pharmgkb.org/gene/PA183/haplotype), [GSTM1](https://www.pharmgkb.org/gene/PA182/haplotype)
# * Special: [SLC6A4](https://www.pharmgkb.org/gene/PA312/haplotype)
# 
# For now we'll skip these and look at those with allele definition tables (covers about 90% of no-RS records in PGKB).

# In[26]:


# What do we lose if we skip these?
len(no_rs_annotations[no_rs_annotations['Gene'].isin(no_allele_def_table_genes)])


# Note that the allele definition tables vary in informativeness, so just because one is present doesn't mean we can necessarily use it.
# * More informative example: [CYP2D6](https://docs.google.com/spreadsheets/d/1tIovgq2w7FXv6g2ASiQd5EtYjxEbSzYrCRpg9dIlrEY/edit?usp=sharing)
# * Less informative example: [HLA-A](https://docs.google.com/spreadsheets/d/1Wz0F74sdY-hG0LJP1Rg5ePsZddVQ1pZrHDySOYgrOhI/edit?usp=sharing)
# 
# Understanding the allele definition table:
# * First few rows give various definitions of variants: protein/chromosome/gene-level HGVS, and rsID if present
# * Each subsequent row gives what alleles are present for each of these variants for a particular named allele
#     * In theory should be able to use the "Genotype/Allele" column from the clinical allele annotations to index into this table
# * The final column is "structural variation" and contains text describing the nature of the variant, e.g. `CYP2D7::CYP2D6 hybrid gene`
# * Missing values = reference? Or is e.g. *1/first row the reference? If so what does missing value mean?

# ### Comparison with PharmVar
# 
# [Top of page](#Table-of-contents)

# In[27]:


# Compare with what we would get from PharmVar
pharmvar_url = 'https://www.pharmvar.org/api-service/alleles?exclude-sub-alleles=false&include-reference-variants=false&include-retired-alleles=false&include-retired-reference-sequences=false'
response = requests.get(pharmvar_url)
pharmvar_data = response.json()


# In[28]:


# 1 per allele
len(pharmvar_data)


# In[29]:


pharmvar_data[0]


# In[30]:


pharmvar_genes = {d['geneSymbol'] for d in pharmvar_data}


# In[31]:


pharmgkb_genes - pharmvar_genes


# In[32]:


pharmvar_genes - pharmgkb_genes


# In[33]:


len(no_rs_annotations[no_rs_annotations['Gene'].isin(pharmvar_genes - pharmgkb_genes)])


# Conclusion from this is that PharmVar probably has less information than PharmGKB; though most of the genes covered by PGKB and not by PV are "uninformative" tables, there are at least 2 exceptions (G6PD and UGT1A1). In contrast PV genes not covered by PGKB are not present in PGKB data.
# 
# I haven't compared the actual content of the PV vs. PGKB data but I'm assuming it's similar since it's sourced directly from PV.
# 
# Implementation-wise, PV does have the advantage in that it has an actual API with JSON responses rather than spreadsheets.

# ### Informativeness
# 
# [Top of page](#Table-of-contents)

# In[41]:


# What we get "out of the box" - note first 7 rows & first column are headers
allele_def_tables['CYP2D6'].head(10)


# In[42]:


allele_def_tables['HLA-A'].head(10)


# In[53]:


# If there are more than 2 columns we'll assume the table is informative
allele_def_metrics = []
informative_tables = []
for gene, table in allele_def_tables.items():
    allele_def_metrics.append({
        'gene': gene,
        'num_alleles': table.shape[0]-7,
        'num_variants': table.shape[1]-1
    })
    if table.shape[1] > 2:
        informative_tables.append(gene)


# In[54]:


allele_def_metrics


# In[55]:


len(informative_tables) / len(allele_def_metrics)


# In[56]:


# Count of non-rs clinical annotation records involving genes with informative allele definition tables
len(no_rs_annotations[no_rs_annotations['Gene'].isin(informative_tables)])


# ## Summary
# 
# * Non-rsID containing records represent 596 / 5101 = 12% of the clinical annotations
# * Affected gene is easy to get for all named alleles - we can rely on the "Gene" column in PGKB data
# * Annotations tend to be per-allele rather than per-genotype
# * Most records have an allele definition table from PGKB that we can download
#     * 53 / 596 = 8.9% do not
#     * Not all of these tables contain specific variants - see [CYP2D6](https://docs.google.com/spreadsheets/d/1tIovgq2w7FXv6g2ASiQd5EtYjxEbSzYrCRpg9dIlrEY/edit?usp=sharing) vs. [HLA-A](https://docs.google.com/spreadsheets/d/1Wz0F74sdY-hG0LJP1Rg5ePsZddVQ1pZrHDySOYgrOhI/edit?usp=sharing)
#         * 381 / 596 = 64% of records have an informative and discoverable allele definition table from PGKB
# * PGKB alleles tables mostly (but not entirely) come from PharmVar
# 
# ### Questions
# 
# * Identifier for these?
#     * PGKB basically uses a list of haplotypes being annotated as the "variant" identifier in their annotations table
#     * Note that our PGx schema uses genotype IDs not variant IDs
# * Do we want to resolve named alleles to variants, and if so how to convey this information?
# * Are we interested in functional consequences or is affected gene enough?
#     * Could try using [Haplosaurus](https://www.ensembl.org/info/docs/tools/vep/haplo/index.html) - use allele definition table & annotated allele to create VCF file input
# 
# [Top of page](#Table-of-contents)

# In[ ]: