#!/usr/bin/env python # coding: utf-8 # # Star alleles # # ## Table of contents # # 1. [Non-rsID records](#Non-rsID-records) # 2. [Genotype/allele annotations](#Genotype/allele-annotations) # 3. [Allele definition tables](#Allele-definition-tables) # * [Comparison with PharmVar](#Comparison-with-PharmVar) # * [Informativeness](#Informativeness) # 4. [Summary and questions](#Summary) # In[1]: import pandas as pd import requests from opentargets_pharmgkb.pandas_utils import read_tsv_to_df # In[2]: work_dir = '/home/april/projects/opentargets/pharmgkb/star-alleles' # In[ ]: # Rerun to refresh data get_ipython().system('cd {work_dir}') get_ipython().system('wget -q https://api.pharmgkb.org/v1/download/file/data/clinicalAnnotations.zip') get_ipython().system('unzip -qj clinicalAnnotations.zip "*.tsv" -d {work_dir}') get_ipython().system('rm clinicalAnnotations.zip') # ## Non-rsID records # # [Top of page](#Table-of-contents) # In[3]: annotations_df = read_tsv_to_df(f'{work_dir}/clinical_annotations.tsv') alleles_df = read_tsv_to_df(f'{work_dir}/clinical_ann_alleles.tsv') # In[4]: len(annotations_df) # In[5]: no_rs_annotations = annotations_df[~annotations_df['Variant/Haplotypes'].str.contains('rs')] # In[6]: len(no_rs_annotations) # In[7]: # Check names to see if there's anything truly bizarre names = no_rs_annotations['Variant/Haplotypes'].unique() # In[8]: # Note that the "variant/haplotype name" is a listing of which alleles are annotated in the specific record names[:50] # In[9]: # Not necessarily an important distinction, but just to check... star_allele_names = [n for n in names if '*' in n] no_star_names = [n for n in names if '*' not in n] # In[10]: no_star_names # No star allele observations: # * [G6PD](https://www.pharmgkb.org/gene/PA28469/haplotype) seems well-defined though the naming is idiosyncratic (e.g. is it safe to just comma-split these strings?) # * may be a bit clearer in the alleles tables, e.g. [here](https://www.pharmgkb.org/clinicalAnnotation/1183621000) # * [GSTT1](https://www.pharmgkb.org/gene/PA183/haplotype), [GSTM1](https://www.pharmgkb.org/gene/PA182/haplotype) null/non-null are just absence or presence of the entire gene, if this naming convention is standard we can work with it # * [SLC6A4](https://www.pharmgkb.org/gene/PA312/haplotype) seems to be just... special # # Note we can clearly get affected genes for all of these alleles though, from PGKB directly. # In[11]: # Confirming there are no missing genes in any of these no_rs_annotations['Gene'].isna().any() # ## Genotype/allele annotations # # [Top of page](#Table-of-contents) # In[12]: pd.set_option('display.max_colwidth', None) # In[13]: joined_df = alleles_df.merge(no_rs_annotations, on='Clinical Annotation ID') # In[14]: # Remove some columns to make things easier to read... joined_df = joined_df[['Clinical Annotation ID', 'Genotype/Allele', 'Annotation Text', 'Allele Function', 'Variant/Haplotypes', 'Gene', 'Level of Evidence', 'Phenotype Category', 'Drug(s)', 'Phenotype(s)']] # In[15]: # https://www.pharmgkb.org/clinicalAnnotation/1451259580 joined_df[joined_df['Clinical Annotation ID'] == '1451259580'] # In[16]: # https://www.pharmgkb.org/clinicalAnnotation/1448427588 joined_df[joined_df['Clinical Annotation ID'] == '1448427588'] # In[17]: # https://www.pharmgkb.org/clinicalAnnotation/981419263 joined_df[joined_df['Clinical Annotation ID'] == '981419263'] # In[18]: # https://www.pharmgkb.org/clinicalAnnotation/1183621000 joined_df[joined_df['Clinical Annotation ID'] == '1183621000'] # Notes: # * These are mostly allele-level annotations, though some are genotype-level (as opposed to rsID records which are nearly all genotype-level) # * Genotype-specific information is sometimes buried in the annotation text for each allele... # * `*1xN` means `N` copies of the `*1` version of the gene # ## Allele definition tables # # [Top of page](#Table-of-contents) # In[19]: # Try to automatically get PGKB spreadsheet definitions allele_definition_url = 'https://api.pharmgkb.org/v1/download/file/attachment/{gene}_allele_definition_table.xlsx' # In[20]: genes = no_rs_annotations['Gene'].unique() # In[21]: genes # In[38]: allele_def_tables = {} for gene in genes: try: allele_def_tables[gene] = pd.read_excel(allele_definition_url.format(gene=gene), storage_options={'User-Agent': 'Mozilla/5.0'}, header=None) except Exception as e: print(f'Error for {gene}: {e}') # In[23]: # Genes with allele tables pharmgkb_genes = set(allele_def_tables.keys()) pharmgkb_genes # In[24]: no_allele_def_table_genes = set(genes) - pharmgkb_genes # In[25]: no_allele_def_table_genes # Checked a few of this list and they indeed don't have definition tables in PharmGKB, categories I see: # * Refer to another resource: [some (but not all) CYP](https://www.pharmgkb.org/gene/PA129), [NAT](https://www.pharmgkb.org/gene/PA18/haplotype), [UGT](https://www.pharmgkb.org/gene/PA37179/haplotype) # * Null/non-null: [GSTT1](https://www.pharmgkb.org/gene/PA183/haplotype), [GSTM1](https://www.pharmgkb.org/gene/PA182/haplotype) # * Special: [SLC6A4](https://www.pharmgkb.org/gene/PA312/haplotype) # # For now we'll skip these and look at those with allele definition tables (covers about 90% of no-RS records in PGKB). # In[26]: # What do we lose if we skip these? len(no_rs_annotations[no_rs_annotations['Gene'].isin(no_allele_def_table_genes)]) # Note that the allele definition tables vary in informativeness, so just because one is present doesn't mean we can necessarily use it. # * More informative example: [CYP2D6](https://docs.google.com/spreadsheets/d/1tIovgq2w7FXv6g2ASiQd5EtYjxEbSzYrCRpg9dIlrEY/edit?usp=sharing) # * Less informative example: [HLA-A](https://docs.google.com/spreadsheets/d/1Wz0F74sdY-hG0LJP1Rg5ePsZddVQ1pZrHDySOYgrOhI/edit?usp=sharing) # # Understanding the allele definition table: # * First few rows give various definitions of variants: protein/chromosome/gene-level HGVS, and rsID if present # * Each subsequent row gives what alleles are present for each of these variants for a particular named allele # * In theory should be able to use the "Genotype/Allele" column from the clinical allele annotations to index into this table # * The final column is "structural variation" and contains text describing the nature of the variant, e.g. `CYP2D7::CYP2D6 hybrid gene` # * Missing values = reference? Or is e.g. *1/first row the reference? If so what does missing value mean? # ### Comparison with PharmVar # # [Top of page](#Table-of-contents) # In[27]: # Compare with what we would get from PharmVar pharmvar_url = 'https://www.pharmvar.org/api-service/alleles?exclude-sub-alleles=false&include-reference-variants=false&include-retired-alleles=false&include-retired-reference-sequences=false' response = requests.get(pharmvar_url) pharmvar_data = response.json() # In[28]: # 1 per allele len(pharmvar_data) # In[29]: pharmvar_data[0] # In[30]: pharmvar_genes = {d['geneSymbol'] for d in pharmvar_data} # In[31]: pharmgkb_genes - pharmvar_genes # In[32]: pharmvar_genes - pharmgkb_genes # In[33]: len(no_rs_annotations[no_rs_annotations['Gene'].isin(pharmvar_genes - pharmgkb_genes)]) # Conclusion from this is that PharmVar probably has less information than PharmGKB; though most of the genes covered by PGKB and not by PV are "uninformative" tables, there are at least 2 exceptions (G6PD and UGT1A1). In contrast PV genes not covered by PGKB are not present in PGKB data. # # I haven't compared the actual content of the PV vs. PGKB data but I'm assuming it's similar since it's sourced directly from PV. # # Implementation-wise, PV does have the advantage in that it has an actual API with JSON responses rather than spreadsheets. # ### Informativeness # # [Top of page](#Table-of-contents) # In[41]: # What we get "out of the box" - note first 7 rows & first column are headers allele_def_tables['CYP2D6'].head(10) # In[42]: allele_def_tables['HLA-A'].head(10) # In[53]: # If there are more than 2 columns we'll assume the table is informative allele_def_metrics = [] informative_tables = [] for gene, table in allele_def_tables.items(): allele_def_metrics.append({ 'gene': gene, 'num_alleles': table.shape[0]-7, 'num_variants': table.shape[1]-1 }) if table.shape[1] > 2: informative_tables.append(gene) # In[54]: allele_def_metrics # In[55]: len(informative_tables) / len(allele_def_metrics) # In[56]: # Count of non-rs clinical annotation records involving genes with informative allele definition tables len(no_rs_annotations[no_rs_annotations['Gene'].isin(informative_tables)]) # ## Summary # # * Non-rsID containing records represent 596 / 5101 = 12% of the clinical annotations # * Affected gene is easy to get for all named alleles - we can rely on the "Gene" column in PGKB data # * Annotations tend to be per-allele rather than per-genotype # * Most records have an allele definition table from PGKB that we can download # * 53 / 596 = 8.9% do not # * Not all of these tables contain specific variants - see [CYP2D6](https://docs.google.com/spreadsheets/d/1tIovgq2w7FXv6g2ASiQd5EtYjxEbSzYrCRpg9dIlrEY/edit?usp=sharing) vs. [HLA-A](https://docs.google.com/spreadsheets/d/1Wz0F74sdY-hG0LJP1Rg5ePsZddVQ1pZrHDySOYgrOhI/edit?usp=sharing) # * 381 / 596 = 64% of records have an informative and discoverable allele definition table from PGKB # * PGKB alleles tables mostly (but not entirely) come from PharmVar # # ### Questions # # * Identifier for these? # * PGKB basically uses a list of haplotypes being annotated as the "variant" identifier in their annotations table # * Note that our PGx schema uses genotype IDs not variant IDs # * Do we want to resolve named alleles to variants, and if so how to convey this information? # * Are we interested in functional consequences or is affected gene enough? # * Could try using [Haplosaurus](https://www.ensembl.org/info/docs/tools/vep/haplo/index.html) - use allele definition table & annotated allele to create VCF file input # # [Top of page](#Table-of-contents) # In[ ]: