import pandas as pd
import requests
from opentargets_pharmgkb.pandas_utils import read_tsv_to_df
work_dir = '/home/april/projects/opentargets/pharmgkb/star-alleles'
# Rerun to refresh data
!cd {work_dir}
!wget -q https://api.pharmgkb.org/v1/download/file/data/clinicalAnnotations.zip
!unzip -qj clinicalAnnotations.zip "*.tsv" -d {work_dir}
!rm clinicalAnnotations.zip
annotations_df = read_tsv_to_df(f'{work_dir}/clinical_annotations.tsv')
alleles_df = read_tsv_to_df(f'{work_dir}/clinical_ann_alleles.tsv')
len(annotations_df)
5101
no_rs_annotations = annotations_df[~annotations_df['Variant/Haplotypes'].str.contains('rs')]
len(no_rs_annotations)
596
# Check names to see if there's anything truly bizarre
names = no_rs_annotations['Variant/Haplotypes'].unique()
# Note that the "variant/haplotype name" is a listing of which alleles are annotated in the specific record
names[:50]
array(['HLA-B*15:02', 'CYP2D6*1, CYP2D6*1xN, CYP2D6*2, CYP2D6*3, CYP2D6*4, CYP2D6*5, CYP2D6*6, CYP2D6*10, CYP2D6*41', 'CYP2D6*1, CYP2D6*1xN, CYP2D6*2xN, CYP2D6*4, CYP2D6*5', 'CYP2D6*1, CYP2D6*1xN, CYP2D6*2, CYP2D6*2xN', 'CYP2D6*1, CYP2D6*3, CYP2D6*4, CYP2D6*4xN, CYP2D6*5, CYP2D6*6', 'CYP2D6*1, CYP2D6*2, CYP2D6*3, CYP2D6*4, CYP2D6*5, CYP2D6*6, CYP2D6*7, CYP2D6*9, CYP2D6*10, CYP2D6*10x2, CYP2D6*11, CYP2D6*17, CYP2D6*21, CYP2D6*36, CYP2D6*41', 'UGT1A3*1, UGT1A3*2, UGT1A3*3', 'HLA-B*55:01', 'CYP2C19*1, CYP2C19*17', 'NAT2*4, NAT2*5, NAT2*6, NAT2*7, NAT2*12, NAT2*13', 'CYP3A5*1, CYP3A5*3', 'CYP2C9*1, CYP2C9*3', 'CYP2C19*1, CYP2C19*2, CYP2C19*3', 'UGT1A1*1, UGT1A1*28', 'CYP2B6*1, CYP2B6*6', 'NUDT15*1, NUDT15*4, NUDT15*5, NUDT15*6', 'NUDT15*1, NUDT15*6', 'CYP2D6*1, CYP2D6*10', 'UGT1A1*1, UGT1A1*6', 'CYP2C9*1, CYP2C9*2, CYP2C9*3', 'HLA-B*48:01', 'CYP2C19*1, CYP2C19*2, CYP2C19*17', 'HLA-B*15:12', 'CYP2D6*1, CYP2D6*2, CYP2D6*2xN, CYP2D6*3, CYP2D6*4, CYP2D6*6', 'CYP2C19*1, CYP2C19*2', 'CYP2C19*1, CYP2C19*2, CYP2C19*3, CYP2C19*8, CYP2C19*9, CYP2C19*17', 'CYP2C8*1, CYP2C8*2, CYP2C8*3, CYP2C8*4', 'CYP2D6*1, CYP2D6*1xN, CYP2D6*2, CYP2D6*2xN, CYP2D6*3, CYP2D6*4, CYP2D6*5, CYP2D6*6, CYP2D6*7', 'HLA-B*38:02', 'CYP2C19*2', 'HLA-B*13:01', 'HLA-B*58:01', 'CYP2D6*2, CYP2D6*10', 'CYP2D6*1, CYP2D6*4', 'HLA-B*51:01', 'CYP2D6*1, CYP2D6*2, CYP2D6*3, CYP2D6*4, CYP2D6*5, CYP2D6*6, CYP2D6*10, CYP2D6*17, CYP2D6*29, CYP2D6*35, CYP2D6*41', 'CYP2D6*1, CYP2D6*4, CYP2D6*5, CYP2D6*6, CYP2D6*10, CYP2D6*14', 'CYP2D6*1, CYP2D6*1xN, CYP2D6*2, CYP2D6*2xN, CYP2D6*3, CYP2D6*4, CYP2D6*5, CYP2D6*6, CYP2D6*10, CYP2D6*17, CYP2D6*29, CYP2D6*36, CYP2D6*41', 'CYP3A4*1, CYP3A4*18, CYP3A4*20, CYP3A4*22', 'CYP2D6*1, CYP2D6*3, CYP2D6*4, CYP2D6*5, CYP2D6*10', 'CYP2B6*1, CYP2B6*5', 'NAT2*4, NAT2*5A, NAT2*5B, NAT2*5C, NAT2*6A, NAT2*6B, NAT2*6J, NAT2*6O, NAT2*7A, NAT2*7B, NAT2*7G, NAT2*12A, NAT2*13A, NAT2*14A', 'CYP2D6*1, CYP2D6*3, CYP2D6*4', 'HLA-A*33:03', 'CYP2D6*1, CYP2D6*3, CYP2D6*4, CYP2D6*5', 'CYP2C9*1, CYP2C9*2, CYP2C9*3, CYP2C9*5, CYP2C9*6, CYP2C9*8, CYP2C9*11, CYP2C9*13, CYP2C9*14, CYP2C9*16, CYP2C9*29, CYP2C9*31, CYP2C9*33, CYP2C9*37, CYP2C9*39, CYP2C9*42, CYP2C9*43, CYP2C9*45, CYP2C9*50, CYP2C9*52, CYP2C9*55', 'CYP2B6*1, CYP2B6*4, CYP2B6*5, CYP2B6*6, CYP2B6*7', 'G6PD A- 202A_376G, G6PD B (reference)', 'CYP2D6*1, CYP2D6*5, CYP2D6*10', 'CYP2D6*1, CYP2D6*4, CYP2D6*4xN, CYP2D6*5, CYP2D6*10, CYP2D6*17, CYP2D6*92, CYP2D6*96'], dtype=object)
# Not necessarily an important distinction, but just to check...
star_allele_names = [n for n in names if '*' in n]
no_star_names = [n for n in names if '*' not in n]
no_star_names
['G6PD A- 202A_376G, G6PD B (reference)', 'GSTT1 non-null, GSTT1 null', 'GSTM1 non-null, GSTM1 null', 'G6PD A- 202A_376G, G6PD B (reference), G6PD Mediterranean, Dallas, Panama, Sassari, Cagliari, Birmingham', 'SLC6A4 HTTLPR long form (L allele), SLC6A4 HTTLPR short form (S allele)', 'G6PD B (reference), G6PD Mediterranean Haplotype', 'G6PD B (reference), G6PD Mediterranean, Dallas, Panama, Sassari, Cagliari, Birmingham', 'G6PD B (reference), G6PD Canton, Taiwan-Hakka, Gifu-like, Agrigento-like', 'G6PD B (reference), G6PD Mediterranean Haplotype, G6PD Mediterranean, Dallas, Panama, Sassari, Cagliari, Birmingham', 'G6PD A- 202A_376G, G6PD B (reference), G6PD Mediterranean Haplotype, G6PD Mediterranean, Dallas, Panama, Sassari, Cagliari, Birmingham', 'G6PD A- 202A_376G']
No star allele observations:
Note we can clearly get affected genes for all of these alleles though, from PGKB directly.
# Confirming there are no missing genes in any of these
no_rs_annotations['Gene'].isna().any()
False
pd.set_option('display.max_colwidth', None)
joined_df = alleles_df.merge(no_rs_annotations, on='Clinical Annotation ID')
# Remove some columns to make things easier to read...
joined_df = joined_df[['Clinical Annotation ID', 'Genotype/Allele', 'Annotation Text',
'Allele Function', 'Variant/Haplotypes', 'Gene', 'Level of Evidence',
'Phenotype Category', 'Drug(s)', 'Phenotype(s)']]
# https://www.pharmgkb.org/clinicalAnnotation/1451259580
joined_df[joined_df['Clinical Annotation ID'] == '1451259580']
Clinical Annotation ID | Genotype/Allele | Annotation Text | Allele Function | Variant/Haplotypes | Gene | Level of Evidence | Phenotype Category | Drug(s) | Phenotype(s) | |
---|---|---|---|---|---|---|---|---|---|---|
1 | 1451259580 | *1 | The CYP2D6*1 allele is assigned as a normal function allele by CPIC. Patients carrying the CYP2D6*1 allele in combination with alleles that result in a normal metabolizer phenotype who are treated with amitriptyline may have decreased likelihood of side effects as compared to patients with a combination of alleles that result in intermediate or poor metabolizer phenotype. Other genetic and clinical factors may also influence response to amitriptyline. | Normal function | CYP2D6*1, CYP2D6*1xN, CYP2D6*2, CYP2D6*3, CYP2D6*4, CYP2D6*5, CYP2D6*6, CYP2D6*10, CYP2D6*41 | CYP2D6 | 1A | Toxicity | amitriptyline | Depressive Disorder |
2 | 1451259580 | *1xN | The CYP2D6*1xN alleles (*1x2 and *1x≥3) have been assigned as increased function alleles by CPIC. Patients carrying the CYP2D6*1xN allele in combination with alleles that result in a normal metabolizer phenotype who are treated with amitriptyline may have decreased likelihood of side effects as compared to patients with a combination of alleles that result in intermediate or poor metabolizer phenotype. Other genetic and clinical factors may also influence response to amitriptyline. | Increased function | CYP2D6*1, CYP2D6*1xN, CYP2D6*2, CYP2D6*3, CYP2D6*4, CYP2D6*5, CYP2D6*6, CYP2D6*10, CYP2D6*41 | CYP2D6 | 1A | Toxicity | amitriptyline | Depressive Disorder |
3 | 1451259580 | *2 | The CYP2D6*2 allele is assigned as a normal function allele by CPIC. Patients carrying the CYP2D6*2 allele in combination with alleles that result in a normal metabolizer phenotype who are treated with amitriptyline may have decreased likelihood of side effects as compared to patients with a combination of alleles that result in intermediate or poor metabolizer phenotype. Other genetic and clinical factors may also influence response to amitriptyline. | Normal function | CYP2D6*1, CYP2D6*1xN, CYP2D6*2, CYP2D6*3, CYP2D6*4, CYP2D6*5, CYP2D6*6, CYP2D6*10, CYP2D6*41 | CYP2D6 | 1A | Toxicity | amitriptyline | Depressive Disorder |
4 | 1451259580 | *3 | The CYP2D6*3 allele is assigned as a no function allele by CPIC. Patients carrying the CYP2D6*3 allele in combination with with alleles that result in intermediate or poor metabolizer phenotype who are treated with amitriptyline may have increased likelihood of side effects as compared to patients with alleles that result in a normal metabolizer phenotype. Other genetic and clinical factors may also influence response to amitriptyline. | No function | CYP2D6*1, CYP2D6*1xN, CYP2D6*2, CYP2D6*3, CYP2D6*4, CYP2D6*5, CYP2D6*6, CYP2D6*10, CYP2D6*41 | CYP2D6 | 1A | Toxicity | amitriptyline | Depressive Disorder |
5 | 1451259580 | *4 | The CYP2D6*4 allele is assigned as a no function allele by CPIC. Patients carrying the CYP2D6*4 allele in combination with with alleles that result in intermediate or poor metabolizer phenotype who are treated with amitriptyline may have increased likelihood of side effects as compared to patients with alleles that result in a normal metabolizer phenotype. Other genetic and clinical factors may also influence response to amitriptyline. | No function | CYP2D6*1, CYP2D6*1xN, CYP2D6*2, CYP2D6*3, CYP2D6*4, CYP2D6*5, CYP2D6*6, CYP2D6*10, CYP2D6*41 | CYP2D6 | 1A | Toxicity | amitriptyline | Depressive Disorder |
6 | 1451259580 | *5 | The CYP2D6*5 allele is assigned as a no function allele by CPIC. Patients carrying the CYP2D6*5 allele in combination with with alleles that result in intermediate or poor metabolizer phenotype who are treated with amitriptyline may have increased likelihood of side effects as compared to patients with alleles that result in a normal metabolizer phenotype. Other genetic and clinical factors may also influence response to amitriptyline. | No function | CYP2D6*1, CYP2D6*1xN, CYP2D6*2, CYP2D6*3, CYP2D6*4, CYP2D6*5, CYP2D6*6, CYP2D6*10, CYP2D6*41 | CYP2D6 | 1A | Toxicity | amitriptyline | Depressive Disorder |
7 | 1451259580 | *6 | The CYP2D6*6 allele is assigned as a no function allele by CPIC. Patients carrying the CYP2D6*6 allele in combination with with alleles that result in intermediate or poor metabolizer phenotype who are treated with amitriptyline may have increased likelihood of side effects as compared to patients with alleles that result in a normal metabolizer phenotype. Other genetic and clinical factors may also influence response to amitriptyline. | No function | CYP2D6*1, CYP2D6*1xN, CYP2D6*2, CYP2D6*3, CYP2D6*4, CYP2D6*5, CYP2D6*6, CYP2D6*10, CYP2D6*41 | CYP2D6 | 1A | Toxicity | amitriptyline | Depressive Disorder |
8 | 1451259580 | *10 | The CYP2D6*10 allele is assigned as a decreased function allele with an activity value of 0.25 by CPIC. Patients carrying the CYP2D6*10 allele in combination with with alleles that result in intermediate or poor metabolizer phenotype who are treated with amitriptyline may have increased likelihood of side effects as compared to patients with alleles that result in a normal metabolizer phenotype. Other genetic and clinical factors may also influence response to amitriptyline. | Decreased function | CYP2D6*1, CYP2D6*1xN, CYP2D6*2, CYP2D6*3, CYP2D6*4, CYP2D6*5, CYP2D6*6, CYP2D6*10, CYP2D6*41 | CYP2D6 | 1A | Toxicity | amitriptyline | Depressive Disorder |
9 | 1451259580 | *41 | The CYP2D6*41 allele is assigned as a decreased function allele with an activity value of 0.5 by CPIC. Patients carrying the CYP2D6*41 allele in combination with alleles that result in intermediate or poor metabolizer phenotype who are treated with amitriptyline may have increased likelihood of side effects as compared to patients with alleles that result in a normal metabolizer phenotype. Other genetic and clinical factors may also influence response to amitriptyline. | Decreased function | CYP2D6*1, CYP2D6*1xN, CYP2D6*2, CYP2D6*3, CYP2D6*4, CYP2D6*5, CYP2D6*6, CYP2D6*10, CYP2D6*41 | CYP2D6 | 1A | Toxicity | amitriptyline | Depressive Disorder |
# https://www.pharmgkb.org/clinicalAnnotation/1448427588
joined_df[joined_df['Clinical Annotation ID'] == '1448427588']
Clinical Annotation ID | Genotype/Allele | Annotation Text | Allele Function | Variant/Haplotypes | Gene | Level of Evidence | Phenotype Category | Drug(s) | Phenotype(s) | |
---|---|---|---|---|---|---|---|---|---|---|
1551 | 1448427588 | non-null/non-null | Patients with the non-null/non-null genotype may have a decreased risk for neutropenia when treated with clozapine as compared to patients with the null/null genotype. Other genetic and clinical factors may also influence neutropenia risk. | NaN | GSTT1 non-null, GSTT1 null | GSTT1 | 3 | Toxicity | clozapine | NaN |
1552 | 1448427588 | null/non-null | Patients with the null/non-null genotype may have a decreased risk for neutropenia when treated with clozapine as compared to patients with the null/null genotype. Other genetic and clinical factors may also influence neutropenia risk. | NaN | GSTT1 non-null, GSTT1 null | GSTT1 | 3 | Toxicity | clozapine | NaN |
1553 | 1448427588 | null/null | Patients with the null/null genotype may have an increased risk for neutropenia when treated with clozapine as compared to patients with the null/non-null or non-null/non-null genotype. Other genetic and clinical factors may also influence neutropenia risk. | NaN | GSTT1 non-null, GSTT1 null | GSTT1 | 3 | Toxicity | clozapine | NaN |
# https://www.pharmgkb.org/clinicalAnnotation/981419263
joined_df[joined_df['Clinical Annotation ID'] == '981419263']
Clinical Annotation ID | Genotype/Allele | Annotation Text | Allele Function | Variant/Haplotypes | Gene | Level of Evidence | Phenotype Category | Drug(s) | Phenotype(s) | |
---|---|---|---|---|---|---|---|---|---|---|
498 | 981419263 | *15:02 | Patients with one or two copies of the HLA-B*15:02 allele may have an increased risk of Severe Cutaneous Adverse Reactions when treated with carbamazepine as compared to patients with no HLA-B*15:02 alleles or negative for the HLA-B*15:02 test. However, conflicting evidence has been reported. Other genetic and clinical factors may also influence risk of carbamazepine-induced adverse reactions. | Presence | HLA-B*15:02, HLA-B*15:11 | HLA-B | 1A | Toxicity | carbamazepine | drug reaction with eosinophilia and systemic symptoms;Epidermal Necrolysis, Toxic;Maculopapular Exanthema;severe cutaneous adverse reactions;Stevens-Johnson Syndrome |
499 | 981419263 | *15:11 | Patients with one or two copies of the HLA-B*15:11 allele may have an increased risk of Severe Cutaneous Adverse Reactions, such as Stevens-Johnson Syndrome and Toxic Epidermal Necrolysis, when treated with carbamazepine as compared to patients with no HLA-B*15:11 alleles or negative for the HLA-B*15:11 test. However, conflicting evidence has been reported. Other genetic and clinical factors may also influence risk of carbamazepine-induced adverse reactions. | NaN | HLA-B*15:02, HLA-B*15:11 | HLA-B | 1A | Toxicity | carbamazepine | drug reaction with eosinophilia and systemic symptoms;Epidermal Necrolysis, Toxic;Maculopapular Exanthema;severe cutaneous adverse reactions;Stevens-Johnson Syndrome |
# https://www.pharmgkb.org/clinicalAnnotation/1183621000
joined_df[joined_df['Clinical Annotation ID'] == '1183621000']
Clinical Annotation ID | Genotype/Allele | Annotation Text | Allele Function | Variant/Haplotypes | Gene | Level of Evidence | Phenotype Category | Drug(s) | Phenotype(s) | |
---|---|---|---|---|---|---|---|---|---|---|
524 | 1183621000 | A- 202A_376G | Patients with one X-chromosome and the A- 202A_376G allele who are treated with rasburicase may have an increased risk of methemoglobinemia and/or hemolysis as compared to patients with the reference B allele (non-deficient, class IV). Patients with two X-chromosomes and the A- 202A_376G allele in combination with another deficient class I-III allele who are treated with rasburicase may have an increased risk of methemoglobinemia and/or hemolysis as compared to patients with two copies of the reference B allele (non-deficient, class IV). Patients with two X-chromosomes and the A- 202A_376G allele in combination with a non-deficient allele who are treated with rasburicase have an unknown risk of methemoglobinemia and/or hemolysis as compared to patients with two copies of the reference B allele (non-deficient, class IV). Other genetic and clinical factors may also influence risk of drug-induced hemolysis. | III/Deficient | G6PD A- 202A_376G, G6PD B (reference), G6PD Mediterranean, Dallas, Panama, Sassari, Cagliari, Birmingham | G6PD | 1A | Toxicity | rasburicase | Hemolysis;Methemoglobinemia |
525 | 1183621000 | B (reference) | Patients with one X-chromosome and the reference B (reference) allele (non-deficient, class IV) who are treated with rasburicase may have a decreased risk of methemoglobinemia and/or hemolysis as compared to patients with a deficient class I-III allele. Patients with two X-chromosomes and two copies of the reference B allele (non-deficient, class IV) who are treated with rasburicase may have a decreased risk of methemoglobinemia and/or hemolysis as compared to patients with a deficient class I-III allele. Patients with two X-chromosomes, one copy of the reference B allele (non-deficient, class IV) and one deficient class I-III allele who are treated with rasburicase have an unknown risk of methemoglobinemia and/or hemolysis as compared to patients with two copies of the reference B allele (non-deficient, class IV). Other genetic and clinical factors may also influence risk of drug-induced hemolysis. | IV/Normal | G6PD A- 202A_376G, G6PD B (reference), G6PD Mediterranean, Dallas, Panama, Sassari, Cagliari, Birmingham | G6PD | 1A | Toxicity | rasburicase | Hemolysis;Methemoglobinemia |
526 | 1183621000 | Mediterranean, Dallas, Panama, Sassari, Cagliari, Birmingham | Patients with one X-chromosome and the Mediterranean, Dallas, Panama, Sassari, Cagliari, Birmingham allele (rs5030868 allele A) who are treated with rasburicase may have an increased risk of methemoglobinemia and/or hemolysis as compared to patients with the reference B allele (non-deficient, class IV)(rs5030868 allele G). Patients with two X-chromosomes and the Mediterranean, Dallas, Panama' Sassari, Cagliari, Birmingham variant (rs5030868 allele A) in combination with another deficient class I-III allele who are treated with rasburicase may have an increased risk of methemoglobinemia and/or hemolysis as compared to patients with two copies of the reference B allele (non-deficient, class IV)(rs5030868 allele G). Patients with two X-chromosomes and the Mediterranean, Dallas, Panama' Sassari, Cagliari, Birmingham variant (rs5030868 allele A) in combination with a non-deficient allele who are treated with rasburicase have an unknown risk of methemoglobinemia and/or hemolysis as compared to patients with two copies of the reference B allele (non-deficient, class IV). Other genetic and clinical factors may also influence risk of drug-induced hemolysis. | II/Deficient | G6PD A- 202A_376G, G6PD B (reference), G6PD Mediterranean, Dallas, Panama, Sassari, Cagliari, Birmingham | G6PD | 1A | Toxicity | rasburicase | Hemolysis;Methemoglobinemia |
Notes:
*1xN
means N
copies of the *1
version of the gene# Try to automatically get PGKB spreadsheet definitions
allele_definition_url = 'https://api.pharmgkb.org/v1/download/file/attachment/{gene}_allele_definition_table.xlsx'
genes = no_rs_annotations['Gene'].unique()
genes
array(['HLA-B', 'CYP2D6', 'UGT1A3', 'CYP2C19', 'NAT2', 'CYP3A5', 'CYP2C9', 'UGT1A1', 'CYP2B6', 'NUDT15', 'CYP2C8', 'CYP3A4', 'HLA-A', 'G6PD', 'UGT2B15', 'SLCO1B1', 'GSTT1', 'GSTM1', 'TPMT', 'SLC6A4', 'HLA-C', 'HLA-DRB1', 'HLA-DQB1', 'HLA-DPB1', 'CYP3A7', 'CYP2A6', 'HLA-DRB3', 'CYP1A2', 'UGT1A6', 'CYP2E1', 'UGT1A7', 'HLA-DQA1', 'UGT1A4', 'CYP1A1', 'CYP4F2'], dtype=object)
allele_def_tables = {}
for gene in genes:
try:
allele_def_tables[gene] = pd.read_excel(allele_definition_url.format(gene=gene),
storage_options={'User-Agent': 'Mozilla/5.0'},
header=None)
except Exception as e:
print(f'Error for {gene}: {e}')
/home/april/projects/opentargets-pharmgkb/venv/lib/python3.8/site-packages/openpyxl/styles/stylesheet.py:226: UserWarning: Workbook contains no default style, apply openpyxl's default warn("Workbook contains no default style, apply openpyxl's default") /home/april/projects/opentargets-pharmgkb/venv/lib/python3.8/site-packages/openpyxl/styles/stylesheet.py:226: UserWarning: Workbook contains no default style, apply openpyxl's default warn("Workbook contains no default style, apply openpyxl's default")
Error for UGT1A3: HTTP Error 404:
/home/april/projects/opentargets-pharmgkb/venv/lib/python3.8/site-packages/openpyxl/styles/stylesheet.py:226: UserWarning: Workbook contains no default style, apply openpyxl's default warn("Workbook contains no default style, apply openpyxl's default")
Error for NAT2: HTTP Error 404:
/home/april/projects/opentargets-pharmgkb/venv/lib/python3.8/site-packages/openpyxl/styles/stylesheet.py:226: UserWarning: Workbook contains no default style, apply openpyxl's default warn("Workbook contains no default style, apply openpyxl's default") /home/april/projects/opentargets-pharmgkb/venv/lib/python3.8/site-packages/openpyxl/styles/stylesheet.py:226: UserWarning: Workbook contains no default style, apply openpyxl's default warn("Workbook contains no default style, apply openpyxl's default") /home/april/projects/opentargets-pharmgkb/venv/lib/python3.8/site-packages/openpyxl/styles/stylesheet.py:226: UserWarning: Workbook contains no default style, apply openpyxl's default warn("Workbook contains no default style, apply openpyxl's default") /home/april/projects/opentargets-pharmgkb/venv/lib/python3.8/site-packages/openpyxl/styles/stylesheet.py:226: UserWarning: Workbook contains no default style, apply openpyxl's default warn("Workbook contains no default style, apply openpyxl's default") /home/april/projects/opentargets-pharmgkb/venv/lib/python3.8/site-packages/openpyxl/styles/stylesheet.py:226: UserWarning: Workbook contains no default style, apply openpyxl's default warn("Workbook contains no default style, apply openpyxl's default") /home/april/projects/opentargets-pharmgkb/venv/lib/python3.8/site-packages/openpyxl/styles/stylesheet.py:226: UserWarning: Workbook contains no default style, apply openpyxl's default warn("Workbook contains no default style, apply openpyxl's default") /home/april/projects/opentargets-pharmgkb/venv/lib/python3.8/site-packages/openpyxl/styles/stylesheet.py:226: UserWarning: Workbook contains no default style, apply openpyxl's default warn("Workbook contains no default style, apply openpyxl's default") /home/april/projects/opentargets-pharmgkb/venv/lib/python3.8/site-packages/openpyxl/styles/stylesheet.py:226: UserWarning: Workbook contains no default style, apply openpyxl's default warn("Workbook contains no default style, apply openpyxl's default") /home/april/projects/opentargets-pharmgkb/venv/lib/python3.8/site-packages/openpyxl/styles/stylesheet.py:226: UserWarning: Workbook contains no default style, apply openpyxl's default warn("Workbook contains no default style, apply openpyxl's default")
Error for UGT2B15: HTTP Error 404:
/home/april/projects/opentargets-pharmgkb/venv/lib/python3.8/site-packages/openpyxl/styles/stylesheet.py:226: UserWarning: Workbook contains no default style, apply openpyxl's default warn("Workbook contains no default style, apply openpyxl's default")
Error for GSTT1: HTTP Error 404: Error for GSTM1: HTTP Error 404:
/home/april/projects/opentargets-pharmgkb/venv/lib/python3.8/site-packages/openpyxl/styles/stylesheet.py:226: UserWarning: Workbook contains no default style, apply openpyxl's default warn("Workbook contains no default style, apply openpyxl's default")
Error for SLC6A4: HTTP Error 404:
/home/april/projects/opentargets-pharmgkb/venv/lib/python3.8/site-packages/openpyxl/styles/stylesheet.py:226: UserWarning: Workbook contains no default style, apply openpyxl's default warn("Workbook contains no default style, apply openpyxl's default") /home/april/projects/opentargets-pharmgkb/venv/lib/python3.8/site-packages/openpyxl/styles/stylesheet.py:226: UserWarning: Workbook contains no default style, apply openpyxl's default warn("Workbook contains no default style, apply openpyxl's default") /home/april/projects/opentargets-pharmgkb/venv/lib/python3.8/site-packages/openpyxl/styles/stylesheet.py:226: UserWarning: Workbook contains no default style, apply openpyxl's default warn("Workbook contains no default style, apply openpyxl's default") /home/april/projects/opentargets-pharmgkb/venv/lib/python3.8/site-packages/openpyxl/styles/stylesheet.py:226: UserWarning: Workbook contains no default style, apply openpyxl's default warn("Workbook contains no default style, apply openpyxl's default")
Error for CYP3A7: HTTP Error 404:
/home/april/projects/opentargets-pharmgkb/venv/lib/python3.8/site-packages/openpyxl/styles/stylesheet.py:226: UserWarning: Workbook contains no default style, apply openpyxl's default warn("Workbook contains no default style, apply openpyxl's default") /home/april/projects/opentargets-pharmgkb/venv/lib/python3.8/site-packages/openpyxl/styles/stylesheet.py:226: UserWarning: Workbook contains no default style, apply openpyxl's default warn("Workbook contains no default style, apply openpyxl's default")
Error for CYP1A2: HTTP Error 404: Error for UGT1A6: HTTP Error 404: Error for CYP2E1: HTTP Error 404: Error for UGT1A7: HTTP Error 404:
/home/april/projects/opentargets-pharmgkb/venv/lib/python3.8/site-packages/openpyxl/styles/stylesheet.py:226: UserWarning: Workbook contains no default style, apply openpyxl's default warn("Workbook contains no default style, apply openpyxl's default")
Error for UGT1A4: HTTP Error 404: Error for CYP1A1: HTTP Error 404:
/home/april/projects/opentargets-pharmgkb/venv/lib/python3.8/site-packages/openpyxl/styles/stylesheet.py:226: UserWarning: Workbook contains no default style, apply openpyxl's default warn("Workbook contains no default style, apply openpyxl's default")
# Genes with allele tables
pharmgkb_genes = set(allele_def_tables.keys())
pharmgkb_genes
{'CYP2A6', 'CYP2B6', 'CYP2C19', 'CYP2C8', 'CYP2C9', 'CYP2D6', 'CYP3A4', 'CYP3A5', 'CYP4F2', 'G6PD', 'HLA-A', 'HLA-B', 'HLA-C', 'HLA-DPB1', 'HLA-DQA1', 'HLA-DQB1', 'HLA-DRB1', 'HLA-DRB3', 'NUDT15', 'SLCO1B1', 'TPMT', 'UGT1A1'}
no_allele_def_table_genes = set(genes) - pharmgkb_genes
no_allele_def_table_genes
{'CYP1A1', 'CYP1A2', 'CYP2E1', 'CYP3A7', 'GSTM1', 'GSTT1', 'NAT2', 'SLC6A4', 'UGT1A3', 'UGT1A4', 'UGT1A6', 'UGT1A7', 'UGT2B15'}
Checked a few of this list and they indeed don't have definition tables in PharmGKB, categories I see:
For now we'll skip these and look at those with allele definition tables (covers about 90% of no-RS records in PGKB).
# What do we lose if we skip these?
len(no_rs_annotations[no_rs_annotations['Gene'].isin(no_allele_def_table_genes)])
53
Note that the allele definition tables vary in informativeness, so just because one is present doesn't mean we can necessarily use it.
Understanding the allele definition table:
CYP2D7::CYP2D6 hybrid gene
# Compare with what we would get from PharmVar
pharmvar_url = 'https://www.pharmvar.org/api-service/alleles?exclude-sub-alleles=false&include-reference-variants=false&include-retired-alleles=false&include-retired-reference-sequences=false'
response = requests.get(pharmvar_url)
pharmvar_data = response.json()
# 1 per allele
len(pharmvar_data)
1945
pharmvar_data[0]
{'geneSymbol': 'CYP2C9', 'alleleName': 'CYP2C9*49.001', 'pvId': 'PV00001', 'legacyLabel': 'CYP2C9*49', 'coreAllele': 'CYP2C9*49', 'evidenceLevel': 'L', 'description': None, 'function': 'uncertain function', 'activeInd': True, 'references': [{'citation': 'Dai et al. 2013', 'url': 'http://www.ncbi.nlm.nih.gov/pubmed/23400009'}], 'variants': [{'referenceSequence': 'NG_008385.2', 'referenceLocation': 'ATG Start', 'referenceCollections': ['RefSeqGene'], 'hgvs': 'NG_008385.2:g.15972A>G', 'rsId': None, 'impact': 'I222V', 'variantFrequency': [], 'url': 'https://www.pharmvar.org/variant/13610', 'variantId': '49', 'position': 'NG_008385.2:g.10447A>G'}, {'referenceSequence': 'NC_000010.10', 'referenceLocation': 'Sequence Start', 'referenceCollections': ['GRCh37'], 'hgvs': 'NC_000010.10:g.96708886A>G', 'rsId': None, 'impact': 'I222V', 'variantFrequency': [], 'url': 'https://www.pharmvar.org/variant/195', 'variantId': '49', 'position': 'NC_000010.10:g.96708886A>G'}, {'referenceSequence': 'NM_000771.4', 'referenceLocation': 'ATG Start', 'referenceCollections': ['RefSeqTranscript'], 'hgvs': 'NM_000771.4:c.664A>G', 'rsId': None, 'impact': 'I222V', 'variantFrequency': [], 'url': 'https://www.pharmvar.org/variant/13765', 'variantId': '49', 'position': 'NM_000771.4:c.664A>G'}, {'referenceSequence': 'NC_000010.11', 'referenceLocation': 'Sequence Start', 'referenceCollections': ['GRCh38'], 'hgvs': 'NC_000010.11:g.94949129A>G', 'rsId': None, 'impact': 'I222V', 'variantFrequency': [], 'url': 'https://www.pharmvar.org/variant/193', 'variantId': '49', 'position': 'NC_000010.11:g.94949129A>G'}, {'referenceSequence': 'NG_008385.2', 'referenceLocation': 'Sequence Start', 'referenceCollections': ['RefSeqGene'], 'hgvs': 'NG_008385.2:g.15972A>G', 'rsId': None, 'impact': 'I222V', 'variantFrequency': [], 'url': 'https://www.pharmvar.org/variant/13609', 'variantId': '49', 'position': 'NG_008385.2:g.15972A>G'}, {'referenceSequence': 'NM_000771.4', 'referenceLocation': 'Sequence Start', 'referenceCollections': ['RefSeqTranscript'], 'hgvs': 'NM_000771.4:c.664A>G', 'rsId': None, 'impact': 'I222V', 'variantFrequency': [], 'url': 'https://www.pharmvar.org/variant/13766', 'variantId': '49', 'position': 'NM_000771.4:c.689A>G'}], 'alleleType': 'Sub', 'url': 'https://www.pharmvar.org/haplotype/PV00001', 'hgvs': 'NG_008385.2:g.15972A>G', 'variantGroups': []}
pharmvar_genes = {d['geneSymbol'] for d in pharmvar_data}
pharmgkb_genes - pharmvar_genes
{'G6PD', 'HLA-A', 'HLA-B', 'HLA-C', 'HLA-DPB1', 'HLA-DQA1', 'HLA-DQB1', 'HLA-DRB1', 'HLA-DRB3', 'TPMT', 'UGT1A1'}
pharmvar_genes - pharmgkb_genes
{'CYP2A13', 'DPYD'}
len(no_rs_annotations[no_rs_annotations['Gene'].isin(pharmvar_genes - pharmgkb_genes)])
0
Conclusion from this is that PharmVar probably has less information than PharmGKB; though most of the genes covered by PGKB and not by PV are "uninformative" tables, there are at least 2 exceptions (G6PD and UGT1A1). In contrast PV genes not covered by PGKB are not present in PGKB data.
I haven't compared the actual content of the PV vs. PGKB data but I'm assuming it's similar since it's sourced directly from PV.
Implementation-wise, PV does have the advantage in that it has an actual API with JSON responses rather than spreadsheets.
# What we get "out of the box" - note first 7 rows & first column are headers
allele_def_tables['CYP2D6'].head(10)
0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | ... | 141 | 142 | 143 | 144 | 145 | 146 | 147 | 148 | 149 | 150 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | GENE: CYP2D6 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
1 | NG_008376.4 (ATG start) | 14C>T | 19G>A | 31G>A | 64delC | 73C>T | 77G>A | 82C>T | 100C>T | 122C>T | ... | 4165T>G | 4167T>C | 4168G>A | 4169C>G | 4170T>C | 4173C>T | 4181G>C | 4187C>T | 4214G>A | Structural Variation |
2 | Effect on protein (NP_000097.3) | p.A5V | p.V7M | p.V11M | p.L22X | p.R25W | p.R26H | p.R28C | p.P34S | p.P41L | ... | p.F481V | NaN | p.A482T | p.A482G | NaN | NaN | p.S486T | p.S488F | p.R497H | NaN |
3 | Position at NC_000022.11 (Homo sapiens chromosome 22, GRCh38.p13) | g.42130778G>A | g.42130773C>T | g.42130761C>T | g.42130729del | g.42130719G>A | g.42130715C>T | g.42130710G>A | g.42130692G>A | g.42130670G>A | ... | g.42126627A>C | g.42126625A>G | g.42126624C>T | g.42126623G>C | g.42126622A>G | g.42126619G>A | g.42126611C>G | g.42126605G>A | g.42126578C>T | NaN |
4 | Position at NG_008376.4 (CYP2D6 RefSeqGene; reverse relative to chromosome) | g.5033C>T | g.5038G>A | g.5050G>A | g.5083del | g.5092C>T | g.5096G>A | g.5101C>T | g.5119C>T | g.5141C>T | ... | g.9184T>G | g.9186T>C | g.9187G>A | g.9188C>G | g.9189T>C | g.9192C>T | g.9200G>C | g.9206C>T | g.9233G>A | NaN |
5 | rsID | rs773790593 | rs72549358 | rs769258 | NaN | rs267608313 | rs28371696 | rs138100349 | rs1065852 | rs373243894 | ... | NaN | NaN | rs74478221 | rs75467367 | rs747998333 | rs28371736 | rs1135840 | rs568495591 | rs1440526469 | NaN |
6 | CYP2D6 Allele | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
7 | *1 | G | C | C | G | G | C | G | G | G | ... | A | A | C | G | A | G | C | G | C | NaN |
8 | *2 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | G | NaN | NaN | NaN |
9 | *3 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
10 rows × 151 columns
allele_def_tables['HLA-A'].head(10)
0 | 1 | |
---|---|---|
0 | GENE: HLA-A | NaN |
1 | NaN | NaN |
2 | Effect on protein | NaN |
3 | Position on chromosomal sequence | NaN |
4 | Position on gene sequence | NaN |
5 | rsID | NaN |
6 | HLA-A Allele | NaN |
7 | *01:01 | Not Callable |
8 | *01:02 | Not Callable |
9 | *01:03 | Not Callable |
# If there are more than 2 columns we'll assume the table is informative
allele_def_metrics = []
informative_tables = []
for gene, table in allele_def_tables.items():
allele_def_metrics.append({
'gene': gene,
'num_alleles': table.shape[0]-7,
'num_variants': table.shape[1]-1
})
if table.shape[1] > 2:
informative_tables.append(gene)
allele_def_metrics
[{'gene': 'HLA-B', 'num_alleles': 1793, 'num_variants': 1}, {'gene': 'CYP2D6', 'num_alleles': 163, 'num_variants': 150}, {'gene': 'CYP2C19', 'num_alleles': 36, 'num_variants': 35}, {'gene': 'CYP3A5', 'num_alleles': 6, 'num_variants': 5}, {'gene': 'CYP2C9', 'num_alleles': 85, 'num_variants': 80}, {'gene': 'UGT1A1', 'num_alleles': 9, 'num_variants': 4}, {'gene': 'CYP2B6', 'num_alleles': 48, 'num_variants': 48}, {'gene': 'NUDT15', 'num_alleles': 20, 'num_variants': 17}, {'gene': 'CYP2C8', 'num_alleles': 18, 'num_variants': 17}, {'gene': 'CYP3A4', 'num_alleles': 45, 'num_variants': 42}, {'gene': 'HLA-A', 'num_alleles': 1332, 'num_variants': 1}, {'gene': 'G6PD', 'num_alleles': 187, 'num_variants': 173}, {'gene': 'SLCO1B1', 'num_alleles': 44, 'num_variants': 32}, {'gene': 'TPMT', 'num_alleles': 46, 'num_variants': 43}, {'gene': 'HLA-C', 'num_alleles': 955, 'num_variants': 1}, {'gene': 'HLA-DRB1', 'num_alleles': 763, 'num_variants': 1}, {'gene': 'HLA-DQB1', 'num_alleles': 106, 'num_variants': 1}, {'gene': 'HLA-DPB1', 'num_alleles': 127, 'num_variants': 1}, {'gene': 'CYP2A6', 'num_alleles': 51, 'num_variants': 64}, {'gene': 'HLA-DRB3', 'num_alleles': 45, 'num_variants': 1}, {'gene': 'HLA-DQA1', 'num_alleles': 24, 'num_variants': 1}, {'gene': 'CYP4F2', 'num_alleles': 16, 'num_variants': 14}]
len(informative_tables) / len(allele_def_metrics)
0.6363636363636364
# Count of non-rs clinical annotation records involving genes with informative allele definition tables
len(no_rs_annotations[no_rs_annotations['Gene'].isin(informative_tables)])
381