Author: Moshe Silverstein
Date: 03-2018
Data Source Home: http://www.drugbank.ca/
Data Source Downlaod: https://www.drugbank.ca/releases/latest
import sys, datetime, os
import numpy as np
import pandas as pd
import importlib
import untility_functions as uf
%matplotlib inline
from clustergrammer_widget import *
net = Network(clustergrammer_widget)
importlib.reload(uf)
<module 'untility_functions' from '/Users/moshesilverstein/Documents/Harmonizome/Drugbank/untility_functions.py'>
%load_ext version_information
%version_information numpy, pandas, clustergrammer_widget
Software | Version |
---|---|
Python | 3.5.2 64bit [GCC 4.2.1 Compatible Apple LLVM 4.2 (clang-425.0.28)] |
IPython | 5.3.0 |
OS | Darwin 17.2.0 x86_64 i386 64bit |
numpy | 1.13.1 |
pandas | 0.21.0 |
clustergrammer_widget | 1.9.0 |
Wed Mar 28 17:58:10 2018 EDT |
path = '/Users/moshesilverstein/Documents/Harmonizome/Drugbank/Output/'
df = pd.read_csv('Input/drugbank_all_enzyme_polypeptide_ids.csv/all.csv')
df.head()
ID | Name | Gene Name | GenBank Protein ID | GenBank Gene ID | UniProt ID | Uniprot Title | PDB ID | GeneCard ID | GenAtlas ID | HGNC ID | Species | Drug IDs | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | Q9UI32 | Glutaminase liver isoform, mitochondrial | GLS2 | 6650606.0 | AF110330 | Q9UI32 | GLSL_HUMAN | 4BQM | NaN | GLS2 | HGNC:29570 | Human | DB00130 |
1 | P00488 | Coagulation factor XIII A chain | F13A1 | 182309.0 | M22001 | P00488 | F13A_HUMAN | 1EVU; 1EX0; 1F13; 1FIE; 1GGT; 1GGU; 1GGY; 1QRK... | NaN | F13A1 | HGNC:3531 | Human | DB00130 |
2 | P35228 | Nitric oxide synthase, inducible | NOS2 | 292242.0 | L09210 | P35228 | NOS2_HUMAN | 1NSI; 2LL6; 2NSI; 3E7G; 3EJ8; 3HR4; 4CX7; 4NOS | NaN | NOS2A | HGNC:7873 | Human | DB00997 |
3 | P11766 | Alcohol dehydrogenase class-3 | ADH5 | 178134.0 | M30471 | P11766 | ADHX_HUMAN | 1M6H; 1M6W; 1MA0; 1MC5; 1MP0; 1TEH; 2FZE; 2FZW... | NaN | ADH5 | HGNC:253 | Human | DB00157; DB00898 |
4 | P00451 | Coagulation factor VIII | F8 | 182818.0 | M14113 | P00451 | FA8_HUMAN | 1CFG; 1D7P; 1FAC; 1IQD; 2R7E; 3CDZ; 3HNB; 3HNY... | NaN | F8 | HGNC:3546 | Human | DB13151 |
df.shape
(305, 13)
drug_meta = pd.read_csv('Input/drugbank_all_drug_links.csv.zip', index_col=0)
drug_meta.head()
Name | CAS Number | Drug Type | KEGG Compound ID | KEGG Drug ID | PubChem Compound ID | PubChem Substance ID | ChEBI ID | PharmGKB ID | HET ID | ... | GenBank ID | DPD ID | RxList Link | Pdrhealth Link | Wikipedia ID | Drugs.com Link | NDC ID | ChemSpider ID | BindingDB ID | TTD ID | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
DrugBank ID | |||||||||||||||||||||
DB00001 | Lepirudin | 138068-37-8 | BiotechDrug | NaN | D06880 | NaN | 46507011.0 | NaN | PA450195 | NaN | ... | NaN | 11916 | http://www.rxlist.com/cgi/generic/lepirudin.htm | NaN | Lepirudin | http://www.drugs.com/cdi/lepirudin.html | NaN | NaN | NaN | DAP000541 |
DB00002 | Cetuximab | 205923-56-4 | BiotechDrug | NaN | D03455 | NaN | 46507042.0 | NaN | PA10040 | NaN | ... | J00228 | 13175 | http://www.rxlist.com/cgi/generic3/erbitux.htm | NaN | Cetuximab | http://www.drugs.com/cdi/cetuximab.html | NaN | NaN | NaN | DNC000788 |
DB00003 | Dornase alfa | 143831-71-4 | BiotechDrug | NaN | NaN | NaN | 46507792.0 | NaN | PA10318 | NaN | ... | M55983 | 650 | http://www.rxlist.com/cgi/generic/pulmozyme.htm | NaN | Dornase_alfa | http://www.drugs.com/cdi/dornase-alfa.html | NaN | NaN | NaN | DAP000981 |
DB00004 | Denileukin diftitox | 173146-27-5 | BiotechDrug | NaN | NaN | NaN | 46506950.0 | NaN | PA164750594 | NaN | ... | V01536 | NaN | http://www.rxlist.com/cgi/generic2/denileukin.htm | NaN | Denileukin_diftitox | http://www.drugs.com/cdi/denileukin-diftitox.html | NaN | NaN | NaN | DAP001098 |
DB00005 | Etanercept | 185243-69-0 | BiotechDrug | C07897 | D00742 | NaN | 46506732.0 | NaN | PA449515 | NaN | ... | M32315 | 12032 | http://www.rxlist.com/cgi/generic/etanercept.htm | NaN | Etanercept | http://www.drugs.com/cdi/etanercept.html | NaN | NaN | NaN | DNC000605 |
5 rows × 22 columns
drug_meta.shape
(10562, 22)
# get only relevetn spcies
human = df[df['Species'] == 'Human'].copy()
mouse = df[df['Species'] == 'Mouse'].copy()
rat = df[df['Species'] == 'Rat'].copy()
df = pd.concat([human, mouse])
df = pd.concat([df, rat])
df = df[['Gene Name', 'Drug IDs']]
df.shape
(278, 2)
df.head()
Gene Name | Drug IDs | |
---|---|---|
0 | GLS2 | DB00130 |
1 | F13A1 | DB00130 |
2 | NOS2 | DB00997 |
3 | ADH5 | DB00157; DB00898 |
4 | F8 | DB13151 |
df_interactions = pd.DataFrame()
for i, index in enumerate(df.index):
progressPercent = ((i+1)/len(df.index))*100
sys.stdout.write("Progeres: %d%% %d Out of %d \r" % (progressPercent, (i+1), len(df.index)))
sys.stdout.flush()
if type(df.loc[index, 'Gene Name']) != float:
lst2 = df.loc[index, 'Drug IDs'].split(';')
lst1 = [df.loc[index, 'Gene Name'].split('(')[0]]*(len(lst2))
temp = pd.DataFrame()
temp['Drug'] = lst2
temp['Gene Name'] = lst1
df_interactions = pd.concat([df_interactions, temp])
Progeres: 100% 278 Out of 278
df_interactions.head()
Drug | Gene Name | |
---|---|---|
0 | DB00130 | GLS2 |
0 | DB00130 | F13A1 |
0 | DB00997 | NOS2 |
0 | DB00157 | ADH5 |
1 | DB00898 | ADH5 |
df_interactions.shape
(2441, 2)
df_interactions.reset_index(inplace=True)
df_interactions.drop('index', axis=1, inplace=True)
lst = []
for i, index in enumerate(df_interactions.index):
progressPercent = ((i+1)/len(df_interactions.index))*100
sys.stdout.write("Progeres: %d%% %d Out of %d \r" % (progressPercent, (i+1), len(df_interactions.index)))
sys.stdout.flush()
if len(df_interactions.loc[index, 'Drug'].split(' ')) > 1:
lst.append(drug_meta.loc[df_interactions.loc[index, 'Drug'].split(' ')[1],'Name'])
else:
lst.append(drug_meta.loc[df_interactions.loc[index, 'Drug'],'Name'])
df_interactions['Drug'] = lst
Progeres: 100% 2441 Out of 2441
df_interactions.set_index('Gene Name', inplace=True)
uf.mapgenesymbols(df_interactions)
Progeres: 100% 2441 Out of 2441
df_interactions.reset_index(inplace=True)
df_interactions.drop_duplicates(inplace=True)
df_interactions.shape
(2429, 2)
binary_matrix = uf.createBinaryMatrix(df_interactions)
Progeres: 100% 261 Out of 261
binary_matrix.head()
Raloxifene | Cimetidine | Adefovir Dipivoxil | Edetic Acid | Lorcaserin | Etodolac | Erdosteine | Oxazepam | Ifosfamide | Nalmefene | ... | Ephedrine | Glycine betaine | Granisetron | Oxycodone | Citalopram | Ticlopidine | Desipramine | Doxacurium chloride | Etomidate | Bupivacaine | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
TGM1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
HIF1AN | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
TDO2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
KYAT1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
MSRB1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
5 rows × 869 columns
binary_matrix.shape
(261, 869)
filename = path+'drugbank_enzyme_binary_matrix_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
binary_matrix.to_csv(filename, sep='\t', compression='gzip')
name = 'drugbank_enzyme_gene_set'
uf.createUpGeneSetLib(binary_matrix, path, name)
Progeres: 100% 869 Out of 869
name = 'drugbank_enzyme_attribute_set'
uf.createUpAttributeSetLib(binary_matrix, path, name)
Progeres: 100% 261 Out of 261
gene_similarity_matix = uf.createSimilarityMatrix(binary_matrix, 'jaccard')
gene_similarity_matix.head()
TGM1 | HIF1AN | TDO2 | KYAT1 | MSRB1 | SULT2A1 | GGH | TXNRD1 | ACO1 | CYP3A5 | ... | CMPK1 | MTR | PLA2G4A | NQO1 | CYP2D6 | C1R | TGM7 | GSR | ADA | SOD2 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
TGM1 | 1.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 |
HIF1AN | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
TDO2 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
KYAT1 | 1.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 |
MSRB1 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
5 rows × 261 columns
filename = path+'drugbank_enzyme_gene_similarity_matix_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
gene_similarity_matix.to_csv(filename, sep='\t', compression='gzip')
attribute_similarity_matix = uf.createSimilarityMatrix(binary_matrix.T, 'jaccard')
attribute_similarity_matix.head()
Raloxifene | Cimetidine | Adefovir Dipivoxil | Edetic Acid | Lorcaserin | Etodolac | Erdosteine | Oxazepam | Ifosfamide | Nalmefene | ... | Ephedrine | Glycine betaine | Granisetron | Oxycodone | Citalopram | Ticlopidine | Desipramine | Doxacurium chloride | Etomidate | Bupivacaine | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
Raloxifene | 1.000000 | 0.071429 | 0.0 | 0.166667 | 0.142857 | 0.0 | 0.000000 | 0.000000 | 0.125000 | 0.0 | ... | 0.0 | 0.0 | 0.000000 | 0.000000 | 0.111111 | 0.166667 | 0.142857 | 0.0 | 0.166667 | 0.00 |
Cimetidine | 0.071429 | 1.000000 | 0.0 | 0.000000 | 0.071429 | 0.0 | 0.000000 | 0.133333 | 0.066667 | 0.0 | ... | 0.0 | 0.0 | 0.083333 | 0.272727 | 0.307692 | 0.400000 | 0.071429 | 0.0 | 0.076923 | 0.25 |
Adefovir Dipivoxil | 0.000000 | 0.000000 | 1.0 | 0.000000 | 0.000000 | 0.0 | 0.000000 | 0.000000 | 0.000000 | 0.0 | ... | 0.0 | 0.0 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.0 | 0.000000 | 0.00 |
Edetic Acid | 0.166667 | 0.000000 | 0.0 | 1.000000 | 0.000000 | 0.0 | 0.333333 | 0.000000 | 0.000000 | 0.0 | ... | 0.0 | 0.0 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.0 | 0.200000 | 0.00 |
Lorcaserin | 0.142857 | 0.071429 | 0.0 | 0.000000 | 1.000000 | 0.0 | 0.000000 | 0.000000 | 0.285714 | 0.0 | ... | 0.0 | 0.0 | 0.200000 | 0.000000 | 0.111111 | 0.166667 | 0.333333 | 0.0 | 0.000000 | 0.00 |
5 rows × 869 columns
filename = path+'drugbank_enzyme_attribute_similarity_matix_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
attribute_similarity_matix.to_csv(filename, sep='\t', compression='gzip')
# net.load_df(attribute_similarity_matix.iloc[:,:].copy())
# # net.filter_N_top('row', rank_type='sum', N_top=300)
# net.cluster()
# net.widget()
gene_list = uf.createGeneList(binary_matrix)
Progeres: 100% 261 Out of 261
gene_list.head()
GeneSym | GeneID | |
---|---|---|
0 | TGM1 | 7051 |
1 | HIF1AN | 55662 |
2 | TDO2 | 6999 |
3 | KYAT1 | 883 |
4 | MSRB1 | 51734 |
gene_list.shape
(261, 2)
filename = path+'drugbank_enzyme_gene_list_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
gene_list.to_csv(filename, sep='\t', index=False, compression='gzip')
attribute_list = uf.createAttributeList(binary_matrix)
attribute_list.head()
Attributes |
---|
Raloxifene |
Cimetidine |
Adefovir Dipivoxil |
Edetic Acid |
Lorcaserin |
attribute_list.shape
(869, 0)
filename = path+'drugbank_enzyme_attribute_list_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
attribute_list.to_csv(filename, sep='\t', index=False, compression='gzip')
name = 'drugbank_enzyme_gene_attribute_edge_list'
uf.createGeneAttributeEdgeList(binary_matrix, attribute_list, gene_list, path, name)
Progeres: 100% 869 Out of 869 The number of statisticaly relevent gene-attribute associations is: 2429