Downloaded: Jan 2, 2019
import csv
import pandas as pd
file = "/Users/maayan/sigsets/Harmonizome/Data/All_Data.gene_info_010219"
mapping_file = "/Users/maayan/sigsets/Harmonizome/Data/mappingFileHMR_2019.tsv"
gene_sym_ids = "/Users/maayan/sigsets/Harmonizome/Data/GeneSymbolsAndIDSHMR_2019.tsv"
# Human, Mouse, and Rat only
mapping_dict = {}
with open(file) as o:
with open(gene_sym_ids, "w") as w:
with open(mapping_file, "w") as m:
csv_file = csv.reader(o, delimiter="\t")
w.write("Human, Mouse, and Rat Approved Symbol Entrez Gene ID(supplied by NCBI)")
for row in csv_file:
tax_id = row[0]
if tax_id in ["9606", "10090", "10116"]:
gene_id = row[1]
if not row[2] == "NEWENTRY":
sym = row[2]
else:
continue
synonyms = [sym]
if not row[4] == "-":
synonyms = synonyms + row[4].split("|")
else:
continue
w.write("\t".join([tax_id, sym, gene_id]) + "\n") # Add Taxon ID
if not tax_id in mapping_dict:
mapping_dict[tax_id] = set([])
for syn in synonyms:
if syn not in mapping_dict[tax_id]:
mapping_dict[tax_id].add(syn)
m.write("\t".join([tax_id, syn, sym])+ "\n")
getGeneIDsHMR_updated = pd.read_csv('/Users/maayan/sigsets/Harmonizome/Data/GeneSymbolsAndIDSHMR_2019.tsv', sep='\t', index_col=[0,1]).sort_index()
(9606,"A2MP1") in getGeneIDsHMR_updated.index
True
mappingDFHMR_updated = pd.read_csv('/Users/maayan/sigsets/Harmonizome/Data/mappingFileHMR_2019.tsv', sep='\t', header=None, index_col=[0,1]).sort_index()
(9606,"(FM-3)") in mappingDFHMR_updated.index
True
mappingDFHMR_updated
2 | ||
---|---|---|
0 | 1 | |
9606 | (FM-3) | NMUR1 |
(IV)-44 | IGHVIV-44-1 | |
(ppGpp)ase | HDDC3 | |
0610011B16Rik | CORO7 | |
0610037N12Rik | POP7 | |
0710008D09Rik | UQCR11 | |
0808y08y | NFYC-AS1 | |
1-12P | IGHV1-12 | |
1-14P | IGHV1-14 | |
1-17P | IGHV1-17 | |
1-67P | IGHV1-67 | |
1-68P | IGHV1-68 | |
1-8D | IFITM2 | |
1-8U | IFITM3 | |
1-AGPAT 6 | GPAT4 | |
1-AGPAT1 | AGPAT1 | |
1-AGPAT2 | AGPAT2 | |
1-AGPAT4 | AGPAT4 | |
1-Cys | PRDX6 | |
1/2-SBSRNA4 | SEC24B-AS1 | |
10-FTHFDH | ALDH1L1 | |
10-fTHF | ALDH1L1 | |
101F10.1 | KNOP1 | |
101F6 | CYB561D2 | |
104p | TUBGCP3 | |
105A | SNORA73B | |
105B | RNU105B | |
10C | ARHGAP9 | |
10q23del | BMPR1A | |
11-DH | HSD11B1 | |
... | ... | ... |
10116 | tpcr07 | Olr1398 |
tpcr09 | Olr737 | |
tpcr10 | Olr1404 | |
tpcr13 | Olr1366 | |
tpcr18 | Olr1307 | |
tpcr19 | Olr1226 | |
tpcr21 | Olr1283 | |
tpcr38 | Olr1606 | |
trk-B | Ntrk2 | |
trkB | Ntrk2 | |
trkC | Ntrk3 | |
try1 | Prss58 | |
uKATP-1 | Kcnj8 | |
uPAR | Plaur | |
uPAR-2 | Plaur | |
uPAR-3 | Plaur | |
ufc1-s | Ufc1 | |
ufd2a | Ube4b | |
upf0227 | Abhd17a | |
vip/phi27 | Vip | |
vms-tm2 | Cd99l2 | |
wbp-11 | Wbp11 | |
x85 | Dus1l | |
xylt-II | Xylt2 | |
y+LAT1 | Slc7a7 | |
zbs559 | Map1lc3b | |
zgc:101121 | Zfand6 | |
zgc:66482 | Pik3ip1 | |
zif-268 | Egr1 | |
NaN | Scn11a |
329092 rows × 1 columns