#!/usr/bin/env python # coding: utf-8 # # New Mapping File from NCBI Gene Dataset # Downloaded: Jan 2, 2019 # In[1]: import csv import pandas as pd # In[2]: file = "/Users/maayan/sigsets/Harmonizome/Data/All_Data.gene_info_010219" mapping_file = "/Users/maayan/sigsets/Harmonizome/Data/mappingFileHMR_2019.tsv" gene_sym_ids = "/Users/maayan/sigsets/Harmonizome/Data/GeneSymbolsAndIDSHMR_2019.tsv" # In[3]: # Human, Mouse, and Rat only mapping_dict = {} with open(file) as o: with open(gene_sym_ids, "w") as w: with open(mapping_file, "w") as m: csv_file = csv.reader(o, delimiter="\t") w.write("Human, Mouse, and Rat Approved Symbol Entrez Gene ID(supplied by NCBI)") for row in csv_file: tax_id = row[0] if tax_id in ["9606", "10090", "10116"]: gene_id = row[1] if not row[2] == "NEWENTRY": sym = row[2] else: continue synonyms = [sym] if not row[4] == "-": synonyms = synonyms + row[4].split("|") else: continue w.write("\t".join([tax_id, sym, gene_id]) + "\n") # Add Taxon ID if not tax_id in mapping_dict: mapping_dict[tax_id] = set([]) for syn in synonyms: if syn not in mapping_dict[tax_id]: mapping_dict[tax_id].add(syn) m.write("\t".join([tax_id, syn, sym])+ "\n") # In[4]: getGeneIDsHMR_updated = pd.read_csv('/Users/maayan/sigsets/Harmonizome/Data/GeneSymbolsAndIDSHMR_2019.tsv', sep='\t', index_col=[0,1]).sort_index() # In[14]: (9606,"A2MP1") in getGeneIDsHMR_updated.index # In[17]: mappingDFHMR_updated = pd.read_csv('/Users/maayan/sigsets/Harmonizome/Data/mappingFileHMR_2019.tsv', sep='\t', header=None, index_col=[0,1]).sort_index() # In[18]: (9606,"(FM-3)") in mappingDFHMR_updated.index # In[8]: mappingDFHMR_updated # In[ ]: