import os
import pickle
import gzip
import hetio.readwrite
from hetmech.xarray import graph_to_xarray
url = 'https://github.com/dhimmel/hetionet/raw/76550e6c93fbe92124edc71725e8c7dd4ca8b1f5/hetnet/json/hetionet-v1.0.json.bz2'
graph = hetio.readwrite.read_graph(url)
metagraph = graph.metagraph
dataset = graph_to_xarray(graph)
dataset
<xarray.Dataset> Dimensions: (Anatomy: 402, Biological Process: 11381, Cellular Component: 1391, Compound: 1552, Disease: 137, Gene: 20945, Molecular Function: 2884, Pathway: 1822, Pharmacologic Class: 345, Side Effect: 5734, Symptom: 438) Coordinates: * Anatomy (Anatomy) <U14 'UBERON:0000002' 'UBERON:0000004' ... * Gene (Gene) int64 1 2 9 10 12 13 14 15 16 18 19 20 21 22 ... * Compound (Compound) <U7 'DB00014' 'DB00035' 'DB00050' ... * Disease (Disease) <U12 'DOID:0050156' 'DOID:0050425' ... * Side Effect (Side Effect) <U8 'C0000727' 'C0000729' 'C0000731' ... * Symptom (Symptom) <U7 'D000006' 'D000270' 'D000326' ... * Biological Process (Biological Process) <U10 'GO:0000002' 'GO:0000012' ... * Cellular Component (Cellular Component) <U10 'GO:0000015' 'GO:0000109' ... * Molecular Function (Molecular Function) <U10 'GO:0000010' 'GO:0000014' ... * Pathway (Pathway) <U13 'PC7_10399' 'PC7_10400' 'PC7_10442' ... * Pharmacologic Class (Pharmacologic Class) <U11 'N0000000069' ... Data variables: AdG (Anatomy, Gene) bool False False False False False ... AeG (Anatomy, Gene) bool False True True False False ... AuG (Anatomy, Gene) bool False True False False False ... CrC (Compound, Compound) bool False False True False ... CpD (Compound, Disease) bool False False False False ... CtD (Compound, Disease) bool False False False False ... CbG (Compound, Gene) bool False False False False False ... CdG (Compound, Gene) bool False False False False False ... CuG (Compound, Gene) bool False False False False False ... CcSE (Compound, Side Effect) bool False False False ... DlA (Disease, Anatomy) bool False False False False ... DrD (Disease, Disease) bool False False False False ... DaG (Disease, Gene) bool False False False False False ... DdG (Disease, Gene) bool False False False False False ... DuG (Disease, Gene) bool False False False False False ... DpS (Disease, Symptom) bool False False False False ... GpBP (Gene, Biological Process) bool False False False ... GpCC (Gene, Cellular Component) bool False False False ... GcG (Gene, Gene) bool False False False False False ... GiG (Gene, Gene) bool False False False False False ... Gr>G (Gene, Gene) bool False False False False False ... GpMF (Gene, Molecular Function) bool False False False ... GpPW (Gene, Pathway) bool False False False False False ... PCiC (Pharmacologic Class, Compound) bool False False ...
dataset['Biological Process']
<xarray.DataArray 'Biological Process' (Biological Process: 11381)> array(['GO:0000002', 'GO:0000012', 'GO:0000018', ..., 'GO:2001301', 'GO:2001302', 'GO:2001303'], dtype='<U10') Coordinates: * Biological Process (Biological Process) <U10 'GO:0000002' 'GO:0000012' ...
dataset['Gr>G']
<xarray.DataArray 'Gr>G' (Gene: 20945)> array([[False, False, False, ..., False, False, False], [False, False, False, ..., False, False, False], [False, False, False, ..., False, False, False], ..., [False, False, False, ..., False, False, False], [False, False, False, ..., False, False, False], [False, False, False, ..., False, False, False]]) Coordinates: * Gene (Gene) int64 1 2 9 10 12 13 14 15 16 18 19 20 21 22 23 24 25 26 ...
dataset.Gene
<xarray.DataArray 'Gene' (Gene: 20945)> array([ 1, 2, 9, ..., 105379874, 105379878, 105379886]) Coordinates: * Gene (Gene) int64 1 2 9 10 12 13 14 15 16 18 19 20 21 22 23 24 25 26 ...
# Density of each metaedge
dataset.mean()
<xarray.Dataset> Dimensions: () Data variables: AdG float64 0.01214 AeG float64 0.06252 AuG float64 0.01162 CrC float64 0.005385 CpD float64 0.001834 CtD float64 0.003551 CbG float64 0.000356 CdG float64 0.0006492 CuG float64 0.000577 CcSE float64 0.01561 DlA float64 0.0654 DrD float64 0.05786 DaG float64 0.004399 DdG float64 0.002657 DuG float64 0.002694 DpS float64 0.05594 GpBP float64 0.002347 GpCC float64 0.002525 GcG float64 0.0002812 GiG float64 0.0006709 Gr>G float64 0.0006056 GpMF float64 0.001609 GpPW float64 0.002211 PCiC float64 0.001922
# netcdf file was humongous. Avoid!
# dataset.to_netcdf('xarray_dataset.nc')
path = os.path.join('data', 'xarray_dataset.pkl.gz')
# Save as pickle
with gzip.open(path, 'wb') as write_file:
pickle.dump(dataset, write_file, protocol=pickle.HIGHEST_PROTOCOL)
# Read pickle
with gzip.open(path) as read_file:
dataset = pickle.load(read_file)