#!/usr/bin/env python # coding: utf-8 # In[1]: import numpy as np import h5py #f = h5py.File("/oak/stanford/groups/akundaje/avsec/basepair/data/" # +"processed/comparison/output/nexus,peaks,OSNK,0,10" # +",1,FALSE,same,0.5,64,25,0.004,9,FALSE,[1,50],TRUE" # +",FALSE,1/deeplift.imp_score.h5", "r") f = h5py.File("deeplift.imp_score.h5","r") nanog_mask = np.array(f['metadata']['interval_from_task'][:]=='Nanog') nanog_profile_wn_hypimp = np.array(f["hyp_imp/Nanog/profile/wn"][:])[nanog_mask] onehot_seq = np.array(f["inputs/seq"][:])[nanog_mask] nanog_profile_wn_contribs = nanog_profile_wn_hypimp*onehot_seq # In[2]: import modisco #track_set = modisco.tfmodisco_workflow.workflow.prep_track_set( # task_names=["Nanog_profile_wn"], # contrib_scores={'Nanog_profile_wn': nanog_profile_wn_contribs}, # hypothetical_contribs={'Nanog_profile_wn': nanog_profile_wn_hypimp}, # one_hot=onehot_seq) #grp = h5py.File("/oak/stanford/groups/akundaje/avsec/basepair/data/processed/comparison/output/nexus" # +",peaks,OSNK,0,10,1,FALSE,same,0.5,64,25,0.004,9,FALSE,[1,50],TRUE,FALSE,1/deeplift" # +"/Nanog/out/profile/wn/modisco.h5","r") #grp = h5py.File("modisco.h5","r") #loaded_tfmodisco_results =\ # modisco.tfmodisco_workflow.workflow.TfModiscoResults.from_hdf5(grp, track_set=track_set) #grp.close() #patterns = (loaded_tfmodisco_results # .metacluster_idx_to_submetacluster_results["metacluster_0"] # .seqlets_to_patterns_result.patterns) # In[3]: #Saving the seqlets """extracted_contrib_scores = [] extracted_hypothetical_scores = [] extracted_onehot_seqs = [] seqlets_list = loaded_tfmodisco_results.multitask_seqlet_creation_results.final_seqlets window_around = 50 #extract +/- 50bp around each seqlet for seqlet in seqlets_list: example_idx = seqlet.coor.example_idx start = seqlet.coor.start end = seqlet.coor.end if ((start>=window_around) and (end<=1000-window_around)): extracted_contrib_scores.append( nanog_profile_wn_contribs[example_idx,start-window_around:end+window_around]) extracted_hypothetical_scores.append( nanog_profile_wn_hypimp[example_idx,start-window_around:end+window_around]) extracted_onehot_seqs.append( onehot_seq[example_idx,start-window_around:end+window_around]) np.save("extracted_contrib_scores.npy", np.array(extracted_contrib_scores)) np.save("extracted_hypothetical_scores.npy", np.array(extracted_hypothetical_scores)) np.save("extracted_onehot.npy", np.array(extracted_onehot_seqs))""" # In[4]: #for i in range(10): # seqlets_list = loaded_tfmodisco_results.multitask_seqlet_creation_results.final_seqlets # modisco.visualization.viz_sequence.plot_weights(seqlets_list[i]["Nanog_profile_wn_contrib_scores"].fwd) # In[5]: #visualize the saved patterns: """%matplotlib inline from modisco.visualization import viz_sequence for idx,pattern in enumerate(patterns): print("pattern idx",idx) print(len(pattern.seqlets)) viz_sequence.plot_weights( pattern["Nanog_profile_wn_contrib_scores"].fwd) viz_sequence.plot_weights(pattern["sequence"].fwd)""" # In[6]: #print modisco commit hash get_ipython().run_line_magic('cd', '/users/avanti/tfmodisco') get_ipython().system('git log -n 1') get_ipython().run_line_magic('cd', '/users/avanti/tfmodisco_bio_experiments/bpnet/trial1') from importlib import reload get_ipython().run_line_magic('matplotlib', 'inline') import h5py import numpy as np import modisco import modisco.seqlet_embedding.advanced_gapped_kmer reload(modisco.seqlet_embedding.advanced_gapped_kmer) import modisco.seqlet_embedding reload(modisco.seqlet_embedding) import modisco reload(modisco) reload(modisco.util) import modisco.cluster.phenograph.core reload(modisco.cluster.phenograph.core) import modisco.cluster.phenograph.cluster reload(modisco.cluster.phenograph.cluster) import modisco.cluster.phenograph reload(modisco.cluster.phenograph) import modisco.cluster.core reload(modisco.cluster.core) import modisco.cluster reload(modisco.cluster) import modisco.affinitymat.core reload(modisco.affinitymat.core) import modisco.affinitymat.transformers reload(modisco.affinitymat.transformers) import modisco.tfmodisco_workflow.seqlets_to_patterns reload(modisco.tfmodisco_workflow.seqlets_to_patterns) import modisco.tfmodisco_workflow.workflow reload(modisco.tfmodisco_workflow.workflow) import modisco.nearest_neighbors reload(modisco.nearest_neighbors) import modisco.affinitymat reload(modisco.affinitymat) import modisco.aggregator reload(modisco.aggregator) import modisco.value_provider reload(modisco.value_provider) import modisco.core reload(modisco.core) import modisco.coordproducers reload(modisco.coordproducers) import modisco.metaclusterers reload(modisco.metaclusterers) import modisco.clusterinit.memeinit reload(modisco.clusterinit.memeinit) get_ipython().run_line_magic('matplotlib', 'inline') N_CORES = 10 workflow = modisco.tfmodisco_workflow.workflow.TfModiscoWorkflow( sliding_window_size=21,#[5,9,13,17,21], flank_size=10, target_seqlet_fdr=0.01, min_passing_windows_frac=0.03, max_passing_windows_frac=0.03, min_metacluster_size=2000, min_metacluster_size_frac=0.02, max_seqlets_per_metacluster=50000, seqlets_to_patterns_factory= modisco.tfmodisco_workflow.seqlets_to_patterns.TfModiscoSeqletsToPatternsFactory( #initclusterer_factory=modisco.clusterinit.memeinit.MemeInitClustererFactory( # meme_command="/software/meme/5.0.1/bin/meme", # base_outdir="meme_out", # num_seqlets_to_use=10000, # nmotifs=20, n_jobs=4), use_louvain=False, trim_to_window_size=30, initial_flank_to_add=10, embedder_factory=modisco.seqlet_embedding .advanced_gapped_kmer .AdvancedGappedKmerEmbedderFactory(n_jobs=N_CORES), #kmer_len=6, #num_gaps=2, #num_mismatches=0, n_cores=N_CORES, final_min_cluster_size=60 ) ) results = workflow( task_names=["Nanog_profile_wn"], contrib_scores={'Nanog_profile_wn': nanog_profile_wn_contribs}, hypothetical_contribs={'Nanog_profile_wn': nanog_profile_wn_hypimp}, one_hot=onehot_seq) # In[7]: import h5py import modisco.util reload(modisco.util) import os file_path = "v0.5.13.0.hdf5" if (os.path.exists(file_path)): os.remove(file_path) grp = h5py.File(file_path, "w") results.save_hdf5(grp) grp.close() # In[8]: from modisco.visualization import viz_sequence get_ipython().run_line_magic('matplotlib', 'inline') hdf5_results = h5py.File(file_path,"r") metacluster_names = [ x.decode("utf-8") for x in list(hdf5_results["metaclustering_results"] ["all_metacluster_names"][:])] all_patterns = [] background = np.array([0.27, 0.23, 0.23, 0.27]) for metacluster_name in metacluster_names: print(metacluster_name) metacluster_grp = (hdf5_results["metacluster_idx_to_submetacluster_results"] [metacluster_name]) print("activity pattern:",metacluster_grp["activity_pattern"][:]) all_pattern_names = [x.decode("utf-8") for x in list(metacluster_grp["seqlets_to_patterns_result"] ["patterns"]["all_pattern_names"][:])] if (len(all_pattern_names)==0): print("No motifs found for this activity pattern") for pattern_name in all_pattern_names: print(metacluster_name, pattern_name) all_patterns.append((metacluster_name, pattern_name)) pattern = metacluster_grp["seqlets_to_patterns_result"]["patterns"][pattern_name] print("total seqlets:",len(pattern["seqlets_and_alnmts"]["seqlets"])) print("Task 0 hypothetical scores:") viz_sequence.plot_weights(pattern["Nanog_profile_wn_hypothetical_contribs"]["fwd"]) print("Task 0 actual importance scores:") viz_sequence.plot_weights(pattern["Nanog_profile_wn_contrib_scores"]["fwd"]) print("onehot, fwd and rev:") viz_sequence.plot_weights(np.array(pattern["sequence"]["fwd"])) viz_sequence.plot_weights(np.array(pattern["sequence"]["rev"])) hdf5_results.close() # In[ ]: