#!/usr/bin/env python # coding: utf-8 # In[1]: import keras from keras.models import Sequential, Model, load_model from keras.layers import Dense, Dropout, Activation, Flatten, Input, Lambda from keras.layers import Conv2D, MaxPooling2D, Conv1D, MaxPooling1D, LSTM, ConvLSTM2D, GRU, BatchNormalization, LocallyConnected2D, Permute from keras.layers import Concatenate, Reshape, Softmax, Conv2DTranspose, Embedding, Multiply from keras.callbacks import ModelCheckpoint, EarlyStopping, Callback from keras import regularizers from keras import backend as K import keras.losses import tensorflow as tf from tensorflow.python.framework import ops import isolearn.keras as iso import numpy as np import tensorflow as tf import logging logging.getLogger('tensorflow').setLevel(logging.ERROR) import pandas as pd import os import pickle import numpy as np import scipy.sparse as sp import scipy.io as spio import matplotlib.pyplot as plt import isolearn.io as isoio import isolearn.keras as isol from genesis.visualization import * from genesis.generator import * from genesis.predictor import * from genesis.optimizer import * from definitions.generator.aparent_deconv_conv_generator_concat_alllayers import load_generator_network from definitions.predictor.aparent import load_saved_predictor import sklearn from sklearn.decomposition import PCA from sklearn.manifold import TSNE # In[2]: #Define target isoform loss function def get_cleavage_loss(target_poses, use_start=0, use_end=70, use_target_bits=1.8, cse_start=70, cse_end=76, cse_target_bits=1.8, dse_start=76, dse_end=125, dse_target_bits=1.8, entropy_weight=0.0, similarity_weight=0.0, punish_dn_cse=0.0, punish_up_c=0.0, punish_dn_c=0.0, reward_dn_ggcc=0.0, punish_up_aa=0.0, punish_dn_aa=0.0) : target_cuts = np.zeros((len(target_poses), 206)) for i, target_pos in enumerate(target_poses) : target_cuts[i, target_pos] = 1.0 masked_use_entropy_mse = get_target_entropy_sme_masked(pwm_start=use_start, pwm_end=use_end, target_bits=use_target_bits) cse_entropy_mse = get_target_entropy_sme(pwm_start=cse_start, pwm_end=cse_end, target_bits=cse_target_bits) masked_dse_entropy_mse = get_target_entropy_sme_masked(pwm_start=dse_start, pwm_end=dse_end, target_bits=dse_target_bits) punish_dn_cse_func = get_punish_cse(pwm_start=74, pwm_end=dse_end) punish_up_c_func = get_punish_c(pwm_start=use_start, pwm_end=use_end) punish_dn_c_func = get_punish_c(pwm_start=dse_start, pwm_end=dse_end) punish_dn_gg_func = get_punish_gg(pwm_start=80, pwm_end=110) punish_dn_cc_func = get_punish_cc(pwm_start=80, pwm_end=110) punish_up_aa_func = get_punish_aa(pwm_start=use_start, pwm_end=use_end) punish_dn_aa_func = get_punish_aa(pwm_start=dse_start, pwm_end=dse_end) pwm_sample_entropy_func = get_pwm_margin_sample_entropy_masked(pwm_start=70-60, pwm_end=76+60, margin=0.5, shift_1_nt=True) #pwm_sample_entropy_func = get_pwm_sample_entropy_masked(pwm_start=70-60, pwm_end=76+60) #pwm_sample_entropy_up_func = get_pwm_margin_sample_entropy_masked(pwm_start=70-60, pwm_end=70-4, margin=0.5, shift_1_nt=True) #pwm_sample_entropy_dn_func = get_pwm_margin_sample_entropy_masked(pwm_start=76, pwm_end=76+60, margin=0.5, shift_1_nt=True) extra_sim = np.ones((len(target_poses), 1, 205, 4, 1)) for i in range(len(target_poses)) : extra_sim[i, 0, 70-4:76, :, 0] = 0.0 def loss_func(loss_tensors) : _, _, _, sequence_class, pwm_logits_1, pwm_logits_2, pwm_1, pwm_2, sampled_pwm_1, sampled_pwm_2, mask, sampled_mask, iso_pred, cut_pred, iso_score_pred, cut_score_pred = loss_tensors #Create target cuts with sample axis (ax=1) cut_targets = K.constant(target_cuts) cut_true = K.gather(cut_targets, sequence_class[:, 0]) cut_true = K.tile(K.expand_dims(cut_true, axis=1), (1, K.shape(sampled_pwm_1)[1], 1)) #Specify costs cut_loss = 1.0 * K.mean(kl_divergence(cut_true, cut_pred), axis=1) seq_loss = 0.0 seq_loss += punish_dn_cse * K.mean(punish_dn_cse_func(sampled_pwm_1), axis=1) seq_loss += punish_up_c * K.mean(punish_up_c_func(sampled_pwm_1), axis=1) seq_loss += punish_dn_c * K.mean(punish_dn_c_func(sampled_pwm_1), axis=1) seq_loss += punish_up_aa * K.mean(punish_up_aa_func(sampled_pwm_1), axis=1) seq_loss += punish_dn_aa * K.mean(punish_dn_aa_func(sampled_pwm_1), axis=1) seq_loss += reward_dn_ggcc * (-1.) * punish_dn_gg_func(pwm_1) * punish_dn_cc_func(pwm_1) extra_sims = K.constant(extra_sim) extra_sim_mask = K.gather(extra_sims, sequence_class[:, 0]) extra_sim_mask = K.tile(extra_sim_mask, (1, K.shape(sampled_pwm_1)[1], 1, 1, 1)) entropy_loss = entropy_weight * (masked_use_entropy_mse(pwm_1, mask) + cse_entropy_mse(pwm_1) + masked_dse_entropy_mse(pwm_1, mask)) entropy_loss += similarity_weight * K.mean(pwm_sample_entropy_func(sampled_pwm_1, sampled_pwm_2, sampled_mask * extra_sim_mask), axis=1) #entropy_loss += similarity_weight * K.mean(pwm_sample_entropy_up_func(sampled_pwm_1, sampled_pwm_2, sampled_mask), axis=1) / 2.0 #entropy_loss += similarity_weight * K.mean(pwm_sample_entropy_dn_func(sampled_pwm_1, sampled_pwm_2, sampled_mask), axis=1) / 2.0 #Compute total loss total_loss = cut_loss + seq_loss + entropy_loss return total_loss return loss_func class EpochVariableCallback(Callback): def __init__(self, my_variable, my_func): self.my_variable = my_variable self.my_func = my_func def on_epoch_end(self, epoch, logs={}): K.set_value(self.my_variable, self.my_func(K.get_value(self.my_variable), epoch)) #Function for running GENESIS def run_genesis(sequence_templates, loss_func, library_contexts, model_path, batch_size=32, n_samples=1, n_epochs=10, steps_per_epoch=100) : #Build Generator Network _, generator = build_generator(batch_size, len(sequence_templates[0]), load_generator_network, n_classes=len(sequence_templates), n_samples=n_samples, sequence_templates=sequence_templates, batch_normalize_pwm=False) #Build Predictor Network and hook it on the generator PWM output tensor _, pwm_predictor = build_predictor(generator, load_saved_predictor(model_path, library_contexts=library_contexts), batch_size, n_samples=1, eval_mode='pwm') _, sample_predictor = build_predictor(generator, load_saved_predictor(model_path, library_contexts=library_contexts), batch_size, n_samples=n_samples, eval_mode='sample') for layer in pwm_predictor.layers : if 'aparent' in layer.name : layer.name += "_pwmversion" #Build Loss Model (In: Generator seed, Out: Loss function) _, pwm_loss_model = build_loss_model(pwm_predictor, loss_func) _, sample_loss_model = build_loss_model(sample_predictor, loss_func) dual_loss_out = Lambda(lambda x: 0.5 * x[0] + 0.5 * x[1])([pwm_loss_model.outputs[0], sample_loss_model.outputs[0]]) loss_model = Model(inputs=pwm_loss_model.inputs, outputs=dual_loss_out) #Specify Optimizer to use #opt = keras.optimizers.SGD(lr=0.1) opt = keras.optimizers.Adam(lr=0.001, beta_1=0.9, beta_2=0.999) #Compile Loss Model (Minimize self) loss_model.compile(loss=lambda true, pred: pred, optimizer=opt) #Fit Loss Model train_history = loss_model.fit( [], np.ones((1, 1)), #Dummy training example epochs=n_epochs, steps_per_epoch=steps_per_epoch ) return generator, sample_predictor, train_history # In[3]: #Specfiy file path to pre-trained predictor network save_dir = os.path.join(os.getcwd(), '../../../aparent/saved_models') saved_predictor_model_name = 'aparent_plasmid_iso_cut_distalpas_all_libs_no_sampleweights_sgd.h5' saved_predictor_model_path = os.path.join(save_dir, saved_predictor_model_name) # In[4]: #Maximize isoform proportions for all native minigene libraries sequence_templates = [ 'CCCTACACGACGCTCTTCCGATCTNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNAATAAANNNNNATNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNGTGTCCTGCCCGGTCGGCTTGAGCGCGTGGGTCTCGTTTAGATGCTGCGCCTAACCCTAAGCAGATTCTTCATGCAATTGT', 'CCCTACACGACGCTCTTCCGATCTNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNAATAAANNNNNNNNNNATNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNGTGTCCTGCCCGGTCGGCTTGAGCGCGTGGGTCTCGTTTAGATGCTGCGCCTAACCCTAAGCAGATTCTTCATGCAATTGT', 'CCCTACACGACGCTCTTCCGATCTNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNAATAAANNNNNNNNNNNNNNNATNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNGTGTCCTGCCCGGTCGGCTTGAGCGCGTGGGTCTCGTTTAGATGCTGCGCCTAACCCTAAGCAGATTCTTCATGCAATTGT', 'CCCTACACGACGCTCTTCCGATCTNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNAATAAANNNNNNNNNNNNNNNNNNNNATNNNNNNNNNNNNNNNNNNNNNNNNNNNGTGTCCTGCCCGGTCGGCTTGAGCGCGTGGGTCTCGTTTAGATGCTGCGCCTAACCCTAAGCAGATTCTTCATGCAATTGT', 'CCCTACACGACGCTCTTCCGATCTNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNAATAAANNNNNNNNNNNNNNNNNNNNNNNNNATNNNNNNNNNNNNNNNNNNNNNNGTGTCCTGCCCGGTCGGCTTGAGCGCGTGGGTCTCGTTTAGATGCTGCGCCTAACCCTAAGCAGATTCTTCATGCAATTGT', 'CCCTACACGACGCTCTTCCGATCTNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNAATAAANNNNNNNNNNNNNNNNNNNNNNNNNNNNNNATNNNNNNNNNNNNNNNNNGTGTCCTGCCCGGTCGGCTTGAGCGCGTGGGTCTCGTTTAGATGCTGCGCCTAACCCTAAGCAGATTCTTCATGCAATTGT', 'CCCTACACGACGCTCTTCCGATCTNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNAATAAANNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNATNNNNNNNNNNNNGTGTCCTGCCCGGTCGGCTTGAGCGCGTGGGTCTCGTTTAGATGCTGCGCCTAACCCTAAGCAGATTCTTCATGCAATTGT', 'CCCTACACGACGCTCTTCCGATCTNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNAATAAANNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNATNNNNNNNGTGTCCTGCCCGGTCGGCTTGAGCGCGTGGGTCTCGTTTAGATGCTGCGCCTAACCCTAAGCAGATTCTTCATGCAATTGT', 'CCCTACACGACGCTCTTCCGATCTNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNAATAAANNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNATNNGTGTCCTGCCCGGTCGGCTTGAGCGCGTGGGTCTCGTTTAGATGCTGCGCCTAACCCTAAGCAGATTCTTCATGCAATTGT' ] library_contexts = [ 'simple', 'simple', 'simple', 'simple', 'simple', 'simple', 'simple', 'simple', 'simple' ] cut_positions = [ 80, 85, 90, 95, 100, 105, 110, 115, 120 ] # In[5]: #Train APA Cleavage GENESIS Network print("Training GENESIS (multi-class)") #Number of PWMs to generate per objective batch_size = 36 #Number of One-hot sequences to sample from the PWM at each grad step n_samples = 10 #Number of epochs per objective to optimize n_epochs = 50 #Number of steps (grad updates) per epoch steps_per_epoch = 500 loss = get_cleavage_loss( cut_positions, use_start=22, use_end=70, use_target_bits=1.95, cse_start=70, cse_end=76, cse_target_bits=1.95, dse_start=76, dse_end=125, dse_target_bits=1.95, entropy_weight=1.0, similarity_weight=5.0, punish_dn_cse=1.0, punish_up_c=0.007, punish_dn_c=0.010, reward_dn_ggcc=0.0, punish_up_aa=0.0, punish_dn_aa=0.025 ) genesis_generator, genesis_predictor, train_history = run_genesis(sequence_templates, loss, library_contexts, saved_predictor_model_path, batch_size, n_samples, n_epochs, steps_per_epoch) genesis_generator.get_layer('lambda_rand_sequence_class').function = lambda inp: inp genesis_generator.get_layer('lambda_rand_input_1').function = lambda inp: inp genesis_generator.get_layer('lambda_rand_input_2').function = lambda inp: inp genesis_predictor.get_layer('lambda_rand_sequence_class').function = lambda inp: inp genesis_predictor.get_layer('lambda_rand_input_1').function = lambda inp: inp genesis_predictor.get_layer('lambda_rand_input_2').function = lambda inp: inp # Save model and weights save_dir = 'saved_models' if not os.path.isdir(save_dir): os.makedirs(save_dir) model_name = 'genesis_cleavage_pwm_and_multisample_marginsimilarity_hardersimilarity_simple_sym_kl_concat_retry_1_generator.h5' model_path = os.path.join(save_dir, model_name) genesis_generator.save(model_path) print('Saved trained model at %s ' % model_path) model_name = 'genesis_cleavage_pwm_and_multisample_marginsimilarity_hardersimilarity_simple_sym_kl_concat_retry_1_predictor.h5' model_path = os.path.join(save_dir, model_name) genesis_predictor.save(model_path) print('Saved trained model at %s ' % model_path) # In[4]: #(Re-)Load GENESIS model save_dir = os.path.join(os.getcwd(), 'saved_models/target_cleavage') model_name = 'genesis_cleavage_pwm_and_multisample_marginsimilarity_hardersimilarity_simple_sym_kl_concat_retry_1_generator.h5' model_path = os.path.join(save_dir, model_name) generator = load_model(model_path, custom_objects={'st_sampled_softmax': st_sampled_softmax, 'st_hardmax_softmax': st_hardmax_softmax}) save_dir = os.path.join(os.getcwd(), 'saved_models/target_cleavage') model_name = 'genesis_cleavage_pwm_and_multisample_marginsimilarity_hardersimilarity_simple_sym_kl_concat_retry_1_predictor.h5' model_path = os.path.join(save_dir, model_name) predictor = load_model(model_path, custom_objects={'st_sampled_softmax': st_sampled_softmax, 'st_hardmax_softmax': st_hardmax_softmax}) # In[5]: n = 36 * 6 sequence_class = np.array([j for i in [[k] * 20 for k in range(9)] for j in i] + [0] * 200)[:n].reshape(-1, 1) noise_1 = np.random.uniform(-1, 1, (n, 100)) noise_2 = np.random.uniform(-1, 1, (n, 100)) pred_outputs = predictor.predict([sequence_class, noise_1, noise_2], batch_size=36) _, _, _, optimized_pwm, _, _, _, _, _, iso_pred, cut_pred, _, _ = pred_outputs # In[6]: #Plot one PWM sequence logo per optimized objective (Experiment 'Punish A-runs') for pwm_index in range(9 * 20) : sequence_template = sequence_templates[sequence_class[pwm_index, 0]] target_cut = cut_positions[sequence_class[pwm_index, 0]] pwm = np.expand_dims(optimized_pwm[pwm_index, :, :, 0], axis=0) cut = np.expand_dims(cut_pred[pwm_index, 0, :], axis=0) iso = np.expand_dims(np.sum(cut[:, target_cut:target_cut + 1], axis=-1), axis=-1) plot_seqprop_logo(pwm, iso, cut, annotate_peaks='max', sequence_template=sequence_template, figsize=(12.5, 1.5), width_ratios=[1, 8], logo_height=0.8, usage_unit='fraction', plot_start=70-49, plot_end=76+49, save_figs=False, fig_name='cleavage_genesis_target_cut_' + str(target_cut) + "_pwm_index_" + str(pwm_index), fig_dpi=150) # In[10]: #Estimate duplication rates def get_consensus_sequence(pwm) : consensus = '' for j in range(pwm.shape[0]) : nt_ix = np.argmax(pwm[j, :]) if nt_ix == 0 : consensus += 'A' elif nt_ix == 1 : consensus += 'C' elif nt_ix == 2 : consensus += 'G' elif nt_ix == 3 : consensus += 'T' return consensus f = plt.figure(figsize=(5, 4)) n_classes = 9 n_sequences_large_list = [10, 100, 1000, 10000, 100000] dup_rates = [] save_figs = False for n_sequences_large in n_sequences_large_list : n_sequences_ceil_large = int(n_sequences_large / 36) * 36 + 36 print("N Sequences = " + str(n_sequences_large)) sequence_class = np.array([np.random.randint(n_classes)] * n_sequences_ceil_large).reshape(-1, 1) noise_1 = np.random.uniform(-1, 1, (n_sequences_ceil_large, 100)) noise_2 = np.random.uniform(-1, 1, (n_sequences_ceil_large, 100)) pred_outputs = predictor.predict([sequence_class, noise_1, noise_2], batch_size=36) _, _, _, optimized_pwm_large, _, sampled_pwm_large, _, _, _, _, _, _, _ = pred_outputs onehots_large = sampled_pwm_large[:, 0, :, :, :] consensus_seqs_large = [] for i in range(onehots_large.shape[0]) : consensus_seqs_large.append(get_consensus_sequence(onehots_large[i, :, :, 0])) consensus_seqs_large = np.array(consensus_seqs_large, dtype=np.object) #Sample first n_sequences onehots_large_kept = onehots_large[:n_sequences_large, :, :] consensus_large_seqs_kept = consensus_seqs_large[:n_sequences_large] n_unique_seqs_kept = len(np.unique(consensus_large_seqs_kept)) print("Number of unique sequences = " + str(n_unique_seqs_kept)) dup_rate = 1. - n_unique_seqs_kept / n_sequences_large dup_rates.append(dup_rate) print("Duplication rate = " + str(round(dup_rate, 4))) plt.plot(np.arange(len(n_sequences_large_list)), dup_rates, linewidth=3, linestyle='-', color='black') plt.scatter(np.arange(len(n_sequences_large_list)), dup_rates, c='blue', s=45) plt.xlabel("# Sequences", fontsize=14) plt.ylabel("Duplication Rate", fontsize=14) plt.xticks(np.arange(len(n_sequences_large_list)), n_sequences_large_list, fontsize=14, rotation=45) plt.yticks(fontsize=14) plt.ylim(0, np.max(dup_rates) * 1.10 + 0.001) plt.title("Duplication rate curve", fontsize=14) plt.tight_layout() if save_figs : plt.savefig("apa_cleavage_dup_rate_curve.eps") plt.savefig("apa_cleavage_dup_rate_curve.svg") plt.savefig("apa_cleavage_dup_rate_curve.png", transparent=True, dpi=150) plt.show() # In[17]: import scipy import scipy.sparse as sp n = 36 * 100 sequence_class = np.array([np.random.randint(n_classes)] * n).reshape(-1, 1) noise_1 = np.random.uniform(-1, 1, (n, 100)) noise_2 = np.random.uniform(-1, 1, (n, 100)) pred_outputs = predictor.predict([sequence_class, noise_1, noise_2], batch_size=36) _, _, _, optimized_pwm, _, sampled_pwm, _, _, _, _, _, _, _ = pred_outputs onehots = sampled_pwm[:, 0, :, :, :] #Evaluate Sequence diversity def get_consensus_sequence(pwm) : consensus = '' for j in range(pwm.shape[0]) : nt_ix = np.argmax(pwm[j, :]) if nt_ix == 0 : consensus += 'A' elif nt_ix == 1 : consensus += 'C' elif nt_ix == 2 : consensus += 'G' elif nt_ix == 3 : consensus += 'T' return consensus n_sequences = 1000 consensus_seqs = [] for i in range(onehots.shape[0]) : consensus_seqs.append(get_consensus_sequence(onehots[i, :, :, 0])) consensus_seqs = np.array(consensus_seqs, dtype=np.object) #Sample first n_sequences onehots_kept = onehots[:n_sequences, :, :] consensus_seqs_kept = consensus_seqs[:n_sequences] print("Number of unique sequences = " + str(len(np.unique(consensus_seqs_kept)))) #Calculate average/std nucleotide entropy nt_entropies = [] for j in range(onehots_kept.shape[1]) : if sequence_templates[0][j] == 'N' : p_A = np.sum(onehots_kept[:, j, 0, 0]) / n_sequences p_C = np.sum(onehots_kept[:, j, 1, 0]) / n_sequences p_G = np.sum(onehots_kept[:, j, 2, 0]) / n_sequences p_T = np.sum(onehots_kept[:, j, 3, 0]) / n_sequences nt_entropy = 0 if p_A * p_C * p_G * p_T > 0. : nt_entropy = - (p_A * np.log2(p_A) + p_C * np.log2(p_C) + p_G * np.log2(p_G) + p_T * np.log2(p_T)) nt_entropies.append(nt_entropy) nt_entropies = np.array(nt_entropies) print("Mean NT Entropy = " + str(round(np.mean(nt_entropies), 4))) print("Std NT Entropy = " + str(round(np.std(nt_entropies), 4))) #Calculate hexamer entropies hexamer_encoder = isol.NMerEncoder(n_mer_len=6, count_n_mers=True) hexamers = isol.SparseBatchEncoder(encoder=hexamer_encoder)([x[20:128] for x in consensus_seqs_kept]) print(hexamers.shape) hexamer_sum = np.ravel(hexamers.sum(axis=0)) hexamers_probs = hexamer_sum / np.sum(hexamer_sum) n_nonzero_hexamers = len(np.nonzero(hexamer_sum > 0)[0]) print("Number of unique hexamers = " + str(n_nonzero_hexamers)) hexamer_entropy = -1. * np.sum(hexamers_probs[hexamer_sum > 0] * np.log2(hexamers_probs[hexamer_sum > 0])) print("Hexamer Entropy = " + str(hexamer_entropy)) #Calculate average/std hexamer entropy nonzero_index = np.nonzero(hexamer_sum > 0)[0] hexamer_entropies = [] for j in range(n_nonzero_hexamers) : p_on = len(np.nonzero(hexamers[:, nonzero_index[j]] > 0)[0]) / hexamers.shape[0] p_off = 1. - p_on hexamer_entropy = 0 if p_on * p_off > 0. : hexamer_entropy = -(p_on * np.log2(p_on) + p_off * np.log2(p_off)) hexamer_entropies.append(hexamer_entropy) hexamer_entropies = np.array(hexamer_entropies) print("Mean Binary Hexamer Entropy = " + str(round(np.mean(hexamer_entropies), 4))) print("Std Binary Hexamer Entropy = " + str(round(np.std(hexamer_entropies), 4))) # In[18]: #Helper function to get the conensus sequence from a PWM def get_consensus_sequence(pwm) : consensus = '' for j in range(pwm.shape[0]) : nt_ix = np.argmax(pwm[j, :]) if nt_ix == 0 : consensus += 'A' elif nt_ix == 1 : consensus += 'C' elif nt_ix == 2 : consensus += 'G' elif nt_ix == 3 : consensus += 'T' return consensus n_classes = 9 n_sequences = 1000 n_sequences_ceil = int(n_sequences / 36) * 36 + 36 pwms = [] consensus_seqs = [] onehot_seqs = [] iso_preds = [] cut_preds = [] objectives = [] onehot_encoder = isol.OneHotEncoder(205) for k in range(n_classes) : print('Predicting sequences for objective ' + str(k) + '...') sequence_class = np.tile(np.array([[k]]), (n_sequences_ceil, 1)) noise_1 = np.random.uniform(-1, 1, (n_sequences_ceil, 100)) noise_2 = np.random.uniform(-1, 1, (n_sequences_ceil, 100)) pred_outputs = predictor.predict([sequence_class, noise_1, noise_2], batch_size=36) _, _, _, optimized_pwm, _, _, _, _, _, iso_pred, cut_pred, _, _ = pred_outputs optimized_pwm = np.expand_dims(optimized_pwm[:n_sequences, :, :, 0], axis=0) cut_pred = cut_pred[:n_sequences, 0, :] obj_ix = cut_positions[k] iso_pred = np.expand_dims(np.sum(cut_pred[:, obj_ix-1:obj_ix+2], axis=-1), axis=0) cut_pred = np.expand_dims(cut_pred, axis=0) pwms.append(optimized_pwm) iso_preds.append(iso_pred) cut_preds.append(cut_pred) consensus_seqs.append([]) onehot_seqs.append([]) for i in range(n_sequences) : consensus_seqs[k].append(get_consensus_sequence(optimized_pwm[0, i, :, :])) onehot_seqs[k].append(onehot_encoder(get_consensus_sequence(optimized_pwm[0, i, :, :]))) consensus_seqs[k] = np.expand_dims(np.array(consensus_seqs[k], dtype=np.object), axis=0) onehot_seqs[k] = np.expand_dims(onehot_seqs[k], axis=0) objectives.append(np.zeros((1, n_sequences)) + k) pwms = np.concatenate(pwms, axis=0) iso_preds = np.concatenate(iso_preds, axis=0) cut_preds = np.concatenate(cut_preds, axis=0) consensus_seqs = np.concatenate(consensus_seqs, axis=0) onehot_seqs = np.concatenate(onehot_seqs, axis=0) objectives = np.concatenate(objectives, axis=0) print('pwms.shape = ' + str(pwms.shape)) print('iso_preds.shape = ' + str(iso_preds.shape)) print('cut_preds.shape = ' + str(cut_preds.shape)) print('consensus_seqs.shape = ' + str(consensus_seqs.shape)) print('onehot_seqs.shape = ' + str(onehot_seqs.shape)) print('objectives.shape = ' + str(objectives.shape)) # In[19]: flat_onehot_seqs = np.reshape(onehot_seqs, (n_classes * n_sequences, 205 * 4)) flat_objectives = np.ravel(objectives) obj_poses = cut_positions flat_labels = np.array(['CSE+' + str(obj_poses[int(k)] - 75) for k in flat_objectives.tolist()], dtype=np.object) flat_onehot_seqs_opt = flat_onehot_seqs[:, 24*4:124*4] flat_onehot_seqs_dse = flat_onehot_seqs[:, 75*4:124*4] # In[36]: from scipy.stats import pearsonr import seaborn as sns #sns.set() #Uniquely generated sequences per objective unique_counts = np.array([len(np.unique(consensus_seqs[k, :])) for k in range(n_classes)]) f = plt.figure(figsize=(6, 4)) plt.bar(np.arange(n_classes), unique_counts)#, edgecolor='darkgrey', linewidth=2, color='black' plt.xticks(np.arange(n_classes), ['CSE+' + str(obj_poses[int(k)] - 75) for k in range(n_classes)], fontsize=14, rotation=45) plt.yticks(fontsize=14) plt.xlabel('Objective', fontsize=18) plt.ylabel('Unique sequences', fontsize=18) plt.tight_layout() plt.show() #Objective cut log odds violin plot save_figs = False cut_logodds = [np.log(iso_preds[k, :] / (1.0 - iso_preds[k, :])) for k in range(n_classes)] for k in range(n_classes) : print("Mean Cut Log Odds (CSE+" + str(obj_poses[int(k)] - 75) + ") = " + str(round(np.mean(cut_logodds[k]), 4))) print("Std Cut Log Odds (CSE+" + str(obj_poses[int(k)] - 75) + ") = " + str(round(np.std(cut_logodds[k]), 4))) f = plt.figure(figsize=(6, 4)) sns.violinplot(data=cut_logodds) plt.xticks(np.arange(n_classes), ['CSE+' + str(obj_poses[int(k)] - 75) for k in range(n_classes)], fontsize=14, rotation=45) plt.yticks(fontsize=14) plt.xlabel('Objective', fontsize=18) plt.ylabel('Objective cut odds (log)', fontsize=18) plt.tight_layout() fig_name = 'cleavage_genesis_logodds_violin' if save_figs : plt.savefig(fig_name + '.png', transparent=True, dpi=150) plt.savefig(fig_name + '.svg') plt.savefig(fig_name + '.eps') plt.show() #Average cut profile per objective cut_preds_prox = cut_preds[:, :, :-1] / np.expand_dims(np.sum(cut_preds[:, :, :-1], axis=-1), axis=-1) flat_cut_preds = np.reshape(cut_preds_prox, (n_classes * n_sequences, 205)) avg_cut_poses = np.sum(np.arange(205).reshape(1, -1) * flat_cut_preds, axis=-1) targ_cut_poses = np.array([obj_poses[int(k)] for k in flat_objectives]) avg_pos_r, avg_pos_p_val = pearsonr(avg_cut_poses, targ_cut_poses) avg_pos_r2 = avg_pos_r**2 print("Avg pos R^2 = " + str(round(avg_pos_r2, 4)) + ", p = " + str(round(avg_pos_p_val, 10))) avgcut = np.mean(cut_preds[:, :, :-1] / np.expand_dims(np.sum(cut_preds[:, :, :-1], axis=-1), axis=-1), axis=1) f = plt.figure(figsize=(6, 4)) for k in range(n_classes) : plt.plot(np.arange(205-75), avgcut[k, 75:], linewidth=2) plt.xticks([obj_poses[int(k)] - 75 for k in range(n_classes)], ['CSE+' + str(obj_poses[int(k)] - 75) for k in range(n_classes)], fontsize=14, rotation=45) plt.yticks(fontsize=14) plt.xlim(0, 50) plt.xlabel('Objective', fontsize=18) plt.ylabel('Objective cut proportion (%)', fontsize=18) plt.tight_layout() fig_name = 'cleavage_genesis_average_cut' if save_figs : plt.savefig(fig_name + '.png', transparent=True, dpi=150) plt.savefig(fig_name + '.svg') plt.savefig(fig_name + '.eps') plt.show() # In[10]: #PCA visualization, full sequence save_figs = False print('Starting PCA...') pca = PCA(n_components=50) flat_onehot_seqs_pca = pca.fit_transform(flat_onehot_seqs_opt) print('Done with PCA.') #tSNE visualization print('Starting tSNE...') tsne = TSNE(n_components=2) flat_onehot_seqs_tsne = tsne.fit_transform(flat_onehot_seqs_pca) print('Done with tSNE.') f = plt.figure(figsize = (8, 8)) for label in ['CSE+5', 'CSE+10', 'CSE+15', 'CSE+20', 'CSE+25', 'CSE+30', 'CSE+35', 'CSE+40', 'CSE+45'] : plt.scatter(flat_onehot_seqs_tsne[flat_labels == label, 0], flat_onehot_seqs_tsne[flat_labels == label, 1], label=label) plt.xlabel('tSNE 1', fontsize=18) plt.ylabel('tSNE 2', fontsize=18) plt.xticks(fontsize=18) plt.yticks(fontsize=18) plt.legend(fontsize=14) plt.tight_layout() fig_name = 'cleavage_genesis_tsne' if save_figs : plt.savefig(fig_name + '.png', transparent=True, dpi=150) plt.savefig(fig_name + '.svg') plt.savefig(fig_name + '.eps') plt.show() # In[11]: #PCA visualization, DSE only save_figs = False print('Starting PCA...') pca = PCA(n_components=50) flat_onehot_seqs_pca = pca.fit_transform(flat_onehot_seqs_dse) print('Done with PCA.') #tSNE visualization print('Starting tSNE...') tsne = TSNE(n_components=2) flat_onehot_seqs_tsne = tsne.fit_transform(flat_onehot_seqs_pca) print('Done with tSNE.') f = plt.figure(figsize = (8, 8)) for label in ['CSE+5', 'CSE+10', 'CSE+15', 'CSE+20', 'CSE+25', 'CSE+30', 'CSE+35', 'CSE+40', 'CSE+45'] : plt.scatter(flat_onehot_seqs_tsne[flat_labels == label, 0], flat_onehot_seqs_tsne[flat_labels == label, 1], label=label) plt.xlabel('tSNE 1', fontsize=18) plt.ylabel('tSNE 2', fontsize=18) plt.xticks(fontsize=18) plt.yticks(fontsize=18) plt.legend(fontsize=14) plt.tight_layout() fig_name = 'cleavage_genesis_tsne_dse_only' if save_figs : plt.savefig(fig_name + '.png', transparent=True, dpi=150) plt.savefig(fig_name + '.svg') plt.savefig(fig_name + '.eps') plt.show() # In[37]: #Load designed MPRA data (Collapsed over experiment replicates) seq_dict = isoio.load('~/aparent/data/prepared_data/apa_array_data/apa_array_data_master_seq') print("n = " + str(len(seq_dict['array_df']))) print('Avg read count (avg over barcodes) = ' + str(np.mean(seq_dict['array_df'].query("n_barcodes >= 2")['mean_total_count']))) print('Pooled read count (pooled over barcodes) = ' + str(np.mean(seq_dict['array_df'].query("n_barcodes >= 2")['pooled_total_count']))) # In[38]: seq_df = seq_dict['array_df'] seq_cuts = seq_dict['pooled_cuts'] # In[39]: #Get observed cleavage count matrix cut_true = np.concatenate([np.array(seq_cuts[:, 180 + 20: 180 + 205].todense()), np.array(seq_cuts[:, -1].todense()).reshape(-1, 1)], axis=-1) #Add small pseudo count to true cuts cut_true += 0.0005 cut_true = cut_true / np.sum(cut_true, axis=-1).reshape(-1, 1) seq_df['cut_prob_true'] = [cut_true[i, :] for i in range(len(seq_df))] # In[40]: unique_experiments = seq_df.query("experiment == 'max_cut'")['subexperiment'].unique() seq_df_filtered = seq_df.query("mean_total_count >= 100 and n_barcodes >= 5") cut_df_filtered = seq_df.query("experiment == 'max_cut' and mean_total_count >= 10").copy() #Map and evaluate sequences def eval_logloss(cut_prob, target_pos) : target = np.zeros(cut_prob.shape[0]) target[target_pos] = 0.8 target[target_pos - 1] = 0.10 target[target_pos + 1] = 0.10 ent = -np.dot(target, np.log((cut_prob + 0.001) / np.sum(cut_prob + 0.001))) return ent start_pos = 58 end_pos = 108 cut_poses = np.arange(end_pos - start_pos) + (start_pos - 56) true_cut_column = 'cut_prob_true' avg_position_true = [] cut_probs_true = [] log_loss = [] for index, row in cut_df_filtered.iterrows() : cut_prob_true = np.ravel(row[true_cut_column]) cut_prob_true = cut_prob_true[start_pos:end_pos] / np.sum(cut_prob_true[start_pos:end_pos]) cut_probs_true.append(cut_prob_true) avg_pos_true = np.dot(cut_prob_true, (np.arange(cut_prob_true.shape[0]) + start_pos)) avg_position_true.append(avg_pos_true) cut_pos_caught = False for cut_pos in [60, 65, 70, 75, 80, 85, 90, 95, 100] : if str(cut_pos) in row['subexperiment'] : log_loss.append(eval_logloss(np.copy(cut_prob_true), cut_pos - 55 - (start_pos - 56))) cut_pos_caught = True break if not cut_pos_caught : log_loss.append(0) cut_df_filtered['proxcut_prob_true'] = cut_probs_true cut_df_filtered['avgpos_true'] = avg_position_true cut_df_filtered['loss_logloss'] = log_loss cut_snv_df_filtered = cut_df_filtered.query("variant == 'snv'").copy() cut_df_filtered = cut_df_filtered.query("variant != 'snv'").copy() print("Total number of Max Cut sequences = " + str(len(cut_df_filtered))) df_human = seq_df_filtered.query("experiment == 'clinvar_wt' and variant == 'wt'") human_cutprobs = np.array(df_human[true_cut_column].values.tolist())[:, start_pos:end_pos] human_cutprobs = human_cutprobs / np.sum(human_cutprobs, axis=1).reshape(-1, 1) human_cutprob = np.mean(human_cutprobs, axis=0) objective_poses = [5, 10, 15, 20, 25, 30, 35, 40, 45] seq_objective_df = cut_df_filtered.copy().reset_index()[['master_seq', 'experiment', 'subexperiment', 'variant', 'gene', 'cut_prob_true', 'avgpos_true']] # In[41]: #Map and collect array data cut_nt = 'AT' aruns_objective = ''#_aruns ggcc_objective = ''#_GGCC subexperiments = { 0 : [cut_nt + aruns_objective + '_60', cut_nt + ggcc_objective + aruns_objective + '_ent_60'],#, cut_nt + aruns_objective + '_ent_60' 1 : [cut_nt + aruns_objective + '_65', cut_nt + ggcc_objective + aruns_objective + '_ent_65'],#, cut_nt + aruns_objective + '_ent_65'], 2 : [cut_nt + aruns_objective + '_70', cut_nt + ggcc_objective + aruns_objective + '_ent_70'],#, cut_nt + aruns_objective + '_ent_70'], 3 : [cut_nt + aruns_objective + '_75', cut_nt + ggcc_objective + aruns_objective + '_ent_75'],#, cut_nt + aruns_objective + '_ent_75'], 4 : [cut_nt + aruns_objective + '_80', cut_nt + ggcc_objective + aruns_objective + '_ent_80'],#, cut_nt + aruns_objective + '_ent_80'], 5 : [cut_nt + ggcc_objective + aruns_objective + '_85', cut_nt + ggcc_objective + aruns_objective + '_ent_85'],#, cut_nt + ggcc_objective + aruns_objective + '_ent_85'], 6 : [cut_nt + ggcc_objective + aruns_objective + '_90', cut_nt + ggcc_objective + aruns_objective + '_ent_90'],#, cut_nt + ggcc_objective + aruns_objective + '_ent_90'], 7 : [cut_nt + ggcc_objective + aruns_objective + '_95', cut_nt + ggcc_objective + aruns_objective + '_ent_95'],#, cut_nt + ggcc_objective + aruns_objective + '_ent_95'], 8 : [cut_nt + ggcc_objective + aruns_objective + '_100', cut_nt + ggcc_objective + aruns_objective + '_ent_100'],#, cut_nt + ggcc_objective + aruns_objective + '_ent_100'], } array_seqs = [] array_onehot_seqs = [] array_cut_preds = [] array_avg_poses = [] array_objectives = [] array_seqtypes = [] onehot_encoder = isol.OneHotEncoder(163) for k in range(n_classes) : print('Collecting sequences for objective ' + str(k) + '...') sel_df = seq_objective_df.loc[seq_objective_df.subexperiment.isin(subexperiments[k])].copy() sel_df['master_seq'] = sel_df.master_seq.str.slice(1, 164) print('n_sequences = ' + str(len(sel_df))) array_seqs.append(list(sel_df.master_seq.values)) array_seqs[k] = np.array(array_seqs[k], dtype=np.object) array_onehot_seqs.append([]) for i in range(len(sel_df)) : array_onehot_seqs[k].append(onehot_encoder(array_seqs[k][i])) array_onehot_seqs[k] = np.array(array_onehot_seqs[k]) cut_pred = np.array(list(sel_df['cut_prob_true'].values)) array_cut_preds.append(cut_pred) avg_pos = np.array(list(sel_df['avgpos_true'].values)) array_avg_poses.append(avg_pos) array_objectives.append(np.zeros(len(sel_df)) + k) array_seqtypes.append([]) array_seqtypes[k] = np.array(list(sel_df['variant'].values), dtype=np.object) # In[47]: #One-hot L2-norm cluster centroid confusion matrix, DSE only save_figs = False cluster_means = np.mean(onehot_seqs[:, :, 76:125, :], axis=1).reshape(n_classes, -1) l2_min_assignments = np.zeros((n_classes, n_classes)) n_correct = 0 n_tot = 0 for k in range(0, n_classes) : for i in range(0, array_onehot_seqs[k].shape[0]) : l2_distances = np.linalg.norm(cluster_means - array_onehot_seqs[k][i, 56:105, :].reshape(1, -1), ord=1, axis=-1) l2_min_ix = np.argmin(l2_distances) l2_min_assignments[k, l2_min_ix] += 1 if l2_min_ix == k : n_correct += 1 n_tot += 1 print("Accuracy = " + str(n_correct) + " / " + str(n_tot) + " = " + str(round(n_correct / n_tot, 2))) l2_min_assignments = l2_min_assignments / np.sum(l2_min_assignments, axis=1).reshape(-1, 1) f = plt.figure(figsize=(6, 6)) plt.pcolor(l2_min_assignments, cmap='Reds', vmin=0, vmax=1) plt.xticks(np.arange(n_classes) + 0.5, ['GENESIS+' + str(obj_poses[int(k)] - 75) for k in range(n_classes)], fontsize=14, rotation=45) plt.yticks(np.arange(n_classes) + 0.5, ['ARRAY+' + str(obj_poses[int(k)] - 75) for k in range(n_classes)], fontsize=14, rotation=45) plt.title('Mean L1-norm assignments (DSE only)', fontsize=18) plt.xlabel('GENESIS', fontsize=18) plt.ylabel('ARRAY', fontsize=18) plt.tight_layout() fig_name = 'cleavage_genesis_array_knn' if save_figs : plt.savefig(fig_name + '.png', transparent=True, dpi=150) plt.savefig(fig_name + '.svg') plt.savefig(fig_name + '.eps') plt.show() # In[15]: from scipy.stats import pearsonr import seaborn as sns from matplotlib import colors flat_pwms_collection = [] for class_i in range(0, n_classes) : print("Target cut pos = " + str(obj_poses[class_i] - 75)) flat_pwms = np.zeros((pwms.shape[1], 205)) for i in range(pwms.shape[1]) : for j in range(205) : max_nt_ix = np.argmax(pwms[class_i, i, j, :]) flat_pwms[i, j] = max_nt_ix + 1 flat_pwms = flat_pwms[:20, 70-50: 76+50]#flat_pwms[:20, 70-45: 76+45] print(flat_pwms.shape) flat_pwms_collection.append(flat_pwms) flat_pwms = np.concatenate(flat_pwms_collection[::-1], axis=0) # In[17]: #Plot diversity grid save_figs = False cmap = colors.ListedColormap(['red', 'blue', 'orange', 'darkgreen']) bounds=[0, 1, 2, 3, 4, 5] norm = colors.BoundaryNorm(bounds, cmap.N) w = flat_pwms.shape[1] f = plt.figure(figsize=(4, 12)) plt.imshow(flat_pwms[:20 * n_classes, :], aspect='equal', interpolation='nearest', origin='lower', cmap=cmap, norm=norm) for class_i in range(1, n_classes) : plt.plot([0, w], [class_i * 20, class_i * 20], linewidth=2, color='black', linestyle='--') plt.xticks([], []) plt.yticks([], []) plt.xlim(0, w) plt.ylim(0, 20 * n_classes) plt.tight_layout() if save_figs : plt.savefig('genesis_apa_cleavage_simple_image_seqs.png', transparent=True, dpi=150) plt.savefig('genesis_apa_cleavage_simple_image_seqs.svg') plt.savefig('genesis_apa_cleavage_simple_image_seqs.eps') plt.show() # In[ ]: