#!/usr/bin/env python # coding: utf-8 # In[1]: get_ipython().run_line_magic('load_ext', 'autoreload') get_ipython().run_line_magic('autoreload', '2') import matplotlib.pyplot as plt import seaborn as sns import numpy as np import pandas as pd sns.set(style='ticks', context='paper', rc={'font.sans-serif':'Arial', 'pdf.fonttype': 42}) get_ipython().run_line_magic('matplotlib', 'inline') import flotilla flotilla_dir = '/projects/ps-yeolab/obotvinnik/flotilla_projects' study = flotilla.embark('singlecell_pnm_figure1_supplementary_post_splicing_filtering', flotilla_dir=flotilla_dir) not_outliers = study.splicing.singles.index.difference(study.splicing.outliers.index) folder = 'pdfs' #'/home/obotvinnik/Dropbox/figures2/singlecell_pnm/figure2_modalities/bayesian' get_ipython().system(' mkdir $folder') # In[2]: geo_folder = '/home/obotvinnik/projects/singlecell_pnms/data/geneyeo' # In[3]: cd $geo_folder # In[4]: import os import glob r1s = pd.Series(glob.glob('*R1.fastq.gz'), name='R1').to_frame() r1s.head() # In[5]: r1s['sample_id'] = r1s['R1'].map(lambda x: '_'.join(x.split('_')[:2])) r1s.head() # In[6]: study.expression.data.shape # In[7]: r1s_valid = r1s.loc[r1s['sample_id'].isin(study.expression.data.index)] print(r1s_valid.shape) r1s_valid.head() # In[8]: r1s_invalid = r1s.loc[~r1s['sample_id'].isin(study.expression.data.index)] r1s_invalid.shape # In[9]: from __future__ import print_function # In[10]: ls # In[11]: for r1 in r1s_invalid['R1']: print(r1) r2 = r1.replace('R1', 'R2') get_ipython().system(' rm -rf $r1') get_ipython().system(' rm -rf $r2') # In[12]: pwd # In[13]: # cd .. # In[14]: # cd olga.botvinnik@gmail.com/ # In[15]: all_fastqs = pd.Series(glob.glob('*.fastq.gz'), name='fastq').to_frame() all_fastqs = all_fastqs.sort_values('fastq') all_fastqs.head() # In[16]: md5 = get_ipython().getoutput(' md5sum CVN_01_R1.fastq.gz') # In[17]: md5[0].split() # In[18]: all_fastqs['md5'] = None # In[19]: r2s = pd.Series(glob.glob('*R2.fastq.gz')).to_frame() r2s.head() # In[20]: for i, row in all_fastqs.iterrows(): filename = row['fastq'] md5 = get_ipython().getoutput(' md5sum $filename') md5 = md5[0].split()[0] all_fastqs.loc[i, 'md5'] = md5 all_fastqs.head() # In[ ]: all_fastqs.to_csv('fastq_md5.csv', index=False) # In[ ]: all_fastqs.to_clipboard() # ## Get checksums of processed data # In[2]: cd /projects/ps-yeolab/obotvinnik/flotilla_projects/singlecell_pnm_figure4_voyages/ # In[3]: get_ipython().system(' md5sum *csv') # ## Make soft links to everything # In[5]: cd ~/projects/singlecell_pnms/data/geneyeo/ # In[6]: get_ipython().system(' ln -s /projects/ps-yeolab/obotvinnik/flotilla_projects/singlecell_pnm_figure4_voyages/*csv .') # In[8]: get_ipython().system(' ln -s /projects/ps-yeolab/genomes/hg19/hg19_phastcons_placental_mammal.wig .') # In[9]: get_ipython().system(' md5sum hg19_phastcons_placental_mammal.wig') # In[10]: get_ipython().system(' ln -s /projects/ps-yeolab/obotvinnik/singlecell_pnms/csvs_for_paper/splicing_feature_data/alternative/exons.bed alternative_exons.bed') # In[11]: get_ipython().system(' ln -s /projects/ps-yeolab/obotvinnik/singlecell_pnms/csvs_for_paper/splicing_feature_data/constitutive/exons.bed constitutive_exons.bed') # In[12]: get_ipython().system(' md5sum *.bed') # In[13]: get_ipython().system(' ls -1 | wc -l') # In[ ]: