#!/usr/bin/env python # coding: utf-8 # ## Create _P.generosa_ tissue-specific lncRNA Expression Matrices # # Use lncRNA GTF to ([from 20230502](https://robertslab.github.io/sams-notebook/2023/05/02/lncRNA-Identification-P.generosa-lncRNAs-using-CPC2-and-bedtools.html)) to determine lncRNA expression. # # #### Notebook relies on: # # - [`stringtie`](https://ccb.jhu.edu/software/stringtie/index.shtml?t=manual) # # # ### List computer specs # In[1]: get_ipython().run_cell_magic('bash', '', 'echo "TODAY\'S DATE:"\ndate\necho "------------"\necho ""\n#Display operating system info\nlsb_release -a\necho ""\necho "------------"\necho "HOSTNAME: "; hostname \necho ""\necho "------------"\necho "Computer Specs:"\necho ""\nlscpu\necho ""\necho "------------"\necho ""\necho "Memory Specs"\necho ""\nfree -mh\n') # ### Set variables # - `%env` indicates a bash variable # # - without `%env` is Python variable # In[3]: # Set directories get_ipython().run_line_magic('env', 'transcriptomes_dir=/home/shared/8TB_HDD_01/sam/data/P_generosa/transcriptomes') get_ipython().run_line_magic('env', 'genomes_dir=/home/shared/8TB_HDD_01/sam/data/P_generosa/genomes') get_ipython().run_line_magic('env', 'analysis_dir=/home/shared/8TB_HDD_01/sam/analyses/20230504-pgen-lncRNA-expression') analysis_dir="20230504-pgen-lncRNA-expression" # CPU threads get_ipython().run_line_magic('env', 'threads=40') # Average read length get_ipython().run_line_magic('env', 'read_length=130') # Input files get_ipython().run_line_magic('env', 'lncRNA_gtf=20230502-pgen-lncRNA-IDs.gtf') ## lncRNA directory URL ## https://gannet.fish.washington.edu/Atumefaciens/20230502-pgen-lncRNA-identification/ get_ipython().run_line_magic('env', 'lncRNA_url=gannet:/volume2/web/Atumefaciens/20230502-pgen-lncRNA-identification') ## Tissue-specific sorted BAM files directory URL ## https://gannet.fish.washington.edu/Atumefaciens/20230426-pgen-HISAT2-stringtie-gffcompare-RNAseq/ get_ipython().run_line_magic('env', 'sorted_bams_url=gannet:/volume2/web/Atumefaciens/20230426-pgen-HISAT2-stringtie-gffcompare-RNAseq') # Output file(s) get_ipython().run_line_magic('env', 'lncRNA_stringtie_gtf=pgen-lncRNA-stringtie.gtf') # Set program locations get_ipython().run_line_magic('env', 'stringtie=/home/shared/stringtie-2.2.1.Linux_x86_64/stringtie') get_ipython().run_line_magic('env', 'prepDE=/home/shared/stringtie-2.2.1.Linux_x86_64/prepDE.py3') # Line for formatting get_ipython().run_line_magic('env', 'line=-------------------------------------------------------------------------------------') # ## Create analysis directories # In[4]: get_ipython().run_cell_magic('bash', '', '\ndeclare -a sample_names_array=(ctenidia gonad heart juvenile larvae)\n\nfor sample in "${sample_names_array[@]}"\ndo\n # Make analysis and data directory, if doesn\'t exist\n mkdir --parents "${analysis_dir}/${sample}"\ndone\n\nls -l "${analysis_dir}"\n') # ## Download lncRNA GTF # In[5]: get_ipython().run_cell_magic('bash', '', 'cd "${transcriptomes_dir}"\n\nrsync "${lncRNA_url}/${lncRNA_gtf}" .\n\n\nls -ltrh "${lncRNA_gtf}"\n') # ### Inspect GTF # In[6]: get_ipython().run_cell_magic('bash', '', 'head "${transcriptomes_dir}/${lncRNA_gtf}"\n\necho ""\necho "${line}"\necho ""\n\necho "Number of lines:"\nwc -l "${transcriptomes_dir}/"*.gtf\n') # ## Download tissue-specific BAM files # In[7]: get_ipython().run_cell_magic('bash', '', 'declare -a sample_names_array=(ctenidia gonad heart juvenile larvae)\ncd "${transcriptomes_dir}"\n\nfor sample in "${sample_names_array[@]}"\ndo\n rsync -avP "${sorted_bams_url}/${sample}/*.bam" ./"${sample}"/\ndone\n\n\ntree -h\n') # ## Run StingTie to calculate expression values # In[8]: get_ipython().run_cell_magic('bash', '', 'declare -a sample_names_array=(ctenidia gonad heart juvenile larvae)\ncd "${transcriptomes_dir}"\n\ntime \\\nfor sample in "${sample_names_array[@]}"\ndo\n "${stringtie}" \\\n -G "${lncRNA_gtf}" \\\n -e "${sample}/${sample}.sorted.bam" \\\n -B \\\n -o "${analysis_dir}/${sample}/${sample}-${lncRNA_stringtie_gtf}" \\\n -p "${threads}"\ndone\n') # ### Inspect Ballgown `t_data.ctab` files # In[9]: get_ipython().run_cell_magic('bash', '', 'declare -a sample_names_array=(ctenidia gonad heart juvenile larvae)\ncd "${analysis_dir}"\n\nfor sample in "${sample_names_array[@]}"\ndo\n head --verbose "${sample}/t_data.ctab" | column -t\ndone\n') # ## Create count matrix # In[10]: get_ipython().run_cell_magic('bash', '', 'cd "${analysis_dir}"\n\n"${prepDE}" \\\n-l "${read_length}"\n\nls -ltrh\n') # ### Inspect transcript counts # In[11]: get_ipython().run_cell_magic('bash', '', 'cd "${analysis_dir}"\n\nhead transcript_count_matrix.csv | column -t -s ","\n\necho ""\necho "${line}"\necho ""\n\nwc -l transcript_count_matrix.csv\n')