#!/usr/bin/env python
# coding: utf-8

# # 1. Get data & packages

# Data was downloaded from Globus (globus.org).  It was sent in two zipped files, one from each lane, containing separate files for each sample (demultiplexed). 
# 
# I then checked that the md5 values were the same using `md5 [filename].tar.gz >> [filename].md5` (this appened the new value in the .md5 file, so I could compare). 
# 
# ### A note on data accessibility & my working environment
# 
# I originally downloaded data to Ostrich, thinking that I could work on that computer remotely using Remote Desktop and Jupyter Notebook. However, my internet connection is too slow to work productively that way. I therefore also downloaded the data to my external hard drive, and worked locally. This also allows me to use all the packages that I had installed on my personal computer in 2018 (which is helpful). 
# 
# Canonical versions of the data is saved to Owl/Nightengales in the zipped format here: [nightingales/O_lurida/2020-04-21_QuantSeq-data/](http://owl.fish.washington.edu/nightingales/O_lurida/2020-04-21_QuantSeq-data/) and as individual fastq/sample here [nightingales/O_lurida/](http://owl.fish.washington.edu/nightingales/O_lurida/), and to Gannet as individual fastq/sample here [Atumefaciens/20200426_olur_fastqc_quantseq/](https://gannet.fish.washington.edu/Atumefaciens/20200426_olur_fastqc_quantseq/). 
# 
# Sam also ran MultiQC on my samples; check out his [notebook entry](https://robertslab.github.io/sams-notebook/2020/04/26/FastQC-MultiQC-Laura-Spencer's-QuantSeq-Data.html), and the [MultiQC report](https://gannet.fish.washington.edu/Atumefaciens/20200426_olur_fastqc_quantseq/multiqc_report.html).

# ### Create path variables, i.e. shortcuts to certain directories 
# 
# To start, create some variables for commonly accessed paths. NOTE: many of the steps in this workflow require the me to be located within a specific directory to access files. So, while I try to use these path variables, I often have to hard-code my paths. 

# In[11]:


# create path variable to raw data, saved on my external hard drive 
workingdir = "/Volumes/Bumblebee/O.lurida_QuantSeq-2020/"


# In[2]:


cd {workingdir}


# In[3]:


# create path variable to fastqc directory 
fastqc = "/Applications/bioinformatics/FastQC/"


# In[5]:


# test fqstqc 
get_ipython().system(' {fastqc}fastqc --version')


# ### I installed MultiQC using git clone via the following: 
# 
#     git clone https://github.com/ewels/MultiQC.git
#     cd MultiQC
#     pip install .

# In[6]:


# test multiqc 
get_ipython().system(' multiqc --version')


# ### Install Cutadapt 
# 
# The [Cutadapt program](https://cutadapt.readthedocs.io/en/stable/installation.html) is used in the tagseq processing pipeline. I installed `cutadapt` using `python3 -m pip install --user --upgrade cutadapt`. During install, I received this warning: 
# 
#     WARNING: The script cutadapt is installed in '/Users/laura/.local/bin' which is not on PATH.
#     Consider adding this directory to PATH or, if you prefer to suppress this warning, use --no-warn-script-location.
#   
# So, I added that path to my PATH using the following in Terminal:  
# `PATH=$PATH:/Users/laura/.local/bin`
# 
# For some reason Jupyter Notebook doesn't recognize `cutadapt`, even though I can access it via the Terminal. 

# In[109]:


# Try to add the path here 
get_ipython().system(' PATH=$PATH:/Users/laura/.local/bin')


# In[113]:


# Still doesn't work 
get_ipython().system(' cutadapt --version')


# In[7]:


# Hard coding the cutadapt path works, though 
get_ipython().system(' /Users/laura/.local/bin/cutadapt --version')


# In[10]:


# test fastq_quality_filter (see if it's correctly added to my PATH)
get_ipython().system(' fastq_quality_filter -h')


# ## Unpack raw data 
# 
# Currently, the data is still zipped (that's how it arrived via Globus). I need to thus tar/gunzip the lane files, before I can gunzip the individual library files.  

# In[12]:


cd raw-data/


# In[15]:


ls


# In[21]:


# check md5's for batch1.tar.gz compared to md5 that seq. facility sent 
get_ipython().system(' cat Batch1_69plex_lane1.md5')
get_ipython().system(' md5 Batch1_69plex_lane1.tar.gz')


# In[20]:


# check md5's for batch2.tar.gz compared to md5 that seq. facility sent
get_ipython().system(' cat Batch2_77plex_lane2_md5')
get_ipython().system(' md5 Batch2_77plex_lane2_tar.gz')


# In[22]:


# extract batch/lane 1 data 
get_ipython().system(' gunzip -c Batch1_69plex_lane1.tar.gz | tar xopf -')


# In[23]:


# extract batch/lane 2 data 
get_ipython().system(' gunzip -c Batch2_77plex_lane2_tar.gz | tar xopf -')


# In[24]:


# check out resulting file structure 
get_ipython().system(' ls')


# In[25]:


get_ipython().system(' ls Batch1_69plex_lane1_done/')


# ### Move to each batch's directory containing demultiplexed library files, and gunzip all fastq files in that folder

# In[26]:


cd Batch1_69plex_lane1_done/


# In[27]:


get_ipython().system(' gunzip *.fastq.gz')


# In[28]:


cd ../Batch2_77plex_lane2_done/


# In[29]:


get_ipython().system(' gunzip *.fastq.gz')


# In[30]:


# Check out contents after gunzip 
get_ipython().system(' ls')


# # 2. Initial QC 

# In[45]:


get_ipython().system(' mkdir {workingdir}qc-processing/fastqc/')


# In[31]:


get_ipython().system(' mkdir {workingdir}qc-processing/fastqc/untrimmed/')


# In[49]:


# test fastqc on one sample file 
get_ipython().system(' {fastqc}fastqc  506_S47_L002_R1_001.fastq  --outdir ../fastqc/untrimmed/')


# ### Run fastqc on all .fastq files in Batch/Lane2, the larval data (current directory) 

# In[32]:


get_ipython().system(' {fastqc}fastqc  *.fastq  --outdir {workingdir}qc-processing/fastqc/untrimmed/  --quiet')


# In[33]:


# check out resulting fastqc files. 
get_ipython().system(' ls {workingdir}qc-processing/fastqc/untrimmed/')


# ### Run fastqc on all fastq files in Batch/Lane 1, the adult ctenidia+juvenile samples 

# In[34]:


cd {workingdir}raw-data/Batch1_69plex_lane1_done/


# In[35]:


get_ipython().system(' {fastqc}fastqc  *.fastq  --outdir {workingdir}qc-processing/fastqc/untrimmed/  --quiet')


# ### Generate a MultiQC report on all untrimmed data files 

# In[36]:


get_ipython().system(' multiqc {workingdir}qc-processing/fastqc/untrimmed/  --filename {workingdir}qc-processing/fastqc/untrimmed/multiqc_report_untrimmed.html')


# ### Inspect MultiQC report

# In[38]:


import IPython
IPython.display.HTML(filename='/Volumes/Bumblebee/O.lurida_QuantSeq-2020/qc-processing/fastqc/untrimmed/multiqc_report_untrimmed.html')


# # 3. Trim & filter
# 
# ###  What adapters do I need to trim off? 
# 
# According to the QuantSeq FAQ section ["1.13 What sequence should be trimmed?"](https://www.lexogen.com/quantseq-3mrna-sequencing/#c873a9746c24c41e7): 
# 
# _The reads should be trimmed to remove adapter sequences, poly(A) / poly(T) sequences, and low quality nucleotides. Reads that are too short (i.e., <20 nt) or have generally low quality scores should be removed from the set._
# 
# _As second strand synthesis is based on random priming, there may be a higher proportion of errors at the first nucleotides of the insert due to non-specific hybridization of the random primer to the cDNA template. For QuantSeq FWD data we therefore recommend using an aligner that can perform soft-clipping of the read ends (e.g., STAR aligner) during alignment, or increasing the number of allowed mismatches to 14. Alternatively, trimming the first 12 nt of Read 1 can be performed prior to alignment when using a more stringent aligner (e.g., HISAT2). While trimming the read can decrease the number of reads of suitable length for alignment, the absolute number of mapping reads may increase due to the improved read quality._
# 
# As a reminder, here is how the QuantSeq libraries were generated: 
# ![read_orientation_schematic](https://github.com/fish546-2018/laura-quantseq/blob/master/references/QuantSeq_library_generation_schematic.png?raw=true)
# 
# It looks like I added Illumina sequencing adapters (green) during library prep. This is confirmed in the protocol, which states that during the **First Strand cDNA Synthesis (reverse transcription) step**, "An oligodT primer containing an Illumina-compatible sequence at its 5’ end is hybridized to the RNA and reverse transcription is performed." It's not clear what the Illumina-compatible sequence is. The manual does state the following 
# 
# Adapter sequence according to the manual: 
# "Read 1: Multiplexing Read 1 Sequencing Primer (not supplied): 5’ ACACTCTTTCCCTACACGACGCTCTTCCGATCT 3’"
# 
# ### Universal Illumina Adapter sequence (according to fastqc) 
# 
# The following adapter sequences are listed in the 'FastQC/Configuration/adapter_list.txt' file: 
#  - Illumina Universal Adapter					AGATCGGAAGAG  
#  - Illumina Small RNA 3' Adapter				TGGAATTCTCGG  
#  - Illumina Small RNA 5' Adapter				GATCGTCGGACT  
#  - Nextera Transposase Sequence				    CTGTCTCTTATA  
#  - SOLID Small RNA Adapter						CGCCTTGGCCGT  
# 
# ### Explore data: do I need to remove indices? 
# 
# Each entry in a FASTQ file consists of four lines:
#  1. Sequence identifier  
#  2. Sequence  
#  3. Quality score identifier line (consisting only of a +)  
#  4. Quality score  
#  
# First, I will check out a few fastq files - do reads all have the index number included in the sequence, or have those been omitted? 

# In[41]:


# Sample 34 index is GACATC. 
get_ipython().system(' head -n 10 {workingdir}/raw-data/Batch2_77plex_lane2_done/34_S68_L002_R1_001.fastq')


# In[42]:


# Sample 302 index is GTCAGG. 
get_ipython().system(' head -n 10 {workingdir}/raw-data/Batch1_69plex_lane1_done/302_S15_L001_R1_001.fastq')


# #### Looks like all index sequences are included in the Sequence Identifier line for each read (line #1), but not in the actual sequences. Good - now I need to figure out which Illumina adapter sequences I need to trim. 
# 
# ### Poly-a tails, etc.
# 
# I inspected some fastq files to see what was happening at the 3' end of my reads. Lots of poly-a tails with up to ~18 bp, but I also found many poly-G tails. 
# 
# ![poly tails](https://github.com/fish546-2018/laura-quantseq/blob/master/notebooks/screen-shots/2020-05-07_untrimmed-reads-poly-tails.png?raw=true)
# 
# The Cutadapt poly-N trimming is quite flexible. From their manual: 
# 
# _If you want to trim a poly-A tail from the 3’ end of your reads, use the 3’ adapter type (-a) with an adapter sequence of many repeated A nucleotides. Starting with version 1.8 of Cutadapt, you can use the following notation to specify a sequence that consists of 100 A:_
# 
#     cutadapt -a "A{100}" -o output.fastq input.fastq
# 
# _This also works when there are sequencing errors in the poly-A tail. So this read_
# 
#     TACGTACGTACGTACGAAATAAAAAAAAAAA
# 
# _will be trimmed to:_
# 
#     TACGTACGTACGTACG
# 
# _If for some reason you would like to use a shorter sequence of A, you can do so: The matching algorithm always picks the leftmost match that it can find, so Cutadapt will do the right thing even when the tail has more A than you used in the adapter sequence. However, sequencing errors may result in shorter matches than desired. For example, using -a "A{10}", the read above (where the AAAT is followed by eleven A) would be trimmed to:_
# 
#     TACGTACGTACGTACGAAAT

# ## Adaptor trimming, deduplicating, and quality filtering
# 
# Run `cutadapt` to trim reads for 1) poly-A tails, 2) poly-G tails, and 3) Illumina universal adapter sequences. I also filter for quality 15 and greater, and for read lenths 20 and greater. 
# 
# Note: I am not hard-trimming the first 12bp. That is one of the QuantSeq manual's recommeded options, but only if I don't use a soft-clipping option (which I do, with Bowtie2 using the --local setting). Also, in my [2020-QuantSeq-Testing-Pipelines jupyter notebook](https://nbviewer.jupyter.org/github/laurahspencer/O.lurida_QuantSeq-2020/blob/master/notebooks/2020-QuantSeq-Testing-Pipelines.ipynb) I tested out a pipeline which compared alignment rates w/ and w/o hard-trimming. The highest alignment rate was achieved by NOT hard-trimming (and using local-alignment).  

# In[44]:


# start by trimming batch1
get_ipython().system(' cd {workingdir}/raw-data/Batch1_69plex_lane1_done/')


# In[47]:


get_ipython().system(' ls | head')


# In[48]:


mkdir {workingdir}qc-processing/cutadapt/


# In[56]:


pwd


# In[62]:


get_ipython().run_cell_magic('bash', '', '\n# create output file names (with location in relative path)\nfor file in *_L001_R1_001.fastq\ndo\nsample="$(basename -a $file | cut -d "_" -f 1)"\ntrimmed_file="$sample.trim.fastq"\n\n# run cutadapt on each file\n/Users/laura/.local/bin/cutadapt $file -a A{8} -a G{8} -a AGATCGG -q 15 -m 20 \\\n-o ../../qc-processing/cutadapt/$trimmed_file\ndone\n')


# In[63]:


# Move to other batch to trim 
cd {workingdir}raw-data/Batch2_77plex_lane2_done/


# In[64]:


get_ipython().run_cell_magic('bash', '', '\n# create output file names (with location in relative path)\nfor file in *_L002_R1_001.fastq\ndo\nsample="$(basename -a $file | cut -d "_" -f 1)"\ntrimmed_file="$sample.trim.fastq"\n\n# run cutadapt on each file\n/Users/laura/.local/bin/cutadapt $file -a A{8} -a G{8} -a AGATCGG -q 15 -m 20 \\\n-o ../../qc-processing/cutadapt/$trimmed_file\ndone\n')


# In[66]:


# Check that the total # files
get_ipython().system(' ls {workingdir}qc-processing/cutadapt/ | wc -l')


# ## Assess reads after trimming 

# In[67]:


get_ipython().system(' mkdir {workingdir}qc-processing/fastqc/trimmed/')


# In[68]:


get_ipython().system('{fastqc}fastqc  {workingdir}qc-processing/cutadapt/*.fastq  --outdir {workingdir}qc-processing/fastqc/trimmed/  --quiet')


# In[69]:


get_ipython().system(' multiqc {workingdir}qc-processing/fastqc/trimmed/  --filename {workingdir}qc-processing/fastqc/trimmed/multiqc_report_trimmed.html')


# In[70]:


import IPython
IPython.display.HTML(filename='/Volumes/Bumblebee/O.lurida_QuantSeq-2020/qc-processing/fastqc/trimmed/multiqc_report_trimmed.html')


# ### Count total no. reads before and after trimming 

# In[96]:


# Before trimming 
get_ipython().system(' wc -l {workingdir}raw-data/Batch1_69plex_lane1_done/*.fastq  {workingdir}raw-data/Batch2_77plex_lane2_done/*.fastq > {workingdir}raw-data/read-count-untrimmed.txt')


# In[97]:


get_ipython().system(' cat {workingdir}raw-data/read-count-untrimmed.txt')


# In[98]:


# After trimming
get_ipython().system(' wc -l {workingdir}qc-processing/cutadapt/*.fastq > {workingdir}qc-processing/cutadapt/read-count-trimmed.txt')
get_ipython().system(' cat {workingdir}qc-processing/cutadapt/read-count-trimmed.txt')


# ## 95.44% reads remain after trimming and filtering 
# 
# - 3,915,145,660 reads after trimming
# - 4,102,334,876 reads before 

# # 4. Align reads

# ### Download O. lurida genome 
# (If needed)

# In[88]:


get_ipython().system(' curl http://owl.fish.washington.edu/halfshell/genomic-databank/Olurida_v081.fa > {workingdir}/references/Olurida_v081.fa')


# In[5]:


# MD5 should = 3ac56372bd62038f264d27eef0883bd3
get_ipython().system(' md5 {workingdir}references/Olurida_v081.fa')


# ## Align reads using Bowtie2
# 
# See the [Bowtie2 manual](http://bowtie-bio.sourceforge.net/bowtie2/manual.shtml) for details
# 
# Updated Bowtie2 using homebrew
# 
# `brew upgrade bowtie2` 
# 
# Version before update: version 2.3.4.3  
# Version after update: version 2.4.1

# In[74]:


# make sure bowtie2 is in path and working
get_ipython().system(' bowtie2 --version')


# In[30]:


get_ipython().system(' mkdir {workingdir}qc-processing/bowtie/')


# In[75]:


cd {workingdir}qc-processing/bowtie/


# In[262]:


### creating bowtie2 index for Oly genome v081:
get_ipython().system(' bowtie2-build  {workingdir}references/Olurida_v081.fa  Olurida_v081.fa')


# In[78]:


ls {workingdir}qc-processing/bowtie/


# In[77]:


get_ipython().system(' mkdir {workingdir}qc-processing/bowtie/mapped/')


# ### Candidate options from the Bowtie2 manual 
# 
# ### Local alignment 
# 
# When the `–-local` option is specified, Bowtie 2 performs local read alignment. In this mode, Bowtie 2 might “trim” or “clip” some read characters from one or both ends of the alignment if doing so maximizes the alignment score.
# 
# The following is a “local” alignment because some of the characters at the ends of the read do not participate. In this case, 4 characters are omitted (or “soft trimmed” or “soft clipped”) from the beginning and 3 characters are omitted from the end. This sort of alignment can be produced by Bowtie 2 only in local mode.
# 
#     Read:      ACGGTTGCGTTAATCCGCCACG  
#     Reference: TAACTTGCGTTAAATCCGCCTGG  
# 
#     Alignment:  
#       Read:      ACGGTTGCGTTAA-TCCGCCACG
#                      ||||||||| ||||||
#       Reference: TAACTTGCGTTAAATCCGCCTGG
# 
# ### Distinct alignments map a read to different places 
# 
# Two alignments for the same individual read are “distinct” if they map the same read to different places. Default mode: search for multiple alignments, report the best one. **In -k mode, Bowtie 2 searches for up to N distinct, valid alignments for each read**, where N equals the integer specified with the -k parameter. That is, if -k 2 is specified, Bowtie 2 will search for at most 2 distinct alignments. It reports all alignments found, in descending order by alignment score.  
# 
# Bowtie 2 does not “find” alignments in any specific order, so for reads that have more than N distinct, valid alignments, Bowtie 2 does not guarantee that the N alignments reported are the best possible in terms of alignment score. Still, this mode can be effective and fast in situations where the user cares more about whether a read aligns (or aligns a certain number of times) than where exactly it originated.
# 
# ### Using multiple cores 
# 
# If your computer has multiple processors/cores, use `-p/--threads N`
# 
# The `-p/--threads` option causes Bowtie 2 to launch a specified number of parallel search threads. Each thread runs on a different processor/core and all threads find alignments in parallel, increasing alignment throughput by approximately a multiple of the number of threads (though in practice, speedup is somewhat worse than linear).
# 
# _note: my computer is a quad-core, so I can use up to 8 threads_ 
# 
# `-x`: The basename of the index for the reference genome.  
# `-U`: Comma-separated list of files containing unpaired reads to be aligned, e.g. lane1.fq,lane2.fq,lane3.fq,lane4.fq.  
# `-S`: File to write SAM alignments to. By default, alignments are written to the “standard out” or “stdout” filehandle (i.e. the console).  
# `-q`: Reads are FASTQ files. FASTQ files usually have extension .fq or .fastq. FASTQ is the default format.  
# `--trim5`: Trim N bases from 5’ (left) end of each read before alignment (default: 0).  
# `--trim3`: Trim N bases from 3’ (right) end of each read before alignment (default: 0).   
#     
# ### Presets: setting many settings at once
# 
# Bowtie 2 comes with some useful combinations of parameters packaged into shorter “preset” parameters. For example, running Bowtie 2 with the --very-sensitive option is the same as running with options: -D 20 -R 3 -N 0 -L 20 -i S,1,0.50. The preset options that come with Bowtie 2 are designed to cover a wide area of the speed/sensitivity/accuracy trade-off space, with the presets ending in fast generally being faster but less sensitive and less accurate, and the presets ending in sensitive generally being slower but more sensitive and more accurate. See the documentation for the preset options for details.
# 
# #### Preset options in `--local` mode  
# `--very-fast-local` - Same as: -D 5 -R 1 -N 0 -L 25 -i S,1,2.00  
# `--fast-local` - Same as: -D 10 -R 2 -N 0 -L 22 -i S,1,1.75  
# `--sensitive-local` - Same as: -D 15 -R 2 -N 0 -L 20 -i S,1,0.75 (default in --local mode)  
# `--very-sensitive-local` Same as: -D 20 -R 3 -N 0 -L 20 -i S,1,0.50    
# 
# ##### Preset options 
# `-D` - Up to N consecutive seed extension attempts can “fail” before Bowtie 2 moves on, using the alignments found so far. A seed extension “fails” if it does not yield a new best or a new second-best alignment. This limit is automatically adjusted up when -k or -a are specified.  
# 
# `-R` - The maximum number of times Bowtie 2 will “re-seed” reads with repetitive seeds. When “re-seeding,” Bowtie 2 simply chooses a new set of reads (same length, same number of mismatches allowed) at different offsets and searches for more alignments. A read is considered to have repetitive seeds if the total number of seed hits divided by the number of seeds that aligned at least once is greater than 300.  
# 
# `-N` - No. mismatches allowed in seed alignment.  
# 
# `-L` - Sets the length of the seed substrings to align during multiseed alignment. Smaller values make alignment slower but more sensitive.  
# 
# `-i S,1,0.75` - Sets a function governing the interval between seed substrings to use during multiseed alignment. For instance, if the read has 30 characters, and seed length is 10, and the seed interval is 6, the seeds extracted will be:
# 
#     Read:      TAGCTACGCTCTACGCTATCATGCATAAAC
#     Seed 1 fw: TAGCTACGCT
#     Seed 1 rc: AGCGTAGCTA
#     Seed 2 fw:       CGCTCTACGC
#     Seed 2 rc:       GCGTAGAGCG
#     Seed 3 fw:             ACGCTATCAT
#     Seed 3 rc:             ATGATAGCGT
#     Seed 4 fw:                   TCATGCATAA
#     Seed 4 rc:                   TTATGCATGA
# 
# Since it’s best to use longer intervals for longer reads, this parameter sets the interval as a function of the read length, rather than a single one-size-fits-all number.
# 
# ### SAM output 
# 
# The following settings were recommended by the TagSeq pipeline  
# 
#     --no-unal  = suppress SAM records for unaligned reads  
#     --no-sq    = suppress @SQ header lines  
#     --no-hd    = suppress SAM header lines (starting with @)  
#     
# When I analigned my preliminary QuantSeq data I used the `--no-unal` option, so I'll do that again here. That will also make the downtream processing step easier (don't have to filter out unaligned reads)    

# In[91]:


# confirm that current directory is bowtie/ 
get_ipython().system(' pwd')


# ## Run Bowtie on trimmed reads, pre-set option= `--sensitive-local`

# In[92]:


get_ipython().run_cell_magic('bash', '', '\nfor file in ../cutadapt/*.trim.fastq\ndo\nsample="$(basename -a $file | cut -d "." -f 1)"\nmap_file="$sample.bowtie.sam"\n\n# run Bowtie2 on each file\nbowtie2 \\\n--local \\\n-x Olurida_v081.fa \\\n--sensitive-local \\\n--threads 8 \\\n--no-unal \\\n-k 5 \\\n-U $file \\\n-S mapped/$map_file; \\\ndone >> mapped/bowtieout.txt 2>&1\n')


# In[94]:


get_ipython().system(' cat {workingdir}qc-processing/bowtie/mapped/bowtieout.txt')


# In[99]:


# alignment rates:
get_ipython().system(' grep "overall alignment rate" {workingdir}qc-processing/bowtie/mapped/bowtieout.txt')


# ### External hard drive may have run out of space at the very end of the Bowtie run. Re-align sample 571 (2nd to last sample that ran, had 32.23% overall alignment - last one is the "undetermined" reads)  

# In[101]:


# run Bowtie2 on 571 again to see if file size changes 
get_ipython().system(' bowtie2  --local  -x Olurida_v081.fa  --sensitive-local  --threads 8  --no-unal  -k 5  -U {workingdir}qc-processing/cutadapt/571.trim.fastq  -S {workingdir}qc-processing/bowtie/mapped/571.bowtie.sam')


# ### Same alignment rate as the full run. Good to move forward with .sam --> .bam conversion 

# ### Convert .sam files to .bam files & sort

# In[104]:


cd {workingdir}


# In[105]:


# Create new bowtie/ subdirectory for sorted .bam files 
mkdir {workingdir}qc-processing/bowtie/mapped-sorted/


# In[121]:


get_ipython().run_cell_magic('bash', '', '\nfor file in qc-processing/bowtie/mapped/*.bowtie.sam\ndo\nsample="$(basename -a $file | cut -d "." -f 1)"\nsorted_file="$sample.sorted.bam"\nsamtools view -b $file | samtools sort -o qc-processing/bowtie/mapped-sorted/$sorted_file\ndone\n')


# # Deduplicate reads (optional)
# 
# After running the entire pipeline and analyzing the juvenile samples w/ and w/o deduplicating, I have decided to deduplicate reads, since I only saw slight differences in the #DEGs among groups w/o and w/o deduplicating, and when I spot-checked DEGs in IGV, the reads that had not been deduplicated clearly had some PCR duplication (tons of reads stacked up at exactly the same start & stop). Here I will use the `Picard` tool w/n the `gatk` tool library to deduplicate (since I have that program already, which I use to identify SNPs). 

# In[2]:


# Create directory to house deduplicated .bam files 
get_ipython().system(' mkdir ../qc-processing/bowtie/deduplicated/')


# In[4]:


get_ipython().run_cell_magic('bash', '', '\nfor file in ../qc-processing/bowtie/mapped-sorted/*.bam\ndo\nsample="$(basename -a $file | cut -b 1,2,3)"\ndedup_file="$sample.dedup.bam"\n\n# deduplicate using picard, output will have duplicates removed \n/Applications/bioinformatics/gatk-4.1.9.0/gatk MarkDuplicates \\\nI=$file \\\nO="../qc-processing/bowtie/deduplicated/$dedup_file" \\\nM="../qc-processing/bowtie/deduplicated/$sample.dup_metrics.txt" \\\nREMOVE_DUPLICATES=true\ndone >> "../qc-processing/bowtie/deduplicated/dedup_stout.txt" 2>&1\n')


# In[5]:


get_ipython().system(' cat ../qc-processing/bowtie/deduplicated/dedup_stout.txt')


# ## Create bam indices 

# In[7]:


get_ipython().run_cell_magic('bash', '', '# create .bam indexes\nfor file in ../qc-processing/bowtie/deduplicated/*.bam\ndo\nsamtools index $file\ndone\n')


# # 5. Generate counts 
# 
#  - Use `featureCounts` on aligned reads (from Bowtie2) to count the number of reads that uniquely map to each gene. Works on BAM/SAM alignmment files. 

# ## featureCounts 
# 
# Using [this tutorial](https://hbctraining.github.io/Intro-to-rnaseq-hpc-O2/lessons/05_counting_reads.html) as a guide to generate counts using `featureCounts`  
# 
# #### Notes: 
# First I need to install the `subread` package which contains `featureCounts`.  Following [these installation directions](http://bioinf.wehi.edu.au/subread-package/) I downloaded the subread vs. 2.0.0 binary for MacOS fron [SourceForge](The executables can be found in the bin diretory of the binary packages.), then unptarred the contents into my Applications/bioinformatics directory on my laptop, and renamed the folder to "subread" (from "subread-2.0.0-MacOS-x86_64"). The executables are in the bin/ diretory. 
# 
# Multi-overlapping genes:  A multi-mapping read is a read that maps to more than one location in the reference genome. By default featureCounts does not count these, and recommends that RNASeq experiments do not count them. However, I could try to specify `-M -fraction` to count each alignment fractionally (each alignment carries 1/x count where x is the total number of alignments reported for the read).  
# 
# Options 
# 
# `-T`: Specify # of threads.  
# `-t`: Specify the feature type. Only rows which have the matched
# feature type in the provided GTF annotation file will be included for read counting. ‘exon’ by default.  
# `-g`: Specify the attribute type used to group features (eg. exons)
# into meta-features (eg. genes) when GTF annotation is provided. ‘gene id’ by default. This attribute type is usually the
# gene identifier. This argument is useful for the meta-feature
# level summarization.  
# `-a`: Provide name of an annotation file  
# `-o`: Give the name of the output file. The output file contains
# the number of reads assigned to each meta-feature (or each
# feature if -f is specified). Note that the featureCounts function
# in Rsubread does not use this parameter. It returns a list
# object including read summarization results and other data.  

# In[8]:


# create path variable for featureCounts program
featureCounts = "/Applications/bioinformatics/subread/bin/featureCounts"


# In[9]:


# test path and path variable 
get_ipython().system(' {featureCounts} -v')


# In[125]:


# create directory for featureCounts output 
get_ipython().system(' mkdir {workingdir}qc-processing/featureCounts/')


# ## Generate gene annotation file with +2kb on 3' end 
# 
# In [this issue](https://github.com/z0on/tag-based_RNAseq/issues/3) Mikhail Matz recommended the following when using featureCounts: "adjust your genome’s GFF file extend gene regions 1-2kb towards 3’ ; otherwise gene annotations are often missing the non-coding 3’regions where our reads are mapping". featureCounts using GFF/GTF files as input. I therefore need to generate a GFF file that includes the expanded gene regions.  
# 
# ### Use bedtools `slop` to include 2kb regions on either side of gene regions 
# 
# I want to identify methylated loci that are within genes but also 2kb downstream (aka on the 3' ends) of gene regions. Therefore, before intersecting methylated loci with genes, I need to create a gene region + 2kb file. I will do this using `slopBed`.
# 
# First, generate a "genome file", which defines size of each chromosome. This is so the `slop` function restricts results to within a contig. I can use the indexed FASTA file that I created for a blast. 

# In[12]:


cd {workingdir}


# In[13]:


# Extract column 1 (contig name) and column 2 (# bases in the contig)

get_ipython().system(' cut -f 1,2 references/Olurida_v081.fa.fai > references/Olurida_chrom.sizes')
get_ipython().system(' ls references/')


# In[14]:


#check out format of resulting chromosome size file 
get_ipython().system(' head -n 2 references/Olurida_chrom.sizes')


# In[129]:


# Run slopBed with gene feature file to create gene+2kb 
get_ipython().system(' slopBed  -i {workingdir}references/Olurida_v081-20190709.gene.gff  -g {workingdir}references/Olurida_chrom.sizes  -r 2000  -l 0  > {workingdir}references/Olurida_v081-20190709.gene.2kbdown.gff')


# In[17]:


### Check out sequence of pre-extended first couple genes 
get_ipython().system(' head -n 5 {workingdir}references/Olurida_v081-20190709.gene.gff')


# In[19]:


### Check out sequnce of first couple genes+2kb 
get_ipython().system(' head -n 4 {workingdir}references/Olurida_v081-20190709.gene.2kbdown.gff')


# ## Generate counts using `featureCounts` 
# 
# Using gene GFF that HAS been extended by 2kb past the 3' end  
#     
# featureCounts options
# `-T 5` - 5 threads 
# `-s 1` these data are "forward"ly stranded  
# `-t exon`  
# `-g gene_id`  
# `-a [annotation file]`
# `-o [output file]`  
# 
# featureCounts can also take into account whether your data are stranded or not. If strandedness is specified, then in addition to considering the genomic coordinates it will also take the strand into account for counting. If your data are stranded always specify it.

# In[20]:


# 1) gene GFF with 2kb 3' extention  
get_ipython().system(' {featureCounts}  -T 5  -s 1  -t gene  -g ID  -a {workingdir}references/Olurida_v081-20190709.gene.2kbdown.gff  -o {workingdir}qc-processing/featureCounts/featurecounts-gene2kbdown-deduplicated.txt  {workingdir}qc-processing/bowtie/deduplicated/*.bam')


# In[21]:


ls {workingdir}qc-processing/featureCounts/


# In[22]:


get_ipython().system(' cat {workingdir}qc-processing/featureCounts/featurecounts-gene2kbdown-deduplicated.txt.summary')


# ### Clean up the featureCounts matrix
# 
# There is information about the genomic coordinates and the length of the gene, we don’t need this for the next step, so we are going to extract the columns that we are interested in.

# In[23]:


get_ipython().system(' head {workingdir}qc-processing/featureCounts/featurecounts-gene2kbdown-deduplicated.txt')


# In[26]:


# remove columns 2-6, which contain merged contig info 
get_ipython().system(' cut -f-1,7- {workingdir}qc-processing/featureCounts/featurecounts-gene2kbdown-deduplicated.txt  > {workingdir}qc-processing/featureCounts/featurecounts-gene2kbdown-deduplicated.Rmatrix.txt')


# In[27]:


get_ipython().system(' head {workingdir}qc-processing/featureCounts/featurecounts-gene2kbdown-deduplicated.Rmatrix.txt')


# ### Create a .txt file that lists file names in order of run 

# In[36]:


get_ipython().run_cell_magic('bash', '', '\nfor file in qc-processing/bowtie/deduplicated/*.bam\ndo\nsample="$(basename -a $file | cut -d "." -f 1)"\necho $sample >> qc-processing/featureCounts/filenames-deduplicated.txt\ndone\n')


# In[37]:


get_ipython().system(' head {workingdir}qc-processing/featureCounts/filenames-deduplicated.txt')


# In[38]:


# No. genes identified  
get_ipython().system(' wc -l {workingdir}qc-processing/featureCounts/featurecounts-gene2kbdown-deduplicated.Rmatrix.txt')


# In[39]:


# No. samples 
get_ipython().system(' wc -l {workingdir}qc-processing/featureCounts/filenames-deduplicated.txt')


# ## RESULT: the Bowtie --> featureCounts pipeline resulted in counts for 32,212 genes 
# 
# Counts can now be read in to R and manipulated there. Location of resulting count matrix file: {workingdir}qc-processing/featureCounts/test-out_featurecounts.Rmatrix.txt
# 
# I pulled count dataframe into R and generated some initial summary stats: 
# 
# - Number of samples files (including "Undetermined": 147 
# - Total number of genes in dataframe: 110,784  

# In[ ]: