#!/usr/bin/env python # coding: utf-8 # # An introductory TRGT/TRVZ tutorial # In this tutorial, you will learn the basics of TRGT and TRVZ by analyzing a tiny example dataset included in this repository. # ## Overview of the input files # Our example dataset consists of a reference genome (`reference.fasta`), a file with aligned reads (`sample.bam`), and the repeat definition file (`repeat.bed`) all stored in the `input/` directory. # In[1]: genome = "input/reference.fasta" repeats = "input/repeat.bed" reads = "input/sample.bam" # The repeat definitions are stored in a BED file that, among other information, contains repeat coordinates, repeat identifiers, and motifs: # In[2]: get_ipython().system(' cat {repeats}') # ## Genotype repeats with TRGT # We can genotype the repeat by runing TRGT as shown below. Note that TRGT also has a `--karyotype` parameter that is used to specify sex of the input sample. # In[ ]: get_ipython().system(' trgt --genome {genome} --reads {reads} --repeats {repeats} --output-prefix output/sample') # The output consists of files `sample.vcf.gz` and `sample.spanning.bam`. The VCF file contains repeat genotypes while the BAM file contains pieces of HiFi reads that fully span the repeat sequences. # For example, here is the first entry of the VCF file: # In[3]: get_ipython().system(' bcftools view --no-header output/sample.vcf.gz | head -n 1') # It says that: # - There is a tandem repeat starting at position 10001 of chrA # - The reference sequence of this repeat is CCAGCAGCAGCAGCAGCAGCAGCAGCAGCAGCAGCAGCAGCAGCAGCAGCAGCAGCAGCAG # - This repeat has two alleles spanning 16 CAG motifs each # ## Sort and index the outputs # TRGT outputs are not sorted. So we first sort and index the VCF: # In[ ]: get_ipython().system(' bcftools sort -Ob -o output/sample.sorted.vcf.gz output/sample.vcf.gz') get_ipython().system(' bcftools index output/sample.sorted.vcf.gz') # and then the BAM: # In[ ]: get_ipython().system(' samtools sort output/sample.spanning.bam -o output/sample.bam') get_ipython().system(' samtools index output/sample.bam') # And that's it! The output files are now ready for downstream analysis. # ## Visualize a repeat with TRVZ # To visualize the repeat with the identifier "TR1", run: # In[ ]: get_ipython().system(' trvz --genome {genome} --repeats {repeats} --vcf output/sample.sorted.vcf.gz --spanning-reads output/sample.bam --repeat-id TR1 --image output/TR1.png') # In[4]: from IPython.display import Image Image(filename="output/TR1.png") # In[ ]: