Use CPC2 Standalone to idenitfy P.generosa lncRNAs with no coding potential from the Panopea-generosa-v1.0 annotated GTF generated by gffcompare.
Notebook uses an aribtrary minimum length cutoff of 200bp.
%%bash
echo "TODAY'S DATE:"
date
echo "------------"
echo ""
#Display operating system info
lsb_release -a
echo ""
echo "------------"
echo "HOSTNAME: "; hostname
echo ""
echo "------------"
echo "Computer Specs:"
echo ""
lscpu
echo ""
echo "------------"
echo ""
echo "Memory Specs"
echo ""
free -mh
TODAY'S DATE: Tue May 2 06:45:03 AM PDT 2023 ------------
No LSB modules are available.
Distributor ID: Ubuntu Description: Ubuntu 22.04.2 LTS Release: 22.04 Codename: jammy ------------ HOSTNAME: computer ------------ Computer Specs: Architecture: x86_64 CPU op-mode(s): 32-bit, 64-bit Address sizes: 45 bits physical, 48 bits virtual Byte Order: Little Endian CPU(s): 4 On-line CPU(s) list: 0-3 Vendor ID: GenuineIntel Model name: Intel(R) Core(TM) i9-10885H CPU @ 2.40GHz CPU family: 6 Model: 165 Thread(s) per core: 1 Core(s) per socket: 1 Socket(s): 4 Stepping: 2 BogoMIPS: 4800.01 Flags: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ss syscall nx pdpe1gb rdtscp lm constant_tsc arch_perfmon nopl xtopology tsc_reliable nonstop_tsc cpuid tsc_known_freq pni pclmulqdq ssse3 fma cx16 pcid sse4_1 sse4_2 x2apic movbe popcnt tsc_deadline_timer aes xsave avx f16c rdrand hypervisor lahf_lm abm 3dnowprefetch invpcid_single pti ssbd ibrs ibpb stibp fsgsbase tsc_adjust bmi1 avx2 smep bmi2 invpcid rdseed adx smap clflushopt xsaveopt xsavec xgetbv1 xsaves arat flush_l1d arch_capabilities Hypervisor vendor: VMware Virtualization type: full L1d cache: 128 KiB (4 instances) L1i cache: 128 KiB (4 instances) L2 cache: 1 MiB (4 instances) L3 cache: 64 MiB (4 instances) NUMA node(s): 1 NUMA node0 CPU(s): 0-3 Vulnerability Itlb multihit: KVM: Mitigation: VMX unsupported Vulnerability L1tf: Mitigation; PTE Inversion Vulnerability Mds: Vulnerable: Clear CPU buffers attempted, no microcode; SMT Host state unknown Vulnerability Meltdown: Mitigation; PTI Vulnerability Mmio stale data: Vulnerable: Clear CPU buffers attempted, no microcode; SMT Host state unknown Vulnerability Retbleed: Mitigation; IBRS Vulnerability Spec store bypass: Mitigation; Speculative Store Bypass disabled via prctl Vulnerability Spectre v1: Mitigation; usercopy/swapgs barriers and __user pointer sanitization Vulnerability Spectre v2: Mitigation; IBRS, IBPB conditional, RSB filling, PBRSB-eIBRS Not affected Vulnerability Srbds: Unknown: Dependent on hypervisor status Vulnerability Tsx async abort: Not affected ------------ Memory Specs total used free shared buff/cache available Mem: 54Gi 6.0Gi 35Gi 322Mi 12Gi 47Gi Swap: 2.0Gi 0B 2.0Gi
%env
indicates a bash variable
without %env
is Python variable
# Set directories
%env transcriptomes_dir=/home/sam/data/P_generosa/transcriptomes
%env genomes_dir=/home/sam/data/P_generosa/genomes
%env analysis_dir=/home/sam/analyses/20230502-pgen-lncRNA-identification
analysis_dir="20230502-pgen-lncRNA-identification"
# Set lncRNA minimum length
%env min_lncRNA_length=200
# Input files
%env gffcompare_gtf=Panopea-generosa-v1.0-gffcmp.annotated.gtf
%env genome_fasta=Panopea-generosa-v1.0.fasta
# Genome FastA URL
# https://gannet.fish.washington.edu/Atumefaciens/20191105_swoose_pgen_v074_renaming/Panopea-generosa-v1.0.fa
%env genome_url=gannet:/volume2/web/Atumefaciens/20191105_swoose_pgen_v074_renaming/Panopea-generosa-v1.0.fa
# URL of file directory
# https://gannet.fish.washington.edu/Atumefaciens/20230426-pgen-HISAT2-stringtie-gffcompare-RNAseq/gffcompare
%env gff_comp_gtf_url=gannet:/volume2/web/Atumefaciens/20230426-pgen-HISAT2-stringtie-gffcompare-RNAseq/gffcompare
# Output file(s)
%env lncRNAs_gtf=20230502-pgen-lncRNA-IDs.gtf
%env lncRNA_candidates=lncRNA_candidates.gtf
%env lncRNA_candidates_bed=lncRNA_candidates.bed
%env lncRNA_candidates_fasta=lncRNA_candidates.fasta
%env lncRNA_ids=lncRNA-IDs.txt
%env cpc2_table=cpc2_output_table
# Set program locations
%env cpc2=/home/sam/programs/CPC2_standalone-1.0.1/bin/CPC2.py
%env bedtools=/home/sam/programs/bedtools-2.29.1/bin/bedtools
# Line for formatting
%env line=-------------------------------------------------------------------------------------
env: transcriptomes_dir=/home/sam/data/P_generosa/transcriptomes env: genomes_dir=/home/sam/data/P_generosa/genomes env: analysis_dir=/home/sam/analyses/20230502-pgen-lncRNA-identification env: min_lncRNA_length=200 env: gffcompare_gtf=Panopea-generosa-v1.0-gffcmp.annotated.gtf env: genome_fasta=Panopea-generosa-v1.0.fasta env: genome_url=gannet:/volume2/web/Atumefaciens/20191105_swoose_pgen_v074_renaming/Panopea-generosa-v1.0.fa env: gff_comp_gtf_url=gannet:/volume2/web/Atumefaciens/20230426-pgen-HISAT2-stringtie-gffcompare-RNAseq/gffcompare env: lncRNAs_gtf=20230502-pgen-lncRNA-IDs.gtf env: lncRNA_candidates=lncRNA_candidates.gtf env: lncRNA_candidates_bed=lncRNA_candidates.bed env: lncRNA_candidates_fasta=lncRNA_candidates.fasta env: lncRNA_ids=lncRNA-IDs.txt env: cpc2_table=cpc2_output_table env: cpc2=/home/sam/programs/CPC2_standalone-1.0.1/bin/CPC2.py env: bedtools=/home/sam/programs/bedtools-2.29.1/bin/bedtools env: line=-------------------------------------------------------------------------------------
%%bash
# Make analysis and data directory, if doesn't exist
mkdir --parents "${analysis_dir}"
mkdir --parents "${transcriptomes_dir}"
%%bash
cd "${transcriptomes_dir}"
rsync "${gff_comp_gtf_url}/${gffcompare_gtf}" .
ls -ltrh "${gffcompare_gtf}"
-rw-rw-r-- 1 sam sam 74M Apr 28 15:44 Panopea-generosa-v1.0-gffcmp.annotated.gtf
%%bash
head ${transcriptomes_dir}/"${gffcompare_gtf}"
echo ""
echo "${line}"
echo ""
echo "Number of lines:"
wc -l ${transcriptomes_dir}/*.gtf
echo ""
echo "${line}"
echo ""
echo "Number of transcripts in ${gffcompare_gtf}:"
awk '$3 == "transcript" {print}' ${transcriptomes_dir}/"${gffcompare_gtf}" | wc -l
Scaffold_01 StringTie transcript 2 4719 . + . transcript_id "PGEN_.00g000010.m01"; gene_id "MSTRG.4"; gene_name "PGEN_.00g000010"; xloc "XLOC_000001"; ref_gene_id "PGEN_.00g000010"; cmp_ref "PGEN_.00g000010.m01"; class_code "="; tss_id "TSS1"; Scaffold_01 StringTie exon 2 125 . + . transcript_id "PGEN_.00g000010.m01"; gene_id "MSTRG.4"; exon_number "1"; Scaffold_01 StringTie exon 1995 2095 . + . transcript_id "PGEN_.00g000010.m01"; gene_id "MSTRG.4"; exon_number "2"; Scaffold_01 StringTie exon 3325 3495 . + . transcript_id "PGEN_.00g000010.m01"; gene_id "MSTRG.4"; exon_number "3"; Scaffold_01 StringTie exon 4651 4719 . + . transcript_id "PGEN_.00g000010.m01"; gene_id "MSTRG.4"; exon_number "4"; Scaffold_01 StringTie transcript 55792 67546 . + . transcript_id "PGEN_.00g000040.m01"; gene_id "MSTRG.5"; gene_name "PGEN_.00g000040"; xloc "XLOC_000002"; ref_gene_id "PGEN_.00g000040"; cmp_ref "PGEN_.00g000040.m01"; class_code "="; tss_id "TSS2"; Scaffold_01 StringTie exon 55792 55972 . + . transcript_id "PGEN_.00g000040.m01"; gene_id "MSTRG.5"; exon_number "1"; Scaffold_01 StringTie exon 58912 59276 . + . transcript_id "PGEN_.00g000040.m01"; gene_id "MSTRG.5"; exon_number "2"; Scaffold_01 StringTie exon 61676 62119 . + . transcript_id "PGEN_.00g000040.m01"; gene_id "MSTRG.5"; exon_number "3"; Scaffold_01 StringTie exon 65608 65740 . + . transcript_id "PGEN_.00g000040.m01"; gene_id "MSTRG.5"; exon_number "4"; ------------------------------------------------------------------------------------- Number of lines: 539407 /home/sam/data/P_generosa/transcriptomes/Panopea-generosa-v1.0-gffcmp.annotated.gtf ------------------------------------------------------------------------------------- Number of transcripts in Panopea-generosa-v1.0-gffcmp.annotated.gtf: 79269
%%bash
cd "${genomes_dir}"
rsync "${genome_url}" .
ls -ltrh "${genome_fasta}"
-rwxr-xr-x 1 sam sam 914M Nov 5 2019 Panopea-generosa-v1.0.fasta
%%bash
cd "${genomes_dir}"
echo "Number of sequences:"
grep "^>" -c "${genome_fasta}"
Number of sequences: 18
string class_code āuā indicates no overlap
%%bash
cd "${transcriptomes_dir}"
awk '$3 == "transcript" {print}' "${gffcompare_gtf}" | grep 'class_code "u"' \
| awk -v min_lncRNA_length="${min_lncRNA_length}" '$5 - $4 >= min_lncRNA_length {print}' \
> "${analysis_dir}/${lncRNA_candidates}"
wc -l "${analysis_dir}/${lncRNA_candidates}"
echo ""
echo "${line}"
echo ""
head "${analysis_dir}/${lncRNA_candidates}"
14076 /home/sam/analyses/20230502-pgen-lncRNA-identification/lncRNA_candidates.gtf ------------------------------------------------------------------------------------- Scaffold_01 StringTie transcript 656906 657583 . + . transcript_id "MSTRG.38.5"; gene_id "MSTRG.38"; xloc "XLOC_000013"; class_code "u"; tss_id "TSS19"; Scaffold_01 StringTie transcript 648204 649326 . + . transcript_id "MSTRG.39.1"; gene_id "MSTRG.39"; xloc "XLOC_000014"; class_code "u"; tss_id "TSS20"; Scaffold_01 StringTie transcript 849165 854552 . + . transcript_id "MSTRG.63.1"; gene_id "MSTRG.63"; xloc "XLOC_000019"; class_code "u"; tss_id "TSS25"; Scaffold_01 StringTie transcript 852049 854552 . + . transcript_id "MSTRG.63.2"; gene_id "MSTRG.63"; xloc "XLOC_000019"; class_code "u"; tss_id "TSS26"; Scaffold_01 StringTie transcript 862415 867481 . + . transcript_id "MSTRG.66.1"; gene_id "MSTRG.66"; xloc "XLOC_000020"; class_code "u"; tss_id "TSS27"; Scaffold_01 StringTie transcript 1824775 1828291 . + . transcript_id "MSTRG.109.1"; gene_id "MSTRG.109"; xloc "XLOC_000040"; class_code "u"; tss_id "TSS56"; Scaffold_01 StringTie transcript 1966694 1970033 . + . transcript_id "MSTRG.121.2"; gene_id "MSTRG.121"; xloc "XLOC_000044"; class_code "u"; tss_id "TSS62"; Scaffold_01 StringTie transcript 1966694 1970033 . + . transcript_id "MSTRG.121.1"; gene_id "MSTRG.121"; xloc "XLOC_000044"; class_code "u"; tss_id "TSS62"; Scaffold_01 StringTie transcript 2318317 2328097 . + . transcript_id "MSTRG.137.3"; gene_id "MSTRG.137"; xloc "XLOC_000053"; class_code "u"; tss_id "TSS72"; Scaffold_01 StringTie transcript 2843989 2844204 . + . transcript_id "MSTRG.169.1"; gene_id "MSTRG.169"; xloc "XLOC_000070"; class_code "u"; tss_id "TSS92";
This is needed so the subsequent bedtools getfasta
step will have a unique name for each transcript.
Otherwise, the generated names generate duplicates because they're based off of the scaffold name and the start/stop sites. As such, when there are multiple transcripts identified for the same locations, they end up with the same names.
%%bash
while read -r line
do
stringtie_transcript=$(echo "${line}" | awk -F "\"" '{print $2}')
chr=$(echo "${line}" | awk '{print $1}')
start=$(echo "${line}" | awk '{print $4}')
end=$(echo "${line}" | awk '{print $5}')
score="0"
strand=$(echo "${line}" | awk '{print $7}')
printf "%s\t%s\t%s\t%s\t%s\t%s\n" "${chr}" "${start}" "${end}" "${stringtie_transcript}" "${score}" "${strand}"
done < "${analysis_dir}/${lncRNA_candidates}" > "${analysis_dir}/${lncRNA_candidates_bed}"
%%bash
head "${analysis_dir}/${lncRNA_candidates_bed}"
echo ""
echo "${line}"
echo ""
wc -l "${analysis_dir}/${lncRNA_candidates_bed}"
Scaffold_01 656906 657583 MSTRG.38.5 0 + Scaffold_01 648204 649326 MSTRG.39.1 0 + Scaffold_01 849165 854552 MSTRG.63.1 0 + Scaffold_01 852049 854552 MSTRG.63.2 0 + Scaffold_01 862415 867481 MSTRG.66.1 0 + Scaffold_01 1824775 1828291 MSTRG.109.1 0 + Scaffold_01 1966694 1970033 MSTRG.121.2 0 + Scaffold_01 1966694 1970033 MSTRG.121.1 0 + Scaffold_01 2318317 2328097 MSTRG.137.3 0 + Scaffold_01 2843989 2844204 MSTRG.169.1 0 + ------------------------------------------------------------------------------------- 14076 /home/sam/analyses/20230502-pgen-lncRNA-identification/lncRNA_candidates.bed
Use bedtools
to extract lncRNA sequences as FastA.
-name
option to use FastA sequence ID and coordinates as the names of the output sequences%%bash
${bedtools} getfasta -fi "${genomes_dir}/${genome_fasta}" \
-bed "${analysis_dir}/${lncRNA_candidates_bed}" \
-fo "${analysis_dir}"/"${lncRNA_candidates_fasta}" \
-name
Number of sequences should match number of transcripts from above (14076
)
%%bash
echo "Number of sequences:"
grep -c "^>" "${analysis_dir}"/"${lncRNA_candidates_fasta}"
echo ""
echo "${line}"
echo ""
echo "Definition lines formatting:"
grep "^>" "${analysis_dir}"/"${lncRNA_candidates_fasta}" | head
Number of sequences: 14076 ------------------------------------------------------------------------------------- Definition lines formatting: >MSTRG.38.5::Scaffold_01:656906-657583 >MSTRG.39.1::Scaffold_01:648204-649326 >MSTRG.63.1::Scaffold_01:849165-854552 >MSTRG.63.2::Scaffold_01:852049-854552 >MSTRG.66.1::Scaffold_01:862415-867481 >MSTRG.109.1::Scaffold_01:1824775-1828291 >MSTRG.121.2::Scaffold_01:1966694-1970033 >MSTRG.121.1::Scaffold_01:1966694-1970033 >MSTRG.137.3::Scaffold_01:2318317-2328097 >MSTRG.169.1::Scaffold_01:2843989-2844204
%%bash
"${cpc2}" \
-i "${analysis_dir}"/"${lncRNA_candidates_fasta}" \
-o "${analysis_dir}/${cpc2_table}"
[INFO] read file '/home/sam/analyses/20230502-pgen-lncRNA-identification/lncRNA_candidates.fasta' [INFO] Predicting coding potential, please wait ... [INFO] Running Done! [INFO] cost time: 10s
%%bash
head "${analysis_dir}/${cpc2_table}.txt"
echo ""
echo "${line}"
echo ""
echo "Number of entries:"
# Skips header line
tail -n +2 "${analysis_dir}/${cpc2_table}.txt" | wc -l
# Count number of columns
echo ""
echo "${line}"
echo ""
echo "Number of columns in ${cpc2_table}.txt:"
awk '{print NF}' "${analysis_dir}/${cpc2_table}.txt" | sort --unique
#ID transcript_length peptide_length Fickett_score pI ORF_integrity coding_probability label MSTRG.38.5::Scaffold_01:656906-657583 677 72 0.30393 8.63104343414307 1 0.0370878 noncoding MSTRG.39.1::Scaffold_01:648204-649326 1122 48 0.32127 10.358344841003415 1 0.0232355 noncoding MSTRG.63.1::Scaffold_01:849165-854552 5387 64 0.30411 7.6276449203491214 1 0.0265552 noncoding MSTRG.63.2::Scaffold_01:852049-854552 2503 54 0.32878 6.805923652648925 1 0.0188306 noncoding MSTRG.66.1::Scaffold_01:862415-867481 5066 98 0.26407 9.644486427307129 1 0.0769712 noncoding MSTRG.109.1::Scaffold_01:1824775-1828291 3516 69 0.22328 10.963251686096193 1 0.0907481 noncoding MSTRG.121.2::Scaffold_01:1966694-1970033 3339 51 0.23537 9.184117698669432 1 0.0353791 noncoding MSTRG.121.1::Scaffold_01:1966694-1970033 3339 51 0.23537 9.184117698669432 1 0.0353791 noncoding MSTRG.137.3::Scaffold_01:2318317-2328097 9780 85 0.23588 8.743476295471194 1 0.0716077 noncoding ------------------------------------------------------------------------------------- Number of entries: 14076 ------------------------------------------------------------------------------------- Number of columns in cpc2_output_table.txt: 8
The label
column, which is column 8 ($8
), will be used to pull out all lncRNA IDs.
%%bash
awk '$8 == "noncoding" {print $1}' "${analysis_dir}/${cpc2_table}.txt" |
awk -F":" '{print $1}' \
> "${analysis_dir}/${lncRNA_ids}"
wc -l "${analysis_dir}/${lncRNA_ids}"
echo ""
head "${analysis_dir}/${lncRNA_ids}"
13606 /home/sam/analyses/20230502-pgen-lncRNA-identification/lncRNA-IDs.txt MSTRG.38.5 MSTRG.39.1 MSTRG.63.1 MSTRG.63.2 MSTRG.66.1 MSTRG.109.1 MSTRG.121.2 MSTRG.121.1 MSTRG.137.3 MSTRG.169.1
%%bash
grep --fixed-strings --file="${analysis_dir}/${lncRNA_ids}" "${analysis_dir}/${lncRNA_candidates}" \
> "${analysis_dir}/${lncRNAs_gtf}"
wc -l "${analysis_dir}/${lncRNAs_gtf}"
13606 /home/sam/analyses/20230502-pgen-lncRNA-identification/20230502-pgen-lncRNA-IDs.gtf
%%bash
head "${analysis_dir}/${lncRNAs_gtf}"
Scaffold_01 StringTie transcript 656906 657583 . + . transcript_id "MSTRG.38.5"; gene_id "MSTRG.38"; xloc "XLOC_000013"; class_code "u"; tss_id "TSS19"; Scaffold_01 StringTie transcript 648204 649326 . + . transcript_id "MSTRG.39.1"; gene_id "MSTRG.39"; xloc "XLOC_000014"; class_code "u"; tss_id "TSS20"; Scaffold_01 StringTie transcript 849165 854552 . + . transcript_id "MSTRG.63.1"; gene_id "MSTRG.63"; xloc "XLOC_000019"; class_code "u"; tss_id "TSS25"; Scaffold_01 StringTie transcript 852049 854552 . + . transcript_id "MSTRG.63.2"; gene_id "MSTRG.63"; xloc "XLOC_000019"; class_code "u"; tss_id "TSS26"; Scaffold_01 StringTie transcript 862415 867481 . + . transcript_id "MSTRG.66.1"; gene_id "MSTRG.66"; xloc "XLOC_000020"; class_code "u"; tss_id "TSS27"; Scaffold_01 StringTie transcript 1824775 1828291 . + . transcript_id "MSTRG.109.1"; gene_id "MSTRG.109"; xloc "XLOC_000040"; class_code "u"; tss_id "TSS56"; Scaffold_01 StringTie transcript 1966694 1970033 . + . transcript_id "MSTRG.121.2"; gene_id "MSTRG.121"; xloc "XLOC_000044"; class_code "u"; tss_id "TSS62"; Scaffold_01 StringTie transcript 1966694 1970033 . + . transcript_id "MSTRG.121.1"; gene_id "MSTRG.121"; xloc "XLOC_000044"; class_code "u"; tss_id "TSS62"; Scaffold_01 StringTie transcript 2318317 2328097 . + . transcript_id "MSTRG.137.3"; gene_id "MSTRG.137"; xloc "XLOC_000053"; class_code "u"; tss_id "TSS72"; Scaffold_01 StringTie transcript 2843989 2844204 . + . transcript_id "MSTRG.169.1"; gene_id "MSTRG.169"; xloc "XLOC_000070"; class_code "u"; tss_id "TSS92";
%%bash
cd "${analysis_dir}"
for file in *
do
md5sum "${file}" | tee --append checksums.md5
done
9adb7efc18fe1bfedcad24c86da1161f 20230502-pgen-lncRNA-IDs.gtf 4978cedf268a87715a9b9b41d94461e3 cpc2_output_table.txt 59c3bb6819262856eb1a154de115f172 lncRNA_candidates.bed 712c72cb22be748babc44bff9fc5704a lncRNA_candidates.fasta a01efb7f2322802da94d4c038712230a lncRNA_candidates.gtf 6bd15d3625538c2c1d47f7219deddeed lncRNA-IDs.txt
%%bash
${cpc2} -h
Usage: CPC2.py [options] -i input.fasta -o output_file Contact: Kang Yujian <kangyj@mail.cbi.pku.edu.cn> Options: --version show program's version number and exit -h, --help show this help message and exit Common Options: -i FILE input sequence in fasta format [Required] -o FILE output file [Default: cpc2output.txt] -r also check the reverse strand [Default: FALSE] --ORF output the start position of longest ORF [Default: FALSE]