See this GitHub Issue
This notebook relies on GFFutils to be installed and available in your $PATH
.
%%bash
echo "TODAY'S DATE:"
date
echo "------------"
echo ""
#Display operating system info
lsb_release -a
echo ""
echo "------------"
echo "HOSTNAME: "; hostname
echo ""
echo "------------"
echo "Computer Specs:"
echo ""
lscpu
echo ""
echo "------------"
echo ""
echo "Memory Specs"
echo ""
free -mh
TODAY'S DATE: Tue 01 Jun 2021 07:37:57 AM PDT ------------ Distributor ID: Ubuntu Description: Ubuntu 20.04.2 LTS Release: 20.04 Codename: focal ------------ HOSTNAME: computer ------------ Computer Specs: Architecture: x86_64 CPU op-mode(s): 32-bit, 64-bit Byte Order: Little Endian Address sizes: 45 bits physical, 48 bits virtual CPU(s): 8 On-line CPU(s) list: 0-7 Thread(s) per core: 1 Core(s) per socket: 1 Socket(s): 8 NUMA node(s): 1 Vendor ID: GenuineIntel CPU family: 6 Model: 165 Model name: Intel(R) Core(TM) i9-10885H CPU @ 2.40GHz Stepping: 2 CPU MHz: 2400.000 BogoMIPS: 4800.00 Hypervisor vendor: VMware Virtualization type: full L1d cache: 256 KiB L1i cache: 256 KiB L2 cache: 2 MiB L3 cache: 128 MiB NUMA node0 CPU(s): 0-7 Vulnerability Itlb multihit: KVM: Mitigation: VMX unsupported Vulnerability L1tf: Not affected Vulnerability Mds: Not affected Vulnerability Meltdown: Not affected Vulnerability Spec store bypass: Mitigation; Speculative Store Bypass disabled via prctl and seccomp Vulnerability Spectre v1: Mitigation; usercopy/swapgs barriers and __user pointer sanitization Vulnerability Spectre v2: Mitigation; Enhanced IBRS, IBPB conditional, RSB filling Vulnerability Srbds: Not affected Vulnerability Tsx async abort: Not affected Flags: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ss syscall nx pdpe1gb rdtscp lm constant_tsc arch_perfmon nopl xtopology tsc_reliable nonstop_tsc cpuid pni pclmulqdq ssse3 fma cx16 pcid sse4_1 sse4_2 x2apic movbe popcnt tsc_deadline_timer aes xsave avx f16c rdrand hypervisor lahf_lm abm 3dnowprefetch cpuid_fault invpcid_single ssbd ibrs ibpb stibp ibrs_enhanced fsgsbase tsc_adjust bmi1 avx2 smep bmi2 invpcid rdseed adx smap clflushopt xsaveopt xsavec xgetbv1 xsaves arat pku ospke md_clear flush_l1d arch_capabilities ------------ Memory Specs total used free shared buff/cache available Mem: 53Gi 3.6Gi 45Gi 428Mi 4.1Gi 48Gi Swap: 2.0Gi 0B 2.0Gi
No LSB modules are available.
%env
indicates a bash variable; without %env
is Python variable# Set directories, input/output files
%env data_dir=/home/samb/data/S_salar/genomes
%env analysis_dir=/home/samb/analyses/20210601_ssal_gff-annotations
analysis_dir="/home/samb/analyses/20210601_ssal_gff-annotations"
# Input GFF
%env orig_gff=GCF_000233375.1_ICSASG_v2_genomic.gff
%env orig_gff_url=https://gannet.fish.washington.edu/metacarcinus/Salmo_Calig/GENOMES/v2/RefSeq
# UniProt batch output
%env perl_output=20210601_ssal_uniprot_batch_results.txt
# GTF extractor output
%env gtf_extractor_output=20210601_ssal_chrom-start-end-Dbxref.csv
# Gene name list for UniProt batch submission
%env gene_list=20210601_ssal_gene-list.txt
# Parsed UniProt
%env parsed_uniprot=20210601_ssal_accession-gene_id-gene-gene_description-go_ids.csv
# Final output
%env joined_output=20210601_ssal_chrom-gene_id_start-end-acc-gene-gene_description-go_ids.csv
env: data_dir=/home/samb/data/S_salar/genomes env: analysis_dir=/home/samb/analyses/20210601_ssal_gff-annotations env: orig_gff=GCF_000233375.1_ICSASG_v2_genomic.gff env: orig_gff_url=https://gannet.fish.washington.edu/metacarcinus/Salmo_Calig/GENOMES/v2/RefSeq env: perl_output=20210601_ssal_uniprot_batch_results.txt env: gtf_extractor_output=20210601_ssal_chrom-start-end-Dbxref.csv env: gene_list=20210601_ssal_gene-list.txt env: parsed_uniprot=20210601_ssal_accession-gene_id-gene-gene_description-go_ids.csv env: joined_output=20210601_ssal_chrom-gene_id_start-end-acc-gene-gene_description-go_ids.csv
%%bash
cd "${data_dir}"
# Download with wget. Use --no-check-certificate to avoid issues with Gannet certificate
# Use --quiet option to prevent wget output from printing too many lines to notebook
wget --no-check-certificate --quiet ${orig_gff_url}/${orig_gff}
ls -ltrh "${orig_gff}"
-rw-rw-r-- 1 samb samb 828M Sep 30 2020 GCF_000233375.1_ICSASG_v2_genomic.gff
%%bash
head -n 20 "${data_dir}"/"${orig_gff}"
##gff-version 3 #!gff-spec-version 1.21 #!processor NCBI annotwriter #!genome-build ICSASG_v2 #!genome-build-accession NCBI_Assembly:GCF_000233375.1 #!annotation-source NCBI Salmo salar Annotation Release 100 ##sequence-region NC_027300.1 1 159038749 ##species https://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?id=8030 NC_027300.1 RefSeq region 1 159038749 . + . ID=NC_027300.1:1..159038749;Dbxref=taxon:8030;Name=ssa01;breed=double haploid;chromosome=ssa01;dev-stage=adult;gbkey=Src;genome=chromosome;isolate=Sally;mol_type=genomic DNA;sex=female;tissue-type=muscle NC_027300.1 Gnomon gene 5501 62139 . - . ID=gene-LOC106560212;Dbxref=GeneID:106560212;Name=LOC106560212;gbkey=Gene;gene=LOC106560212;gene_biotype=protein_coding NC_027300.1 Gnomon mRNA 5501 62139 . - . ID=rna-XM_014160784.1;Parent=gene-LOC106560212;Dbxref=GeneID:106560212,Genbank:XM_014160784.1;Name=XM_014160784.1;gbkey=mRNA;gene=LOC106560212;model_evidence=Supporting evidence includes similarity to: 99%25 coverage of the annotated genomic feature by RNAseq alignments%2C including 8 samples with support for all annotated introns;product=fibroblast growth factor receptor 3-like;transcript_id=XM_014160784.1 NC_027300.1 Gnomon exon 61647 62139 . - . ID=exon-XM_014160784.1-1;Parent=rna-XM_014160784.1;Dbxref=GeneID:106560212,Genbank:XM_014160784.1;gbkey=mRNA;gene=LOC106560212;product=fibroblast growth factor receptor 3-like;transcript_id=XM_014160784.1 NC_027300.1 Gnomon exon 43486 43714 . - . ID=exon-XM_014160784.1-2;Parent=rna-XM_014160784.1;Dbxref=GeneID:106560212,Genbank:XM_014160784.1;gbkey=mRNA;gene=LOC106560212;product=fibroblast growth factor receptor 3-like;transcript_id=XM_014160784.1 NC_027300.1 Gnomon exon 23978 24241 . - . ID=exon-XM_014160784.1-3;Parent=rna-XM_014160784.1;Dbxref=GeneID:106560212,Genbank:XM_014160784.1;gbkey=mRNA;gene=LOC106560212;product=fibroblast growth factor receptor 3-like;transcript_id=XM_014160784.1 NC_027300.1 Gnomon exon 16966 17019 . - . ID=exon-XM_014160784.1-4;Parent=rna-XM_014160784.1;Dbxref=GeneID:106560212,Genbank:XM_014160784.1;gbkey=mRNA;gene=LOC106560212;product=fibroblast growth factor receptor 3-like;transcript_id=XM_014160784.1 NC_027300.1 Gnomon exon 5501 5691 . - . ID=exon-XM_014160784.1-5;Parent=rna-XM_014160784.1;Dbxref=GeneID:106560212,Genbank:XM_014160784.1;gbkey=mRNA;gene=LOC106560212;product=fibroblast growth factor receptor 3-like;transcript_id=XM_014160784.1 NC_027300.1 Gnomon CDS 43486 43633 . - 0 ID=cds-XP_014016259.1;Parent=rna-XM_014160784.1;Dbxref=GeneID:106560212,Genbank:XP_014016259.1;Name=XP_014016259.1;gbkey=CDS;gene=LOC106560212;product=fibroblast growth factor receptor 3-like;protein_id=XP_014016259.1 NC_027300.1 Gnomon CDS 23978 24241 . - 2 ID=cds-XP_014016259.1;Parent=rna-XM_014160784.1;Dbxref=GeneID:106560212,Genbank:XP_014016259.1;Name=XP_014016259.1;gbkey=CDS;gene=LOC106560212;product=fibroblast growth factor receptor 3-like;protein_id=XP_014016259.1 NC_027300.1 Gnomon CDS 16966 17019 . - 2 ID=cds-XP_014016259.1;Parent=rna-XM_014160784.1;Dbxref=GeneID:106560212,Genbank:XP_014016259.1;Name=XP_014016259.1;gbkey=CDS;gene=LOC106560212;product=fibroblast growth factor receptor 3-like;protein_id=XP_014016259.1 NC_027300.1 Gnomon CDS 5501 5691 . - 2 ID=cds-XP_014016259.1;Parent=rna-XM_014160784.1;Dbxref=GeneID:106560212,Genbank:XP_014016259.1;Name=XP_014016259.1;gbkey=CDS;gene=LOC106560212;product=fibroblast growth factor receptor 3-like;protein_id=XP_014016259.1
%%bash
# Extract just gene features
# Extract chromosome name, start, end, and Dbxref fields
# Dbxref is the NCBI gene name, in this particular instance
# Specify input as GFF
# Use awk to format as comma-delimited output to help with downstream parsing/joining
time \
gtf_extract \
--feature gene \
--fields=chrom,start,end,Dbxref \
--gff ${data_dir}/${orig_gff} \
| awk 'BEGIN { OFS = ","; FS="[\t:]"} {print $1, $2, $3, $5}' \
> ${analysis_dir}/${gtf_extractor_output}
real 2m26.807s user 2m26.279s sys 0m0.468s
%%bash
cd "${analysis_dir}"
ls -ltrh ${gtf_extractor_output}
echo ""
head ${gtf_extractor_output}
-rw-rw-r-- 1 samb samb 2.9M Jun 1 09:49 20210601_ssal_chrom-start-end-Dbxref.csv NC_027300.1,5501,62139,106560212 NC_027300.1,160437,198815,106607996 NC_027300.1,228330,231471,106601976 NC_027300.1,296031,297111,106560213 NC_027300.1,306942,310878,106566220 NC_027300.1,331369,346454,106571988 NC_027300.1,355675,362950,106578259 NC_027300.1,401623,416794,106583877 NC_027300.1,431662,432555,106589664 NC_027300.1,449112,490663,106596642
%%bash
# Count gene features via GFFutils
echo "GFFutils number of extracted genes:"
gtf_extract -f gene --fields=Dbxref --gff ${data_dir}/${orig_gff} | wc -l
echo ""
# Count gene features via awk
echo "awk number of extracted genes:"
awk '$3 == "gene" { print $0 }' ${data_dir}/${orig_gff} | wc -l
GFFutils number of extracted genes: 79030 awk number of extracted genes: 79030
%%bash
cd "${analysis_dir}"
awk -F"," '{print $4}' "${gtf_extractor_output}" > "${gene_list}"
%%bash
cd "${analysis_dir}"
head "${gene_list}"
106560212 106607996 106601976 106560213 106566220 106571988 106578259 106583877 106589664 106596642
Perl script obtained from UniProt: https://www.uniprot.org/help/api_batch_retrieval
Modified to map NCIB gene ID to UniProt accession.
%%bash
# Print the script for viewing
cat /home/samb/programs/uniprot_mapping.pl
use strict; use warnings; use LWP::UserAgent; my $list = $ARGV[0]; # File containg list of UniProt identifiers. my $base = 'https://www.uniprot.org'; my $tool = 'uploadlists'; my $contact = 'samwhite@uw.edu'; # Please set a contact email address here to help us debug in case of problems (see https://www.uniprot.org/help/privacy). my $agent = LWP::UserAgent->new(agent => "libwww-perl $contact"); push @{$agent->requests_redirectable}, 'POST'; my $response = $agent->post("$base/$tool/", [ 'file' => [$list], 'format' => 'txt', 'from' => 'P_ENTREZGENEID', 'to' => 'ACC', ], 'Content_Type' => 'form-data'); while (my $wait = $response->header('Retry-After')) { print STDERR "Waiting ($wait)...\n"; sleep $wait; $response = $agent->get($response->base); } $response->is_success ? print $response->content : die 'Failed, got ' . $response->status_line . ' for ' . $response->request->uri . "\n";
%%bash
cd "${analysis_dir}"
time \
perl /home/samb/programs/uniprot_mapping.pl "${gene_list}" > "${perl_output}"
ls -ltrh
echo ""
echo ""
echo "--------------------------------------------------"
echo ""
echo "Line count:"
wc -l "${perl_output}"
total 354M -rw-rw-r-- 1 samb samb 2.9M Jun 1 09:49 20210601_ssal_chrom-start-end-Dbxref.csv -rw-rw-r-- 1 samb samb 772K Jun 1 09:58 20210601_ssal_gene-list.txt -rw-rw-r-- 1 samb samb 350M Jun 1 10:04 20210601_ssal_uniprot_batch_results.txt -------------------------------------------------- Line count: 7273462 20210601_ssal_uniprot_batch_results.txt
real 3m32.207s user 0m3.099s sys 0m9.446s
%%bash
cd "${analysis_dir}"
head -n 30 "${perl_output}"
ID A0A1S3NLH1_SALSA Unreviewed; 218 AA. AC A0A1S3NLH1; DT 12-APR-2017, integrated into UniProtKB/TrEMBL. DT 12-APR-2017, sequence version 1. DT 07-APR-2021, entry version 14. DE SubName: Full=fibroblast growth factor receptor 3-like {ECO:0000313|RefSeq:XP_014016259.1}; GN Name=LOC106560212 {ECO:0000313|RefSeq:XP_014016259.1}; OS Salmo salar (Atlantic salmon). OC Eukaryota; Metazoa; Chordata; Craniata; Vertebrata; Euteleostomi; OC Actinopterygii; Neopterygii; Teleostei; Protacanthopterygii; Salmoniformes; OC Salmonidae; Salmoninae; Salmo. OX NCBI_TaxID=8030 {ECO:0000313|Proteomes:UP000087266, ECO:0000313|RefSeq:XP_014016259.1}; RN [1] {ECO:0000313|RefSeq:XP_014016259.1} RP IDENTIFICATION. RC TISSUE=Muscle {ECO:0000313|RefSeq:XP_014016259.1}; RG RefSeq; RL Submitted (OCT-2020) to UniProtKB. CC --------------------------------------------------------------------------- CC Copyrighted by the UniProt Consortium, see https://www.uniprot.org/terms CC Distributed under the Creative Commons Attribution (CC BY 4.0) License CC --------------------------------------------------------------------------- DR RefSeq; XP_014016259.1; XM_014160784.1. DR GeneID; 106560212; -. DR KEGG; sasa:106560212; -. DR OrthoDB; 1497110at2759; -. DR Proteomes; UP000087266; Genome assembly. DR GO; GO:0016021; C:integral component of membrane; IEA:UniProtKB-KW. DR Gene3D; 2.60.40.10; -; 2. DR InterPro; IPR007110; Ig-like_dom. DR InterPro; IPR036179; Ig-like_dom_sf.
UniProt accession
Gene ID (NCBI gene ID)
Gene name/abbraviation
Gene description
GO terms
%%bash
cd "${analysis_dir}"
time \
while read -r line
do
# Get record line descriptor
descriptor=$(echo "${line}" | awk '{print $1}' )
# Capture second field for evaluation
go_line=$(echo "${line}" | awk '{print $2}')
# Append GO IDs to array
if [[ "${go_line}" == "GO;" ]]; then
go_id=$(echo "${line}" | awk '{print $3}')
go_ids_array+=("${go_id}")
elif [[ "${go_line}" == "GeneID;" ]]; then
# Uses sed to strip trailing semi-colon
gene_id=$(echo "${line}" | awk '{print $3}' | sed 's/;$//')
fi
# Get gene description
if [[ "${descriptor}" == "DE" ]] && [[ "${go_line}" == "SubName:" ]]; then
# Uses sed to strip trailing spaces at end of line and remove commas
gene_description=$(echo "${line}" | awk -F"[={]" '{print $2}' | sed 's/[[:blank:]]*$//' | sed 's/,//g')
# Get gene name
elif [[ "${descriptor}" == "GN" ]] && [[ $(echo "${line}" | awk -F "=" '{print $1}') == "GN Name" ]]; then
# Uses sed to strip trailing spaces at end of line
gene=$(echo "${line}" | awk -F'Name=|{' '{print $2}' | sed 's/[[:blank:]]*$//')
# Get UniProt accession
elif [[ "${descriptor}" == "AC" ]]; then
# Uses sed to strip trailing semi-colon
accession=$(echo "${line}" | awk '{print $2}' | sed 's/;$//')
# Identify beginning on new record
elif [[ "${descriptor}" == "//" ]]; then
# Prints other comma-separated variables, then GOID1;GOID2;GOIDn
# IFS prevents spaces from being added between GO IDs
# sed removes ";" after final GO ID
(IFS=; printf "%s,%s,%s,%s,%s\n" "${accession}" "${gene_id}" "${gene}" "${gene_description}" "${go_ids_array[*]}" | sed 's/;$//')
# Re-initialize variables
accession=""
descriptor=""
gene=""
gene_description
gene_id=""
go_id=""
go_ids_array=()
fi
done < "${perl_output}" >> "${parsed_uniprot}"
IOPub data rate exceeded. The Jupyter server will temporarily stop sending output to the client in order to avoid crashing it. To change this limit, set the config variable `--ServerApp.iopub_data_rate_limit`. Current values: ServerApp.iopub_data_rate_limit=1000000.0 (bytes/sec) ServerApp.rate_limit_window=3.0 (secs)
Despite notebook error message, if you check the time stamps on the files below, it looks like this took nearly 6.5hrs!!
%%bash
cd "${analysis_dir}"
ls -ltrh
echo ""
head "${parsed_uniprot}"
echo ""
echo ""
echo "--------------------------------------------------"
echo ""
echo "Line count:"
wc -l "${parsed_uniprot}"
total 362M -rw-rw-r-- 1 samb samb 2.9M Jun 1 09:49 20210601_ssal_chrom-start-end-Dbxref.csv -rw-rw-r-- 1 samb samb 772K Jun 1 09:58 20210601_ssal_gene-list.txt -rw-rw-r-- 1 samb samb 350M Jun 1 10:04 20210601_ssal_uniprot_batch_results.txt -rw-rw-r-- 1 samb samb 8.0M Jun 1 16:31 20210601_ssal_accession-gene_id-gene-gene_description-go_ids.csv A0A1S3NLH1,106560212,LOC106560212,fibroblast growth factor receptor 3-like,GO:0016021 A0A1S3SK04,106607996,LOC106607996,histone-lysine N-methyltransferase NSD2-like,GO:0005694;GO:0005634;GO:0018024;GO:0046872 A0A1S3RMY2,106601976,LOC106601976,fibroblast growth factor receptor 3-like,GO:0005524;GO:0004672 A0A1S3KV69,106560213,LOC106560213,phospholipase B1 membrane-associated-like,GO:0004620 A0A1S3LSJ1,106566220,LOC106566220,forkhead box protein I1c-like,GO:0005634;GO:0003700;GO:0043565 A0A1S3MPI8,106571988,LOC106571988,GDNF family receptor alpha-4-like,GO:0005886;GO:0038023 A0A1S3NLZ6,106578259,LOC106578259,attractin-like, A0A1S3PJR5,106583877,LOC106583877,sodium bicarbonate transporter-like protein 11,GO:0016020;GO:0005452 A0A1S3QEI4,106589664,LOC106589664,G-protein coupled receptor 4-like,GO:0016021;GO:0004930 A0A1S3QYM9,106596642,LOC106596642,sodium bicarbonate transporter-like protein 11,GO:0016021;GO:0005452 -------------------------------------------------- Line count: 82393 20210601_ssal_accession-gene_id-gene-gene_description-go_ids.csv
Line count looks reasonable, as I know that some NCBI gene IDs are associated with multiple UniProt accessions, so we should end up with more results than were submitted.
This will sort the both files on the columns with the NCBI gene ID for joining.
Then, it will replace the commas with tabs and re-order the columns so that the NCBI chromosome is in the first column.
%%bash
cd "${analysis_dir}"
join \
-t "," \
-1 4 \
-2 2 \
<(sort -t "," -k 4,4 "${gtf_extractor_output}") \
<(sort -t "," -k2,2 "${parsed_uniprot}") \
| awk 'BEGIN {FS=","; OFS="\t"} {print $2, $1, $3, $4, $5, $6, $7, $8}' \
> "${joined_output}"
%%bash
cd "${analysis_dir}"
ls -ltrh
echo ""
echo "Line count:"
wc -l "${joined_output}"
total 372M -rw-rw-r-- 1 samb samb 2.9M Jun 1 09:49 20210601_ssal_chrom-start-end-Dbxref.csv -rw-rw-r-- 1 samb samb 772K Jun 1 09:58 20210601_ssal_gene-list.txt -rw-rw-r-- 1 samb samb 350M Jun 1 10:04 20210601_ssal_uniprot_batch_results.txt -rw-rw-r-- 1 samb samb 8.0M Jun 1 16:31 20210601_ssal_accession-gene_id-gene-gene_description-go_ids.csv -rw-rw-r-- 1 samb samb 11M Jun 1 20:21 20210601_ssal_chrom-gene_id_start-end-acc-gene-gene_description-go_ids.csv Line count: 82570 20210601_ssal_chrom-gene_id_start-end-acc-gene-gene_description-go_ids.csv
Hmmm, does this line count make sense?
Submitted to UniProt: 79030
Returned from UniProt: 82393
Joined: 82570
Look at the file. Columns will be tab-separated in this order:
chromosome | NCBI gene ID | start | end | UniProt accession | gene abbreviation/name | gene description | GO IDs |
---|---|---|---|---|---|---|---|
%%bash
cd "${analysis_dir}"
head "${joined_output}"
NC_027327.1 100135779 7914675 7922535 Q7T2G8 fsh-R neurexin-1a-like isoform X17 GO:0016323;GO:0016021;GO:0016020;GO:0004963;GO:0004996;GO:0032354;GO:0034699 NC_027321.1 100136349 24148576 24153099 Q8QHK5 dj-1 28S ribosomal protein S16 mitochondrial-like GO:0045121;GO:0005886;GO:0006914;GO:0007338 NC_027321.1 100136351 63087382 63155499 Q2V2G8 RAF1c Serine/threonine protein kinase RAF1c GO:0005524;GO:0030553;GO:0106310;GO:0106311;GO:0035556 NC_027302.1 100136352 92389729 92396132 B5X240 ACTB actin cytoplasmic 1 isoform X2 GO:0005856;GO:0097433;GO:0005925;GO:0005886 NC_027302.1 100136352 92389729 92396132 O42161 actb; caskin-1-like GO:0015629;GO:0005737;GO:0005634;GO:0005886;GO:0005524 NC_027318.1 100136353 42266727 42274733 O57560 LOC100136353 MBT domain-containing protein 1-like GO:0005524;GO:0004550;GO:0006241;GO:0006183;GO:0006228 NC_027306.1 100136354 58238908 58240915 A0A1S2WYK4 LOC100136354 LIM domain only protein 3 isoform X1 GO:0005840;GO:0003735;GO:0006412 NC_027306.1 100136354 58238908 58240915 O57561 rpl18a; LIM domain only protein 3 isoform X1 GO:0005840;GO:0003735;GO:0006412 NC_027321.1 100136355 8772597 8774381 A8YTA4 tshb thyrotropin subunit beta precursor GO:0005576;GO:0005179 NC_027321.1 100136355 8772597 8774381 O73824 tshb; mitochondrial glutamate carrier 1-like isoform X1 GO:0005576;GO:0005179
Forgot to do this before closing notebook.
%%bash
cd "${analysis_dir}"
for file in *
do
md5sum "${file}" | tee --append checksums.md5
done
e7d970782d7f531967dbfce01e5df549 20210601_ssal_accession-gene_id-gene-gene_description-go_ids.csv 5288ed387a6b1155cca11f25b9a9e3ca 20210601_ssal_chrom-gene_id_start-end-acc-gene-gene_description-go_ids.csv f4182e5129978328b0e9ae2b07d0bbf7 20210601_ssal_chrom-start-end-Dbxref.csv 0d330da91260189090ba2fac1ca0340f 20210601_ssal_gene-list.txt 81f63345d2f2cfbabdc8d60c3326ba66 20210601_ssal_uniprot_batch_results.txt