!pip install oceania-query-fasta
!pip install openpyxl # required to load supplementary info into pandas
Collecting oceania-query-fasta Using cached oceania_query_fasta-0.1.7-py3-none-any.whl (14 kB) Requirement already satisfied: pandas==1.* in /opt/conda/lib/python3.9/site-packages (from oceania-query-fasta) (1.2.5) Requirement already satisfied: requests==2.* in /opt/conda/lib/python3.9/site-packages (from oceania-query-fasta) (2.25.1) Collecting click==7.* Using cached click-7.1.2-py2.py3-none-any.whl (82 kB) Requirement already satisfied: python-dateutil>=2.7.3 in /opt/conda/lib/python3.9/site-packages (from pandas==1.*->oceania-query-fasta) (2.8.1) Requirement already satisfied: pytz>=2017.3 in /opt/conda/lib/python3.9/site-packages (from pandas==1.*->oceania-query-fasta) (2021.1) Requirement already satisfied: numpy>=1.16.5 in /opt/conda/lib/python3.9/site-packages (from pandas==1.*->oceania-query-fasta) (1.21.0) Requirement already satisfied: urllib3<1.27,>=1.21.1 in /opt/conda/lib/python3.9/site-packages (from requests==2.*->oceania-query-fasta) (1.26.5) Requirement already satisfied: chardet<5,>=3.0.2 in /opt/conda/lib/python3.9/site-packages (from requests==2.*->oceania-query-fasta) (4.0.0) Requirement already satisfied: certifi>=2017.4.17 in /opt/conda/lib/python3.9/site-packages (from requests==2.*->oceania-query-fasta) (2021.5.30) Requirement already satisfied: idna<3,>=2.5 in /opt/conda/lib/python3.9/site-packages (from requests==2.*->oceania-query-fasta) (2.10) Requirement already satisfied: six>=1.5 in /opt/conda/lib/python3.9/site-packages (from python-dateutil>=2.7.3->pandas==1.*->oceania-query-fasta) (1.16.0) Installing collected packages: click, oceania-query-fasta Attempting uninstall: click Found existing installation: click 8.0.1 Uninstalling click-8.0.1: Successfully uninstalled click-8.0.1 Successfully installed click-7.1.2 oceania-query-fasta-0.1.7 Collecting openpyxl Using cached openpyxl-3.0.7-py2.py3-none-any.whl (243 kB) Collecting et-xmlfile Using cached et_xmlfile-1.1.0-py3-none-any.whl (4.7 kB) Installing collected packages: et-xmlfile, openpyxl Successfully installed et-xmlfile-1.1.0 openpyxl-3.0.7
#@title Double click to see the cell of the Python program
import pandas
SUPP_INFO_LINK = "https://zenodo.org/record/3539258/files/Salazar_et_al_2019_Suppl_Info.xlsx?download=1"
table_W1 = pandas.read_excel(SUPP_INFO_LINK, sheet_name="Table_W1")
# Select first sample_id at the surface water layer
selected_sample = table_W1[table_W1.Layer=="SRF"].head(1)
print(selected_sample)
sample_id = selected_sample["PANGAEA sample id"].item()
PANGAEA sample id BioSamples_ID ENA_ID ENA_Run_ID MetaG/MetaT Station \ 0 TARA_Y100000004 SAMEA2619888 ERS488658 ERR594328 MetaG 34 Layer Size_fraction Size_fraction_name \ 0 SRF 0.1-0.22 Girus/Prokaryote enriched Used_in_OM-RGC.v1 (Sunagawa_et_al_2015) Used_for_OM-RGC.v2 (current work) \ 0 Used_in_OM-RGC.v1 Used_for_OM-RGC.v2 (current work) Used_for_profiling (current work) Polar \ 0 Not_used_for_profiling (current work) Non polar Sample ID (registered at the BioSamples ...) \ 0 SAMEA2619888 Sample ID (registered at the European Nu...) Date/Time \ 0 ERS488658 2010-01-20T04:27:00Z Latitude Longitude Depth, nominal OS region 0 18.3967 39.875 5 [RS] Red Sea (MRGID:4264)
/opt/conda/lib/python3.9/site-packages/openpyxl/worksheet/_reader.py:312: UserWarning: Unknown extension is not supported and will be removed warn(msg)
#@title Double click to see the cell of the Python program
from oceania import list_intergenic_regions
intergenic_regions_metadata = list_intergenic_regions(sample_id, min_length=100, page=1, page_size=10)
print(intergenic_regions_metadata)
sequence start stop length \ 0 TARA_Y100000004_G_scaffold1_1 509 811 302 1 TARA_Y100000004_G_scaffold5_1 0 319 319 2 TARA_Y100000004_G_scaffold16_1 0 114 114 3 TARA_Y100000004_G_scaffold37_1 8888 9021 133 4 TARA_Y100000004_G_scaffold37_1 9311 9554 243 5 TARA_Y100000004_G_scaffold54_1 2013 2386 373 6 TARA_Y100000004_G_scaffold54_1 3939 4083 144 7 TARA_Y100000004_G_scaffold55_1 373 482 109 8 TARA_Y100000004_G_scaffold60_1 3036 3232 196 9 TARA_Y100000004_G_scaffold76_1 257 465 208 gen_before \ 0 TARA_Y100000004_G_scaffold1_1_gene1 1 TARA_Y100000004_G_scaffold4_1_gene12 2 TARA_Y100000004_G_scaffold15_1_gene28 3 TARA_Y100000004_G_scaffold37_1_gene73 4 TARA_Y100000004_G_scaffold37_1_gene74 5 TARA_Y100000004_G_scaffold54_1_gene87 6 TARA_Y100000004_G_scaffold54_1_gene91 7 TARA_Y100000004_G_scaffold55_1_gene104 8 TARA_Y100000004_G_scaffold60_1_gene120 9 TARA_Y100000004_G_scaffold76_1_gene130 gen_after 0 TARA_Y100000004_G_scaffold1_1_gene2 1 TARA_Y100000004_G_scaffold5_1_gene13 2 TARA_Y100000004_G_scaffold16_1_gene30 3 TARA_Y100000004_G_scaffold37_1_gene74 4 TARA_Y100000004_G_scaffold37_1_gene75 5 TARA_Y100000004_G_scaffold54_1_gene88 6 TARA_Y100000004_G_scaffold54_1_gene92 7 TARA_Y100000004_G_scaffold55_1_gene105 8 TARA_Y100000004_G_scaffold60_1_gene121 9 TARA_Y100000004_G_scaffold76_1_gene131
#@title Double click to see the cell of the Python program
request_regions = intergenic_regions_metadata[['sequence', 'start', 'stop']].copy()
request_params = []
for row in request_regions.itertuples():
request_params.append(
(row[1], int(row[2]), int(row[3]))
)
print(request_params)
[('TARA_Y100000004_G_scaffold1_1', 509, 811), ('TARA_Y100000004_G_scaffold5_1', 0, 319), ('TARA_Y100000004_G_scaffold16_1', 0, 114), ('TARA_Y100000004_G_scaffold37_1', 8888, 9021), ('TARA_Y100000004_G_scaffold37_1', 9311, 9554), ('TARA_Y100000004_G_scaffold54_1', 2013, 2386), ('TARA_Y100000004_G_scaffold54_1', 3939, 4083), ('TARA_Y100000004_G_scaffold55_1', 373, 482), ('TARA_Y100000004_G_scaffold60_1', 3036, 3232), ('TARA_Y100000004_G_scaffold76_1', 257, 465)]
from oceania import get_sequences_from_fasta
request_result = get_sequences_from_fasta(
sample_id,
request_params
)
# request_result is loaded as a pandas.DataFrame
print(request_result)
[30-06-2021 14:09:33] Sending request for fasta sequences [30-06-2021 14:09:34] Request accepted [30-06-2021 14:09:34] Waiting for results... [30-06-2021 14:12:24] Done. Elapsed time: 170.82166524301283 seconds
id start end type \ 0 TARA_Y100000004_G_scaffold1_1 509 811 raw 1 TARA_Y100000004_G_scaffold5_1 0 319 raw 2 TARA_Y100000004_G_scaffold16_1 0 114 raw 3 TARA_Y100000004_G_scaffold37_1 8888 9021 raw 4 TARA_Y100000004_G_scaffold37_1 9311 9554 raw 5 TARA_Y100000004_G_scaffold54_1 2013 2386 raw 6 TARA_Y100000004_G_scaffold54_1 3939 4083 raw 7 TARA_Y100000004_G_scaffold55_1 373 482 raw 8 TARA_Y100000004_G_scaffold60_1 3036 3232 raw 9 TARA_Y100000004_G_scaffold76_1 257 465 raw sequence 0 ATTGTATAGAATGTAGATCTTCGTTATTGGAGATTCAATGATGTGG... 1 TCTGCTGTGCCTTGCATCCTACCTGCCACGCTGTAAGGCTGACAAG... 2 TAATTATACAGGAGGCACCTCACTACGAGCTAAACGAGGTGCAAGA... 3 TCTGTAGACCAGAATAAGAAAGGGAGCCTTCGGGCTCCCTTTTTTT... 4 ATTGTGTGTATTATACAGATATAAACAAAAAATGTCAAGCGTTAAA... 5 GCTGTTGACTATGCTTTGGCTTCTTCATCCTGAAAAGGGCGAAGGA... 6 TGCTGCCTTCGTTGAGCGTTGTAGAACGCTTTTTCTAATGCAGTCA... 7 TCATGCCACTTAAAGGAAAACAGTACAAACTAGATGTTGATGGTGA... 8 TACATCCAAATCTTATATAGATATTAAATTTATTAACATCATCTCT... 9 ATCATTTTTAATTCTATACAGATATGCTCTAGCTTTTGTTTTAGAC...