#!/usr/bin/env python # coding: utf-8 # Let's start out by searching iDigBio for all of the records that have an "associatedSequences" field. # In[2]: import idigbio api = idigbio.json() seq_query = {"data.dwc:associatedSequences": { "type":"exists" } } seq_count = api.count_records(rq=seq_query) print seq_count # It looks like only 22,161 records (out of 29,342,343 total records) have an entry for the "associatedSequences" field. # # Now let's try and grab all of the entries so that we can start checking out counts and formats. # In[4]: import csv PAGE_SIZE = 1000 fields = ["data.dwc:associatedSequences", "recordset"] seq_query = {"data.dwc:associatedSequences": { "type":"exists" } } with open('idigbio_seqs_and_recordsets.tsv', 'wb') as outfile: tsvwriter = csv.writer(outfile, delimiter="\t") for offset in xrange(0,seq_count, PAGE_SIZE): seq_records = api.search_records(rq = seq_query, limit=PAGE_SIZE, offset=offset, fields=fields, fields_exclude=[]) for seq_record in seq_records['items']: tsvwriter.writerow([seq_record['uuid'], seq_record['indexTerms']['recordset'], seq_record['data']['dwc:associatedSequences']]) # In[7]: import pandas as pd import numpy as np columns = ["specimen uuid", "recordset uuid","associatedSequence field"] idigbio_sequences = pd.read_csv('idigbio_seqs_and_recordsets.tsv', names=columns, sep="\t", index_col=0) print idigbio_sequences.head() # In[8]: import re def pull_out_gb_accessions_and_count(seq_string): pattern = re.compile("[a-zA-Z]{1,2}\-?_?\d{5,6}") matching_accessions = re.findall(pattern, seq_string) if len(matching_accessions) > 0: gb_accs = "|".join(matching_accessions) return gb_accs, len(matching_accessions) return np.nan, 0 # In[9]: idigbio_sequences['extracted_accessions'], idigbio_sequences['sequence_count'] = \ zip(*idigbio_sequences['associatedSequence field'].map(pull_out_gb_accessions_and_count)) # In[10]: print idigbio_sequences.head() # In[16]: print idigbio_sequences['sequence_count'].sum() # In[15]: get_ipython().run_line_magic('matplotlib', 'inline') from matplotlib import pyplot as plt plt.rcParams['figure.figsize'] = 12, 5 plt.style.use('ggplot') vc = idigbio_sequences['sequence_count'].value_counts() vc = vc.sort_index() vc.plot(kind='bar') # In[ ]: