#!/usr/bin/env python # coding: utf-8 # ## XBRL US API - Python example # This sample Python code completes a full-text search of SEC reports in the XBRL US Public Filings Database and returns results from 2018 to the present. # # **The document endpoint is a benefit of XBRL US Membership - join XBRL US for comprehensive access - https://xbrl.us/join.** # ### Authenticate for access token # XBRL US limits records returned for a query to improve efficiency; this script loops to collect all data from the Public Filings Database for a query. # Run the cell below, then type your XBRL US Web account email, account password, Client ID, and secret (get these from https://xbrl.us/access-token), pressing the Enter key on the keyboard after each entry. # In[ ]: print('Enter your XBRL US Web account email: ') import os, re, sys, json import requests import pandas as pd from IPython.display import display, HTML import numpy as np import getpass from datetime import datetime import urllib from urllib.parse import urlencode email = input() password = getpass.getpass(prompt='Password: ') clientid = getpass.getpass(prompt='Client ID: ') secret = getpass.getpass(prompt='Secret: ') body_auth = {'username' : ''.join(email), 'client_id': ''.join(clientid), 'client_secret' : ''.join(secret), 'password' : ''.join(password), 'grant_type' : 'password', 'platform' : 'ipynb' } payload = urlencode(body_auth) url = 'https://api.xbrl.us/oauth2/token' headers = {"Content-Type": "application/x-www-form-urlencoded"} res = requests.request("POST", url, data=payload, headers=headers) auth_json = res.json() if 'error' in auth_json: print ("\n\nThere was a problem generating an access token with these credentials. Run the first cell again to enter credentials.") else: print ("\n\nYour access token expires in 60 minutes. After it expires, run the cell immediately below this one to generate a new token and continue to use the query cell. \n\nFor now, skip ahead to the section 'Make a Query'.") access_token = auth_json['access_token'] refresh_token = auth_json['refresh_token'] newaccess = '' newrefresh = '' #print('access token: ' + access_token + ' refresh token: ' + refresh_token) # #### Refresh token # The cell below is only needed to refresh an expired access token after 60 minutes. When the access token no longer returns results, run the cell below to refresh the access token or re-enter credentials by running the cell above. Until the refresh token process is needed, skip ahead to **Make a Query**. # # In[ ]: token = token if newrefresh != '' else refresh_token refresh_auth = {'client_id': ''.join(clientid), 'client_secret' : ''.join(secret), 'grant_type' : 'refresh_token', 'platform' : 'ipynb', 'refresh_token' : ''.join(token) } refreshres = requests.post(url, data=refresh_auth) refresh_json = refreshres.json() access_token = refresh_json['access_token'] refresh_token = refresh_json['refresh_token']#print('access token: ' + access_token + 'refresh token: ' + refresh_token) print('Your access token is refreshed for 60 minutes. If it expires again, run this cell to generate a new token and continue to use the query cells below.') print(access_token) # ### Make a query # After the access token confirmation appears above, you can modify the query below, then use the **_Cell >> Run_** menu option from the cell **immediately below this text** to run the entire query for results. # # The sample results are from 10+ years of data for companies in an SIC code, and may take several minutes to recreate. **The document endpoint is a benefit of XBRL US Membership** - see https://xbrl.us/membership to get started. # # Refer to XBRL API documentation at https://xbrlus.github.io/xbrl-api/#/Facts/getFactDetails for other endpoints and parameters to filter and return. # In[2]: # Define the parameters for the filter and fields to be returned, # run the loop to return results offset_value = 0 res_df = [] # Define the parameters of the query. This query returns all # SEC reports where 'Microsoft' and 'internet of things' # appear within 300 words of each other. text_search = ["Microsoft NEAR/300 \"internet of things\"" ] report_source = ["SEC" ] # Define data fields to return (multi-sort based on order) fields = [ # this is the list of the characteristics of the data being returned by the query 'entity.name.sort(ASC)', 'dts.id.sort(DESC)', 'document.type', 'document.uri', 'document.example' ] params = { # this is the list of what's being queried against the search endpoint and returned as fields 'document.text-search': ','.join(text_search), 'report.source-name': ','.join(report_source), 'fields': ','.join(fields) } # Execute the query with loop for all results search_endpoint = 'https://api.xbrl.us/api/v1/document/search' orig_fields = params['fields'] count = 0 query_start = datetime.now() printed = False while True: if not printed: print("On", query_start.strftime("%c"), email, "(client ID:", str(clientid.split('-')[0]), "...) started the query and ...\n") printed = True res = requests.get(search_endpoint, params=params, headers={'Authorization' : 'Bearer {}'.format(access_token)}) res_json = res.json() if 'error' in res_json: print('There was an error: {}'.format(res_json['error_description'])) break print("up to", str(offset_value + res_json['paging']['limit']), "records are found so far ...") res_df += res_json['data'] if res_json['paging']['count'] < res_json['paging']['limit']: print(" - this set contained fewer than the", res_json['paging']['limit'], "possible, only", str(res_json['paging']['count']), "records.") break else: offset_value += res_json['paging']['limit'] if 100 == res_json['paging']['limit']: params['fields'] = orig_fields + ',fact.offset({})'.format(offset_value) if offset_value == 10 * res_json['paging']['limit']: break elif 500 == res_json['paging']['limit']: params['fields'] = orig_fields + ',fact.offset({})'.format(offset_value) if offset_value == 4 * res_json['paging']['limit']: break params['fields'] = orig_fields + ',fact.offset({})'.format(offset_value) if not 'error' in res_json: current_datetime = datetime.now().replace(microsecond=0) time_taken = current_datetime - query_start index = pd.DataFrame(res_df).index total_rows = len(index) your_limit = res_json['paging']['limit'] limit_message = "If the results below match the limit noted above, you might not be seeing all rows, and should consider upgrading (https://xbrl.us/access-token).\n" if your_limit == 100: print("\nThis non-Member account has a limit of " , 10 * your_limit, " rows per query from our Public Filings Database. " + limit_message) elif your_limit == 500: print("\nThis Basic Individual Member account has a limit of ", 4 * your_limit, " rows per query from our Public Filings Database. " + limit_message) print("\nAt " + current_datetime.strftime("%c") + ", the query finished with " + str(total_rows) + " rows returned in " + str(time_taken) + " " + urllib.parse.unquote(res.url) ) df = pd.DataFrame(res_df) # the format truncates the HTML display of numerical values to two decimals; .csv data is unaffected pd.options.display.float_format = '{:,.2f}'.format display(HTML(df.to_html())) # In[8]: # If you run this program locally, you can save the output to a file # on your computer (modify D:\results.csv to your system) df.to_csv(r"D:\results.csv",sep=",")