#default_exp page
Parallel and serial pagination
#export
from fastcore.utils import *
from fastcore.foundation import *
from ghapi.core import *
import re
from urllib.parse import parse_qs,urlsplit
Some GitHub API operations return their results one page at a time. For instance, there are many thousands of gists, but if we call list_public
we only see the first 30:
api = GhApi()
gists = api.gists.list_public()
len(gists)
30
That's because this operation takes two optional parameters, per_page
, and page
:
api.gists.list_public
gists.list_public(since, per_page, page): List public gists
This is a common pattern for list_*
operations in the GitHub API. One way to get more results is to increase per_page
:
len(api.gists.list_public(per_page=100))
100
However, per_page
has a maximum of 100
, so if you want more, you'll have to pass page=
to get pages beyond the first. An easy way to iterate through all pages is to use paged
. paged
returns a generator
#export
def paged(oper, *args, per_page=30, max_pages=9999, **kwargs):
"Convert operation `oper(*args,**kwargs)` into an iterator"
yield from itertools.takewhile(noop, (oper(*args, per_page=per_page, page=i, **kwargs) for i in range(1,max_pages+1)))
We'll demonstrate this using the repos.list_for_org
method:
api.repos.list_for_org
repos.list_for_org(org, type, sort, direction, per_page, page): List organization repositories
repos = api.repos.list_for_org(org='fastai')
len(repos),repos[0].name
(30, 'docs')
To convert this operation into a Python iterator, pass the operation itself, along with any arguments (either keyword or positional) to paged
. Note how the function and arguments are passed separately:
repos = paged(api.repos.list_for_org, org='fastai')
Note that the object returned from paged
is a generator. You can iterate through this generator repos
in the normal way:
for page in repos: print(len(page), page[0].name)
30 docs 30 fastscript 25 wireguard-fast
GitHub tells us how many pages are available using the link header. Unfortunately the pypi LinkHeader library appears to no longer be maintained, so we've put a refactored version of it here.
#export
class _Scanner:
def __init__(self, buf): self.buf,self.match = buf,None
def __getitem__(self, key): return self.match.group(key)
def scan(self, pattern):
self.match = re.compile(pattern).match(self.buf)
if self.match: self.buf = self.buf[self.match.end():]
return self.match
_QUOTED = r'"((?:[^"\\]|\\.)*)"'
_TOKEN = r'([^()<>@,;:\"\[\]?={}\s]+)'
_RE_COMMA_HREF = r' *,? *< *([^>]*) *> *'
_RE_ATTR = rf'{_TOKEN} *(?:= *({_TOKEN}|{_QUOTED}))? *'
#export
def _parse_link_hdr(header):
"Parse an RFC 5988 link header, returning a `list` of `tuple`s of URL and attr `dict`"
scanner,links = _Scanner(header),[]
while scanner.scan(_RE_COMMA_HREF):
href,attrs = scanner[1],[]
while scanner.scan('; *'):
if scanner.scan(_RE_ATTR):
attr_name, token, quoted = scanner[1], scanner[3], scanner[4]
if quoted is not None: attrs.append([attr_name, quoted.replace(r'\"', '"')])
elif token is not None: attrs.append([attr_name, token])
else: attrs.append([attr_name, None])
links.append((href,dict(attrs)))
if scanner.buf: raise Exception(f"parse() failed at {scanner.buf!r}")
return links
#export
def parse_link_hdr(header):
"Parse an RFC 5988 link header, returning a `dict` from rels to a `tuple` of URL and attrs `dict`"
return {a.pop('rel'):(u,a) for u,a in _parse_link_hdr(header)}
Here's an example of a link header with just one link:
parse_link_hdr('<http://example.com>; rel="foo bar"; type=text/html')
{'foo bar': ('http://example.com', {'type': 'text/html'})}
links = parse_link_hdr('<http://example.com>; rel="foo bar"; type=text/html')
link = links['foo bar']
test_eq(link[0], 'http://example.com')
test_eq(link[1]['type'], 'text/html')
Let's test it on the headers we received on our last call to GitHub. You can access the last call's headers in `recv_hdrs':
api.recv_hdrs['Link']
'<https://api.github.com/organizations/20547620/repos?per_page=30&page=4>; rel="prev", <https://api.github.com/organizations/20547620/repos?per_page=30&page=4>; rel="last", <https://api.github.com/organizations/20547620/repos?per_page=30&page=1>; rel="first"'
Here's what happens when we parse that:
parse_link_hdr(api.recv_hdrs['Link'])
{'prev': ('https://api.github.com/organizations/20547620/repos?per_page=30&page=4', {}), 'last': ('https://api.github.com/organizations/20547620/repos?per_page=30&page=4', {}), 'first': ('https://api.github.com/organizations/20547620/repos?per_page=30&page=1', {})}
Rather than requesting each page one at a time, we can save some time by getting all the pages we need in parallel.
#export
@patch
def last_page(self:GhApi):
"Parse RFC 5988 link header from most recent operation, and extract the last page"
header = self.recv_hdrs.get('Link', '')
last = nested_idx(parse_link_hdr(header), 'last', 0) or ''
qs = parse_qs(urlsplit(last).query)
return int(nested_idx(qs,'page',0) or 0)
To help us know the number of pages needed, we can use last_page
, which uses the link header we just looked at to grab the last page from GitHub.
We will need multiple pages to get all the repos in the github
organization, even if we get 100 at a time:
api.repos.list_for_org('github', per_page=100)
api.last_page()
4
#export
def _call_page(i, oper, args, kwargs, per_page):
return oper(*args, per_page=per_page, page=i, **kwargs)
#export
def pages(oper, n_pages, *args, n_workers=None, per_page=100, **kwargs):
"Get `n_pages` pages from `oper(*args,**kwargs)`"
return parallel(_call_page, range(1,n_pages+1), oper=oper, per_page=per_page, args=args, kwargs=kwargs,
progress=False, n_workers=ifnone(n_workers,n_pages), threadpool=True)
pages
by default passes per_page=100
to the operation.
Let's look at some examples. To get all the pages for the repos in the github
organization in parallel, we can use this:
gh_repos = pages(api.repos.list_for_org, api.last_page(), 'github').concat()
len(gh_repos)
367
If you already know ahead of time the number of pages required, there's no need to call last_page
. For instance, the GitHub docs specify that we can get at most 3000 gists:
gists = pages(api.gists.list_public, 30).concat()
len(gists)
3000
GitHub ignores the per_page
parameter for some API calls, such as listing public events, which it limits to 8 pages of 30 items per page. To retrieve all pages in these cases, you need to explicitly pass the lower per page limit:
api.activity.list_public_events()
api.last_page()
8
evts = pages(api.activity.list_public_events, api.last_page(), per_page=30).concat()
len(evts)
232
#hide
from nbdev.export import notebook2script
notebook2script()
Converted 00_core.ipynb. Converted 01_actions.ipynb. Converted 02_auth.ipynb. Converted 03_page.ipynb. Converted 04_event.ipynb. Converted 10_cli.ipynb. Converted 50_fullapi.ipynb. Converted 80_tutorial_actions.ipynb. Converted 90_build_lib.ipynb. Converted Untitled.ipynb. Converted ghapi demo.ipynb. Converted index.ipynb.