#default_exp linkcheck
API for fast local and online link checking
#export
from fastcore.all import *
from html.parser import HTMLParser
from urllib.parse import urlparse,urlunparse
from fastcore.script import SCRIPT_INFO
import sys
#export
class _HTMLParseAttrs(HTMLParser):
def reset(self):
super().reset()
self.found = set()
def handle_starttag(self, tag, attrs):
a = first(v for k,v in attrs if k in ("src","href"))
if a: self.found.add(a)
handle_startendtag = handle_starttag
#export
def get_links(fn):
"List of all links in file `fn`"
h = _HTMLParseAttrs()
h.feed(Path(fn).read_text())
return L(h.found)
We can use get_links
to parse an HTML file for different types of links. For example, this is the contents of ./example/test.html
:
example = Path('_example/test.html')
print(example.read_text())
<a href="//somecdn.com/doesntexist.html"></a> <a href="http://www.bing.com"></a> <script src="test.js"></script> <img src="http://fastlinkcheck.com/test.html" />
Calling get_links
with the above file path will return a list of links:
links = get_links(example)
test_eq(set(links), {'test.js',
'//somecdn.com/doesntexist.html',
'http://www.bing.com','http://fastlinkcheck.com/test.html'})
#export
def _local_url(u, root, host, fname):
"Change url `u` to local path if it is a local link"
fpath = Path(fname).parent
islocal=False
# remove `host` prefix
for o in 'http://','https://','http://www.','https://www.':
if u.startswith(o+host): u,islocal = remove_prefix(u, o+host),True
# remove params, querystring, and fragment
p = list(urlparse(u))[:5]+['']
# local prefix, or no protocol or host
if islocal or (not p[0] and not p[1]):
u = p[2]
if u and u[0]=='/': return (root/u[1:]).resolve()
else: return (fpath/u).resolve()
# URLs without a protocol are "protocol relative"
if not p[0]: p[0]='http'
# mailto etc are not checked
if p[0] not in ('http','https'): return ''
return urlunparse(p)
#export
class _LinkMap(dict):
"""A dict that pretty prints Links and their associated locations."""
def _repr_locs(self, k): return '\n'.join(f' - `{p}`' for p in self[k])
def __repr__(self):
rstr = L(f'- {k!r} was found in the following pages:\n{self._repr_locs(k)}' for k in self).concat()
return '\n'.join(rstr)
_repr_markdown_ = __repr__
#export
def local_urls(path:Path, host:str):
"returns a `dict` mapping all HTML files in `path` to a list of locally-resolved links in that file"
path=Path(path)
fns = L(path.glob('**/*.html'))+L(path.glob('**/*.htm'))
found = [(fn.resolve(),_local_url(link, root=path, host=host, fname=fn))
for fn in fns for link in get_links(fn)]
return _LinkMap(groupby(found, 1, 0))
The keys of the dict
returned by local_urls
are links found in HTML files, and the values of this dict
are a list of paths that those links are found in.
Furthermore, local links are returned as Path
objects, whereas external URLs are strings. For example, notice how the link:
http://fastlinkcheck.com/test.html
is resolved to a local path, because the host
parameter supplied to local_urls
, fastlinkcheck.com
matches the url in the link:
path = Path('./_example')
links = local_urls(path, host='fastlinkcheck.com')
links
/Users/hamelsmu/github/fastlinkcheck/_example/test.html
/Users/hamelsmu/github/fastlinkcheck/_example/test.html
/Users/hamelsmu/github/fastlinkcheck/_example/test.html
/Users/hamelsmu/github/fastlinkcheck/_example/test.html
#export
def broken_local(links, ignore_paths=None):
"List of items in keys of `links` that are `Path`s that do not exist"
ignore_paths = setify(ignore_paths)
return L(o for o in links if isinstance(o,Path) and o not in ignore_paths and not o.exists())
Since test.js
does not exist in the example/
directory, broken_local
returns this path:
broken_local(links)
(#1) [Path('/Users/hamelsmu/github/fastlinkcheck/_example/test.js')]
assert not all([x.exists() for x in broken_local(links)])
#export
def broken_urls(links, ignore_urls=None):
"List of items in keys of `links` that are URLs that return a failure status code"
ignore_urls = setify(ignore_urls)
its = L(o for o in links if isinstance(o, str) and o not in ignore_urls)
working_urls = parallel(urlcheck, its, n_workers=32, threadpool=True)
return L(o for o,p in zip(its,working_urls) if not p)
Similarly the url http://somecdn.com/doesntexist.html
doesn't exist, which is why it is returned by broken_urls
assert broken_urls(links) == ['http://somecdn.com/doesntexist.html']
#export
@call_parse
def link_check(path:Param("Root directory searched recursively for HTML files", str),
host:Param("Host and path (without protocol) of web server", str)='',
config_file:Param("Location of file with urls to ignore",str)=None,
actions_output:Param("Toggle GitHub Actions output on/off",store_true)=False,
exit_on_found:Param("(CLI Only) Exit with status code 1 if broken links are found", store_true)=False,
print_logs:Param("Toggle printing logs to stdout.", store_true)=False):
"""Check for broken links recursively in `path`."""
path = Path(path)
is_cli = (SCRIPT_INFO.func == 'link_check')
assert path.exists(), f"{path.absolute()} does not exist."
if config_file: assert Path(config_file).is_file(), f"{config_file} is either not a file or doesn't exist."
ignore = L(x.strip() for x in (Path(config_file).readlines() if config_file else ''))
links = local_urls(path, host=host)
ignore_paths = set((path/o).resolve() for o in ignore if not urlvalid(o))
ignore_urls = set(ignore.filter(urlvalid))
lm = _LinkMap({k:links[k] for k in (broken_urls(links, ignore_urls) + broken_local(links, ignore_paths))})
if actions_output: print(f"::set-output name=broken_links::{bool(lm)}")
msg = f'\nERROR: The Following Broken Links or Paths were found:\n{lm}' if lm else 'No Broken Links Found!'
if print_logs or is_cli: print(msg)
if is_cli and lm and exit_on_found: sys.exit(1)
else: return lm
link_check(path='_example', host='fastlinkcheck.com')
/Users/hamelsmu/github/fastlinkcheck/_example/test.html
/Users/hamelsmu/github/fastlinkcheck/_example/test.html
You can choose to ignore files with a a plain-text file containing a list of urls to ignore. For example, the file linkcheck.rc
contains a list of urls I want to ignore:
print((path/'linkcheck.rc').read_text())
test.js https://www.google.com
In this case example/test.js
will be filtered out from the list:
link_check(path='_example', host='fastlinkcheck.com', config_file='_example/linkcheck.rc')
/Users/hamelsmu/github/fastlinkcheck/_example/test.html
You can optionally emit the variable broken_links
for GitHub Actions as discussed here. This variable will be either True
or False
depending on if broken links are found or not:
link_check(path="_example", host="fastlinkcheck.com", actions_output=True)
::set-output name=broken_links::True
/Users/hamelsmu/github/fastlinkcheck/_example/test.html
/Users/hamelsmu/github/fastlinkcheck/_example/test.html
link_check
can also be called use from the command line like this:
Note: the
!
command in Jupyter allows you run shell commands
The -h
or --help
flag will allow you to see the command line docs:
!link_check -h
usage: link_check [-h] [--host HOST] [--config_file CONFIG_FILE] [--actions_output] [--exit_on_found] [--print_logs] [--pdb] [--xtra XTRA] path Check for broken links recursively in `path`. positional arguments: path Root directory searched recursively for HTML files optional arguments: -h, --help show this help message and exit --host HOST Host and path (without protocol) of web server (default: ) --config_file CONFIG_FILE Location of file with urls to ignore --actions_output Toggle GitHub Actions output on/off (default: False) --exit_on_found (CLI Only) Exit with status code 1 if broken links are found (default: False) --print_logs Toggle printing logs to stdout. (default: False) --pdb Run in pdb debugger (default: False) --xtra XTRA Parse for additional args (default: '')
#hide
from nbdev.export import *
notebook2script()
Converted index.ipynb. Converted linkcheck.ipynb.