In [ ]:

#default_exp linkcheck

fastlinkcheck API¶

API for fast local and online link checking

In [ ]:

#export
from fastcore.all import *
from html.parser import HTMLParser
from urllib.parse import urlparse,urlunparse
from fastcore.script import SCRIPT_INFO
import sys

Find links in an HTML file¶

In [ ]:

#export
class _HTMLParseAttrs(HTMLParser):
    def reset(self):
        super().reset()
        self.found = set()
    def handle_starttag(self, tag, attrs):
        a = first(v for k,v in attrs if k in ("src","href"))
        if a: self.found.add(a)
    handle_startendtag = handle_starttag

In [ ]:

#export
def get_links(fn):
    "List of all links in file `fn`"
    h = _HTMLParseAttrs()
    h.feed(Path(fn).read_text())
    return L(h.found)

We can use get_links to parse an HTML file for different types of links. For example, this is the contents of ./example/test.html:

In [ ]:

example = Path('_example/test.html')
print(example.read_text())

<a href="//somecdn.com/doesntexist.html"></a>
<a href="http://www.bing.com"></a>
<script src="test.js"></script>
<img src="http://fastlinkcheck.com/test.html" />

Calling get_links with the above file path will return a list of links:

In [ ]:

links = get_links(example)
test_eq(set(links), {'test.js',
                     '//somecdn.com/doesntexist.html',
                     'http://www.bing.com','http://fastlinkcheck.com/test.html'})

In [ ]:

#export
def _local_url(u, root, host, fname):
    "Change url `u` to local path if it is a local link"
    fpath = Path(fname).parent
    islocal=False
    # remove `host` prefix
    for o in 'http://','https://','http://www.','https://www.':
        if u.startswith(o+host): u,islocal = remove_prefix(u, o+host),True
    # remove params, querystring, and fragment
    p = list(urlparse(u))[:5]+['']
    # local prefix, or no protocol or host
    if islocal or (not p[0] and not p[1]):
        u = p[2]
        if u and u[0]=='/': return (root/u[1:]).resolve()
        else: return (fpath/u).resolve()
    # URLs without a protocol are "protocol relative"
    if not p[0]: p[0]='http'
    # mailto etc are not checked
    if p[0] not in ('http','https'): return ''
    return urlunparse(p)

In [ ]:

#export
class _LinkMap(dict):
    """A dict that pretty prints Links and their associated locations."""
    def _repr_locs(self, k): return '\n'.join(f'  - `{p}`' for p in self[k])
    def __repr__(self):
        rstr = L(f'- {k!r} was found in the following pages:\n{self._repr_locs(k)}' for k in self).concat()
        return '\n'.join(rstr)
    _repr_markdown_ = __repr__

In [ ]:

#export
def local_urls(path:Path, host:str):
    "returns a `dict` mapping all HTML files in `path` to a list of locally-resolved links in that file"
    path=Path(path)
    fns = L(path.glob('**/*.html'))+L(path.glob('**/*.htm'))
    found = [(fn.resolve(),_local_url(link, root=path, host=host, fname=fn))
             for fn in fns for link in get_links(fn)]
    return _LinkMap(groupby(found, 1, 0))

The keys of the dict returned by local_urls are links found in HTML files, and the values of this dict are a list of paths that those links are found in.

Furthermore, local links are returned as Path objects, whereas external URLs are strings. For example, notice how the link:

http://fastlinkcheck.com/test.html

is resolved to a local path, because the host parameter supplied to local_urls, fastlinkcheck.com matches the url in the link:

In [ ]:

path = Path('./_example')
links = local_urls(path, host='fastlinkcheck.com')
links

Out[ ]:

'http://somecdn.com/doesntexist.html' was found in the following pages:
- /Users/hamelsmu/github/fastlinkcheck/_example/test.html
Path('/Users/hamelsmu/github/fastlinkcheck/_example/test.html') was found in the following pages:
- /Users/hamelsmu/github/fastlinkcheck/_example/test.html
'http://www.bing.com' was found in the following pages:
- /Users/hamelsmu/github/fastlinkcheck/_example/test.html
Path('/Users/hamelsmu/github/fastlinkcheck/_example/test.js') was found in the following pages:
- /Users/hamelsmu/github/fastlinkcheck/_example/test.html

Finding broken links¶

In [ ]:

#export
def broken_local(links, ignore_paths=None):
    "List of items in keys of `links` that are `Path`s that do not exist"
    ignore_paths = setify(ignore_paths)
    return L(o for o in links if isinstance(o,Path) and o not in ignore_paths and not o.exists())

Since test.js does not exist in the example/ directory, broken_local returns this path:

In [ ]:

broken_local(links)

Out[ ]:

(#1) [Path('/Users/hamelsmu/github/fastlinkcheck/_example/test.js')]

In [ ]:

assert not all([x.exists() for x in broken_local(links)])

In [ ]:

#export
def broken_urls(links, ignore_urls=None):
    "List of items in keys of `links` that are URLs that return a failure status code"
    ignore_urls = setify(ignore_urls)
    its = L(o for o in links if isinstance(o, str) and o not in ignore_urls)
    working_urls = parallel(urlcheck, its, n_workers=32, threadpool=True)
    return L(o for o,p in zip(its,working_urls) if not p)

Similarly the url http://somecdn.com/doesntexist.html doesn't exist, which is why it is returned by broken_urls

In [ ]:

assert broken_urls(links) == ['http://somecdn.com/doesntexist.html']

In [ ]:

#export
@call_parse
def link_check(path:Param("Root directory searched recursively for HTML files", str),
               host:Param("Host and path (without protocol) of web server", str)='',
               config_file:Param("Location of file with urls to ignore",str)=None,
               actions_output:Param("Toggle GitHub Actions output on/off",store_true)=False,
               exit_on_found:Param("(CLI Only) Exit with status code 1 if broken links are found", store_true)=False,
               print_logs:Param("Toggle printing logs to stdout.", store_true)=False):
    """Check for broken links recursively in `path`."""
    path = Path(path)
    is_cli = (SCRIPT_INFO.func == 'link_check')
    assert path.exists(), f"{path.absolute()} does not exist."
    if config_file: assert Path(config_file).is_file(), f"{config_file} is either not a file or doesn't exist."
    ignore = L(x.strip() for x in (Path(config_file).readlines() if config_file else ''))
    links = local_urls(path, host=host)
    ignore_paths = set((path/o).resolve() for o in ignore if not urlvalid(o))
    ignore_urls = set(ignore.filter(urlvalid))
    lm = _LinkMap({k:links[k] for k in (broken_urls(links, ignore_urls) + broken_local(links, ignore_paths))})
    if actions_output: print(f"::set-output name=broken_links::{bool(lm)}")
    msg = f'\nERROR: The Following Broken Links or Paths were found:\n{lm}' if lm else 'No Broken Links Found!'
    if print_logs or is_cli: print(msg)
    if is_cli and lm and exit_on_found: sys.exit(1)
    else: return lm

In [ ]:

link_check(path='_example', host='fastlinkcheck.com')

Out[ ]:

'http://somecdn.com/doesntexist.html' was found in the following pages:
- /Users/hamelsmu/github/fastlinkcheck/_example/test.html
Path('/Users/hamelsmu/github/fastlinkcheck/_example/test.js') was found in the following pages:
- /Users/hamelsmu/github/fastlinkcheck/_example/test.html

Ignore links with a configuration file¶

You can choose to ignore files with a a plain-text file containing a list of urls to ignore. For example, the file linkcheck.rc contains a list of urls I want to ignore:

In [ ]:

print((path/'linkcheck.rc').read_text())

test.js
https://www.google.com

In this case example/test.js will be filtered out from the list:

In [ ]:

link_check(path='_example', host='fastlinkcheck.com', config_file='_example/linkcheck.rc')

Out[ ]:

'http://somecdn.com/doesntexist.html' was found in the following pages:
- /Users/hamelsmu/github/fastlinkcheck/_example/test.html

You can optionally emit the variable broken_links for GitHub Actions as discussed here. This variable will be either True or False depending on if broken links are found or not:

In [ ]:

link_check(path="_example", host="fastlinkcheck.com", actions_output=True)

::set-output name=broken_links::True

Out[ ]:

'http://somecdn.com/doesntexist.html' was found in the following pages:
- /Users/hamelsmu/github/fastlinkcheck/_example/test.html
Path('/Users/hamelsmu/github/fastlinkcheck/_example/test.js') was found in the following pages:
- /Users/hamelsmu/github/fastlinkcheck/_example/test.html

link_check can also be called use from the command line like this:

Note: the ! command in Jupyter allows you run shell commands

The -h or --help flag will allow you to see the command line docs:

In [ ]:

!link_check -h

usage: link_check [-h] [--host HOST] [--config_file CONFIG_FILE]
                  [--actions_output] [--exit_on_found] [--print_logs] [--pdb]
                  [--xtra XTRA]
                  path

Check for broken links recursively in `path`.

positional arguments:
  path                  Root directory searched recursively for HTML files

optional arguments:
  -h, --help            show this help message and exit
  --host HOST           Host and path (without protocol) of web server
                        (default: )
  --config_file CONFIG_FILE
                        Location of file with urls to ignore
  --actions_output      Toggle GitHub Actions output on/off (default: False)
  --exit_on_found       (CLI Only) Exit with status code 1 if broken links are
                        found (default: False)
  --print_logs          Toggle printing logs to stdout. (default: False)
  --pdb                 Run in pdb debugger (default: False)
  --xtra XTRA           Parse for additional args (default: '')

In [ ]:

#hide
from nbdev.export import *
notebook2script()

Converted index.ipynb.
Converted linkcheck.ipynb.

In [ ]: