#default_exp linkcheck #export from fastcore.all import * from html.parser import HTMLParser from urllib.parse import urlparse,urlunparse from fastcore.script import SCRIPT_INFO import sys #export class _HTMLParseAttrs(HTMLParser): def reset(self): super().reset() self.found = set() def handle_starttag(self, tag, attrs): a = first(v for k,v in attrs if k in ("src","href")) if a: self.found.add(a) handle_startendtag = handle_starttag #export def get_links(fn): "List of all links in file `fn`" h = _HTMLParseAttrs() h.feed(Path(fn).read_text()) return L(h.found) example = Path('_example/test.html') print(example.read_text()) links = get_links(example) test_eq(set(links), {'test.js', '//somecdn.com/doesntexist.html', 'http://www.bing.com','http://fastlinkcheck.com/test.html'}) #export def _local_url(u, root, host, fname): "Change url `u` to local path if it is a local link" fpath = Path(fname).parent islocal=False # remove `host` prefix for o in 'http://','https://','http://www.','https://www.': if u.startswith(o+host): u,islocal = remove_prefix(u, o+host),True # remove params, querystring, and fragment p = list(urlparse(u))[:5]+[''] # local prefix, or no protocol or host if islocal or (not p[0] and not p[1]): u = p[2] if u and u[0]=='/': return (root/u[1:]).resolve() else: return (fpath/u).resolve() # URLs without a protocol are "protocol relative" if not p[0]: p[0]='http' # mailto etc are not checked if p[0] not in ('http','https'): return '' return urlunparse(p) #export class _LinkMap(dict): """A dict that pretty prints Links and their associated locations.""" def _repr_locs(self, k): return '\n'.join(f' - `{p}`' for p in self[k]) def __repr__(self): rstr = L(f'- {k!r} was found in the following pages:\n{self._repr_locs(k)}' for k in self).concat() return '\n'.join(rstr) _repr_markdown_ = __repr__ #export def local_urls(path:Path, host:str): "returns a `dict` mapping all HTML files in `path` to a list of locally-resolved links in that file" path=Path(path) fns = L(path.glob('**/*.html'))+L(path.glob('**/*.htm')) found = [(fn.resolve(),_local_url(link, root=path, host=host, fname=fn)) for fn in fns for link in get_links(fn)] return _LinkMap(groupby(found, 1, 0)) path = Path('./_example') links = local_urls(path, host='fastlinkcheck.com') links #export def broken_local(links, ignore_paths=None): "List of items in keys of `links` that are `Path`s that do not exist" ignore_paths = setify(ignore_paths) return L(o for o in links if isinstance(o,Path) and o not in ignore_paths and not o.exists()) broken_local(links) assert not all([x.exists() for x in broken_local(links)]) #export def broken_urls(links, ignore_urls=None): "List of items in keys of `links` that are URLs that return a failure status code" ignore_urls = setify(ignore_urls) its = L(o for o in links if isinstance(o, str) and o not in ignore_urls) working_urls = parallel(urlcheck, its, n_workers=32, threadpool=True) return L(o for o,p in zip(its,working_urls) if not p) assert broken_urls(links) == ['http://somecdn.com/doesntexist.html'] #export @call_parse def link_check(path:Param("Root directory searched recursively for HTML files", str), host:Param("Host and path (without protocol) of web server", str)='', config_file:Param("Location of file with urls to ignore",str)=None, actions_output:Param("Toggle GitHub Actions output on/off",store_true)=False, exit_on_found:Param("(CLI Only) Exit with status code 1 if broken links are found", store_true)=False, print_logs:Param("Toggle printing logs to stdout.", store_true)=False): """Check for broken links recursively in `path`.""" path = Path(path) is_cli = (SCRIPT_INFO.func == 'link_check') assert path.exists(), f"{path.absolute()} does not exist." if config_file: assert Path(config_file).is_file(), f"{config_file} is either not a file or doesn't exist." ignore = L(x.strip() for x in (Path(config_file).readlines() if config_file else '')) links = local_urls(path, host=host) ignore_paths = set((path/o).resolve() for o in ignore if not urlvalid(o)) ignore_urls = set(ignore.filter(urlvalid)) lm = _LinkMap({k:links[k] for k in (broken_urls(links, ignore_urls) + broken_local(links, ignore_paths))}) if actions_output: print(f"::set-output name=broken_links::{bool(lm)}") msg = f'\nERROR: The Following Broken Links or Paths were found:\n{lm}' if lm else 'No Broken Links Found!' if print_logs or is_cli: print(msg) if is_cli and lm and exit_on_found: sys.exit(1) else: return lm link_check(path='_example', host='fastlinkcheck.com') print((path/'linkcheck.rc').read_text()) link_check(path='_example', host='fastlinkcheck.com', config_file='_example/linkcheck.rc') link_check(path="_example", host="fastlinkcheck.com", actions_output=True) !link_check -h #hide from nbdev.export import * notebook2script()