API for fast local and online link checking
#export
from fastcore.all import *
from html.parser import HTMLParser
from urllib.parse import urlparse,urlunparse
#export
class _HTMLParseAttrs(HTMLParser):
def reset(self):
super().reset()
self.found = set()
def handle_starttag(self, tag, attrs):
a = first(v for k,v in attrs if k in ("src","href"))
if a: self.found.add(a)
handle_startendtag = handle_starttag
#export
def get_links(fn):
"List of all links in file `fn`"
h = _HTMLParseAttrs()
h.feed(Path(fn).read_text())
return L(h.found)
We can use get_links
to parse an HTML file for different types of links. For example, this is the contents of ./example/test.html
:
!cat ./example/test.html
<a href="//somecdn.com/doesntexist.html"></a> <a href="http://www.bing.com"></a> <script src="test.js"></script> <img src="http://fastlinkcheck.com/test.html" />
Calling get_links
with the above file path will return a list of links:
links = get_links('./example/test.html')
links
(#4) ['http://fastlinkcheck.com/test.html','http://www.bing.com','//somecdn.com/doesntexist.html','test.js']
test_eq(set(links), {'test.js',
'//somecdn.com/doesntexist.html',
'http://www.bing.com','http://fastlinkcheck.com/test.html'})
#export
def _local_url(u, root, host, fname):
"Change url `u` to local path if it is a local link"
fpath = Path(fname).parent
islocal=False
# remove `host` prefix
for o in 'http://','https://','http://www.','https://www.':
if u.startswith(o+host): u,islocal = remove_prefix(u, o+host),True
# remove params, querystring, and fragment
p = list(urlparse(u))[:5]+['']
# local prefix, or no protocol or host
if islocal or (not p[0] and not p[1]):
u = p[2]
if u and u[0]=='/': return (root/u[1:]).resolve()
else: return (fpath/u).resolve()
# URLs without a protocol are "protocol relative"
if not p[0]: p[0]='http'
# mailto etc are not checked
if p[0] not in ('http','https'): return ''
return urlunparse(p)
class LinkMap(dict):
"""A dict that pretty prints Links and their associated locations."""
def __repr__(self):
rstr=''
for k in self:
rstr+=f'Link: {repr(k)}\n Locations found:\n'
for p in self[k]:
rstr+=f' - {p}\n'
rstr+='\n'
return rstr
#export
def local_urls(path:Path, host:str):
"returns a `dict` mapping all HTML files in `path` to a list of locally-resolved links in that file"
path=Path(path)
fns = L(path.glob('**/*.html'))+L(path.glob('**/*.htm'))
found = [(fn.resolve(),_local_url(link, root=path, host=host, fname=fn))
for fn in fns for link in get_links(fn)]
return LinkMap(groupby(found, 1, 0))
The keys of the dict
returned by local_urls
are links found in HTML files, and the values of this dict
are a list of paths that those links are found in.
Furthermore, local links are returned as Path
objects, whereas external URLs are strings. For example, notice how the link:
<img src="http://fastlinkcheck.com/test.html" />
is resolved to a local path, because the host
parameter supplied to local_urls
, fastlinkcheck.com
matches the url in the link:
path = Path('./example')
links = local_urls(path, host='fastlinkcheck.com')
links
Link: Path('/Users/hamelsmu/github/fastlinkcheck/example/test.html') Locations found: - /Users/hamelsmu/github/fastlinkcheck/example/test.html Link: 'http://www.bing.com' Locations found: - /Users/hamelsmu/github/fastlinkcheck/example/test.html Link: 'http://somecdn.com/doesntexist.html' Locations found: - /Users/hamelsmu/github/fastlinkcheck/example/test.html Link: Path('/Users/hamelsmu/github/fastlinkcheck/example/test.js') Locations found: - /Users/hamelsmu/github/fastlinkcheck/example/test.html
def broken_local(links) -> L:
"List of items in keys of `links` that are `Path`s that do not exist"
return L(o for o in links if isinstance(o,Path) and not o.exists())
Since test.js
does not exist in the example/
directory, broken_local
returns this path:
broken_local(links)
(#1) [Path('/Users/hamelsmu/github/fastlinkcheck/example/test.js')]
assert not all([x.exists() for x in broken_local(links)])
def broken_urls(links):
"List of items in keys of `links` that are URLs that return a failure status code"
its = L(links).filter(risinstance(str))
working_urls = parallel(urlcheck, its, n_workers=32, threadpool=True)
return L(o for o,p in zip(its,working_urls) if not p)
Similarly the url http://somecdn.com/doesntexist.html
doesn't exist, which is why it is returned by broken_urls
assert broken_urls(links) == ['http://somecdn.com/doesntexist.html']
@call_parse
def fastlinkcheck(path:Param("Root directory searched recursively for HTML files", str),
host:Param("Host and path (without protocol) of web server", str)='',
config_file:Param("Location of file with urls to ignore", str)=None):
if config_file: assert Path(config_file).is_file(), f"{config_file} is either not a file or doesn't exist."
ignore = [] if not config_file else [x.strip() for x in Path(config_file).readlines()]
links = local_urls(path, host=host)
return LinkMap({k:links[k] for k in (broken_urls(links) + broken_local(links)) if str(k) not in ignore})
fastlinkcheck(path='./example', host='fastlinkcheck.com')
Link: 'http://somecdn.com/doesntexist.html' Locations found: - /Users/hamelsmu/github/fastlinkcheck/example/test.html Link: Path('/Users/hamelsmu/github/fastlinkcheck/example/test.js') Locations found: - /Users/hamelsmu/github/fastlinkcheck/example/test.html
You can choose to ignore files with a a plain-text file containing a list of urls to ignore. For example, the file linkcheck.rc
contains a list of urls I want to ignore:
! cat linkcheck.rc
/Users/hamelsmu/github/fastlinkcheck/example/test.js https://www.google.com
In this case example/test.js
will be filtered out from the list:
fastlinkcheck(path='./example', host='fastlinkcheck.com', config_file='linkcheck.rc')
Link: 'http://somecdn.com/doesntexist.html' Locations found: - /Users/hamelsmu/github/fastlinkcheck/example/test.html
with ExceptionExpected(ex=AssertionError, regex="not a file or doesn't exist"):
fastlinkcheck(path='./example/', config_file='doesnt_exist')