# default_exp core
# export
from fastprogress.fastprogress import progress_bar
from fastcore.all import *
import hashlib,shutil
from pprint import pformat
#hide
from nbdev.showdoc import show_doc
import tempfile,fastdownload
This helper functions provide the functionality that FastDownload
relies on. Most users should use FastDownload
rather than calling these helpers.
dest = Path('tmp')
url = 'https://s3.amazonaws.com/fast-ai-sample/mnist_tiny.tgz'
#hide
shutil.rmtree(dest, ignore_errors=True)
Path.BASE_PATH = Path.home()
#export
def download_url(url, dest=None, timeout=None, show_progress=True):
"Download `url` to `dest` and show progress"
pbar = progress_bar([])
def progress(count=1, bsize=1, tsize=None):
pbar.total = tsize
pbar.update(count*bsize)
return urlsave(url, dest, reporthook=progress if show_progress else None, timeout=timeout)
dest.mkdir(exist_ok=True)
fpath = download_url(url, dest)
fpath
Path('tmp/mnist_tiny.tgz')
# export
def path_stats(fpath):
"`True` if size and hash of `fpath` matches `size_check` and `hash_check`"
size = os.path.getsize(fpath)
# Just use first 1MB of file for performance
with open(fpath, "rb") as f: hashed = hashlib.md5(f.read(2**20)).hexdigest()
return size,hashed
path_stats(fpath)
(342207, '56143e8f24db90d925d82a5a74141875')
#export
def checks_module(module):
"Location of `download_checks.py`"
if not module: return {}
return Path(module.__file__).parent/'download_checks.py'
The download_checks.py
file containing sizes and hashes will be located next to module
:
mod = checks_module(fastdownload)
mod
Path('git/fastdownload/fastdownload/download_checks.py')
#export
def read_checks(fmod):
"Evaluated contents of `download_checks.py`"
if fmod == {} or not fmod.exists(): return {}
txt = fmod.read_text()
return eval(txt) if txt else {}
assert read_checks({}) == {}
#export
def check(fmod, url, fpath):
"Check whether size and hash of `fpath` matches stored data for `url` or data is missing"
checks = read_checks(fmod).get(url)
return not checks or path_stats(fpath)==checks
# export
def update_checks(fpath, url, fmod):
"Store the hash and size of `fpath` for `url` in `download_checks.py`"
checks = read_checks(fmod)
checks[url] = path_stats(fpath)
fmod.write_text(pformat(checks))
if mod.exists(): mod.unlink()
update_checks(fpath, url, mod)
read_checks(mod)
{'https://s3.amazonaws.com/fast-ai-sample/mnist_tiny.tgz': (342207, '56143e8f24db90d925d82a5a74141875')}
#export
def download_and_check(url, fpath, fmod, force):
"Download `url` to `fpath`, unless exists and `check` fails and not `force`"
if not force and fpath.exists():
if check(fmod, url, fpath): return fpath
else: print("Downloading a new version of this dataset...")
res = download_url(url, fpath)
if not check(fmod, url, fpath): raise Exception("Downloaded file is corrupt or not latest version")
return res
# export
class FastDownload:
def __init__(self, cfg=None, base='~/.fastdownload', archive=None, data=None, module=None):
base = Path(base).expanduser().absolute()
default = {'data':(data or 'data'), 'archive':(archive or 'archive')}
self.cfg = Config(base, 'config.ini', create=default) if cfg is None else cfg
self.module = checks_module(module)
if data is not None: self.cfg['data'] = data
if archive is not None: self.cfg['archive'] = archive
def arch_path(self):
"Path to archives"
return self.cfg.path('archive')
def data_path(self, extract_key='data', arch=None):
"Path to extracted data"
path = self.cfg.path(extract_key)
return path if arch is None else path/remove_suffix(arch.stem, '.tar')
def check(self, url, fpath):
"Check whether size and hash of `fpath` matches stored data for `url` or data is missing"
checks = read_checks(self.module).get(url)
return not checks or path_stats(fpath)==checks
def download(self, url, force=False):
"Download `url` to archive path, unless exists and `self.check` fails and not `force`"
self.arch_path().mkdir(exist_ok=True, parents=True)
return download_and_check(url, urldest(url, self.arch_path()), self.module, force)
def rm(self, url, rm_arch=True, rm_data=True, extract_key='data'):
"Delete downloaded archive and extracted data for `url`"
arch = urldest(url, self.arch_path())
if rm_arch: arch.delete()
if rm_data: self.data_path(extract_key, arch).delete()
def update(self, url):
"Store the hash and size in `download_checks.py`"
update_checks(urldest(url, self.arch_path()), url, self.module)
def extract(self, url, extract_key='data', force=False):
"Extract archive already downloaded from `url`, overwriting existing if `force`"
arch = urldest(url, self.arch_path())
if not arch.exists(): raise Exception(f'{arch} does not exist')
dest = self.data_path(extract_key)
dest.mkdir(exist_ok=True, parents=True)
return untar_dir(arch, dest, rename=True, overwrite=force)
def get(self, url, extract_key='data', force=False):
"Download and extract `url`, overwriting existing if `force`"
if not force:
data = self.data_path(extract_key, urldest(url, self.arch_path()))
if data.exists(): return data
self.download(url, force=force)
return self.extract(url, extract_key=extract_key, force=force)
d = FastDownload(module=fastdownload)
d.module
Path('git/fastdownload/fastdownload/download_checks.py')
The config.ini
file will be created (if it doesn't exist) in {base}/config.ini
:
d.cfg.config_file
Path('.fastdownload/config.ini')
print(d.cfg.config_file.read_text())
[DEFAULT] data = /home/jhoward/.fastdownload/data archive = /home/jhoward/.fastdownload/archive
show_doc(FastDownload.download)
FastDownload.download
[source]
FastDownload.download
(url
,force
=False
)
Download url
to archive path, unless exists and self.check
fails and not force
If there is no stored hash and size for url
, or the size and hash matches the stored checks, then download
will only download the URL if the destination file does not exist. The destination path will be retured.
if d.module.exists(): d.module.unlink()
arch = d.download(url)
arch
Path('.fastdownload/archive/mnist_tiny.tgz')
show_doc(FastDownload.update)
d.update(url)
eval(d.module.read_text())
{'https://s3.amazonaws.com/fast-ai-sample/mnist_tiny.tgz': (342207, '56143e8f24db90d925d82a5a74141875')}
Calling download
will now just return the existing file, since the checks match:
d.download(url)
Path('.fastdownload/archive/mnist_tiny.tgz')
If the checks file doesn't match the size or hash of the archive, then a new copy of the file will be downloaded.
show_doc(FastDownload.extract)
FastDownload.extract
[source]
FastDownload.extract
(url
,extract_key
='data'
,force
=False
)
Extract archive already downloaded from url
, overwriting existing if force
extr = d.extract(url, force=True)
extr
Path('.fastdownload/data/mnist_tiny')
extr.ls()
(#5) [Path('.fastdownload/data/mnist_tiny/models'),Path('.fastdownload/data/mnist_tiny/train'),Path('.fastdownload/data/mnist_tiny/labels.csv'),Path('.fastdownload/data/mnist_tiny/valid'),Path('.fastdownload/data/mnist_tiny/test')]
Pass extract_key
to use a key other than data
from your config file when selecting an archive extraction location:
d.cfg['model_path'] = 'models'
d.extract(url, extract_key='model_path')
Path('.fastdownload/models/mnist_tiny')
show_doc(FastDownload.rm)
FastDownload.rm
[source]
FastDownload.rm
(url
,rm_arch
=True
,rm_data
=True
,extract_key
='data'
)
Delete downloaded archive and extracted data for url
d.rm(url)
extr.exists(),arch.exists()
(False, False)
show_doc(FastDownload.get)
FastDownload.get
[source]
FastDownload.get
(url
,extract_key
='data'
,force
=False
)
Download and extract url
, overwriting existing if force
res = d.get(url)
res,extr.exists()
(Path('.fastdownload/data/mnist_tiny'), True)
If the archive doesn't exist, but the extracted data does, then the archive is not downloaded again.
d.rm(url, rm_data=False)
res = d.get(url)
res,extr.exists()
(Path('.fastdownload/data/mnist_tiny'), True)
extract_key
works the same way as in FastDownload.extract
:
res = d.get(url, extract_key='model_path')
res,res.exists()
(Path('.fastdownload/models/mnist_tiny'), True)
#hide
from nbdev.export import notebook2script
notebook2script()
Converted 00_core.ipynb. Converted index.ipynb.