In [ ]:

# default_exp core

Core API¶

In [ ]:

# export
from fastprogress.fastprogress import progress_bar
from fastcore.all import *
import hashlib,shutil
from pprint import pformat

In [ ]:

#hide
from nbdev.showdoc import show_doc
import tempfile,fastdownload

Helpers¶

This helper functions provide the functionality that FastDownload relies on. Most users should use FastDownload rather than calling these helpers.

In [ ]:

dest = Path('tmp')
url = 'https://s3.amazonaws.com/fast-ai-sample/mnist_tiny.tgz'

In [ ]:

#hide
shutil.rmtree(dest, ignore_errors=True)
Path.BASE_PATH = Path.home()

In [ ]:

#export
def download_url(url, dest=None, timeout=None, show_progress=True):
    "Download `url` to `dest` and show progress"
    pbar = progress_bar([])
    def progress(count=1, bsize=1, tsize=None):
        pbar.total = tsize
        pbar.update(count*bsize)
    return urlsave(url, dest, reporthook=progress if show_progress else None, timeout=timeout)

In [ ]:

dest.mkdir(exist_ok=True)
fpath = download_url(url, dest)
fpath

100.54% [344064/342207 00:00<00:00]

Out[ ]:

Path('tmp/mnist_tiny.tgz')

In [ ]:

# export
def path_stats(fpath):
    "`True` if size and hash of `fpath` matches `size_check` and `hash_check`"
    size = os.path.getsize(fpath)
    # Just use first 1MB of file for performance
    with open(fpath, "rb") as f: hashed = hashlib.md5(f.read(2**20)).hexdigest()
    return size,hashed

In [ ]:

path_stats(fpath)

Out[ ]:

(342207, '56143e8f24db90d925d82a5a74141875')

In [ ]:

#export
def checks_module(module):
    "Location of `download_checks.py`"
    if not module: return {}
    return Path(module.__file__).parent/'download_checks.py'

The download_checks.py file containing sizes and hashes will be located next to module:

In [ ]:

mod = checks_module(fastdownload)
mod

Out[ ]:

Path('git/fastdownload/fastdownload/download_checks.py')

In [ ]:

#export
def read_checks(fmod):
    "Evaluated contents of `download_checks.py`"
    if fmod == {} or not fmod.exists(): return {}
    txt = fmod.read_text()
    return eval(txt) if txt else {}

In [ ]:

assert read_checks({}) == {}

In [ ]:

#export
def check(fmod, url, fpath):
    "Check whether size and hash of `fpath` matches stored data for `url` or data is missing"
    checks = read_checks(fmod).get(url)
    return not checks or path_stats(fpath)==checks

In [ ]:

# export
def update_checks(fpath, url, fmod):
    "Store the hash and size of `fpath` for `url` in `download_checks.py`"
    checks = read_checks(fmod)
    checks[url] = path_stats(fpath)
    fmod.write_text(pformat(checks))

In [ ]:

if mod.exists(): mod.unlink()
update_checks(fpath, url, mod)
read_checks(mod)

Out[ ]:

{'https://s3.amazonaws.com/fast-ai-sample/mnist_tiny.tgz': (342207,
  '56143e8f24db90d925d82a5a74141875')}

In [ ]:

#export
def download_and_check(url, fpath, fmod, force):
    "Download `url` to `fpath`, unless exists and `check` fails and not `force`"
    if not force and fpath.exists():
        if check(fmod, url, fpath): return fpath
        else: print("Downloading a new version of this dataset...")
    res = download_url(url, fpath)
    if not check(fmod, url, fpath): raise Exception("Downloaded file is corrupt or not latest version")
    return res

FastDownload -¶

In [ ]:

# export
class FastDownload:
    def __init__(self, cfg=None, base='~/.fastdownload', archive=None, data=None, module=None):
        base = Path(base).expanduser().absolute()
        default = {'data':(data or 'data'), 'archive':(archive or 'archive')}
        self.cfg = Config(base, 'config.ini', create=default) if cfg is None else cfg
        self.module = checks_module(module)
        if data is not None: self.cfg['data'] = data
        if archive is not None: self.cfg['archive'] = archive
    
    def arch_path(self):
        "Path to archives"
        return self.cfg.path('archive')
    
    def data_path(self, extract_key='data', arch=None):
        "Path to extracted data"
        path = self.cfg.path(extract_key)
        return path if arch is None else path/remove_suffix(arch.stem, '.tar')

    def check(self, url, fpath):
        "Check whether size and hash of `fpath` matches stored data for `url` or data is missing"
        checks = read_checks(self.module).get(url)
        return not checks or path_stats(fpath)==checks

    def download(self, url, force=False):
        "Download `url` to archive path, unless exists and `self.check` fails and not `force`"
        self.arch_path().mkdir(exist_ok=True, parents=True)
        return download_and_check(url, urldest(url, self.arch_path()), self.module, force)
    
    def rm(self, url, rm_arch=True, rm_data=True, extract_key='data'):
        "Delete downloaded archive and extracted data for `url`"
        arch = urldest(url, self.arch_path())
        if rm_arch: arch.delete()
        if rm_data: self.data_path(extract_key, arch).delete()

    def update(self, url):
        "Store the hash and size in `download_checks.py`"
        update_checks(urldest(url, self.arch_path()), url, self.module)

    def extract(self, url, extract_key='data', force=False):
        "Extract archive already downloaded from `url`, overwriting existing if `force`"
        arch = urldest(url, self.arch_path())
        if not arch.exists(): raise Exception(f'{arch} does not exist')
        dest = self.data_path(extract_key)
        dest.mkdir(exist_ok=True, parents=True)
        return untar_dir(arch, dest, rename=True, overwrite=force)
    
    def get(self, url, extract_key='data', force=False):
        "Download and extract `url`, overwriting existing if `force`"
        if not force:
            data = self.data_path(extract_key, urldest(url, self.arch_path()))
            if data.exists(): return data
        self.download(url, force=force)
        return self.extract(url, extract_key=extract_key, force=force)

In [ ]:

d = FastDownload(module=fastdownload)
d.module

Out[ ]:

Path('git/fastdownload/fastdownload/download_checks.py')

The config.ini file will be created (if it doesn't exist) in {base}/config.ini:

In [ ]:

d.cfg.config_file

Out[ ]:

Path('.fastdownload/config.ini')

In [ ]:

print(d.cfg.config_file.read_text())

[DEFAULT]
data = /home/jhoward/.fastdownload/data
archive = /home/jhoward/.fastdownload/archive

In [ ]:

show_doc(FastDownload.download)

`FastDownload.download`[source]

FastDownload.download(url, force=False)

Download url to archive path, unless exists and self.check fails and not force

If there is no stored hash and size for url, or the size and hash matches the stored checks, then download will only download the URL if the destination file does not exist. The destination path will be retured.

In [ ]:

if d.module.exists(): d.module.unlink()
arch = d.download(url)
arch

100.54% [344064/342207 00:00<00:00]

Out[ ]:

Path('.fastdownload/archive/mnist_tiny.tgz')

In [ ]:

show_doc(FastDownload.update)

`FastDownload.update`[source]

FastDownload.update(url)

Store the hash and size in download_checks.py

In [ ]:

d.update(url)
eval(d.module.read_text())

Out[ ]:

{'https://s3.amazonaws.com/fast-ai-sample/mnist_tiny.tgz': (342207,
  '56143e8f24db90d925d82a5a74141875')}

Calling download will now just return the existing file, since the checks match:

In [ ]:

d.download(url)

Out[ ]:

Path('.fastdownload/archive/mnist_tiny.tgz')

If the checks file doesn't match the size or hash of the archive, then a new copy of the file will be downloaded.

In [ ]:

show_doc(FastDownload.extract)

`FastDownload.extract`[source]

FastDownload.extract(url, extract_key='data', force=False)

Extract archive already downloaded from url, overwriting existing if force

In [ ]:

extr = d.extract(url, force=True)
extr

Out[ ]:

Path('.fastdownload/data/mnist_tiny')

In [ ]:

extr.ls()

Out[ ]:

(#5) [Path('.fastdownload/data/mnist_tiny/models'),Path('.fastdownload/data/mnist_tiny/train'),Path('.fastdownload/data/mnist_tiny/labels.csv'),Path('.fastdownload/data/mnist_tiny/valid'),Path('.fastdownload/data/mnist_tiny/test')]

Pass extract_key to use a key other than data from your config file when selecting an archive extraction location:

In [ ]:

d.cfg['model_path'] = 'models'
d.extract(url, extract_key='model_path')

Out[ ]:

Path('.fastdownload/models/mnist_tiny')

In [ ]:

show_doc(FastDownload.rm)

`FastDownload.rm`[source]

FastDownload.rm(url, rm_arch=True, rm_data=True, extract_key='data')

Delete downloaded archive and extracted data for url

In [ ]:

d.rm(url)
extr.exists(),arch.exists()

Out[ ]:

(False, False)

In [ ]:

show_doc(FastDownload.get)

`FastDownload.get`[source]

FastDownload.get(url, extract_key='data', force=False)

Download and extract url, overwriting existing if force

In [ ]:

res = d.get(url)
res,extr.exists()

100.54% [344064/342207 00:00<00:00]

Out[ ]:

(Path('.fastdownload/data/mnist_tiny'), True)

If the archive doesn't exist, but the extracted data does, then the archive is not downloaded again.

In [ ]:

d.rm(url, rm_data=False)
res = d.get(url)
res,extr.exists()

Out[ ]:

(Path('.fastdownload/data/mnist_tiny'), True)

extract_key works the same way as in FastDownload.extract:

In [ ]:

res = d.get(url, extract_key='model_path')
res,res.exists()

Out[ ]:

(Path('.fastdownload/models/mnist_tiny'), True)

Export -¶

In [ ]:

#hide
from nbdev.export import notebook2script
notebook2script()

Converted 00_core.ipynb.
Converted index.ipynb.

In [ ]:

Core API¶

Helpers¶

FastDownload -¶

FastDownload.download[source]

FastDownload.update[source]

FastDownload.extract[source]

FastDownload.rm[source]

FastDownload.get[source]

Export -¶

`FastDownload.download`[source]

`FastDownload.update`[source]

`FastDownload.extract`[source]

`FastDownload.rm`[source]

`FastDownload.get`[source]