Notebook --- title: clean output-file: clean.html description: Strip superfluous metadata from notebooks ---

In [ ]:

#|default_exp clean

In [ ]:

#|export
import ast,warnings,stat
from astunparse import unparse
from textwrap import indent

from execnb.nbio import *
from fastcore.script import *
from fastcore.basics import *
from fastcore.imports import *

from nbdev.imports import *
from nbdev.config import *
from nbdev.sync import *
from nbdev.process import first_code_ln

In [ ]:

#|hide
from fastcore.test import *

To avoid pointless conflicts while working with jupyter notebooks (with different execution counts or cell metadata), it is recommended to clean the notebooks before committing anything (done automatically if you install the git hooks with nbdev_install_hooks). The following functions are used to do that.

Trust¶

In [ ]:

#|export
@call_parse
def nbdev_trust(
    fname:str=None,  # A notebook name or glob to trust
    force_all:bool=False  # Also trust notebooks that haven't changed
):
    "Trust notebooks matching `fname`"
    try: from nbformat.sign import NotebookNotary
    except:
        import warnings
        warnings.warn("Please install jupyter and try again")
        return

    fname = Path(fname if fname else get_config().path('nbs_path'))
    path = fname if fname.is_dir() else fname.parent
    check_fname = path/".last_checked"
    last_checked = os.path.getmtime(check_fname) if check_fname.exists() else None
    nbs = globtastic(fname, file_glob='*.ipynb', skip_folder_re='^[_.]') if fname.is_dir() else [fname]
    for fn in nbs:
        if last_checked and not force_all:
            last_changed = os.path.getmtime(fn)
            if last_changed < last_checked: continue
        nb = read_nb(fn)
        if not NotebookNotary().check_signature(nb): NotebookNotary().sign(nb)
    check_fname.touch(exist_ok=True)

Clean¶

Utils -¶

In [ ]:

#|export
_repr_id_re = re.compile('(<.*?)( at 0x[0-9a-fA-F]+)(>)')

def _clean_cell_output_id(lines):
    sub = partial(_repr_id_re.sub, r'\1\3')
    return sub(lines) if isinstance(lines,str) else [sub(o) for o in lines]

In [ ]:

#|hide
test_eq(_clean_cell_output_id(['Lambda(func=<function _add2 at 0x7f8252378820>)',
                               '[<PIL.Image.Image image mode=RGB size=320x240 at 0x7FAC4E2CF610>,\n',
                               '(<a at 0x7f8252378820>, <b at 0x7EFE94247550>, <c at 0x7f8252378820>)']),
                              ['Lambda(func=<function _add2>)',
                               '[<PIL.Image.Image image mode=RGB size=320x240>,\n',
                               '(<a>, <b>, <c>)'])
test_eq(_clean_cell_output_id('foo\n<function _add2 at 0x7f8252378820>\nbar'), 'foo\n<function _add2>\nbar')

In [ ]:

#|export
def _clean_cell_output(cell, clean_ids):
    "Remove `cell` output execution count and optionally ids from text reprs"
    outputs = cell.get('outputs', [])
    for o in outputs:
        if 'execution_count' in o: o['execution_count'] = None
        data = o.get('data', {})
        data.pop("application/vnd.google.colaboratory.intrinsic+json", None)
        if clean_ids:
            for k in data:
                if k.startswith('text'): data[k] = _clean_cell_output_id(data[k])
            if 'text' in o: o['text'] = _clean_cell_output_id(o['text'])
        o.get('metadata', {}).pop('tags', None)

In [ ]:

#|export
def _clean_cell(cell, clear_all, allowed_metadata_keys, clean_ids):
    "Clean `cell` by removing superfluous metadata or everything except the input if `clear_all`"
    if 'execution_count' in cell: cell['execution_count'] = None
    if 'outputs' in cell:
        if clear_all: cell['outputs'] = []
        else:         _clean_cell_output(cell, clean_ids)
    if cell['source'] == ['']: cell['source'] = []
    cell['metadata'] = {} if clear_all else {
        k:v for k,v in cell['metadata'].items() if k in allowed_metadata_keys}

In [ ]:

#|export
def clean_nb(
    nb, # The notebook to clean
    clear_all=False, # Remove all cell metadata and cell outputs
    allowed_metadata_keys:list=None, # Preserve the list of keys in the main notebook metadata
    allowed_cell_metadata_keys:list=None, # Preserve the list of keys in cell level metadata
    clean_ids=True, # Remove ids from plaintext reprs?
):
    "Clean `nb` from superfluous metadata"
    metadata_keys = {"kernelspec", "jekyll", "jupytext", "doc"}
    if allowed_metadata_keys: metadata_keys.update(allowed_metadata_keys)
    cell_metadata_keys = {"hide_input"}
    if allowed_cell_metadata_keys: cell_metadata_keys.update(allowed_cell_metadata_keys)
    for c in nb['cells']: _clean_cell(c, clear_all, cell_metadata_keys, clean_ids)
    nb['metadata'] = {k:v for k,v in nb['metadata'].items() if k in metadata_keys}

The test notebook has metadata in both the main metadata section and contains cell level metadata in the second cell:

In [ ]:

test_nb = read_nb('../tests/metadata.ipynb')

assert {'meta', 'jekyll', 'my_extra_key', 'my_removed_key'} <= test_nb.metadata.keys()
assert {'meta', 'hide_input', 'my_extra_cell_key', 'my_removed_cell_key'} == test_nb.cells[1].metadata.keys()

After cleaning the notebook, all extra metadata is removed, only some keys are allowed by default:

In [ ]:

clean_nb(test_nb)

assert {'jekyll', 'kernelspec'} == test_nb.metadata.keys()
assert {'hide_input'} == test_nb.cells[1].metadata.keys()

We can preserve some additional keys at the notebook or cell levels:

In [ ]:

test_nb = read_nb('../tests/metadata.ipynb')
clean_nb(test_nb, allowed_metadata_keys={'my_extra_key'}, allowed_cell_metadata_keys={'my_extra_cell_key'})

assert {'jekyll', 'kernelspec', 'my_extra_key'} == test_nb.metadata.keys()
assert {'hide_input', 'my_extra_cell_key'} == test_nb.cells[1].metadata.keys()

Passing clear_all=True removes everything from the cell metadata:

In [ ]:

test_nb = read_nb('../tests/metadata.ipynb')
clean_nb(test_nb, clear_all=True)

assert {'jekyll', 'kernelspec'} == test_nb.metadata.keys()
test_eq(test_nb.cells[1].metadata, {})

Passing clean_ids=True removes ids from plaintext repr outputs, to avoid notebooks whose contents change on each run since they often lead to git merge conflicts. For example:

<PIL.PngImagePlugin.PngImageFile image mode=L size=28x28 at 0x7FB4F8979690>

becomes:

<PIL.PngImagePlugin.PngImageFile image mode=L size=28x28>

Commands -¶

In [ ]:

#|export
def _reconfigure(*strms):
    for s in strms:
        if hasattr(s,'reconfigure'): s.reconfigure(encoding='utf-8')

In [ ]:

#|export
def process_write(warn_msg, proc_nb, f_in, f_out=None, disp=False):
    if not f_out: f_out = sys.stdout if disp else f_in
    if isinstance(f_in, (str,Path)): f_in = Path(f_in).open()
    try:
        _reconfigure(f_in, f_out)
        nb = loads(f_in.read())
        proc_nb(nb)
        write_nb(nb, f_out)
    except Exception as e:
        warn(f'{warn_msg}')
        warn(e)

In [ ]:

#|export
def _nbdev_clean(nb, path=None, **kwargs):
    cfg = get_config(path=path)
    allowed_metadata_keys = cfg.get("allowed_metadata_keys").split()
    allowed_cell_metadata_keys = cfg.get("allowed_cell_metadata_keys").split()
    clean_ids = str2bool(cfg.get('clean_ids'))
    return clean_nb(nb, clean_ids=clean_ids, allowed_metadata_keys=allowed_metadata_keys,
                    allowed_cell_metadata_keys=allowed_cell_metadata_keys, **kwargs)

In [ ]:

#|export
@call_parse
def nbdev_clean(
    fname:str=None, # A notebook name or glob to clean
    clear_all:bool=False, # Clean all metadata and outputs
    disp:bool=False,  # Print the cleaned outputs
    stdin:bool=False # Read notebook from input stream
):
    "Clean all notebooks in `fname` to avoid merge conflicts"
    # Git hooks will pass the notebooks in stdin
    _clean = partial(_nbdev_clean, clear_all=clear_all)
    _write = partial(process_write, warn_msg='Failed to clean notebook', proc_nb=_clean)
    if stdin: return _write(f_in=sys.stdin, f_out=sys.stdout)
    
    if fname is None: fname = get_config().path('nbs_path')
    for f in globtastic(fname, file_glob='*.ipynb', skip_folder_re='^[_.]'): _write(f_in=f, disp=disp)

By default (fname left to None), all the notebooks in lib_folder are cleaned. You can opt in to fully clean the notebook by removing every bit of metadata and the cell outputs by passing clear_all=True.

If you want to keep some keys in the main notebook metadata you can set allowed_metadata_keys in settings.ini. Similarly for cell level metadata use: allowed_cell_metadata_keys. For example, to preserve both k1 and k2 at both the notebook and cell level adding the following in settings.ini:

...
allowed_metadata_keys = k1 k2
allowed_cell_metadata_keys = k1 k2
...

Jupyter -¶

In [ ]:

#|export
def clean_jupyter(path, model, **kwargs):
    "Clean Jupyter `model` pre save to `path`"
    if not (model['type']=='notebook' and model['content']['nbformat']==4): return
    get_config.cache_clear() # Allow config changes without restarting Jupyter
    jupyter_hooks = get_config(path=path).jupyter_hooks
    if jupyter_hooks in {'user','nbdev','none'}:
        warn(("`jupyter_hooks` values in `{'user','nbdev','none'}` are deprecated. Use `True` or `False` instead.\n"
              "See the docs for more: https://nbdev.fast.ai/clean.html#clean_jupyter"), DeprecationWarning)
        jupyter_hooks = False if jupyter_hooks == 'none' else True
    else: jupyter_hooks = str2bool(jupyter_hooks)
    if jupyter_hooks: _nbdev_clean(model['content'], path=path)

This cleans notebooks on-save to avoid unnecessary merge conflicts. The easiest way to install it for both Jupyter Notebook and Lab is by running nbdev_install_hooks. It works by implementing a pre_save_hook from Jupyter's file save hook API.

Hooks¶

In [ ]:

#|export
_pre_save_hook_src = '''
def nbdev_clean_jupyter(**kwargs):
    try: from nbdev.clean import clean_jupyter
    except ModuleNotFoundError: return
    clean_jupyter(**kwargs)

c.ContentsManager.pre_save_hook = nbdev_clean_jupyter'''.strip()
_pre_save_hook_re = re.compile(r'c\.(File)?ContentsManager\.pre_save_hook')

In [ ]:

#|export
def _add_jupyter_hooks(src, path):
    if _pre_save_hook_src in src: return
    mod = ast.parse(src)
    for node in ast.walk(mod):
        if not isinstance(node,ast.Assign): continue
        target = only(node.targets)
        if _pre_save_hook_re.match(unparse(target)):
            pre = ' '*2
            old = indent(unparse(node), pre)
            new = indent(_pre_save_hook_src, pre)
            sys.stderr.write(f"Can't install hook to '{path}' since it already contains:\n{old}\n"
                             f"Manually update to the following (without indentation) for this functionality:\n\n{new}\n\n")
            return
    src = src.rstrip()
    if src: src+='\n\n'
    return src+_pre_save_hook_src

In [ ]:

#|hide
# Returns None if hook is already installed
res = _add_jupyter_hooks(_pre_save_hook_src, 'config.py')
test_is(res, None)

In [ ]:

#|hide
# Returns None and warns if pre_save_hook is already set
res = _add_jupyter_hooks("c.ContentsManager.pre_save_hook = my_hook\n", 'config.py')
test_is(res, None)

Can't install hook to 'config.py' since it already contains:

  c.ContentsManager.pre_save_hook = my_hook

Manually update to the following (without indentation) for this functionality:

  def nbdev_clean_jupyter(**kwargs):
      try: from nbdev.clean import clean_jupyter
      except ModuleNotFoundError: return
      clean_jupyter(**kwargs)

  c.ContentsManager.pre_save_hook = nbdev_clean_jupyter

In [ ]:

#|hide
# Adds after existing source
show_src(_add_jupyter_hooks('an_existing_line = True\n', 'config.py'))

Out[ ]:

an_existing_line = True

def nbdev_clean_jupyter(**kwargs):
    try: from nbdev.clean import clean_jupyter
    except ModuleNotFoundError: return
    clean_jupyter(**kwargs)

c.ContentsManager.pre_save_hook = nbdev_clean_jupyter

In [ ]:

#|export
def _git_root(): 
    try: return Path(run('git rev-parse --show-toplevel'))
    except OSError: return None

In [ ]:

#|hide
import tempfile

In [ ]:

#|hide
test_eq(_git_root().name, 'nbdev')
with tempfile.TemporaryDirectory() as d, working_directory(d): test_is(_git_root(), None)

In [ ]:

#|export
@call_parse
def nbdev_install_hooks():
    "Install Jupyter and git hooks to automatically clean, trust, and fix merge conflicts in notebooks"
    cfg_path = Path.home()/'.jupyter'
    cfg_path.mkdir(exist_ok=True)
    cfg_fns = [cfg_path/f'jupyter_{o}_config.py' for o in ('notebook','server')]
    for fn in cfg_fns:
        src = fn.read_text() if fn.exists() else ''
        upd = _add_jupyter_hooks(src, fn)
        if upd is not None: fn.write_text(upd)

    repo_path = _git_root()
    if repo_path is None:
        sys.stderr.write('Not in a git repository, git hooks cannot be installed.\n')
        return
    hook_path = repo_path/'.git'/'hooks'
    fn = hook_path/'post-merge'
    hook_path.mkdir(parents=True, exist_ok=True)
    fn.write_text("#!/bin/bash\nnbdev_trust")
    os.chmod(fn, os.stat(fn).st_mode | stat.S_IEXEC)

    cmd = 'git config --local include.path ../.gitconfig'
    (repo_path/'.gitconfig').write_text(f'''# Generated by nbdev_install_hooks
#
# If you need to disable this instrumentation do:
#   git config --local --unset include.path
#
# To restore:
#   {cmd}
#
[merge "nbdev-merge"]
	name = resolve conflicts with nbdev_fix
	driver = nbdev_merge %O %A %B %P
''')
    run(cmd)

    attrs_path = repo_path/'.gitattributes'
    nbdev_attr = '*.ipynb merge=nbdev-merge\n'
    try:
        attrs = attrs_path.read_text()
        if nbdev_attr not in attrs:
            if not attrs.endswith('\n'): attrs+='\n'
            attrs_path.write_text(attrs+nbdev_attr)
    except FileNotFoundError: attrs_path.write_text(nbdev_attr)

    print("Hooks are installed.")

See clean_jupyter and nbdev_merge for more about how each hook works.

End-to-end git hooks test -¶

In [ ]:

#|hide
def _git_brunch_current(): return run('git branch --show-current')

In [ ]:

#|hide
meta = {'nbformat': 4,'metadata':{'kernelspec':{'display_name':'Python 3','language': 'python','name': 'python3'}}}
base = dict2nb({'cells':[mk_cell('import random'),
                         mk_cell('random.random()')], **meta})
base.cells[-1].output = create_output('0.3314001088639852\n0.20280244713400464', 'plain')

In [ ]:

#|hide
from copy import deepcopy

In [ ]:

#|hide
ours = deepcopy(base)
ours.cells[0].source+=',os' # Change first cell
ours.cells.insert(1, mk_cell('Calculate a random number:', cell_type='markdown')) # New cell
ours.cells[-1].output = create_output('0.3379097372590093\n0.7379492349993123', 'plain') # Change outputs

In [ ]:

#|hide
thrs = deepcopy(base)
thrs.cells[0].source+=',sys'# Also change first cell
thrs.cells.insert(0, mk_cell('# Random numbers', cell_type='markdown')) # New cell
thrs.cells[-1].output = create_output('0.6587181429602441\n0.5962200692415515', 'plain') # Change outputs

In [ ]:

#|hide
import subprocess

In [ ]:

#|hide
def _run(cmd, check=True):
    proc = subprocess.run(cmd, shell=True, capture_output=True, text=True)
    if check and proc.returncode != 0:
        msg = f"Command '{cmd}' returned non-zero exit status {proc.returncode}"
        if proc.stdout.strip(): msg+=f'\nstdout: {proc.stdout.strip()}'
        if proc.stderr.strip(): msg+=f'\nstderr: {proc.stderr.strip()}'
        raise RuntimeError(msg)
    return proc

In [ ]:

#|hide
with tempfile.TemporaryDirectory() as d, working_directory(d):
    _run('git init')
    _run("git config user.email 'nbdev@fast.ai'")
    _run("git config user.name 'nbdev'")

    nbs_path = Path('nbs')
    nbs_path.mkdir()
    Config('.', 'settings.ini', create={'nbs_path':nbs_path,'author':'fastai'})
    _run('nbdev_install_hooks')
    
    fn = 'random.ipynb'
    p = nbs_path/fn
    write_nb(base, p)
    _run(f"git add . && git commit -m 'add {fn}'")
    default = _git_brunch_current()

    feature = 'add-heading'
    _run(f'git checkout -b {feature}')
    write_nb(thrs, p)
    _run("git commit -am 'heading'")

    _run(f'git checkout {default}')
    write_nb(ours, p)
    _run("git commit -am 'docs'")

    proc = _run(f'git merge {feature}', check=False)
    if proc.stderr: raise AssertionError(f'Git hook failed with:\n\n{proc.stderr}')
    assert proc.returncode != 0, proc.stdout.strip() # Should error since we can't autofix cell source change
    nb = read_nb(p)

s = [o.source for o in nb.cells]
test_eq(s, ['# Random numbers',
            '`<<<<<<< HEAD`',
           'import random,os',
           'Calculate a random number:',
           '`=======`',
           'import random,sys',
           '`>>>>>>> add-heading`',
           'random.random()'])
test_eq(nb.cells[-1].output, ours.cells[-1].output)

Export -¶

In [ ]:

#|hide
from nbdev import nbdev_export
nbdev_export()

In [ ]: