#|default_exp clean
#|export
import ast,warnings,stat
from astunparse import unparse
from textwrap import indent
from execnb.nbio import *
from fastcore.script import *
from fastcore.basics import *
from fastcore.imports import *
from nbdev.imports import *
from nbdev.config import *
from nbdev.sync import *
from nbdev.process import first_code_ln
#|hide
from fastcore.test import *
To avoid pointless conflicts while working with jupyter notebooks (with different execution counts or cell metadata), it is recommended to clean the notebooks before committing anything (done automatically if you install the git hooks with nbdev_install_hooks
). The following functions are used to do that.
#|export
@call_parse
def nbdev_trust(
fname:str=None, # A notebook name or glob to trust
force_all:bool=False # Also trust notebooks that haven't changed
):
"Trust notebooks matching `fname`"
try: from nbformat.sign import NotebookNotary
except:
import warnings
warnings.warn("Please install jupyter and try again")
return
fname = Path(fname if fname else get_config().path('nbs_path'))
path = fname if fname.is_dir() else fname.parent
check_fname = path/".last_checked"
last_checked = os.path.getmtime(check_fname) if check_fname.exists() else None
nbs = globtastic(fname, file_glob='*.ipynb', skip_folder_re='^[_.]') if fname.is_dir() else [fname]
for fn in nbs:
if last_checked and not force_all:
last_changed = os.path.getmtime(fn)
if last_changed < last_checked: continue
nb = read_nb(fn)
if not NotebookNotary().check_signature(nb): NotebookNotary().sign(nb)
check_fname.touch(exist_ok=True)
#|export
_repr_id_re = re.compile('(<.*?)( at 0x[0-9a-fA-F]+)(>)')
def _clean_cell_output_id(lines):
sub = partial(_repr_id_re.sub, r'\1\3')
return sub(lines) if isinstance(lines,str) else [sub(o) for o in lines]
#|hide
test_eq(_clean_cell_output_id(['Lambda(func=<function _add2 at 0x7f8252378820>)',
'[<PIL.Image.Image image mode=RGB size=320x240 at 0x7FAC4E2CF610>,\n',
'(<a at 0x7f8252378820>, <b at 0x7EFE94247550>, <c at 0x7f8252378820>)']),
['Lambda(func=<function _add2>)',
'[<PIL.Image.Image image mode=RGB size=320x240>,\n',
'(<a>, <b>, <c>)'])
test_eq(_clean_cell_output_id('foo\n<function _add2 at 0x7f8252378820>\nbar'), 'foo\n<function _add2>\nbar')
#|export
def _clean_cell_output(cell, clean_ids):
"Remove `cell` output execution count and optionally ids from text reprs"
outputs = cell.get('outputs', [])
for o in outputs:
if 'execution_count' in o: o['execution_count'] = None
data = o.get('data', {})
data.pop("application/vnd.google.colaboratory.intrinsic+json", None)
if clean_ids:
for k in data:
if k.startswith('text'): data[k] = _clean_cell_output_id(data[k])
if 'text' in o: o['text'] = _clean_cell_output_id(o['text'])
o.get('metadata', {}).pop('tags', None)
#|export
def _clean_cell(cell, clear_all, allowed_metadata_keys, clean_ids):
"Clean `cell` by removing superfluous metadata or everything except the input if `clear_all`"
if 'execution_count' in cell: cell['execution_count'] = None
if 'outputs' in cell:
if clear_all: cell['outputs'] = []
else: _clean_cell_output(cell, clean_ids)
if cell['source'] == ['']: cell['source'] = []
cell['metadata'] = {} if clear_all else {
k:v for k,v in cell['metadata'].items() if k in allowed_metadata_keys}
#|export
def clean_nb(
nb, # The notebook to clean
clear_all=False, # Remove all cell metadata and cell outputs
allowed_metadata_keys:list=None, # Preserve the list of keys in the main notebook metadata
allowed_cell_metadata_keys:list=None, # Preserve the list of keys in cell level metadata
clean_ids=True, # Remove ids from plaintext reprs?
):
"Clean `nb` from superfluous metadata"
metadata_keys = {"kernelspec", "jekyll", "jupytext", "doc"}
if allowed_metadata_keys: metadata_keys.update(allowed_metadata_keys)
cell_metadata_keys = {"hide_input"}
if allowed_cell_metadata_keys: cell_metadata_keys.update(allowed_cell_metadata_keys)
for c in nb['cells']: _clean_cell(c, clear_all, cell_metadata_keys, clean_ids)
nb['metadata'] = {k:v for k,v in nb['metadata'].items() if k in metadata_keys}
The test notebook has metadata in both the main metadata section and contains cell level metadata in the second cell:
test_nb = read_nb('../tests/metadata.ipynb')
assert {'meta', 'jekyll', 'my_extra_key', 'my_removed_key'} <= test_nb.metadata.keys()
assert {'meta', 'hide_input', 'my_extra_cell_key', 'my_removed_cell_key'} == test_nb.cells[1].metadata.keys()
After cleaning the notebook, all extra metadata is removed, only some keys are allowed by default:
clean_nb(test_nb)
assert {'jekyll', 'kernelspec'} == test_nb.metadata.keys()
assert {'hide_input'} == test_nb.cells[1].metadata.keys()
We can preserve some additional keys at the notebook or cell levels:
test_nb = read_nb('../tests/metadata.ipynb')
clean_nb(test_nb, allowed_metadata_keys={'my_extra_key'}, allowed_cell_metadata_keys={'my_extra_cell_key'})
assert {'jekyll', 'kernelspec', 'my_extra_key'} == test_nb.metadata.keys()
assert {'hide_input', 'my_extra_cell_key'} == test_nb.cells[1].metadata.keys()
Passing clear_all=True
removes everything from the cell metadata:
test_nb = read_nb('../tests/metadata.ipynb')
clean_nb(test_nb, clear_all=True)
assert {'jekyll', 'kernelspec'} == test_nb.metadata.keys()
test_eq(test_nb.cells[1].metadata, {})
Passing clean_ids=True
removes id
s from plaintext repr outputs, to avoid notebooks whose contents change on each run since they often lead to git merge conflicts. For example:
<PIL.PngImagePlugin.PngImageFile image mode=L size=28x28 at 0x7FB4F8979690>
becomes:
<PIL.PngImagePlugin.PngImageFile image mode=L size=28x28>
#|export
def _reconfigure(*strms):
for s in strms:
if hasattr(s,'reconfigure'): s.reconfigure(encoding='utf-8')
#|export
def process_write(warn_msg, proc_nb, f_in, f_out=None, disp=False):
if not f_out: f_out = sys.stdout if disp else f_in
if isinstance(f_in, (str,Path)): f_in = Path(f_in).open()
try:
_reconfigure(f_in, f_out)
nb = loads(f_in.read())
proc_nb(nb)
write_nb(nb, f_out)
except Exception as e:
warn(f'{warn_msg}')
warn(e)
#|export
def _nbdev_clean(nb, path=None, **kwargs):
cfg = get_config(path=path)
allowed_metadata_keys = cfg.get("allowed_metadata_keys").split()
allowed_cell_metadata_keys = cfg.get("allowed_cell_metadata_keys").split()
clean_ids = str2bool(cfg.get('clean_ids'))
return clean_nb(nb, clean_ids=clean_ids, allowed_metadata_keys=allowed_metadata_keys,
allowed_cell_metadata_keys=allowed_cell_metadata_keys, **kwargs)
#|export
@call_parse
def nbdev_clean(
fname:str=None, # A notebook name or glob to clean
clear_all:bool=False, # Clean all metadata and outputs
disp:bool=False, # Print the cleaned outputs
stdin:bool=False # Read notebook from input stream
):
"Clean all notebooks in `fname` to avoid merge conflicts"
# Git hooks will pass the notebooks in stdin
_clean = partial(_nbdev_clean, clear_all=clear_all)
_write = partial(process_write, warn_msg='Failed to clean notebook', proc_nb=_clean)
if stdin: return _write(f_in=sys.stdin, f_out=sys.stdout)
if fname is None: fname = get_config().path('nbs_path')
for f in globtastic(fname, file_glob='*.ipynb', skip_folder_re='^[_.]'): _write(f_in=f, disp=disp)
By default (fname
left to None
), all the notebooks in lib_folder
are cleaned. You can opt in to fully clean the notebook by removing every bit of metadata and the cell outputs by passing clear_all=True
.
If you want to keep some keys in the main notebook metadata you can set allowed_metadata_keys
in settings.ini
.
Similarly for cell level metadata use: allowed_cell_metadata_keys
. For example, to preserve both k1
and k2
at both the notebook and cell level adding the following in settings.ini
:
...
allowed_metadata_keys = k1 k2
allowed_cell_metadata_keys = k1 k2
...
#|export
def clean_jupyter(path, model, **kwargs):
"Clean Jupyter `model` pre save to `path`"
if not (model['type']=='notebook' and model['content']['nbformat']==4): return
get_config.cache_clear() # Allow config changes without restarting Jupyter
jupyter_hooks = get_config(path=path).jupyter_hooks
if jupyter_hooks in {'user','nbdev','none'}:
warn(("`jupyter_hooks` values in `{'user','nbdev','none'}` are deprecated. Use `True` or `False` instead.\n"
"See the docs for more: https://nbdev.fast.ai/clean.html#clean_jupyter"), DeprecationWarning)
jupyter_hooks = False if jupyter_hooks == 'none' else True
else: jupyter_hooks = str2bool(jupyter_hooks)
if jupyter_hooks: _nbdev_clean(model['content'], path=path)
This cleans notebooks on-save to avoid unnecessary merge conflicts. The easiest way to install it for both Jupyter Notebook and Lab is by running nbdev_install_hooks
. It works by implementing a pre_save_hook
from Jupyter's file save hook API.
#|export
_pre_save_hook_src = '''
def nbdev_clean_jupyter(**kwargs):
try: from nbdev.clean import clean_jupyter
except ModuleNotFoundError: return
clean_jupyter(**kwargs)
c.ContentsManager.pre_save_hook = nbdev_clean_jupyter'''.strip()
_pre_save_hook_re = re.compile(r'c\.(File)?ContentsManager\.pre_save_hook')
#|export
def _add_jupyter_hooks(src, path):
if _pre_save_hook_src in src: return
mod = ast.parse(src)
for node in ast.walk(mod):
if not isinstance(node,ast.Assign): continue
target = only(node.targets)
if _pre_save_hook_re.match(unparse(target)):
pre = ' '*2
old = indent(unparse(node), pre)
new = indent(_pre_save_hook_src, pre)
sys.stderr.write(f"Can't install hook to '{path}' since it already contains:\n{old}\n"
f"Manually update to the following (without indentation) for this functionality:\n\n{new}\n\n")
return
src = src.rstrip()
if src: src+='\n\n'
return src+_pre_save_hook_src
#|hide
# Returns None if hook is already installed
res = _add_jupyter_hooks(_pre_save_hook_src, 'config.py')
test_is(res, None)
#|hide
# Returns None and warns if pre_save_hook is already set
res = _add_jupyter_hooks("c.ContentsManager.pre_save_hook = my_hook\n", 'config.py')
test_is(res, None)
Can't install hook to 'config.py' since it already contains: c.ContentsManager.pre_save_hook = my_hook Manually update to the following (without indentation) for this functionality: def nbdev_clean_jupyter(**kwargs): try: from nbdev.clean import clean_jupyter except ModuleNotFoundError: return clean_jupyter(**kwargs) c.ContentsManager.pre_save_hook = nbdev_clean_jupyter
#|hide
# Adds after existing source
show_src(_add_jupyter_hooks('an_existing_line = True\n', 'config.py'))
an_existing_line = True
def nbdev_clean_jupyter(**kwargs):
try: from nbdev.clean import clean_jupyter
except ModuleNotFoundError: return
clean_jupyter(**kwargs)
c.ContentsManager.pre_save_hook = nbdev_clean_jupyter
#|export
def _git_root():
try: return Path(run('git rev-parse --show-toplevel'))
except OSError: return None
#|hide
import tempfile
#|hide
test_eq(_git_root().name, 'nbdev')
with tempfile.TemporaryDirectory() as d, working_directory(d): test_is(_git_root(), None)
#|export
@call_parse
def nbdev_install_hooks():
"Install Jupyter and git hooks to automatically clean, trust, and fix merge conflicts in notebooks"
cfg_path = Path.home()/'.jupyter'
cfg_path.mkdir(exist_ok=True)
cfg_fns = [cfg_path/f'jupyter_{o}_config.py' for o in ('notebook','server')]
for fn in cfg_fns:
src = fn.read_text() if fn.exists() else ''
upd = _add_jupyter_hooks(src, fn)
if upd is not None: fn.write_text(upd)
repo_path = _git_root()
if repo_path is None:
sys.stderr.write('Not in a git repository, git hooks cannot be installed.\n')
return
hook_path = repo_path/'.git'/'hooks'
fn = hook_path/'post-merge'
hook_path.mkdir(parents=True, exist_ok=True)
fn.write_text("#!/bin/bash\nnbdev_trust")
os.chmod(fn, os.stat(fn).st_mode | stat.S_IEXEC)
cmd = 'git config --local include.path ../.gitconfig'
(repo_path/'.gitconfig').write_text(f'''# Generated by nbdev_install_hooks
#
# If you need to disable this instrumentation do:
# git config --local --unset include.path
#
# To restore:
# {cmd}
#
[merge "nbdev-merge"]
name = resolve conflicts with nbdev_fix
driver = nbdev_merge %O %A %B %P
''')
run(cmd)
attrs_path = repo_path/'.gitattributes'
nbdev_attr = '*.ipynb merge=nbdev-merge\n'
try:
attrs = attrs_path.read_text()
if nbdev_attr not in attrs:
if not attrs.endswith('\n'): attrs+='\n'
attrs_path.write_text(attrs+nbdev_attr)
except FileNotFoundError: attrs_path.write_text(nbdev_attr)
print("Hooks are installed.")
See clean_jupyter
and nbdev_merge
for more about how each hook works.
#|hide
def _git_brunch_current(): return run('git branch --show-current')
#|hide
meta = {'nbformat': 4,'metadata':{'kernelspec':{'display_name':'Python 3','language': 'python','name': 'python3'}}}
base = dict2nb({'cells':[mk_cell('import random'),
mk_cell('random.random()')], **meta})
base.cells[-1].output = create_output('0.3314001088639852\n0.20280244713400464', 'plain')
#|hide
from copy import deepcopy
#|hide
ours = deepcopy(base)
ours.cells[0].source+=',os' # Change first cell
ours.cells.insert(1, mk_cell('Calculate a random number:', cell_type='markdown')) # New cell
ours.cells[-1].output = create_output('0.3379097372590093\n0.7379492349993123', 'plain') # Change outputs
#|hide
thrs = deepcopy(base)
thrs.cells[0].source+=',sys'# Also change first cell
thrs.cells.insert(0, mk_cell('# Random numbers', cell_type='markdown')) # New cell
thrs.cells[-1].output = create_output('0.6587181429602441\n0.5962200692415515', 'plain') # Change outputs
#|hide
import subprocess
#|hide
def _run(cmd, check=True):
proc = subprocess.run(cmd, shell=True, capture_output=True, text=True)
if check and proc.returncode != 0:
msg = f"Command '{cmd}' returned non-zero exit status {proc.returncode}"
if proc.stdout.strip(): msg+=f'\nstdout: {proc.stdout.strip()}'
if proc.stderr.strip(): msg+=f'\nstderr: {proc.stderr.strip()}'
raise RuntimeError(msg)
return proc
#|hide
with tempfile.TemporaryDirectory() as d, working_directory(d):
_run('git init')
_run("git config user.email 'nbdev@fast.ai'")
_run("git config user.name 'nbdev'")
nbs_path = Path('nbs')
nbs_path.mkdir()
Config('.', 'settings.ini', create={'nbs_path':nbs_path,'author':'fastai'})
_run('nbdev_install_hooks')
fn = 'random.ipynb'
p = nbs_path/fn
write_nb(base, p)
_run(f"git add . && git commit -m 'add {fn}'")
default = _git_brunch_current()
feature = 'add-heading'
_run(f'git checkout -b {feature}')
write_nb(thrs, p)
_run("git commit -am 'heading'")
_run(f'git checkout {default}')
write_nb(ours, p)
_run("git commit -am 'docs'")
proc = _run(f'git merge {feature}', check=False)
if proc.stderr: raise AssertionError(f'Git hook failed with:\n\n{proc.stderr}')
assert proc.returncode != 0, proc.stdout.strip() # Should error since we can't autofix cell source change
nb = read_nb(p)
s = [o.source for o in nb.cells]
test_eq(s, ['# Random numbers',
'`<<<<<<< HEAD`',
'import random,os',
'Calculate a random number:',
'`=======`',
'import random,sys',
'`>>>>>>> add-heading`',
'random.random()'])
test_eq(nb.cells[-1].output, ours.cells[-1].output)
#|hide
from nbdev import nbdev_export
nbdev_export()