#hide
#default_exp clean
from nbdev.showdoc import show_doc
#export
import io,sys,json,glob,re
from fastcore.script import call_parse,Param
from nbdev.imports import Config
from pathlib import Path
#hide
#For tests only
from nbdev.imports import *
Strip notebooks from superfluous metadata
To avoid pointless conflicts while working with jupyter notebooks (with different execution counts or cell metadata), it is recommended to clean the notebooks before committing anything (done automatically if you install the git hooks with nbdev_install_git_hooks
). The following functions are used to do that.
#export
def rm_execution_count(o):
"Remove execution count in `o`"
if 'execution_count' in o: o['execution_count'] = None
#export
colab_json = "application/vnd.google.colaboratory.intrinsic+json"
def clean_output_data_vnd(o):
"Remove `application/vnd.google.colaboratory.intrinsic+json` in data entries"
if 'data' in o:
data = o['data']
if colab_json in data:
new_data = {k:v for k,v in data.items() if k != colab_json}
o['data'] = new_data
#export
def clean_cell_output(cell):
"Remove execution count in `cell`"
if 'outputs' in cell:
for o in cell['outputs']:
rm_execution_count(o)
clean_output_data_vnd(o)
o.get('metadata', o).pop('tags', None)
#export
cell_metadata_keep = ["hide_input"]
nb_metadata_keep = ["kernelspec", "jekyll", "jupytext", "doc"]
#export
def clean_cell(cell, clear_all=False):
"Clean `cell` by removing superfluous metadata or everything except the input if `clear_all`"
rm_execution_count(cell)
if 'outputs' in cell:
if clear_all: cell['outputs'] = []
else: clean_cell_output(cell)
if cell['source'] == ['']: cell['source'] = []
cell['metadata'] = {} if clear_all else {k:v for k,v in cell['metadata'].items() if k in cell_metadata_keep}
tst = {'cell_type': 'code',
'execution_count': 26,
'metadata': {'hide_input': True, 'meta': 23},
'outputs': [{'execution_count': 2,
'data': {
'application/vnd.google.colaboratory.intrinsic+json': {
'type': 'string'},
'plain/text': ['sample output',]
},
'output': 'super'}],
'source': 'awesome_code'}
tst1 = tst.copy()
clean_cell(tst)
test_eq(tst, {'cell_type': 'code',
'execution_count': None,
'metadata': {'hide_input': True},
'outputs': [{'execution_count': None,
'data': {'plain/text': ['sample output',]},
'output': 'super'}],
'source': 'awesome_code'})
clean_cell(tst1, clear_all=True)
test_eq(tst1, {'cell_type': 'code',
'execution_count': None,
'metadata': {},
'outputs': [],
'source': 'awesome_code'})
tst2 = {
'metadata': {'tags':[]},
'outputs': [{
'metadata': {
'tags':[]
}}],
"source": [
""
]}
clean_cell(tst2, clear_all=False)
test_eq(tst2, {
'metadata': {},
'outputs': [{
'metadata':{}}],
'source': []})
#export
def clean_nb(nb, clear_all=False):
"Clean `nb` from superfluous metadata, passing `clear_all` to `clean_cell`"
for c in nb['cells']: clean_cell(c, clear_all=clear_all)
nb['metadata'] = {k:v for k,v in nb['metadata'].items() if k in nb_metadata_keep }
tst = {'cell_type': 'code',
'execution_count': 26,
'metadata': {'hide_input': True, 'meta': 23},
'outputs': [{'execution_count': 2,
'data': {
'application/vnd.google.colaboratory.intrinsic+json': {
'type': 'string'},
'plain/text': ['sample output',]
},
'output': 'super'}],
'source': 'awesome_code'}
nb = {'metadata': {'kernelspec': 'some_spec', 'jekyll': 'some_meta', 'meta': 37},
'cells': [tst]}
clean_nb(nb)
test_eq(nb['cells'][0], {'cell_type': 'code',
'execution_count': None,
'metadata': {'hide_input': True},
'outputs': [{'execution_count': None,
'data': { 'plain/text': ['sample output',]},
'output': 'super'}],
'source': 'awesome_code'})
test_eq(nb['metadata'], {'kernelspec': 'some_spec', 'jekyll': 'some_meta'})
#export
def _print_output(nb):
"Print `nb` in stdout for git things"
_output_stream = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8')
x = json.dumps(nb, sort_keys=True, indent=1, ensure_ascii=False)
_output_stream.write(x)
_output_stream.write("\n")
_output_stream.flush()
#export
BSLASH = '\x5c'
def clean_cr(s): return re.sub(fr'{BSLASH}{BSLASH}r(\\n)?', r'\\n', s)
assert clean_cr(fr'a{BSLASH}r\nb{BSLASH}rc\n') == fr'a\nb\nc\n'
#export
@call_parse
def nbdev_clean_nbs(fname:Param("A notebook name or glob to convert", str)=None,
clear_all:Param("Clean all metadata and outputs", bool)=False,
disp:Param("Print the cleaned outputs", bool)=False,
read_input_stream:Param("Read input stram and not nb folder")=False):
"Clean all notebooks in `fname` to avoid merge conflicts"
#Git hooks will pass the notebooks in the stdin
if read_input_stream and sys.stdin:
input_stream = io.TextIOWrapper(sys.stdin.buffer, encoding='utf-8')
nb = json.load(input_stream)
clean_nb(nb, clear_all=clear_all)
_print_output(nb)
return
if fname is None:
try: path = Config().path("nbs_path")
except Exception as e: path = Path.cwd()
files = path.glob('**/*.ipynb') if fname is None else glob.glob(fname)
for f in files:
if not str(f).endswith('.ipynb'): continue
nb_s = open(f, 'r', encoding='utf-8').read()
nb = json.loads(clean_cr(nb_s))
clean_nb(nb, clear_all=clear_all)
if disp: _print_output(nb)
else:
x = json.dumps(nb, sort_keys=True, indent=1, ensure_ascii=False)
with io.open(f, 'w', encoding='utf-8') as f:
f.write(x)
f.write("\n")
By default (fname
left to None
), the all the notebooks in lib_folder
are cleaned. You can opt in to fully clean the notebook by removing every bit of metadata and the cell outputs by passing clear_all=True
. disp
is only used for internal use with git hooks and will print the clean notebook instead of saving it. Same for read_input_stream
that will read the notebook from the input stream instead of the file names.
#hide
from nbdev.export import notebook2script
notebook2script()
Converted 00_export.ipynb. Converted 01_sync.ipynb. Converted 02_showdoc.ipynb. Converted 03_export2html.ipynb. Converted 04_test.ipynb. Converted 05_merge.ipynb. Converted 06_cli.ipynb. Converted 07_clean.ipynb. Converted 99_search.ipynb. Converted example.ipynb. Converted index.ipynb. Converted tutorial.ipynb.