In [ ]:

#default_exp asciidoc

fastdoc.asciidoc¶

API for the fastdoc convertor

In [ ]:

#export
from fastdoc.imports import *
from fastcore.script import *
from warnings import warn

In [ ]:

#export
def markdown_cell(md):
    return nbformat.notebooknode.NotebookNode({'cell_type': 'markdown', 'source': md, 'metadata': {}})

In [ ]:

#export
def code_cell(code, metadata=None, outputs=None):
    return nbformat.notebooknode.NotebookNode(
        {'cell_type': 'code',
         'execution_count': None,
         'source': code,
         'metadata': {} if metadata is None else metadata,
         'outputs': [] if outputs is None else outputs})

Preprocessing¶

Preprocessing on the list of all cells¶

Removing cells with the flag # hide

In [ ]:

#export
_re_hidden = re.compile(r'^\s*#\s*(hide|clean)\s*$', re.MULTILINE)

In [ ]:

#export
def remove_hidden_cells(cells):
    "Remove cells marked with #hide"
    return [c for c in cells if _re_hidden.search(c['source']) is None]

In [ ]:

cells = [code_cell('# hide'), code_cell('lalala'), markdown_cell('lalala\n# hide')]
test_eq(remove_hidden_cells(cells), [code_cell('lalala')])

Isolating the bits in triple quotes annotated with asciidoc in code cells without outputs so that they are not interpreted by the converter, with adding ##clear## so that the post-processing removes the [python] flag.

In [ ]:

#export
def isolate_adoc_blocks(cells):
    res = []
    for cell in cells:
        if cell['cell_type'] == 'markdown' and re.search(r'```\s*asciidoc', cell['source']) is not None:
            lines = cell['source'].split('\n')
            adoc,s,idx = False,0,0
            for line in lines:
                if re.search(r'^```\s*asciidoc\s*$', line) is not None and not adoc:
                    res.append(markdown_cell('\n'.join(lines[s:idx])))
                    adoc,s = True,idx+1
                elif re.search(r'^```\s*$', line) is not None and adoc:
                    res.append(code_cell('##clear##' + '\n'.join(lines[s:idx])))
                    adoc,s = False,idx+1
                idx+=1
            assert not adoc, f"Triple-quote asciidoc block not ended in {cell['source']}"
            res.append(markdown_cell('\n'.join(lines[s:])))
        else: res.append(cell)
    return res

In [ ]:

test = """This is some text
```asciidoc
This should be isolated
```
Some other text
```asciidoc
This should also be isolated
```
end"""
test_eq(isolate_adoc_blocks([markdown_cell(test)]), [
    markdown_cell("This is some text"),
    code_cell("##clear##This should be isolated"),
    markdown_cell("Some other text"),
    code_cell("##clear##This should also be isolated"),
    markdown_cell("end")
])

Preprocessing individual code cells¶

Old way of putting [WARNING], [NOTE] or [TIP]

In [ ]:

#export
#TODO: remove when all notebooks have been ported to v2
def replace_old_jekylls(cell):
    if cell['source'].startswith('jekyll'):
        pat1 = re.compile(r"""jekyll_(.*)\(['"].*""")
        pat2 = re.compile(r"""jekyll_.*\(['"]+([\s\S]*[^'"])['"]+\)$""")
        jekyll_type = re.match(pat1, cell['source']).groups()[0]
        message = re.match(pat2, cell['source']).groups()[0]
        inst = {'warn':'WARNING', 'note':'NOTE', 'important':'TIP'}
        cell['metadata'] = {}
        cell['source'] = f'##clear##[{inst[jekyll_type]}]\n====\n{message}\n===='
        cell['outputs'] = []
    return cell

In [ ]:

test_eq(replace_old_jekylls(code_cell('jekyll_warn("""Try to convert me!""")')), 
        code_cell('##clear##[WARNING]\n====\nTry to convert me!\n===='))

Hide input of cells with hide_input=True in metadata (extension hide input) or a flag #hide_input. Put ##remove## instead of the code that will be removed during post-processing

In [ ]:

#export
_re_hide_input = re.compile(r'^\s*#\s*hide_input\s*$', re.MULTILINE)

In [ ]:

#export
def hide_input(cell):
    if cell['metadata'].get('hide_input', False) or _re_hide_input.search(cell["source"]) is not None: cell['source'] = '##remove##'
    return cell

In [ ]:

test_eq(hide_input(code_cell('some code', metadata={'hide_input': True}, outputs=[1])), 
        code_cell('##remove##', metadata={'hide_input': True}, outputs=[1]))
test_eq(hide_input(code_cell('# hide_input\nsome code', outputs=[1])), 
        code_cell('##remove##', outputs=[1]))

Hide outputs of cells with collapsed=True in their metadata or a flag #hide_output

In [ ]:

#export
_re_hide_output = re.compile(r'^\s*#\s*hide_output\s*$', re.MULTILINE)

In [ ]:

#export
def hide_output(cell):
    if cell['metadata'].get('collapsed', False) or _re_hide_output.search(cell["source"]) is not None:
        cell['outputs'] = []
        cell['source'] = re.sub(r'#\s*hide_output\s*\n', '', cell['source'])
    return cell

In [ ]:

test_eq(hide_output(code_cell('some code', metadata={'collapsed': True}, outputs=[1])), 
        code_cell('some code', metadata={'collapsed': True}))
test_eq(hide_output(code_cell('# hide_output\nsome code', outputs=[1])), 
        code_cell('some code'))

Replace outputs as text_html by text_plain (otherwise they are not kept)

In [ ]:

#export
def extract_html(cell):
    for o in cell['outputs']:
        if 'data' in o and 'text/html' in o['data']:
            o['data']['text/plain'] = o['data']['text/html']
            del o['data']['text/html']
    return cell

In [ ]:

test_eq(extract_html(code_cell('some code', outputs=[{'data': {'text/html': 'some_html'}}])),
        code_cell('some code', outputs=[{'data': {'text/plain': 'some_html'}}]))

Deal with errors by putting them in plain text

In [ ]:

#export
def split_max_len(text, l):
    words = text.split(' ')
    line,lines = "",[]
    for word in words:
        if len(line) + len(word) + 1 <= l: line += f' {word}'
        else:
            lines.append(line)
            line = ""
    if len(line) > 0: lines.append(line)
    return "\n".join(lines)

In [ ]:

#export
def deal_error(cell):
    for i,out in enumerate(cell['outputs']):
        if out['output_type'] == 'error':
            msg = f"{out['ename']}: {out['evalue']}"
            cell['outputs'][i] = nbformat.notebooknode.NotebookNode({
                'data': {'text/plain': split_max_len(msg, 81) },
                'execution_count': None,
                'metadata': {},
                'output_type': 'execute_result'})
    return cell

In [ ]:

test_eq(deal_error(code_cell('some code', outputs=[{'output_type': 'error', 'ename': 'Error name', 'evalue': 'This is an error.'}])), 
        code_cell('some code', outputs = [
            {'data': {'text/plain': ' Error name: This is an error.'},
             'execution_count': None,
             'metadata': {},
             'output_type': 'execute_result'}
        ]))

Remove interrupted progress bars from the outputs

In [ ]:

#export
def remove_interrupted_pbars(cell):
    outs = []
    for out in cell['outputs']:
        if 'data' not in out or 'text/plain' not in out['data'] or 'progress-bar-interrupted' not in out['data']['text/plain']:
            outs.append(out)
    cell['outputs'] = outs
    return cell

In [ ]:

test_eq(remove_interrupted_pbars(
    code_cell("some code", outputs = [{'a': 1}, {'data': {'text/plain': 'progress-bar-interrupted'}}, {'b': 2}])),
        code_cell("some code", outputs = [{'a': 1}, {'b': 2}]))

Get metadata for outputs.

In [ ]:

#export
def get_cell_meta(cell):
    for attr in ["id", "caption", "alt", "width"]:
        if re.search(r'^\s*#\s*' + attr + r'\s(.*)$', cell["source"], re.MULTILINE) is not None:
            cell["metadata"][attr] = re.search(r'^\s*#\s*' + attr + r'\s(.*)$', cell["source"], re.MULTILINE).groups()[0]
            cell["source"] = re.sub(r'#\s*' + attr + r'\s.*?($|\n)', '', cell["source"])
    return cell

In [ ]:

test_eq(get_cell_meta(code_cell("#id 123\n#caption This is a bear\nsome code")), 
        code_cell("some code", metadata = {'id': '123', 'caption': 'This is a bear'}))

Deal with table captions and refs

In [ ]:

#export
def caption_tables(cell):
    if 'outputs' not in cell or len(cell['outputs']) == 0: return cell
    output = cell['outputs'][0]
    if 'data' not in output or 'text/plain' not in output['data']: return cell
    text = output['data']['text/plain']
    if re.search(r'^<\s*table\s+([^>]*>)', text) is None: return cell
    table_id = cell['metadata'].get('id', None)
    caption = cell['metadata'].get('caption', None)
    text_id = '' if table_id is None else f'id="{table_id}" '
    text_caption = '' if caption is None else f'\n  <caption>{caption}</caption>'
    output['data']['text/plain'] = re.sub(r'^<\s*table\s+([^>]*>)', '<table '+text_id+r'\1'+text_caption, text)
    cell['outputs'][0] = output
    return cell

In [ ]:

cell = code_cell("some code", 
    metadata={'id': '123', 'caption': 'a caption'},
    outputs=[{'data': {'text/plain': '<table border="1">\nTable code'}}])
cell2 = code_cell("some code", 
    metadata={'id': '123', 'caption': 'a caption'},
    outputs=[{'data': {'text/plain': '<table id="123" border="1">\n  <caption>a caption</caption>\nTable code'}}])
test_eq(caption_tables(cell), cell2)

In [ ]:

cell = code_cell("#hide_input\n#id 123\n#caption a caption", 
    metadata={},
    outputs=[{'data': {'text/plain': '<table border="1">\nTable code'}, 'output_type':''}])

Wrap text in outputs

In [ ]:

#export
TEXT_MAX_WIDTH = 80

In [ ]:

#export
def _wrap_output(output):
    if 'text' in output:
        lines = ['\n'.join(textwrap.wrap(l, width=TEXT_MAX_WIDTH, subsequent_indent = ' > ')) for l in output['text'].split('\n')]
        output['text'] = '\n'.join(lines)
        return output
    if ('data' not in output or 'text/plain' not in output['data']): return output
    text = output['data']['text/plain']
    if re.search(r'^<\s*table\s*([^>]*>)', text) is not None: return output
    lines = ['\n'.join(textwrap.wrap(l, width=TEXT_MAX_WIDTH, subsequent_indent = ' > ')) for l in text.split('\n')]
    output['data']['text/plain'] = '\n'.join(lines)
    return output

In [ ]:

#export
def wrap_text_outputs(cell):
    if 'outputs' not in cell or len(cell['outputs']) == 0: return cell
    cell['outputs'] = [_wrap_output(o) for o in cell['outputs']]
    return cell

In [ ]:

cell = code_cell("some code", 
    metadata={},
    outputs=[{'data': {'text/plain': 'This is a long output'*5}, 'output_type':''},
             {'text': 'This is a long output'*5}])
wrapped = 'This is a long outputThis is a long outputThis is a long outputThis is a long\n > outputThis is a long output'
test_eq(wrap_text_outputs(cell), code_cell("some code", 
    metadata={},
    outputs=[{'data': {'text/plain': wrapped}, 'output_type':''},
             {'text': wrapped}]))

Test code length

In [ ]:

#export
CODE_MAX_LEN = 80

In [ ]:

#export
def check_code_len(cell):
    lines = cell['source'].split('\n')
    for l in lines:
        if len(l) > CODE_MAX_LEN: warn(f"Found code too long in a cell:\n{cell['source']}")
    return cell

Preprocessing individual markdown cells¶

Replace "` `" by ``

In [ ]:

#export
def deal_quotes(cell):
    cell['source'] = re.sub(r'"`([^`]*)`"', r'`\1`', cell['source'])
    cell['source'] = re.sub(r"'", r'xxsinglequote', cell['source'])
    return cell

In [ ]:

test_eq(deal_quotes(markdown_cell('"`code`"')), markdown_cell('`code`'))
test_eq(deal_quotes(markdown_cell('a"b"c')), markdown_cell('a"b"c'))
test_eq(deal_quotes(markdown_cell("a'b'c")), markdown_cell('axxsinglequotebxxsinglequotec'))

Add one title level to every Markdown cell

In [ ]:

#export
def add_title_level(cell):
    if cell['source'].startswith('#'): cell['source'] = '#' + cell['source']
    return cell

In [ ]:

test_eq(add_title_level(markdown_cell('# title')), markdown_cell('## title'))

Remove digits from numbered lists and format labeled lists

In [ ]:

#export
def deal_with_lists(cell):
    lines = cell['source'].split('\n')
    for i in range(len(lines)):
        lines[i] = re.sub(r'(^\s*)\d*\.(.*)$', r'\1.\2xxnewl', lines[i])
        lines[i] = re.sub(r'(^\s*)-\s(.*::)\s(.*)$', r'\2xxnewls\3xxnewl', lines[i])
    cell['source'] = '\n'.join(lines)
    return cell

In [ ]:

test_eq(deal_with_lists(markdown_cell("  1. Item\n  2. Item")),
        markdown_cell("  . Itemxxnewl\n  . Itemxxnewl"))
test_eq(deal_with_lists(markdown_cell("- lbl1:: item1\n- lbl2:: item2")),
        markdown_cell("lbl1::xxnewlsitem1xxnewl\nlbl2::xxnewlsitem2xxnewl"))

Catch block quotes and put them in asciidoc blocks

In [ ]:

#export
_re_block_notes = re.compile(r"""
# Catches any pattern > Title: content with title in group 1 and content in group 2
^\s*>\s*     # > followed by any number of whitespace
([^:]*)      # Catching group for any character but :
:\s*         # : then any number of whitespace
([^\n]*)     # Catching group for anything but a new line character
(?:\n|$)     # Non-catching group for either a new line or the end of the text
""", re.VERBOSE | re.MULTILINE)

_re_forgot_column = re.compile("^\s*>[^:]*$", re.MULTILINE)

Catch Markdown URLs of the form

[link](https://github.com/fastai)

inside asciidoc blocks. Asciidoc expects URLs to be in the following format:

[BLOCK_NAME]
====
This is a block with some https://github.com/fastai[link]
====

In [ ]:

#export
_re_urls = re.compile("\[(.*?)\]\((.*?)\)")

In [ ]:

#export
def replace_jekylls(cell):
    block_names = {'warning':'WARNING', 'note':'NOTE', 'important':'TIP', 'tip': 'TIP', 'stop': 'WARNING',
                   'jargon':'JARGON', 'question':'QUESTION', 'a': 'ALEXIS', 'j': 'JEREMY', 's': 'SYLVAIN'}
    def _rep(m):
        typ,text = m.groups()
        text = re.sub(_re_urls, r"\2[\1]", text)
        name = block_names.get(typ.lower(), typ.upper())
        if name in ['ALEXIS', 'JEREMY', 'SYLVAIN', 'JARGON', 'QUESTION']:
            title = name[0]+name[1:].lower()
            surro = 'NOTE'
            if name=='JARGON':
                splits = text.split(': ')
                title = f'{title}: {splits[0]}'
                text = re.sub(_re_urls, r"\2[\1]", ': '.join(splits[1:]))
            if name in ['ALEXIS', 'JEREMY', 'SYLVAIN']:
                title = f"{title} says"
                surro = 'TIP'
            return f'```asciidoc\n.{title}\n[{surro}]\n====\n{text}\n====\n```\n'
        elif len(name) != 0: return f"```asciidoc\n[{name}]\n====\n{text}\n====\n```\n"
        else:              return f"```asciidoc\n____\n{text}\n____\n```\n"
    if _re_forgot_column.search(cell["source"]): warn("Found a non-processed block quote, please fix")
    cell["source"] = _re_block_notes.sub(_rep, cell["source"])
    return cell

In [ ]:

test_eq(replace_jekylls(markdown_cell("text\n> : This is a block quote")),
    markdown_cell("text\n```asciidoc\n____\nThis is a block quote\n____\n```\n"))
test_eq(replace_jekylls(markdown_cell("text\n> : This is a block quote with a [link](https://github.com/fastai)")),
    markdown_cell("text\n```asciidoc\n____\nThis is a block quote with a https://github.com/fastai[link]\n____\n```\n"))
test_eq(replace_jekylls(markdown_cell("text\n> jargon: term: Some new term")),
    markdown_cell('text\n```asciidoc\n.Jargon: term\n[NOTE]\n====\nSome new term\n====\n```\n'))
test_eq(replace_jekylls(markdown_cell("text\n> jargon: term: Some new term with a [link](https://github.com/fastai)")),
    markdown_cell('text\n```asciidoc\n.Jargon: term\n[NOTE]\n====\nSome new term with a https://github.com/fastai[link]\n====\n```\n'))
test_warns(lambda: replace_jekylls(markdown_cell("text\n> This is a block quote")))

In [ ]:

#export
_re_sidebar = re.compile(r'^\s*#\s*sidebar\s(.*)$', re.MULTILINE)

In [ ]:

#export
def interpret_sidebar(cell):
    lines = cell["source"].split("\n")
    if _re_sidebar.search(lines[0]) is not None:
        title = _re_sidebar.search(lines[0]).groups()[0]
        body = "\n".join(lines[1:])
        cell["source"] = f"```asciidoc\n.{title}\n****\n{body}\n****\n```\n"
    return cell

In [ ]:

test = """#sidebar My intervention

This will be changed to a sidebar when converted in Asciidoc.

It can have several lines, contrary to a block quote."""
interpret_sidebar(markdown_cell(test))

Out[ ]:

{'cell_type': 'markdown',
 'source': '```asciidoc\n.My intervention\n****\n\nThis will be changed to a sidebar when converted in Asciidoc.\n\nIt can have several lines, contrary to a block quote.\n****\n```\n',
 'metadata': {}}

In [ ]:

#export
_re_md_image = re.compile(r"^(<img\ [^>]*>)", re.MULTILINE)

In [ ]:

#export
IMAGE_CONV_MULT = 0.6

In [ ]:

#export
def process_images(cell):
    h = HTMLParseAttrs()
    def _rep(m):
        d = h(m.groups()[0])
        attrs = ['"' + d.get('alt', '') + '"']
        if 'width' in d: attrs.append(str(int(IMAGE_CONV_MULT * int(d['width']))))
        if 'width' in d and 'height' in d: attrs.append(str((int(IMAGE_CONV_MULT * int(d['height'])))))
        suff = f"[{', '.join(attrs)}]"
        pid = f"[[{d['id']}]]\n" if 'id' in d else ""
        caption = f".{d['caption']}\n" if 'caption' in d else ""
        return f"```asciidoc\n{pid}{caption}image::{d['src']}{suff}\n```"
    cell["source"] = _re_md_image.sub(_rep, cell["source"])
    return cell

In [ ]:

txt = 'text\n<img alt="Alternative text" width="700" caption="This is an image" src="puppy.jpg" id="123"/>\nother text'
test_eq(process_images(markdown_cell(txt)), 
        markdown_cell('text\n```asciidoc\n[[123]]\n.This is an image\nimage::puppy.jpg["Alternative text", 420]\n```\nother text'))

In [ ]:

#export
_re_reference = re.compile(r'<<([^>]*)>>')

In [ ]:

#export
def wrap_references(cell):
    cell["source"] = _re_reference.sub(r'xxref\1xxeref', cell["source"])
    return cell

In [ ]:

test_eq(wrap_references(markdown_cell("There is a reference <<ref>> here.")),
        markdown_cell("There is a reference xxrefrefxxeref here."))

In [ ]:

#export
def extract_attachments(cell, dest):
    if not 'attachments' in cell: return cell
    mime,img = first(first(cell['attachments'].values()).items())
    ext = mime.split('/')[1]
    for i in range(99999):
        p = dest/(f'att_{i:05d}.{ext}')
        if not p.exists(): break
    p.write_bytes(b64decode(img))
    del(cell['attachments'])
    cell['source'] = re.sub('attachment:image.png', str(p), cell['source'])
    return cell

Catch sidebars: sidebars are delimited by header cells like ### Sidebar title then ### End sidebar

In [ ]:

#export
_re_sidebar_title = re.compile(r'#+\s+Sidebar:\s+(.*)$', re.IGNORECASE)
_re_end_sidebar = re.compile(r'#+\s+End sidebar', re.IGNORECASE)

In [ ]:

_re_sidebar_title.search('### Sidebar: Tenacity in deep learning').groups()

Out[ ]:

('Tenacity in deep learning',)

In [ ]:

#export
def sidebar_headers(cell):
    cell['source'] = _re_sidebar_title.sub(r'```asciidoc\n.\1\n****\n```', cell['source'])
    cell['source'] = _re_end_sidebar.sub(r'```asciidoc\n****\n```', cell['source'])
    return cell

In [ ]:

test_eq(sidebar_headers(markdown_cell("### Sidebar: My intervention")),
        markdown_cell("```asciidoc\n.My intervention\n****\n```"))
test_eq(sidebar_headers(markdown_cell("### End sidebar")), markdown_cell("```asciidoc\n****\n```"))

All preprocessing together¶

In [ ]:

#export
code_cell_tfms = [get_cell_meta, replace_old_jekylls, hide_input, hide_output, extract_html, deal_error,
                  remove_interrupted_pbars, wrap_text_outputs, caption_tables, check_code_len]
md_cell_tfms = [deal_quotes, wrap_references, interpret_sidebar, sidebar_headers, add_title_level, deal_with_lists,
                process_images, replace_jekylls]

Raw cells just need to have a new line added at the beginning

In [ ]:

#export
def add_new_line(cell):
    cell['source'] = '\n' + cell['source']
    return cell

In [ ]:

#export
def treat_notebook(nb, dest):
    nb['cells'] = remove_hidden_cells(nb['cells'])
    tfm_func = {'code': compose(*code_cell_tfms), 'markdown': compose(partial(extract_attachments, dest=dest), *md_cell_tfms),
                'raw': add_new_line}
    nb['cells'] = [tfm_func[c['cell_type']](c) for c in nb['cells']]
    nb['cells'] = isolate_adoc_blocks(nb['cells'])
    return nb

Post-processing¶

Replace special tokens by their values

In [ ]:

#export
def rep_spec_tok(adoc, metadata=None):
    adoc = re.sub('xxsinglequote', "'", adoc)
    adoc = re.sub('xxnewls', '\n  ', adoc)
    return re.sub('xxnewl\s', '\n', adoc)

nbconvert will flag the code cells with [ipython3], we replace this by [python]

In [ ]:

#export
def ipython2python(adoc, metadata=None):
    return re.sub(r'\[source, ipython3\]','[source, python]', adoc)

In [ ]:

test_eq(ipython2python("[source, ipython3]\n----\nsome code\n----\n"), "[source, python]\n----\nsome code\n----\n")

Remove empty cells or cells flagged for removal (because of hide_input)

In [ ]:

#export
def remove_cells(adoc, metadata=None):
    adoc = re.sub(r'\n\[source, python\]\n----(\n)*----\n','', adoc)
    return re.sub(r'\n\[source, python\]\n----\n##remove##\n----\n','', adoc)

In [ ]:

test_eq(remove_cells("lalala\n[source, python]\n----\n\n----\n"), "lalala")
test_eq(remove_cells("lalala\n[source, python]\n----\n##remove##\n----\n"), "lalala")

Clear code cells from the code flag when there is a ##clear## tag.

In [ ]:

#export
_re_clear = re.compile(r'\[source, python\]\n----\n##clear##(.*?)----\n', re.DOTALL)
def clear_cells(adoc, metadata=None): return _re_clear.sub(r'\1', adoc)

In [ ]:

test_eq(clear_cells(
    "lalala\n[source, python]\n----\n##clear##pure adoc\n----\nfoo\nbla\n[source, python]\n----\n##clear##pure adoc again\n----\nbli"),
        "lalala\npure adoc\nfoo\nbla\npure adoc again\nbli")

Format LaTeX equations properly: they arrive either as latexmath:[$equation$] or latexmath:[\[equation\]]

In [ ]:

#export
def format_latex(adoc, metadata=None):
    #LaTeX equations
    adoc = re.sub(r"latexmath:\[\$([^\$]*)\$\]", r"latexmath:[\\(\1\\)]", adoc)
    return re.sub(r"latexmath:\[\\\[(.*)\\\]\]", r"\n[latexmath]\n++++\n\\begin{equation}\n\1\n\\end{equation}\n++++\n", adoc)

In [ ]:

test_eq(format_latex(r"latexmath:[$equation$]"), r"latexmath:[\(equation\)]")
test_eq(format_latex(r"latexmath:[\[equation\]]"), 
        "\n[latexmath]\n++++\n\\begin{equation}\nequation\n\\end{equation}\n++++\n")

Format image outputs and make sure they point to the right folder.

In [ ]:

#export
_re_image_output = re.compile(r'----\n!\[(?:svg|png|jpg)\]\((.+)\)\n----')

In [ ]:

#export
def format_outputs(adoc, metadata=None):
    folder = ({} if metadata is None else metadata).get('folder', '.')
    def _rep(m):
        name = m.groups()[0]
        d = metadata[name] if metadata is not None and name in metadata else {}
        attrs = ['"' + d.get('alt', '') + '"']
        if 'width' in d: attrs.append(str(d['width']))
        if 'width' in d and 'height' in d: attrs.append(str(d['height']))
        suff = f"[{', '.join(attrs)}]"
        pid = f"[[{d['id']}]]\n" if 'id' in d else ""
        caption = f".{d['caption']}\n" if 'caption' in d else ""
        return f"{pid}{caption}image::{str(folder)}/{name}{suff}"
    return _re_image_output.sub(_rep, adoc)

In [ ]:

test_eq(format_outputs('----\n![svg](output.svg)\n----', {'folder':'path', 'output.svg': {'alt': 'alt'}}),
        'image::path/output.svg["alt"]')
test_eq(format_outputs('----\n![svg](output.svg)\n----', {'folder':'path', 'output.svg': {'alt': 'alt', 'width': 100}}),
        'image::path/output.svg["alt", 100]')
test_eq(format_outputs('----\n![png](output1.png)\n----'),
        'image::./output1.png[""]')

Deal with quotes

In [ ]:

#export
def fix_quotes(adoc, metadata=None):
    return re.sub(r"``([^'`]*)''", r'"\1"', adoc)

In [ ]:

test_eq(fix_quotes("``double quotes''"), '"double quotes"')

Put back << >> around refs

In [ ]:

#export
def fix_references(adoc, metadata=None): return re.sub(r"xxref(.*)xxeref", r"<<\1>>", adoc)

In [ ]:

test_eq(fix_references("There is a reference xxrefrefxxeref here."), "There is a reference <<ref>> here.")

Format tables

In [ ]:

#export
def format_tables(adoc, metadata=None):
    splits = adoc.split('----')
    seps = [''] + ['----' for _ in range(len(splits)-1)] + ['']
    for i,s in enumerate(splits):
        s = re.sub(r'<div>[\s\S]*<table', '<table', s)
        s = re.sub('</div>', '', s)
        s = re.sub('<p>', '', s)
        s = re.sub('</p>', '', s)
        if len(s) > 0 and not s.startswith('\n'): s = '\n' + s
        if len(s) > 0 and not s.endswith('\n'):   s = s + '\n'
        if s.startswith('\n<table'): seps[i],seps[i+1] = '++++','++++'
        elif '<table' in s:
            res = re.search('<table', s)
            begin,end = res.span()
            s = s[:begin] + '\n----\n\n++++\n' + s[begin:]
            seps[i+1] = '++++'
        splits[i] = s
    res = ''
    for s,c in zip(seps,splits): res = res + s + c
    return res.replace('\n\n--------', '')

Just as a personal preference, replace all blocks of three new lines or more by \n\n

In [ ]:

#export
def remove_lines(text, metadata=None):
    return re.sub(r'\n\n\n\n+([^\n])', r'\n\n\n\1', text)

In [ ]:

test_eq(remove_lines('a\n\n\n\n\n\nb'), 'a\n\n\nb')

All together

In [ ]:

#export
post_process_tfms = [fix_quotes, rep_spec_tok, ipython2python, remove_cells, clear_cells, format_latex,
                     format_outputs, fix_references, format_tables, remove_lines]

In [ ]:

#export
def post_process(adoc, metadata=None):
    if not adoc.startswith('\n'): adoc = '\n' + adoc
    adoc = re.sub('xxnewl\s', '\n', adoc)
    adoc = compose(*post_process_tfms)(adoc, metadata=metadata)
    return adoc.strip()

Exporting¶

In [ ]:

#export
c = ExportConfig()
exporter = ASCIIDocExporter(c)
exporter.exclude_input_prompt=True
exporter.exclude_output_prompt=True

In [ ]:

#export
def add_metadata(nb):
    "Stripping removes metadata used in the conversion."
    if 'language_info' not in nb['metadata']:
        nb['metadata']['language_info'] = {
            'codemirror_mode': {'name': 'ipython', 'version': 3},
            'file_extension': '.py',
            'mimetype': 'text/x-python',
            'name': 'python',
            'nbconvert_exporter': 'python',
            'pygments_lexer': 'ipython3',
            'version': '3.7.1'}
    return nb

In [ ]:

#export
def output_num(n):
    m = re.search(r'^output_(\d*)_', n)
    if m is None: return
    return int(m.groups()[0])

In [ ]:

test_eq(output_num('output_31_0.png'), 31)
test_eq(output_num('output_12_0.svg'), 12)

In [ ]:

#export
import PIL

In [ ]:

#export
IMAGE_OUT_MULT = 0.8

In [ ]:

#export
import xml.etree.ElementTree as ET

In [ ]:

#export
def get_output_width(name, raw, folder):
    if name.endswith('.svg'): return ET.fromstring(raw).attrib['width'].split('.')[0].replace('pt', '')
    try: return PIL.Image.open(Path(folder)/name).size[0]
    except: return None

In [ ]:

#export
def convert_nb(fname, dest_path='.', folder=None):
    "Convert a notebook `fname` to html file in `dest_path`."
    print(f"Converting {fname}")
    fname = Path(fname)
    dest_name = fname.with_suffix('.asciidoc').name
    if folder is None: folder = Path(dest_path)/f'{fname.stem}_files'
    #folder for images. Clear if exists
    if folder.exists(): shutil.rmtree(folder)
    os.makedirs(folder, exist_ok=True)

    nb = add_metadata(treat_notebook(read_nb(fname), folder))
    export = exporter.from_notebook_node(nb)
    metadata = {'folder': folder.relative_to(dest_path)}
    metadata.update({n: nb["cells"][output_num(n)]['metadata'] for n in export[1]['outputs'].keys() if output_num(n) is not None})
    for n,o in export[1]['outputs'].items():
        with open(Path(folder)/n, 'wb') as f: f.write(o)
        w = metadata[n]['width'] if 'width' in metadata[n] else get_output_width(n, o, folder)
        if w is not None: metadata[n]['width'] = str(int(IMAGE_OUT_MULT * int(w)))
    with open(f'{dest_path}/{dest_name}','w', encoding="utf8") as f:
        f.write(post_process(export[0], metadata))

In [ ]:

dest = Path('test')
convert_nb('test/_test.ipynb', dest)

Converting test/_test.ipynb

In [ ]:

#convert_nb('test/_test.ipynb', Path('test'))

In [ ]:

#export
def _copy_images(path, dest_path):
    os.makedirs(dest_path, exist_ok=True)
    for f in path.iterdir():
        if f.is_file(): shutil.copy(f, dest_path/f.name)
        if f.is_dir(): _copy_images(f, dest_path/f.name)

In [ ]:

#export
def copy_images(path, dest_path):
    img_folder = dest_path/"images"
    if img_folder.exists(): shutil.rmtree(img_folder)
    _copy_images(path/"images", img_folder)

In [ ]:

dest = Path('..')/'convert_book'
# copy_images(Path('book'), dest)

In [ ]:

#export
def _convert1(fname, dest_path='.'):
    try: convert_nb(fname, dest_path=dest_path)
    except Exception as e:
        print(f"Error in notebook {fname}")
        print(e)

In [ ]:

#export
@call_parse
def fastdoc_convert_all(
    path:str='book',  # Path to notebooks
    dest_path:str='../convert_book'  # Path to generated asciidoc files
):
    path,dest_path = Path(path),Path(dest_path)
    dest_path.mkdir(parents=True,exist_ok=True)
    (path/'images').mkdir(parents=True,exist_ok=True)
    nbs = [f for f in path.iterdir() if f.suffix == '.ipynb' and not f.name.startswith('_')]
    parallel(_convert1, nbs, dest_path=dest_path)
    for f in path.iterdir():
        if f.suffix in ['.adoc', '.asciidoc']: shutil.copy(f, dest_path/f.name)
    copy_images(path, dest_path)

In [ ]:

#convert_all()

Export -¶

In [ ]:

from nbdev.export import *
notebook2script()

Converted 00_asciidoc.ipynb.
Converted 01_clean.ipynb.
Converted index.ipynb.

In [ ]: