fileLocation
in the first code cell below. The second code cell will list the contents of the .dbc file, so you can check that you're working with the correct file. The third code cell will list the .ipynb files that will be generated. Note that if the files already exist they will be overwritten. The fourth code cell will perform the conversion and overwrite any files, if necessary. The fifth code cell just cleans up the directory that was used for extracting the .dbc file.¶display
function and a working displayHTML
function. It also replaces baseDir = <some path>
with baseDir = 'data'
.¶# Change this fileLocation if necessary
# Subsequent downloads might be called 'Databricks (1).dbc', etc.
fileLocation = 'Databricks.dbc'
# Extract dbc file
# Cleanup from prior run
import shutil
try: shutil.rmtree('tmp_dbc')
except OSError: pass
import zipfile
import os
try: os.mkdir('tmp_dbc')
except OSError: pass
with zipfile.ZipFile(fileLocation, 'r') as z:
z.extractall('tmp_dbc')
print '*** Contents from the .dbc file (usually one file or a directory) ***\n'
print os.listdir('tmp_dbc')
# Find files to parse
import fnmatch
filesToParse = []
for root, dirNames, fileNames in os.walk('tmp_dbc'):
for fileName in fnmatch.filter(fileNames, '*.python'):
filesToParse.append((root, fileName))
def getIpynbName((path, fileName)):
path = os.path.normpath(path)
pathSplit = path.split(os.sep)[2:]
baseDir = os.path.join(*pathSplit) if len(pathSplit) > 0 else '.'
newFileName = os.path.splitext(fileName)[0] + '_export.ipynb'
return os.path.join(baseDir, newFileName)
print "*** Files to be created (relative to your current working directory) ***"
print "(Warning: files will be overwritten!)\n"
for path, fileName in filesToParse:
print getIpynbName((path, fileName))
# Create the IPython Notebooks
# Convert .python files to .ipynb files
import codecs
from IPython import nbformat
from IPython.nbformat.v3.nbpy import PyReader
import json
import re
_header = u'# -*- coding: utf-8 -*-\n# <nbformat>3.0</nbformat>\n'
_markdownCell = u'\n\n# <markdowncell>\n\n'
_codeCell = u'\n\n# <codecell>\n\n'
_firstCell = u"""# Increase compatibility with Databricks
from IPython.display import display as idisplay, HTML
displayHTML = lambda x: idisplay(HTML(x))
def display(*args, **kargs): pass"""
def convertToIpynb(fileToParse):
with codecs.open(os.path.join(*fileToParse), encoding="utf-8") as fp:
jsonData = json.load(fp)
commands = jsonData['commands']
commandInfo = [(x['position'], x['command']) for x in commands]
commandList = sorted(commandInfo)
with codecs.open('tmp_ipynb.py', 'w', encoding="utf-8") as fp:
fp.write(_header)
fp.write(_codeCell)
fp.write(_firstCell)
for position, command in commandList:
if re.match(r'\s*%md', command):
command = re.sub(r'^\s*%md', '', command, flags=re.MULTILINE)
command = re.sub(r'(%\(|\)%)', '$', command)
command = re.sub(r'(%\[|\]%)', '$$', command)
fp.write(_markdownCell)
asLines = command.split('\n')
command = '# ' + '\n# '.join(asLines)
else:
command = re.sub(r'^\s*baseDir\s*=.*$', 'baseDir = \'data\'',
command, flags=re.MULTILINE)
fp.write(_codeCell)
fp.write(command)
outputName = getIpynbName(fileToParse)
with codecs.open('tmp_ipynb.py', 'r', encoding="utf-8") as intermediate:
nb = PyReader().read(intermediate)
os.remove('tmp_ipynb.py')
baseDirectory = os.path.split(outputName)[0]
if not os.path.isdir(baseDirectory):
os.makedirs(baseDirectory)
with codecs.open(outputName, 'w', encoding="utf-8") as output:
nbformat.write(nbformat.convert(nb, 4.0), output)
print 'Created: {0}'.format(outputName)
for fileToParse in filesToParse:
convertToIpynb(fileToParse)
# Cleanup
import shutil
try: shutil.rmtree('tmp_dbc')
except OSError: pass