I recently ran into an issue with a large amount of JSON events, where the volume of data was ridicoulus simply because of the redundancy in the dataset.
Hearing Raymond Hettingers words inside my head: "There Must Be a Better Way!" I decided to spend some time shrinking the data.
There were 3 steps to the process:
{'x': ..., 'y': ..., 'z':...}
to tuples.The two first parts are more of less trivial. The JSON is a dict in python so doing a diff is a dict comparison
def dict_comp(A, B):
""" helper for comparing json like dicts"""
C = {}
for k, v1 in A.items():
v2 = B.get(k, None)
if v2 is None:
C[k]=v1
elif v1!=v2:
C[k]=v2
elif isinstance(v1,dict):
C.update(dict_comp(v1, v2))
else:
continue
return C
A = {'Alice': 1, 'Bob': {'one': 1, 'two':2}}
B = {'Alice': 0, 'Bob': {'one': 1, 'two': 0}}
dict_comp(A,B)
{'Alice': 0, 'Bob': {'one': 1, 'two': 0}}
Here we only see the changes.
The third part is probably a little more novel. Let's start with the requirements:
Here's the whole thing:
import zipfile, io
class InMemoryZip(object):
def __init__(self):
# Create the in-memory file-like object
self.in_memory_zip = io.BytesIO()
self._path = None
def load(self, path):
if not isinstance(path, pathlib.Path):
raise TypeError(f"{path} is not a path object")
if not path.name.lower().endswith('zip'):
raise ValueError(f"{path} is not a zip")
self._path = path
def __iter__(self):
assert isinstance(self._path, pathlib.Path)
zf = zipfile.ZipFile(self._path)
for name in zf.namelist():
yield name
def __getitem__(self, item):
zf = zipfile.ZipFile(self._path)
if item not in zf.namelist():
raise KeyError(f"no such file: {item}")
return zf.read(item)
def append(self, filename_in_zip, file_contents):
"""Appends a file with name filename_in_zip and contents of
file_contents to the in-memory zip."""
# Get a handle to the in-memory zip in append mode
zf = zipfile.ZipFile(self.in_memory_zip, "a", zipfile.ZIP_DEFLATED, False)
# Write the file to the in-memory zip
zf.writestr(filename_in_zip, file_contents)
# Mark the files as having been created on Windows so that
# Unix permissions are not inferred as 0000
for zfile in zf.filelist:
zfile.create_system = 0
return self
def read(self):
"""Returns a string with the contents of the in-memory zip."""
self.in_memory_zip.seek(0)
return self.in_memory_zip.read()
def write(self, path):
"""Writes the in-memory zip to a file."""
if not isinstance(path, pathlib.Path):
raise TypeError
with path.open('wb') as fo:
fo.write(self.read())
To test that it works, let's first get the imports out of the way:
import io
import pathlib
import tempfile
Next, let's create some data and store it in memory
imz = InMemoryZip()
bytestream = io.BytesIO(b"123 123 ")
bytestream.seek(0)
imz.append('a/first', bytestream.read())
bytestream = io.BytesIO(b"123 456 ")
bytestream.seek(0)
imz.append('a/second', bytestream.read())
<__main__.InMemoryZip at 0x28aa67d12b0>
Now let's write it to disk
tempdir = tempfile.gettempdir()
path = pathlib.Path(tempdir) / "io_test.zip"
imz.write(path)
with path.open('rb') as fi:
print(path.name, len(fi.read()), "bytes")
io_test.zip 222 bytes
Finally let's load it from disk
imz = InMemoryZip()
imz.load(path)
names = [name for name in imz]
assert len(names) == 2
first = imz['a/first']
assert first == b"123 123 "
second = imz['a/second']
assert second == b"123 456 "
And finally clean up the file system.
path.unlink()
Simple.