#!/usr/bin/env python # coding: utf-8 # # Marshal as serialization of TF data # # [Marshal](https://docs.python.org/3/library/marshal.html#module-marshal) # is a data serialization format used in the standard library of Python. It is more primitive, # but it might be faster. # # As a simple test, we take the feature data for `g_word` and `oslots`. # # `g_word` is a map from the numbers 1 to ca. 420,000 to Hebrew word occurrences (ASCII strings). # # `oslots` is a map from ca. 1 million integers to tuples of integers. # # In Text-Fabric we have a representation in plain text and a compressed, pickled representation. # # We also run the deserialization in two ways: when the garbage collector is enabled, or when the garbage # collector is deliberately turned off. # # # Outcome # # Pickle is faster. Loading gzipped, pickled data is *much* faster than loading gzipped, marshalled data. # # The size of the marshal uncompressed serialization is much bigger than the TF text representation. # # The size of the gzipped marshal serialization is approximately the same as the gzipped, pickled TF serialization. # # # Detailed comparison # # what | `g_word` | `oslots` # --- | --- | --- # pickle.gz with GC | 0.08 | 0.7 # pickle.gz without GC | 0.09 | 0.38 # marshal.gz with GC | 1.11 | 1.86 # marshal.gz without GC | 1.07 | 1.85 # # # Conclusion # # **We do not see reasons to replace the TF feature data serialization by marshal.** # # **We do not fiddle with the garbage collector.** # In[1]: get_ipython().run_line_magic('load_ext', 'autoreload') get_ipython().run_line_magic('autoreload', '2') # In[2]: import os import gzip import marshal import pickle import gc from shutil import move from tf.fabric import Fabric from tf.app import use GZIP_LEVEL = 2 # same as used in Text-Fabric # ## Load from the textual data # In[3]: BASE = os.path.expanduser("~/github/annotation/text-fabric") TEST_BASE = f"{BASE}/_temp/serial" TEST_DATA_TF = f"{TEST_BASE}/tf" TEST_DATA_SERIAL = f"{TEST_BASE}/serialized" FEATURES = ("g_word", "oslots") if not os.path.exists(TEST_DATA_SERIAL): os.makedirs(TEST_DATA_SERIAL, exist_ok=True) # In[4]: TF = Fabric(locations=TEST_DATA_TF) api = TF.load(FEATURES) # During this time, the textual data has been compiled and written to a binary form. # We move the binary form (gz pickled) to the serial directory. # In[5]: for fName in FEATURES: move(f"{TEST_DATA_TF}/.tf/2/{fName}.tfx", f"{TEST_DATA_SERIAL}/{fName}.pickle.gz") # ## Load gz-pickled # In[6]: def load(fName, ext, withGc=True): TF.indent(reset=True) fullName = f"{fName}.{ext}" path = f"{TEST_DATA_SERIAL}/{fullName}" TF.info(f"start loading {fullName}") if not withGc: gc.disable() if ext == "pickle.gz": with gzip.open(path, "rb") as f: data = pickle.load(f) elif ext == "marshal.gz": with gzip.open(path, "rb") as f: data = marshal.load(f) TF.info(f"end loading {fName}.{ext}") if not withGc: gc.enable() return data # In[7]: data = {} for fName in FEATURES: data[fName] = load(fName, "pickle.gz") for fName in FEATURES: data[fName] = load(fName, "pickle.gz", withGc=False) # # Make an marshal feature data file # In[8]: for fName in FEATURES: with open(f"{TEST_DATA_SERIAL}/{fName}.marshal.gz", 'wb') as mf: with gzip.open(f"{TEST_DATA_SERIAL}/{fName}.marshal.gz", "wb", compresslevel=GZIP_LEVEL) as f: marshal.dump(data[fName], f) # ## Load gz-marshal # In[9]: dataMarshal = {} for fName in FEATURES: dataMarshal[fName] = load(fName, "marshal.gz") for fName in FEATURES: dataMarshal[fName] = load(fName, "marshal.gz", withGc=False) # # With garbage collector turned off or on? # # It seems that oslots loads much faster with the garbage collector temporarily switched off. # # Let's try to load the whole BHSA in both ways: # In[17]: TF.indent(reset=True) TF.info("start loading bhsa with gc switched off") A = use("bhsa", withGc=False) TF.info("end loading bhsa with gc switched off") # In[18]: TF.indent(reset=True) TF.info("start loading bhsa with gc switched on") A = use("bhsa", withGc=True) TF.info("end loading bhsa with gc switched on") # Does not make much difference. We leave the garbage collector untouched by default, # i.e. we do not switch it off. # In[ ]: