#!/usr/bin/env python # coding: utf-8 # In[12]: import pandas as pd import math # # Robert a.k.a. 00011 a.k.a 1100010100000010011 # > Compression # Fun with compressing names which is either interesting to you as a reader or not :) It does connect to data engineering and data science: # # * Column storage with very similar data, check out Roaring Bitmaps # * An old idea to compare music is to compress a song together with a catalogue of music from a genre. The lowest compression is then the genre. # * How well data compresses tells you something about the intrisic complexity/richness of the data. Having a large dataset is a very basic measure, compression (and ideas like compression) tells you more. # * Encoder/decoders kindof work like lossy compression. # # This article will be followed up with compressing timeseries. # ## Read census data # Read in the 1990 census data of the USA # In[13]: def parse_perc(s: str) -> float: return float(s.strip('%')) / 100 def parse_count(s: str) -> int: return int(s.replace(',', '')) df = pd.read_csv("names.csv", sep=";", converters={ 'Frequency (%) M': parse_perc, 'Frequency (%) F': parse_perc, 'Count M': parse_count, 'Count F': parse_count }) # In[14]: df.tail() # ## Processing the data to 2-grams # The idea is that we construct character 2-grams and list the top 4 letters following a letter. For example, `Jefferey` would result in `[je, ef, ff, fe, er, re, ey]`. The leter `e` is the start of three 2-grams: `ef`, `er` and `ey`. By summing up the occurences of each 2-gram we can find out what's the most likely character following a specified letter. # Below we select the top 4: # In[15]: # Construct 2-grams top = 4 def two_grams(s: str): return list( zip(s[:-1], s[1:])) # Construct flattened list of weighted 2-grams and get the top 8 per letter weighted_two_grams = pd.DataFrame( df.apply( lambda row: [(a,b, row['Frequency (%) M']) for a,b in two_grams(row['Name M'])], axis=1 ).apply(pd.Series).stack().to_list(), columns=['first', 'second', 'freq'] ).groupby(by=['first', 'second']).sum().sort_values(by='freq', ascending=False).groupby(by='first').head(top).sort_index() weighted_two_grams.head() # ## Encoding names # To simplify we assume we only have 26 letters. To encode names we go through the name and output a (binary) code for each letter. We distinguish two types of codes: # # 1. The number of the letter in the alphabet (1-26) # 2. The position in the top-4 of letters of the letter _preceding_ this one # # # Let's define some functions that do this for us! # In[16]: def encode_char(c, prev): if prev is not None and c in weighted_two_grams.loc[(prev,)].index: return (0, weighted_two_grams.loc[(prev,)].index.get_loc(c)) else: return (1, ord(c)-65) def encode(name): encoded = [] prev = None for c in name: code = encode_char(c, prev) prev = c encoded.append(code) return encoded def decode_char(code, prev): t, nr = code if prev is not None and t == 0: return weighted_two_grams.loc[(prev,)].iloc[nr].name else: return chr(nr+65) def decode(codes): decoded = [] prev = None for code in codes: prev = decode_char(code, prev) decoded.append(prev) return ''.join(decoded) # As an example let's see how `robert` is encoded and decoded: # In[17]: encode("ROBERT") # In[18]: decode(encode("ROBERT")) # ## How much bits do we need? # # We could encode the two types of codes with a `0` followed by 2 bits or `1` followed by 5 bits. The two bits give the position in the top 4, and the 5 bits give the position in the 26 letter alphabet (2^5=32). There might be some room to use even less bits with some trickery. # # The 2-gram lookup table requires 26*4*5=520 bits: sequentially store the top 4 for a...z, and there is no top four pad with zeros). # Suppose we encounter the top 1000 names proportionally than on average we need 21.92 bits per name: # In[19]: def nr_bits(codes): return sum([ 1 + (math.log2(top) if t == 0 else 5) for t, nr in codes]) # In[20]: df['enc'] = df['Name M'].map(lambda x: nr_bits(encode(x))) (df['enc'] * df['Frequency (%) M']).sum() # Robert would be encoded as # In[30]: def to_bin(encoding): return ''.join([str(code) + format(nr, 'b') for code, nr in encoding]) # In[31]: to_bin(encode('ROBERT')) # ## Can we do better? # If the first 1000 first names really cover 90% we can also encode the first 1000 names in a fixed list. We then encode it as follows: # # * Single bit 0 if not in the list and 1 if in the list # * If in the list 10 bits a number <1000 # * If not in the list 5 bits per letter # # The table takes 5677 * 5 = 28385 bits to store. On average this takes 10 bits per name. # # The break-even point is at roughly encoding 2300 names from the top 1000. # In[33]: (28385-520)/12 # ## Can we do _even_ better? # # Probably the best idea is to mix the frequency table approach and encoding the top 200 names, as these already cover 72% of all names. The frequency table is very small and also works for names outside of the top 1000. # In[284]: df.sort_values(by='Frequency (%) M', ascending=False).head(200)['Frequency (%) M'].sum() # This is left as an exercise for the reader.