#!/usr/bin/env python
# coding: utf-8

# Entropic Coding and Compression
# ===============================
# 
# *Important:* Please read the [installation page](http://gpeyre.github.io/numerical-tours/installation_python/) for details about how to install the toolboxes.
# $\newcommand{\dotp}[2]{\langle #1, #2 \rangle}$
# $\newcommand{\enscond}[2]{\lbrace #1, #2 \rbrace}$
# $\newcommand{\pd}[2]{ \frac{ \partial #1}{\partial #2} }$
# $\newcommand{\umin}[1]{\underset{#1}{\min}\;}$
# $\newcommand{\umax}[1]{\underset{#1}{\max}\;}$
# $\newcommand{\umin}[1]{\underset{#1}{\min}\;}$
# $\newcommand{\uargmin}[1]{\underset{#1}{argmin}\;}$
# $\newcommand{\norm}[1]{\|#1\|}$
# $\newcommand{\abs}[1]{\left|#1\right|}$
# $\newcommand{\choice}[1]{ \left\{  \begin{array}{l} #1 \end{array} \right. }$
# $\newcommand{\pa}[1]{\left(#1\right)}$
# $\newcommand{\diag}[1]{{diag}\left( #1 \right)}$
# $\newcommand{\qandq}{\quad\text{and}\quad}$
# $\newcommand{\qwhereq}{\quad\text{where}\quad}$
# $\newcommand{\qifq}{ \quad \text{if} \quad }$
# $\newcommand{\qarrq}{ \quad \Longrightarrow \quad }$
# $\newcommand{\ZZ}{\mathbb{Z}}$
# $\newcommand{\CC}{\mathbb{C}}$
# $\newcommand{\RR}{\mathbb{R}}$
# $\newcommand{\EE}{\mathbb{E}}$
# $\newcommand{\Zz}{\mathcal{Z}}$
# $\newcommand{\Ww}{\mathcal{W}}$
# $\newcommand{\Vv}{\mathcal{V}}$
# $\newcommand{\Nn}{\mathcal{N}}$
# $\newcommand{\NN}{\mathcal{N}}$
# $\newcommand{\Hh}{\mathcal{H}}$
# $\newcommand{\Bb}{\mathcal{B}}$
# $\newcommand{\Ee}{\mathcal{E}}$
# $\newcommand{\Cc}{\mathcal{C}}$
# $\newcommand{\Gg}{\mathcal{G}}$
# $\newcommand{\Ss}{\mathcal{S}}$
# $\newcommand{\Pp}{\mathcal{P}}$
# $\newcommand{\Ff}{\mathcal{F}}$
# $\newcommand{\Xx}{\mathcal{X}}$
# $\newcommand{\Mm}{\mathcal{M}}$
# $\newcommand{\Ii}{\mathcal{I}}$
# $\newcommand{\Dd}{\mathcal{D}}$
# $\newcommand{\Ll}{\mathcal{L}}$
# $\newcommand{\Tt}{\mathcal{T}}$
# $\newcommand{\si}{\sigma}$
# $\newcommand{\al}{\alpha}$
# $\newcommand{\la}{\lambda}$
# $\newcommand{\ga}{\gamma}$
# $\newcommand{\Ga}{\Gamma}$
# $\newcommand{\La}{\Lambda}$
# $\newcommand{\si}{\sigma}$
# $\newcommand{\Si}{\Sigma}$
# $\newcommand{\be}{\beta}$
# $\newcommand{\de}{\delta}$
# $\newcommand{\De}{\Delta}$
# $\newcommand{\phi}{\varphi}$
# $\newcommand{\th}{\theta}$
# $\newcommand{\om}{\omega}$
# $\newcommand{\Om}{\Omega}$

# This numerical tour studies source coding using entropic coders (Huffman and arithmetic).

# In[ ]:


from __future__ import division

import numpy as np
import scipy as scp
import pylab as pyl
import matplotlib.pyplot as plt

from nt_toolbox.general import *
from nt_toolbox.signal import *

get_ipython().run_line_magic('matplotlib', 'inline')
get_ipython().run_line_magic('load_ext', 'autoreload')
get_ipython().run_line_magic('autoreload', '2')


# In[ ]:


# Source Coding and Entropy
# -------------------------
# Entropic coding converts a vector $x$ of integers into a binary stream
# $y$. Entropic coding exploits the
# redundancies in the statistical distribution of the entries of $x$ to
# reduce as much as possible the size of $y$. The lower bound for the
# number of bits $p$ of $y$ is the Shannon bound :
# 
# $$p=-\sum_ih(i)\log_2(h(i))$$
# 
# where $h(i)$ is the probability of apparition of symbol $i$ in $x$.
# 
# Fist we generate a simple binary signal $x$ so that $0$ has a probability $p$
# to appear in $x$.
# 
# Probability of 0.

# In[2]:


p = 0.1


# Size.
# 

# In[3]:


n = 512


# Signal, should be with token 1,2.

# In[4]:


from numpy import random

x = (random.rand(n) > p) + 1


# One can check the probabilities by computing the empirical histogram.

# In[5]:


h = [np.sum(x == 1), np.sum(x == 2)]
h = h/np.sum(h)

print("Empirical p = %.2f" %h[0])


# We can compute the entropy of the distribution represented as a vector $h$ of proability that should sum to 1.
# We take a max to avoid problems with null probabilties.

# In[6]:


e = - np.sum(h*np.log2([max(e,1e-20) for e in h]))
print("Entropy = %.2f" %e)


# Huffman Coding
# --------------
# A Hufman code $C$ associates to each symbol $i$ in $\{1,...,m\}$ a binary code $C_i$
# whose length is as close as possible to the optimal bound
# $-\log_2\left(h(i)\right)$, where $h(i)$ is the probability of apparition of the
# symbol $i$.
# 
# We select a set of proabilities.

# In[7]:


h = [.1, .15, .4, .15, .2]


# The tree $T$ contains the codes and is generated by an iterative algorithm.
# The initial "tree" is a collection of empty trees, pointing to the symbols numbers.

# In[8]:


m = len(h)
T = [0] * m # create an empty tree


# We build iteratively the Huffman tree
# by grouping together the two erees that have the smallest probabilities.
# The merged tree has a probability which is the sum of the two selected
# probabilities.
# 
# Initial probability.

# In[9]:


#we use the symbols i = 0,1,2,3,4 (as strings) with the associated probabilities h(i)

for i in range(m):
    T[i] = (h[i],str(i))


# Iterative merging of the leading probabilities.

# In[10]:


while len(T) > 1: 
    T.sort() #sort according to the first values of the tuples (the probabilities)
    t = tuple(T[:2])
    q = T[0][0] + T[1][0]
    T = T[2:] + [(q,t)]


# We trim the computed tree by removing the probabilities.

# In[11]:


def trim(T):
    T0 = T[1]
    if type(T0) == str:
        return T0
    else:
        return (trim(T0[0]),trim(T0[1]))

T = trim(T[0])


# We display T using the ete3 package (install it in the terminal with "pip install ete3").

# In[12]:


from ete3 import Tree, TreeStyle , NodeStyle, AttrFace, faces

t = Tree(str(T)+";")

ts = TreeStyle()
#ts.rotation = 90
ts.scale = 90
ts.branch_vertical_margin = 50 # 10 pixels between adjacent branches
ts.show_leaf_name = False #

nstyle = NodeStyle()
nstyle["shape"] = "sphere"
nstyle["size"] = 20
nstyle["fgcolor"] = "blue"

for n in t.traverse():
    n.set_style(nstyle)

for node in t.iter_leaves():
    node.add_face(AttrFace("name", fsize=20), column=0)  
    
t.render("%%inline", tree_style = ts)


# Once the tree $T$ is computed, one can compute the code $C_{i}$
# associated to each symbol $i$. This requires to perform a deep first
# search in the tree and stop at each node.

# In[13]:


codes = {}

def huffman_gencode(T,codes,c):
    if type(T) == str: #test if T is a leaf
        codes[T] = c
    else:
        huffman_gencode(T[0],codes, c + "0")
        huffman_gencode(T[1],codes, c + "1")

huffman_gencode(T,codes,"") 


# Display the code.

# In[14]:


for e in codes:
    print("Code of token " + e + ": " + codes[e])


# We draw a vector $x$ according to the distribution $h$.
# 
# Size of the signal.

# In[15]:


n = 1024


# Randomization.

# In[16]:


from numpy import random

def rand_discr(p, m = 1):
    """
        rand_discr - discrete random generator 
        
          y = rand_discr(p, n);
        
          y is a random vector of length n drawn from 
          a variable X such that
              p(i) = Prob( X=i )
        
          Copyright (c) 2004 Gabriel Peyré
    """

    # makes sure it sums to 1
    p = p/np.sum(p)
    
    n = len(p)
    coin = random.rand(m)
    cumprob = np.append(0,+ np.cumsum(p))
    sample = np.zeros(m)
    
    for j in range(n):
        ind = [(coin > cumprob[j]) & (coin <= cumprob[j+1])]
        sample[ind] = j
        
    return sample

x = rand_discr(h, n)


# __Exercise 1__
# 
# Implement the coding of the vector $x$ to obtain a binary vector $y$, which corresponds to replacing each sybmol $x(i)$ by the code $C_{x(i)}$.

# In[17]:


run -i nt_solutions/coding_2_entropic/exo1


# In[18]:


## Insert your code here.


# Compare the length of the code with the entropy bound.

# In[19]:


e = - np.sum(h*np.log2([max(e,1e-20) for e in h]))
print("Entropy bound = %.2f" %(n*e))
print("Huffman code  = %.2f" %len(y)) 


# Decoding is more complicated, since it requires to iteratively parse the tree $T$.

# Initial empty decoded stream.

# In[20]:


x1 = []


# Perform decoding.

# In[21]:


T0 = T
for e in y:
    if e == '0':
            T0 = T0[0]
    else:
            T0 = T0[1]
    if type(T0) == str:
        i = i+1
        x1 += T0
        T0 = T


# We test if the decoding is correct.

# In[22]:


from numpy import linalg

err = linalg.norm(np.subtract(x,[float(e) for e in x1]))
print("Error (should be zero) : %f " %err)


# Huffman Block Coding
# --------------------
# A Huffman coder is inefficient because it can distribute only an integer
# number of bit per symbol. In particular, distribution where one of the
# symbol has a large probability are not well coded using a Huffman code.
# This can be aleviated by replacing the set of $m$ symbols by $m^q$
# symbols obtained by packing the symbols by blocks of $q$ (here we use $m=2$ for a binary alphabet). This breaks
# symbols with large probability into many symbols with smaller proablity,
# thus approaching the Shannon entropy bound.
# 
# 
# Generate a binary vector with a high probability of having 1, so that the
# Huffman code is not very efficient (far from Shanon bound).
# 
# Proability of having 0.

# In[23]:


t = .12


# Probability distriution.

# In[24]:


h = [t, 1-t]


# Generate signal.

# In[25]:


from numpy import random

n = 4096*2
x = (random.rand(n) > t) + 1


# For block of length $q=3$, create a new vector by coding each block
# with an integer in $\{1,...,m^q=2^3\}$. The new length of the vector is
# $n_1/q$ where $n_1=\lceil n/q\rceil q$.
# 
# Block size.

# In[26]:


q = 3


# Maximum token value.

# In[27]:


m = 2


# New size.

# In[28]:


n1 = (n//q+1)*q


# New vector.

# In[29]:


x1 = np.zeros(n1)
x1[:len(x)] = x
x1[len(x):] = 1
x1 = x1 - 1
x2 = []

for i in range(0,n1,q):
    mult = [m**j for j in range(q)]
    x2.append(sum(x1[i:i+q]*mult))


# We generate the probability table $H$ of $x_1$ that represents the probability
# of each new block symbols in $\{1,...,m^q\}$.

# In[30]:


H = h
for i in range(q-1):
    Hold = H
    H = []
    for j in range(len(h)):
        H = H + [e*h[j] for e in Hold]


# A simpler way to compute this block-histogram is to use the Kronecker product.

# In[31]:


H = h
for i in range(1,q):
    H = np.kron(H, h)


# __Exercise 2__
# 
# For various values of block size $k$, Perform the Huffman coding and compute the length of the code.
# Compare with the entropy lower bound.

# In[32]:


run -i nt_solutions/coding_2_entropic/exo2


# In[33]:


## Insert your code here.

<script>
  $(document).ready(function(){
      $('div.prompt').hide();
  });
</script>