#!/usr/bin/env python
# coding: utf-8
# # Differences between 'word' and 'normalized'
# ## Table of content
# * 1 - Introduction
# * 2 - Load Text-Fabric app and data
# * 3 - Performing the queries
# # 1 - Introduction
#
# Jupyter NoteBook to investigate the differences between feature 'word' and 'normalized'.
# # 2 - Load Text-Fabric app and data
# ##### [Back to TOC](#TOC)
# In[1]:
get_ipython().run_line_magic('load_ext', 'autoreload')
get_ipython().run_line_magic('autoreload', '2')
# In[2]:
# Loading the Text-Fabric code
# Note: it is assumed Text-Fabric is installed in your environment.
from tf.fabric import Fabric
from tf.app import use
# In[3]:
# load the app and data
N1904 = use ("tonyjurg/Nestle1904GBI:latest", hoist=globals())
# # 3 - Performing the queries
# ##### [Back to TOC](#TOC)
# In[4]:
Differences = '''
a:word
a .word#normalized. a
'''
DifferencesList = N1904.search(Differences)
# In[5]:
# Library to format table
from tabulate import tabulate
ResultDict = {}
for Difference in DifferencesList:
index=+1
node=Difference[0]
Change=F.word.v(node)+" -> "+F.normalized.v(node)
# Check if this Change already exists in ResultDict
if Change in ResultDict:
# If it exists, add the count to the existing value
ResultDict[Change]+=1
else:
# If it doesn't exist, initialize the count as the value
ResultDict[Change]=1
# Convert the dictionary into a list of key-value pairs and sort it according to frequency
UnsortedTableData = [[key, value] for key, value in ResultDict.items()]
TableData= sorted(UnsortedTableData, key=lambda row: row[1], reverse=True)
# Produce the table
headers = ["word -> normalized","frequency"]
print(tabulate(TableData, headers=headers, tablefmt='fancy_grid'))
# In[ ]: