#!/usr/bin/env python # coding: utf-8 # # Differences between 'word' and 'normalized' # ## Table of content # * 1 - Introduction # * 2 - Load Text-Fabric app and data # * 3 - Performing the queries # # 1 - Introduction # # Jupyter NoteBook to investigate the differences between feature 'word' and 'normalized'. # # 2 - Load Text-Fabric app and data # ##### [Back to TOC](#TOC) # In[1]: get_ipython().run_line_magic('load_ext', 'autoreload') get_ipython().run_line_magic('autoreload', '2') # In[2]: # Loading the Text-Fabric code # Note: it is assumed Text-Fabric is installed in your environment. from tf.fabric import Fabric from tf.app import use # In[3]: # load the app and data N1904 = use ("tonyjurg/Nestle1904GBI:latest", hoist=globals()) # # 3 - Performing the queries # ##### [Back to TOC](#TOC) # In[4]: Differences = ''' a:word a .word#normalized. a ''' DifferencesList = N1904.search(Differences) # In[5]: # Library to format table from tabulate import tabulate ResultDict = {} for Difference in DifferencesList: index=+1 node=Difference[0] Change=F.word.v(node)+" -> "+F.normalized.v(node) # Check if this Change already exists in ResultDict if Change in ResultDict: # If it exists, add the count to the existing value ResultDict[Change]+=1 else: # If it doesn't exist, initialize the count as the value ResultDict[Change]=1 # Convert the dictionary into a list of key-value pairs and sort it according to frequency UnsortedTableData = [[key, value] for key, value in ResultDict.items()] TableData= sorted(UnsortedTableData, key=lambda row: row[1], reverse=True) # Produce the table headers = ["word -> normalized","frequency"] print(tabulate(TableData, headers=headers, tablefmt='fancy_grid')) # In[ ]: