#!/usr/bin/env python # coding: utf-8 # # Verify 'BOL' against 'LFT' # In order to be BOL features to be compatible with the LFT Text-Fabric version, the node numbers for node type 'word' need to match exactly. This script will check this by comparing feature normalized between the two datasets. # In[71]: # Following variables should contain the relative path and name of the two files to compare LFTFile="../tf/0.5/normalized.tf" BOLFile="BOL/normalized.tf" targetWord="Βιβλος" # word to sync both files upon # How many difference to show NumberExamples = 10 # In[68]: import os from unidecode import unidecode import unicodedata item1=item2='' def remove_accents(text): return ''.join(c for c in unicodedata.normalize('NFD', text) if unicodedata.category(c) != 'Mn') def compare_files(file1_path, file2_path): global targetWord global NumberExamples global item1 global item2 FoundDifferences=0 with open(file1_path, 'r', encoding='utf-8') as file1, open(file2_path, 'r', encoding='utf-8') as file2: # Skip part of file2 until target word is found lineNumber1=0 for line1 in file1: lineNumber1+=1 unaccentedWord=remove_accents(line1.strip()) if targetWord in unaccentedWord: print ('Starting at line ',lineNumber1,' in file 1 at:',repr(line1)) break # Skip part of file2 until target word is found lineNumber2=0 for line2 in file2: lineNumber2+=1 unaccentedWord=remove_accents(line2.strip()) if targetWord in unaccentedWord: print ('Starting at line ',lineNumber2,' in file 2 at:',repr(line2)) break monad=0 # Compare the remaining contents of both files for line1, line2 in zip(file1, file2): monad+=1 if remove_accents(line1.strip()) != remove_accents(line2.strip()): print ('mismatch at monad', monad, ':',repr(line1), ' versus ', repr(line2)) # store them item1=line1 item2=line2 print("Finished.") # main part #First check if the file exist, then check its content if os.path.exists(LFTFile): if os.path.exists(BOLFile): print ("Comparing file ",LFTFile," with ",BOLFile,"\n\nResult:\n\n",end="") compare_files(LFTFile, BOLFile) else: print (f"Could not find file {BOLFile}.") else: print(f"Could not find file {LFTFile}.") # ## Check where this difference is found # In[59]: get_ipython().run_line_magic('load_ext', 'autoreload') get_ipython().run_line_magic('autoreload', '2') # In[60]: # Loading the Text-Fabric code # Note: it is assumed Text-Fabric is installed in your environment. from tf.fabric import Fabric from tf.app import use # In[63]: # load the N1904GBI app and data # Since two distinct Text-Fabric dataset are loaed, the option hoist=globals() SHOULD NOT be used! N1904GBI = use ("tonyjurg/Nestle1904LFT",version='0.5', hoist=globals()) # In[67]: T.sectionFromNode(83369) # In[65]: T.sectionTuple(83369) # In[66]: T.text(150868) # ## Dig a litle deeper # In[69]: for char in item1: print(f"Character: '{char}'\tUnicode Code Point: {ord(char)}") # In[70]: for char in item2: print(f"Character: '{char}'\tUnicode Code Point: {ord(char)}") # Since the comparison is performed on the unaccented word, the problem seems to be the use of a different unicode value for θ. # ## Other invisable differences between the tf files # # There were found to be differences in regards to special characters between the tf files: # # In[ ]: