#!/usr/bin/env python # coding: utf-8 # # Compare class with morph where morph=ADV (N1904LFT) # The following script reads the xml atributes class and morph for tag w and compare them. It will analyse the words whenever morph atribute is 'adv' and the class atribute is unequal to 'adv'. It prints first a number of examples (with verse/word location) and finishes with a table showing the frequency of the cases where morph=adv is not matching class=adv. # In[16]: import os import xml.etree.ElementTree as ET from tabulate import tabulate ResultDict = {} def compare_class_and_morph(file_path): ExampleNumber = 0 tree = ET.parse(file_path) root = tree.getroot() for w_tag in root.iter('w'): # Extract attributes class and morph for the tag w class_attr = w_tag.get('class') morph_attr = w_tag.get('morph') lemma_attr = w_tag.get('lemma') ref_attr = w_tag.get('ref') # Compare class and morph attributes if morph_attr.lower()=='adv' and class_attr!='adv': ExampleNumber += 1 Mapping=f"lemma={lemma_attr}, morph={morph_attr}, class={class_attr}" # Check if this Change already exists in ResultDict if Mapping in ResultDict: # If it exists, add the count to the existing value ResultDict[Mapping]+=1 else: # If it doesn't exist, initialize the count as the value ResultDict[Mapping]=1 if ExampleNumber<=NumberExamples: print(f"At ref={ref_attr} found class={class_attr} and morph={morph_attr} for lemma={lemma_attr}") return # Following variable should contain the relative path and name of file to check InputFile="xml/20230628/01-matthew.xml" # How many difference to show prior to table NumberExamples = 10 # First check if the file exists, then analyze its content if os.path.exists(InputFile): print(f"Comparing atributes class morph for file {InputFile}\n\nResult:\n\n", end="") differences = compare_class_and_morph(InputFile) # Convert the dictionary into a list of key-value pairs and sort it according to frequency UnsortedTableData = [[key, value] for key, value in ResultDict.items()] TableData= sorted(UnsortedTableData, key=lambda row: row[1], reverse=True) # Produce the table headers = ["lemma, morph, class","frequency"] print(tabulate(TableData, headers=headers, tablefmt='fancy_grid')) else: print(f"Could not find file {InputFile}.") # In[ ]: