#!/usr/bin/env python # coding: utf-8 # # Calculate V, S, O order (N1904GBI) # ## Table of content # * 1 - Introduction # * 2 - Create sum of orders # # 1 - Introduction # ##### [Back to TOC](#TOC) # Investigating the order of various clausal parts (e.g. V,S,O). # # # Testing dataset: N1904 treebank (GBI) # # # 2 - Create sum of orders # ##### [Back to TOC](#TOC) # In[5]: import pandas as pd import sys import os import time import pickle import re # used for regular expressions from os import listdir from os.path import isfile, join import xml.etree.ElementTree as ET # In[7]: BaseDir = 'C:\\Users\\tonyj\\my_new_Jupyter_folder\\test_of_xml_etree\\' InputDir = BaseDir+'inputfiles\\' bo='26-jude' InputFile = os.path.join(InputDir, f'{bo}.xml') tree = ET.parse(InputFile) root = tree.getroot() # Dictionary to store transition frequencies transition_frequencies = {} # Multiple sets of books are defined here allowing for determening variations. # In[17]: booklist = ['01-matthew', '02-mark', '03-luke', '04-john', '05-acts', '06-romans', '07-1corinthians','08-2corinthians', '09-galatians', '10-ephesians', '11-philippians', '12-colossians', '13-1thessalonians', '14-2thessalonians', '15-1timothy', '16-2timothy', '17-titus', '18-philemon', '19-hebrews', '20-james', '21-1peter', '22-2peter', '23-1john', '24-2john', '25-3john', '26-jude', '27-revelation'] paullist= ['06-romans', '07-1corinthians','08-2corinthians', '09-galatians', '10-ephesians', '11-philippians', '12-colossians', '13-1thessalonians', '14-2thessalonians', '15-1timothy', '16-2timothy', '17-titus', '18-philemon'] peterlist= ['21-1peter', '22-2peter'] lukelist= ['03-luke','05-acts'] johnlist = ['23-1john', '24-2john', '25-3john'] # In[ ]: import xml.etree.ElementTree as ET import re # Dictionary to store transition frequencies transition_frequencies = {} total_transitions = 0 # Dictionary to store transitions grouped by ('from', 'to') value grouped_transitions = {} for bo in paullist: InputFile = os.path.join(InputDir, f'{bo}.xml') print (f'Reading file {InputFile}') # Load the XML file tree = ET.parse(InputFile) root = tree.getroot() pattern = re.compile(r'-') # Iterate over 'Tree' elements for tree in root.findall('.//Tree'): # Iterate over child nodes of the current 'Tree' element for node in tree.findall('.//Node'): # Determine the current rule node_cat = node.get('Cat') node_rule = node.get('Rule') if node_cat == "CL" : print (node_cat,node_rule) # In[ ]: # avarages for each seperate transition (i.e. all rules sum op to p=1 per starting condition) import xml.etree.ElementTree as ET def addParentInfo(parent, element): for child in element: child.attrib['parent'] = parent addParentInfo(child, child) def getParent(element): if 'parent' in element.attrib: return element.attrib['parent'] else: return None # Dictionary to store transition frequencies transition_frequencies = {} total_transitions = 0 # Dictionary to store transitions grouped by ('from', 'to') value grouped_transitions = {} print('loading books ',end='') for bo in johnlist: InputFile = os.path.join(InputDir, f'{bo}.xml') #print (f'Reading file {InputFile}') print ('.',end='') # Load the XML file tree = ET.parse(InputFile) root = tree.getroot() # Add 'parent' attribute to each child element addParentInfo(None, root) # Iterate over 'Tree' elements for tree in root.findall('.//Tree'): # Iterate over child nodes of the current 'Tree' element for node in tree.findall('.//Node'): # Check if the node has child nodes has_children = bool(list(node)) # Determine the current rule node_cat = node.get('Cat') if has_children else 'Term' # Get the parent node using the 'getParent' function parent_node = getParent(node) # Check if there is a parent node if parent_node is not None: parent_cat = parent_node.get('Cat') if parent_cat is None and node_cat is not None: parent_cat = "Start" continue # Combine parent and current rule to form the transition transition = (parent_cat, node_cat) # Update the frequency count in the dictionary total_transitions += 1 transition_frequencies[transition] = transition_frequencies.get(transition, 0) + 1 print (f'\nFinished\tNumber of transitions: {total_transitions}\n') # Group transitions based on ('from', 'to') value for (from_value, to_value), frequency in transition_frequencies.items(): grouped_transitions.setdefault(from_value, []).append((from_value, to_value, frequency)) # Print separate tables for each group with sorted transitions for from_value, transitions in grouped_transitions.items(): print(f"Transition table for starting condition: {from_value}") print("From\tTo\tOcc.\tWeigth") # Sort transitions based on frequency in descending order sorted_transitions = sorted(transitions, key=lambda x: x[2], reverse=True) # Calculate total occurrences for the current table total_occurrences = sum(occurrence for _, _, occurrence in sorted_transitions) for from_val, to_val, frequency in sorted_transitions: # Calculate the average occurrence for each transition average_occurrence = frequency / total_occurrences print(f'{from_val}\t{to_val}\t{frequency}\t{average_occurrence:.4}') print('\n') # In[26]: import os import xml.etree.ElementTree as ET import re from collections import defaultdict # Your list of paullist orderlist = [...] # Create a dictionary to store frequencies rule_frequencies = defaultdict(int) print('Reading the inputfiles ',end='') for bo in booklist: InputFile = os.path.join(InputDir, f'{bo}.xml') print('.',end='') # Load the XML file tree = ET.parse(InputFile) root = tree.getroot() # Iterate over 'Tree' elements for tree_element in root.findall('.//Tree'): # Iterate over child nodes of the current 'Tree' element for node in tree_element.findall('.//Node'): # Determine the current rule node_cat = node.get('Cat') node_rule = node.get('Rule') if node_cat == "CL": #print(node_cat, node_rule) if 'CL' not in node_rule: if 'Cl' not in node_rule: # Update the frequency in the dictionary rule_frequencies[node_rule] += 1 # Print the table of frequencies print("\n\nFrequency Table:") print("{:<20} {:<10}".format("Node Rule", "Frequency")) print("-" * 30) # Sort the table by frequency in descending order sorted_frequencies = sorted(rule_frequencies.items(), key=lambda x: x[1], reverse=True) # Print the sorted table for rule, frequency in sorted_frequencies: print("{:<20} {:<10}".format(rule, frequency)) # # 3 - References # ##### [Back to TOC](#TOC) # #### Footnotes: # # 1 Porter, Stanley. E. "Greek Word Order, Still an Unexplored Area in New Testament Studies?" in Stanley E. Porter, *Linguistic Analysis of the Greek New Testament, Studies in Tools, Methods, and Practices* (Grand Rapids: Baker Academic, 2015), 347-363. # In[ ]: