#!/usr/bin/env python
# coding: utf-8
# # Calculate V, S, O order (N1904GBI)
# ## Table of content
# * 1 - Introduction
# * 2 - Create sum of orders
# # 1 - Introduction
# ##### [Back to TOC](#TOC)
# Investigating the order of various clausal parts (e.g. V,S,O).
#
#
# Testing dataset: N1904 treebank (GBI)
#
# # 2 - Create sum of orders
# ##### [Back to TOC](#TOC)
# In[5]:
import pandas as pd
import sys
import os
import time
import pickle
import re # used for regular expressions
from os import listdir
from os.path import isfile, join
import xml.etree.ElementTree as ET
# In[7]:
BaseDir = 'C:\\Users\\tonyj\\my_new_Jupyter_folder\\test_of_xml_etree\\'
InputDir = BaseDir+'inputfiles\\'
bo='26-jude'
InputFile = os.path.join(InputDir, f'{bo}.xml')
tree = ET.parse(InputFile)
root = tree.getroot()
# Dictionary to store transition frequencies
transition_frequencies = {}
# Multiple sets of books are defined here allowing for determening variations.
# In[17]:
booklist = ['01-matthew', '02-mark', '03-luke', '04-john', '05-acts', '06-romans',
'07-1corinthians','08-2corinthians', '09-galatians', '10-ephesians',
'11-philippians', '12-colossians', '13-1thessalonians', '14-2thessalonians',
'15-1timothy', '16-2timothy', '17-titus', '18-philemon', '19-hebrews',
'20-james', '21-1peter', '22-2peter', '23-1john', '24-2john', '25-3john',
'26-jude', '27-revelation']
paullist= ['06-romans', '07-1corinthians','08-2corinthians', '09-galatians', '10-ephesians',
'11-philippians', '12-colossians', '13-1thessalonians', '14-2thessalonians',
'15-1timothy', '16-2timothy', '17-titus', '18-philemon']
peterlist= ['21-1peter', '22-2peter']
lukelist= ['03-luke','05-acts']
johnlist = ['23-1john', '24-2john', '25-3john']
# In[ ]:
import xml.etree.ElementTree as ET
import re
# Dictionary to store transition frequencies
transition_frequencies = {}
total_transitions = 0
# Dictionary to store transitions grouped by ('from', 'to') value
grouped_transitions = {}
for bo in paullist:
InputFile = os.path.join(InputDir, f'{bo}.xml')
print (f'Reading file {InputFile}')
# Load the XML file
tree = ET.parse(InputFile)
root = tree.getroot()
pattern = re.compile(r'-')
# Iterate over 'Tree' elements
for tree in root.findall('.//Tree'):
# Iterate over child nodes of the current 'Tree' element
for node in tree.findall('.//Node'):
# Determine the current rule
node_cat = node.get('Cat')
node_rule = node.get('Rule')
if node_cat == "CL" :
print (node_cat,node_rule)
# In[ ]:
# avarages for each seperate transition (i.e. all rules sum op to p=1 per starting condition)
import xml.etree.ElementTree as ET
def addParentInfo(parent, element):
for child in element:
child.attrib['parent'] = parent
addParentInfo(child, child)
def getParent(element):
if 'parent' in element.attrib:
return element.attrib['parent']
else:
return None
# Dictionary to store transition frequencies
transition_frequencies = {}
total_transitions = 0
# Dictionary to store transitions grouped by ('from', 'to') value
grouped_transitions = {}
print('loading books ',end='')
for bo in johnlist:
InputFile = os.path.join(InputDir, f'{bo}.xml')
#print (f'Reading file {InputFile}')
print ('.',end='')
# Load the XML file
tree = ET.parse(InputFile)
root = tree.getroot()
# Add 'parent' attribute to each child element
addParentInfo(None, root)
# Iterate over 'Tree' elements
for tree in root.findall('.//Tree'):
# Iterate over child nodes of the current 'Tree' element
for node in tree.findall('.//Node'):
# Check if the node has child nodes
has_children = bool(list(node))
# Determine the current rule
node_cat = node.get('Cat') if has_children else 'Term'
# Get the parent node using the 'getParent' function
parent_node = getParent(node)
# Check if there is a parent node
if parent_node is not None:
parent_cat = parent_node.get('Cat')
if parent_cat is None and node_cat is not None:
parent_cat = "Start"
continue
# Combine parent and current rule to form the transition
transition = (parent_cat, node_cat)
# Update the frequency count in the dictionary
total_transitions += 1
transition_frequencies[transition] = transition_frequencies.get(transition, 0) + 1
print (f'\nFinished\tNumber of transitions: {total_transitions}\n')
# Group transitions based on ('from', 'to') value
for (from_value, to_value), frequency in transition_frequencies.items():
grouped_transitions.setdefault(from_value, []).append((from_value, to_value, frequency))
# Print separate tables for each group with sorted transitions
for from_value, transitions in grouped_transitions.items():
print(f"Transition table for starting condition: {from_value}")
print("From\tTo\tOcc.\tWeigth")
# Sort transitions based on frequency in descending order
sorted_transitions = sorted(transitions, key=lambda x: x[2], reverse=True)
# Calculate total occurrences for the current table
total_occurrences = sum(occurrence for _, _, occurrence in sorted_transitions)
for from_val, to_val, frequency in sorted_transitions:
# Calculate the average occurrence for each transition
average_occurrence = frequency / total_occurrences
print(f'{from_val}\t{to_val}\t{frequency}\t{average_occurrence:.4}')
print('\n')
# In[26]:
import os
import xml.etree.ElementTree as ET
import re
from collections import defaultdict
# Your list of paullist
orderlist = [...]
# Create a dictionary to store frequencies
rule_frequencies = defaultdict(int)
print('Reading the inputfiles ',end='')
for bo in booklist:
InputFile = os.path.join(InputDir, f'{bo}.xml')
print('.',end='')
# Load the XML file
tree = ET.parse(InputFile)
root = tree.getroot()
# Iterate over 'Tree' elements
for tree_element in root.findall('.//Tree'):
# Iterate over child nodes of the current 'Tree' element
for node in tree_element.findall('.//Node'):
# Determine the current rule
node_cat = node.get('Cat')
node_rule = node.get('Rule')
if node_cat == "CL":
#print(node_cat, node_rule)
if 'CL' not in node_rule:
if 'Cl' not in node_rule:
# Update the frequency in the dictionary
rule_frequencies[node_rule] += 1
# Print the table of frequencies
print("\n\nFrequency Table:")
print("{:<20} {:<10}".format("Node Rule", "Frequency"))
print("-" * 30)
# Sort the table by frequency in descending order
sorted_frequencies = sorted(rule_frequencies.items(), key=lambda x: x[1], reverse=True)
# Print the sorted table
for rule, frequency in sorted_frequencies:
print("{:<20} {:<10}".format(rule, frequency))
# # 3 - References
# ##### [Back to TOC](#TOC)
# #### Footnotes:
#
# 1 Porter, Stanley. E. "Greek Word Order, Still an Unexplored Area in New Testament Studies?" in Stanley E. Porter, *Linguistic Analysis of the Greek New Testament, Studies in Tools, Methods, and Practices* (Grand Rapids: Baker Academic, 2015), 347-363.
# In[ ]: