In the context of a corpus, the term "monad" refers to a unit of textual analysis or a discrete linguistic unit. Specifically, in this Text-Fabric dataset, it is employed to denote the sequence number of an individual word within the entire corpus, specifically the New Testament.
Hence, in the tf file the value for monad should nicely increment by one. This script checks this.
import os
# Following variable should contain the relative path to the 'monad' tf file to check
FileToCheck = "..//tf//0.4//monad.tf"
def process_file(file_path):
LinesToPrint=10
PrintedLines=0
PreviousMonadValue=0
Result ="Filecontent is OK (i.e. sequential)"
with open(file_path, "r") as file:
print("Analyzing: ", end='', flush=True)
for line in file:
if line.startswith("@"):
continue # Skip lines that start with "@"
if line.startswith("\n"):
continue # Skip lines that start with " "
CurrentMonadValue=int(line.replace("\n", ""))
if CurrentMonadValue % 10000 == 0:
print(".", end='', flush=True) # Print a dot without a new line for every 10000th monad value
if CurrentMonadValue-1==PreviousMonadValue:
PreviousMonadValue=CurrentMonadValue
else:
Result="Found something wrong. Monad"+CurrentMonadValue+"after"+PreviousMonadValue
break
print ("\n",Result)
# Main part
#First check if the file exist, then check its content
if os.path.exists(FileToCheck):
print(f"The file {FileToCheck} exist and will be checked")
process_file(FileToCheck)
else:
print(f"Could not find file {FileToCheck}.")
The file ..//tf//0.4//monad.tf exist and will be checked Analyzing: ............. Filecontent is OK (i.e. sequential)