#!/usr/bin/env python # coding: utf-8 # # Check feature monad # In the context of a corpus, the term "monad" refers to a unit of textual analysis or a discrete linguistic unit. Specifically, in this Text-Fabric dataset, it is employed to denote the sequence number of an individual word within the entire corpus, specifically the New Testament. # # Hence, in the tf file the value for monad should nicely increment by one. This script checks this. # In[9]: import os # Following variable should contain the relative path to the 'monad' tf file to check FileToCheck = "..//tf//0.4//monad.tf" def process_file(file_path): LinesToPrint=10 PrintedLines=0 PreviousMonadValue=0 Result ="Filecontent is OK (i.e. sequential)" with open(file_path, "r") as file: print("Analyzing: ", end='', flush=True) for line in file: if line.startswith("@"): continue # Skip lines that start with "@" if line.startswith("\n"): continue # Skip lines that start with " " CurrentMonadValue=int(line.replace("\n", "")) if CurrentMonadValue % 10000 == 0: print(".", end='', flush=True) # Print a dot without a new line for every 10000th monad value if CurrentMonadValue-1==PreviousMonadValue: PreviousMonadValue=CurrentMonadValue else: Result="Found something wrong. Monad"+CurrentMonadValue+"after"+PreviousMonadValue break print ("\n",Result) # Main part #First check if the file exist, then check its content if os.path.exists(FileToCheck): print(f"The file {FileToCheck} exist and will be checked") process_file(FileToCheck) else: print(f"Could not find file {FileToCheck}.") # In[ ]: