#!/usr/bin/env python # coding: utf-8 # # [Doc4TF](https://github.com/tonyjurg/Doc4TF) # #### *automatic creation of feature documentation for existing Text-Fabric datasets* # ## Table of content # * 1 - Introduction # * 2 - Setting up the environment # * 3 - Load Text-Fabric data # * 4 - Creation of the dataset # * 4.1 - Setting up some production values # * 4.2 - Store data in dictionaries # * 4.2.1 - Get node types and their node ranges # * 4.2.2 - Determine which node types have specific features # * 4.2.3 - Create dictionairy with description and valuefrequency per feature # * 5 - Create the pages # * 5.1 - Create set of feature pages # * 5.2 - Create overview page # * 6 - Licence # # 1 - Introduction # ##### [Back to TOC](#TOC) # # Ideally, a comprehensive documentation set should be created as part of developing a Text-Fabric dataset. However, in practice, this is not always completed during the initial phase or after changes to features. This Jupyter Notebook contains Python code to automatically generate (and thus ensure consistency) a documentation set for any [Text-Fabric](https://github.com/annotation/text-fabric) dataset. It serves as a robust starting point for the development of a brand new documentation set or as validation for an existing one. One major advantage is that the resulting documentation set is fully hyperlinked, a task that can be laborious if done manually. # # The main steps in producing the documentation set are: # * Load a Text-Fabric database # * Execute the code pressent in the subsequent cells. The code will: # * construct a few python dictionaries with relevant data from the TF datase # * create separate files for each feature # * create an overview page of all featers per node type # # 2. Setting up the environment # ##### [Back to TOC](#TOC) # Your environment should (obviously) include the Python package `Text-Fabric`. In the current implementation of the script, the Python package `markdown2` is also required. If not installed yet, it can be installed using `pip`. (note: possibly in a future version this dependancy might be removed). # In[68]: get_ipython().system('pip install markdown2') # # 3 - Load Text-Fabric data # ##### [Back to TOC](#TOC) # At this stage the Text-Fabric dataset is loaded which will be used to create a documentation set. See documentation for function [`use`](https://annotation.github.io/text-fabric/tf/app.html#tf.app.use) for various options regaring storage locations. # In[1]: get_ipython().run_line_magic('load_ext', 'autoreload') get_ipython().run_line_magic('autoreload', '2') # In[2]: # Loading the Text-Fabric code # Note: it is assumed Text-Fabric is installed in your environment from tf.fabric import Fabric from tf.app import use # In[3]: # load the N1904 app and data N1904 = use ("tonyjurg/Nestle1904LFT", version="0.6", hoist=globals()) # # 4 - Creation of the dataset # ## 4.1 - Setting up some production values # ##### [Back to TOC](#TOC) # In[4]: # set the title for all pages (indicating the dataset the documentation is describing) pageTitle="N1904 Greek New Testament Text-Fabric dataset [tonyjurg/Nestle1904LFT - 0.6](https://github.com/tonyjurg/Nestle1904LFT)" # location to store the resulting files.For now the same location as where the notebook resides (no ending slash) resultLocation = "" # Set verbose to True if you want dictionaries printed. Setting to False does mute the output verbose=True # ## 4.2 - Store data in dictionaries # ### 4.2.1 - Get node types and their node ranges # ##### [Back to TOC](#TOC) # The following will create a dictionary containing the mapping from node type to node number. # In[5]: # Initialize an empty dictionary nodeDict = {} # Iterate over C.levels.data for item in C.levels.data: node,_,start,end = item # Create empty node list nodeDict[node] = [] # Append the tuple (start, end) to the node's list nodeDict[node].append((start, end)) # Print resulting dictionary depending on setting 'verbose' if verbose: print(nodeDict) print('finished') # Or alternative (with identical result) # In[6]: # Initialize an empty dictionary nodeDict = {} # Iterate over node types for NodeType in F.otype.all: nodeDict[NodeType] = [] start, end = F.otype.sInterval(NodeType) # Append the tuple (start, end) to the node's list nodeDict[NodeType].append((start, end)) # Print resulting dictionary depending on setting 'verbose' if verbose: print(nodeDict) print('finished') # ### 4.2.2 - Determine which node types have specific features # ##### [Back to TOC](#TOC) # The following will create a feature list with information about the node types that contain values for that specific feature. # In[7]: # Initialize an empty dictionary featureDict = {} # Iterate over Fall(), all features for item in Fall(): # Use a set to store unique values for each feature featureDict[item] = set() for node, content in Fs(item).items(): featureDict[item].add(F.otype.v(node)) # Print the resulting dictionary depending on setting 'verbose' if verbose: print(featureDict) print('finished') # ### 4.2.3 - Create dictionairy with description and valuefrequency per feature # ##### [Back to TOC](#TOC) # The following will create a dictionairy with the description per feature (taken from the meta data) # In[8]: # Initialize an empty dictionary featureMetaDict = {} # Iterate over Fall(), all features for item in Fall(): featureMetaDict[item] = [] featureMetaData=Fs(item).meta # Check if 'description' key exists in the meta dictionary if 'description' in featureMetaData: featureDescription = featureMetaData['description'] else: featureDescription = "No feature description" # Check if 'valueType' key exists in the meta dictionary if 'valueType' in featureMetaData: featureType = "unknown" if featureMetaData["valueType"] == 'str': featureType = "string" if featureMetaData["valueType"] == 'int': featureType = "integer" else: featureType = "not found" if item!='otype': FeatureFrequenceLists=Fs(item).freqList() FoundItems=0 FeatureValueSetList = [] # Initialize an empty list to store feature value sets for value, freq in FeatureFrequenceLists: FoundItems+=1 FeatureValueSet = value FeatureFrequencySet = freq FeatureValueSetList.append((FeatureValueSet,FeatureFrequencySet)) if FoundItems==10: break featureMetaDict[item].append((featureDescription, featureType, FeatureValueSetList)) # Print resulting dictionary depending on setting 'verbose' if verbose: print(featureMetaDict) print('finished') # ## 5 - Create the pages # ## 5.1 - Create set of feature pages # ##### [Back to TOC](#TOC) # In[9]: import markdown2 import os filesCreated=0 for feature in featureDict: # prepare the data featureName = feature nodeList = '' featureValues='' for node in featureDict[feature]: nodeList += f' `{node}`' featureDescription, featureType, valueFreq = featureMetaDict[feature][0] featureValues="Value|Frequency|\n---|---|\n" for value, freq in valueFreq: if value=='': featureValues+=f"empty |{freq}|\n" else: featureValues+=f"`{value}` | {freq} |\n" featureValues+="Note: only the first 10 items are shown" # define the template for the feature description pages FeaturePageTemplate = f"{pageTitle}\n#Feature: {featureName}\nData type|Available for node types|\n---|---|\n`{featureType}` |{nodeList}|\n## Description\n{featureDescription}\n## Values\n{featureValues}\n" # create the feature file FeaturePageContent = FeaturePageTemplate.format(featureName=feature, featureType=featureType, nodeList=nodeList) # Convert the plain text to Markdown markdown_content = markdown2.markdown(FeaturePageContent, extras=['tables']) # set up path to location to store the resulting file fileName = os.path.join(resultLocation, f"{feature}.md") try: with open(fileName, "w", encoding="utf-8") as file: file.write(markdown_content) filesCreated+=1 # Write the Markdown content to a file if verbose: print(f"Markdown content written to {fileName}") except Exception as e: print(f"Error writing to file {fileName} (please create directory \'{resultLocation}\' first)") break if filesCreated!=0: print(f'finished (writing {filesCreated} files)') # ## 5.2 - Create overview page # ##### [Back to TOC](#TOC) # In[10]: overviewPage = f"{pageTitle}\n#Features per node type\n" # Iterate over node types for NodeType in F.otype.all: # Initialize an empty list to store keys FeaturesWithNodeType = [] # Check each set in featureDict for the presence of this nodetype for feature, value_set in featureDict.items(): if NodeType in value_set: FeaturesWithNodeType.append(feature) NodeItemText=f"##{NodeType}\nFeature|Datatype|Description|Examples\n|---|---|---|---|\n" for item in FeaturesWithNodeType: featureDescription =featureMetaDict[item][0][0] DataType="`"+featureMetaDict[item][0][1]+"` " #Get some example values FoundItems=0 valueExamples='' for value, freq in featureMetaDict[item][0][2]: FoundItems+=1 valueExamples+='`'+str(value)+'` ' if FoundItems==2: break NodeItemText+=f'{item}| {DataType} | {featureDescription} | {valueExamples} \n' overviewPage+=NodeItemText # create the feature overview file # Convert the plain text to Markdown markdown_content = markdown2.markdown(overviewPage, extras=['tables']) # set up path to location to store the resulting file fileName = os.path.join(resultLocation, "featurebynodetype.md") try: with open(fileName, "w", encoding="utf-8") as file: file.write(markdown_content) filesCreated+=1 # Write the Markdown content to a file if verbose: print(f"Markdown content written to {fileName}") print('Overview page created successfully') except Exception as e: print(f"Error writing to file {fileName} (please create directory \'{resultLocation}\' first)") # # 6 - License # ##### [Back to TOC](#TOC) # Licenced under [Creative Commons Attribution 4.0 International (CC BY 4.0)](https://github.com/tonyjurg/Doc4TF/blob/main/LICENCE.md)