#!/usr/bin/env python
# coding: utf-8
# # [Doc4TF](https://github.com/tonyjurg/Doc4TF)
# #### *automatic creation of feature documentation for existing Text-Fabric datasets*
# ## Table of content
# * 1 - Introduction
# * 2 - Setting up the environment
# * 3 - Load Text-Fabric data
# * 4 - Creation of the dataset
# * 4.1 - Setting up some production values
# * 4.2 - Store data in dictionaries
# * 4.2.1 - Get node types and their node ranges
# * 4.2.2 - Determine which node types have specific features
# * 4.2.3 - Create dictionairy with description and valuefrequency per feature
# * 5 - Create the pages
# * 5.1 - Create set of feature pages
# * 5.2 - Create overview page
# * 6 - Licence
# # 1 - Introduction
# ##### [Back to TOC](#TOC)
#
# Ideally, a comprehensive documentation set should be created as part of developing a Text-Fabric dataset. However, in practice, this is not always completed during the initial phase or after changes to features. This Jupyter Notebook contains Python code to automatically generate (and thus ensure consistency) a documentation set for any [Text-Fabric](https://github.com/annotation/text-fabric) dataset. It serves as a robust starting point for the development of a brand new documentation set or as validation for an existing one. One major advantage is that the resulting documentation set is fully hyperlinked, a task that can be laborious if done manually.
#
# The main steps in producing the documentation set are:
# * Load a Text-Fabric database
# * Execute the code pressent in the subsequent cells. The code will:
# * construct a few python dictionaries with relevant data from the TF datase
# * create separate files for each feature
# * create an overview page of all featers per node type
# # 2. Setting up the environment
# ##### [Back to TOC](#TOC)
# Your environment should (obviously) include the Python package `Text-Fabric`. In the current implementation of the script, the Python package `markdown2` is also required. If not installed yet, it can be installed using `pip`. (note: possibly in a future version this dependancy might be removed).
# In[68]:
get_ipython().system('pip install markdown2')
# # 3 - Load Text-Fabric data
# ##### [Back to TOC](#TOC)
# At this stage the Text-Fabric dataset is loaded which will be used to create a documentation set. See documentation for function [`use`](https://annotation.github.io/text-fabric/tf/app.html#tf.app.use) for various options regaring storage locations.
# In[1]:
get_ipython().run_line_magic('load_ext', 'autoreload')
get_ipython().run_line_magic('autoreload', '2')
# In[2]:
# Loading the Text-Fabric code
# Note: it is assumed Text-Fabric is installed in your environment
from tf.fabric import Fabric
from tf.app import use
# In[3]:
# load the N1904 app and data
N1904 = use ("tonyjurg/Nestle1904LFT", version="0.6", hoist=globals())
# # 4 - Creation of the dataset
# ## 4.1 - Setting up some production values
# ##### [Back to TOC](#TOC)
# In[4]:
# set the title for all pages (indicating the dataset the documentation is describing)
pageTitle="N1904 Greek New Testament Text-Fabric dataset [tonyjurg/Nestle1904LFT - 0.6](https://github.com/tonyjurg/Nestle1904LFT)"
# location to store the resulting files.For now the same location as where the notebook resides (no ending slash)
resultLocation = ""
# Set verbose to True if you want dictionaries printed. Setting to False does mute the output
verbose=True
# ## 4.2 - Store data in dictionaries
# ### 4.2.1 - Get node types and their node ranges
# ##### [Back to TOC](#TOC)
# The following will create a dictionary containing the mapping from node type to node number.
# In[5]:
# Initialize an empty dictionary
nodeDict = {}
# Iterate over C.levels.data
for item in C.levels.data:
node,_,start,end = item
# Create empty node list
nodeDict[node] = []
# Append the tuple (start, end) to the node's list
nodeDict[node].append((start, end))
# Print resulting dictionary depending on setting 'verbose'
if verbose: print(nodeDict)
print('finished')
# Or alternative (with identical result)
# In[6]:
# Initialize an empty dictionary
nodeDict = {}
# Iterate over node types
for NodeType in F.otype.all:
nodeDict[NodeType] = []
start, end = F.otype.sInterval(NodeType)
# Append the tuple (start, end) to the node's list
nodeDict[NodeType].append((start, end))
# Print resulting dictionary depending on setting 'verbose'
if verbose: print(nodeDict)
print('finished')
# ### 4.2.2 - Determine which node types have specific features
# ##### [Back to TOC](#TOC)
# The following will create a feature list with information about the node types that contain values for that specific feature.
# In[7]:
# Initialize an empty dictionary
featureDict = {}
# Iterate over Fall(), all features
for item in Fall():
# Use a set to store unique values for each feature
featureDict[item] = set()
for node, content in Fs(item).items():
featureDict[item].add(F.otype.v(node))
# Print the resulting dictionary depending on setting 'verbose'
if verbose: print(featureDict)
print('finished')
# ### 4.2.3 - Create dictionairy with description and valuefrequency per feature
# ##### [Back to TOC](#TOC)
# The following will create a dictionairy with the description per feature (taken from the meta data)
# In[8]:
# Initialize an empty dictionary
featureMetaDict = {}
# Iterate over Fall(), all features
for item in Fall():
featureMetaDict[item] = []
featureMetaData=Fs(item).meta
# Check if 'description' key exists in the meta dictionary
if 'description' in featureMetaData:
featureDescription = featureMetaData['description']
else:
featureDescription = "No feature description"
# Check if 'valueType' key exists in the meta dictionary
if 'valueType' in featureMetaData:
featureType = "unknown"
if featureMetaData["valueType"] == 'str': featureType = "string"
if featureMetaData["valueType"] == 'int': featureType = "integer"
else:
featureType = "not found"
if item!='otype':
FeatureFrequenceLists=Fs(item).freqList()
FoundItems=0
FeatureValueSetList = [] # Initialize an empty list to store feature value sets
for value, freq in FeatureFrequenceLists:
FoundItems+=1
FeatureValueSet = value
FeatureFrequencySet = freq
FeatureValueSetList.append((FeatureValueSet,FeatureFrequencySet))
if FoundItems==10: break
featureMetaDict[item].append((featureDescription, featureType, FeatureValueSetList))
# Print resulting dictionary depending on setting 'verbose'
if verbose: print(featureMetaDict)
print('finished')
# ## 5 - Create the pages
# ## 5.1 - Create set of feature pages
# ##### [Back to TOC](#TOC)
# In[9]:
import markdown2
import os
filesCreated=0
for feature in featureDict:
# prepare the data
featureName = feature
nodeList = ''
featureValues=''
for node in featureDict[feature]:
nodeList += f' `{node}`'
featureDescription, featureType, valueFreq = featureMetaDict[feature][0]
featureValues="Value|Frequency|\n---|---|\n"
for value, freq in valueFreq:
if value=='':
featureValues+=f"empty |{freq}|\n"
else:
featureValues+=f"`{value}` | {freq} |\n"
featureValues+="Note: only the first 10 items are shown"
# define the template for the feature description pages
FeaturePageTemplate = f"{pageTitle}\n#Feature: {featureName}\nData type|Available for node types|\n---|---|\n`{featureType}` |{nodeList}|\n## Description\n{featureDescription}\n## Values\n{featureValues}\n"
# create the feature file
FeaturePageContent = FeaturePageTemplate.format(featureName=feature, featureType=featureType, nodeList=nodeList)
# Convert the plain text to Markdown
markdown_content = markdown2.markdown(FeaturePageContent, extras=['tables'])
# set up path to location to store the resulting file
fileName = os.path.join(resultLocation, f"{feature}.md")
try:
with open(fileName, "w", encoding="utf-8") as file:
file.write(markdown_content)
filesCreated+=1
# Write the Markdown content to a file
if verbose: print(f"Markdown content written to {fileName}")
except Exception as e:
print(f"Error writing to file {fileName} (please create directory \'{resultLocation}\' first)")
break
if filesCreated!=0: print(f'finished (writing {filesCreated} files)')
# ## 5.2 - Create overview page
# ##### [Back to TOC](#TOC)
# In[10]:
overviewPage = f"{pageTitle}\n#Features per node type\n"
# Iterate over node types
for NodeType in F.otype.all:
# Initialize an empty list to store keys
FeaturesWithNodeType = []
# Check each set in featureDict for the presence of this nodetype
for feature, value_set in featureDict.items():
if NodeType in value_set:
FeaturesWithNodeType.append(feature)
NodeItemText=f"##{NodeType}\nFeature|Datatype|Description|Examples\n|---|---|---|---|\n"
for item in FeaturesWithNodeType:
featureDescription =featureMetaDict[item][0][0]
DataType="`"+featureMetaDict[item][0][1]+"` "
#Get some example values
FoundItems=0
valueExamples=''
for value, freq in featureMetaDict[item][0][2]:
FoundItems+=1
valueExamples+='`'+str(value)+'` '
if FoundItems==2: break
NodeItemText+=f'{item}| {DataType} | {featureDescription} | {valueExamples} \n'
overviewPage+=NodeItemText
# create the feature overview file
# Convert the plain text to Markdown
markdown_content = markdown2.markdown(overviewPage, extras=['tables'])
# set up path to location to store the resulting file
fileName = os.path.join(resultLocation, "featurebynodetype.md")
try:
with open(fileName, "w", encoding="utf-8") as file:
file.write(markdown_content)
filesCreated+=1
# Write the Markdown content to a file
if verbose: print(f"Markdown content written to {fileName}")
print('Overview page created successfully')
except Exception as e:
print(f"Error writing to file {fileName} (please create directory \'{resultLocation}\' first)")
# # 6 - License
# ##### [Back to TOC](#TOC)
# Licenced under [Creative Commons Attribution 4.0 International (CC BY 4.0)](https://github.com/tonyjurg/Doc4TF/blob/main/LICENCE.md)