#!/usr/bin/env python # coding: utf-8 # # # # # --- # # To get started: consult [start](start.ipynb) # # --- # # # Annotate # # Text-Fabric is a tool for computing with read only datasets. # How can you manually annotate an existing dataset? # # The scenario is: export the portions that must be annotated into a plain text file, accompanied with # location information. # # Use an external tool, e.g. # [BRAT](https://brat.nlplab.org) to manually annotate that text. # # Read the resulting annotations, combine them with the location information, # and export the result as a new feature or set of features. # # These new features can be published anywhere, # see the [share](share.ipynb) tutorial, # and users that want to make use of the new features, can tell Text-Fabric to fetch it from the # published location alongside the main dataset. # # From this point on, the new features act as first class citizens in the dataset. # # Note how this does not involve modifying existing datasets! # In[1]: get_ipython().run_line_magic('load_ext', 'autoreload') get_ipython().run_line_magic('autoreload', '2') # In[2]: import os from tf.app import use from tf.convert.recorder import Recorder from tf.dataset import Versions # **NB:** We used version 0.4 of this data set to export material, annotate the exported material, # and draw in those annotation as a bunch of new features. # However, in the meanwhile we have newer versions of the missieven data, where different encoding decisions have been applied. # # Rather than doing the annotation work again, we want to migrate the annotations from 0.4 to 0.7. # We shall show how. # # First we show how we made the annotations in 0.4, and to that end we use a previous version of the data. # # We have to overcome the fact that in those times this repository resided under a different organization on GitHub (`Dans-labs`) and # had a different name (`clariah-gm`). Also, the TF-app for this dataset resided in `annotation/app-missieven`, while # it is now in `clariah/wp6-missieven/app`. # # It is still possible to work with that old version. We ask for the old TF-app # and override the `org` and `repo` settings of the old app, by passing the new values in `provenanceSpec=...`. # In[3]: A = use("CLARIAH/wp6-missieven:v0.4", checkout="clone", version="0.4", hoist=globals(), legacy=True, provenanceSpec=dict(org="CLARIAH", repo="wp6-missieven")) # Text-Fabric has support for exporting data together with location information and then importing new data # and turning it into new features based on the location information. # # See [Recorder](https://annotation.github.io/text-fabric/tf/convert/recorder.html). # # We show the workflow by selecting a letter, exporting the original text material as plain text, # manually annotating it for named entities with [BRAT](https://brat.nlplab.org) and then saving the output # as a new feature `name`. # # Text selection # # We choose volume 1 page 6: # In[4]: p = A.nodeFromSectionStr("1 6") for ln in L.d(p, otype="line"): A.plain(ln, fmt="text-orig-full") # Quite a bit of names. Let's leave out the notes. # In[5]: for ln in L.d(p, otype="line"): A.plain(ln, fmt="layout-orig") # # Recording # # We'll prepare this portion of text for annotation outside TF. # # What needs to happen is, that we produce a text file and that we remember the positions of the relevant # nodes in that text file. # # The [Recorder](https://annotation.github.io/text-fabric/tf/convert/recorder.html). # lets you create a string from nodes, # where the positions of the nodes in that string are remembered. # You may add all kinds of material in between the texts of the nodes. # # And it is up to you how you represent the nodes. # # We can add strings to the recorder, and we can tell nodes to start and to stop. # # We add all words in all lines to the recorder, provided the words belong to the original material. # # We add line numbers to each line. # In[6]: # start a recorder rec = Recorder() for ln in L.d(p, otype="line"): # start a line node rec.start(ln) # add the line number rec.add(f"{F.n.v(ln)}. ") for w in L.d(ln, otype="word"): trans = F.transo.v(w) # if there is nothing in transo, it is not original text if not trans: continue # start a word node rec.start(w) # add the word and its trailing punctuation rec.add(f"{trans}{F.punco.v(w)}") # terminate the word node rec.end(w) # add a newline rec.add("\n") # terminate the line node rec.end(ln) # As a check, let's print the recorded text: # In[7]: print(rec.text()) # and the recorded node positions. # In[8]: for i in range(20, 30): print(f"pos {i}: {rec.positions()[i]}") # This means that the character on position 20 in the plain text string is part of the text of node 1039 and of node 5054871. # With one statement we write the recorded text and the positions to two files: # In[9]: rec.write("exercises/v01-p0006.txt") # In[10]: get_ipython().system('head -n 10 exercises/v01-p0006.txt') # In[11]: get_ipython().system('head -n 30 exercises/v01-p0006.txt.pos') # # # # Annotating # # We head over to a local installation of Brat # and annotate our text. # # Left you see a quick and dirty manual annotation of some entities that I performed on # the Brat interface, served locally. # # We captured the output of this annotation session into the file `v01-p0006.txt.ann`, it has the following contents: # # ``` # T1 Person 675 679 Nera # T2 GPE 1181 1189 Ternnate # #1 AnnotatorNotes T2 Ternate # T3 Person 1203 1223 Coninck van Spagnien # T4 GPE 1215 1223 Spagnien # T5 Organization 1240 1254 Heeren Staeten # T6 Person 1293 1300 Coninck # T7 Person 1406 1413 Coninck # T8 Organization 1457 1471 Heeren Staeten # T9 GPE 1557 1562 Banda # T10 GPE 1653 1662 Engelsche # T11 Person 58 65 orancay # T12 Person 663 670 arancay # T13 Person 697 706 sabandaer # T14 Person 794 802 orancaye # T15 GPE 965 975 Hollanders # T16 Person 1010 1019 Verhoeven # T17 GPE 1154 1161 Ambojna # #2 AnnotatorNotes T17 Amboina # T18 GPE 1305 1310;1311 1322 ditto 24. plaetse # #3 AnnotatorNotes T18 Ternate # * Alias T11 T14 # R1 Geographical_part Arg1:T2 Arg2:T18 # ``` # # Now we want to feed back these annotations as TF features on word nodes. # The Recorder cannot anticipate the formats that tools like Brat deliver their results in. # Therefore, it expects the data to be in a straightforward tabular format. # # In this case, we must do a small conversion to bring the output annotations # into good shape, namely a tab separated file # with columns `start end feature1 feature2 ...` # # Here we choose to expose the identifier (the `Tn` values) as `feature1` # and the kind of entity as `feature2`. # # In case there is a link between two entities, we want to assign # the earliest `T`number to all entities involved. # # We also want to preserve the annotator notes. # In[12]: def brat2tsv(inh, outh): outh.write("start\tend\tentityId\tentityKind\tentityComment\n") entities = [] notes = {} maps = {} for line in inh: fields = line.rstrip("\n").split("\t") if line.startswith("T"): id1 = fields[0] (kind, *positions) = fields[1].split() (start, end) = (positions[0], positions[-1]) entities.append([start, end, id1, kind, ""]) elif line.startswith("#"): id1 = fields[1].split()[1] notes[id1] = fields[2] elif line.startswith("*"): (kind, id1, id2) = fields[1].split() maps[id2] = id1 elif line.startswith("R"): (id1, id2) = (f[5:] for f in fields[1].split()[1:]) maps[id2] = id1 for entity in entities: id1 = entity[2] if id1 in maps: entity[2] = maps[id1] if id1 in notes: entity[4] = notes[id1] line = "\t".join(entity) print(line) outh.write(f"{line}\n") print(maps) with open("exercises/v01-p0006.txt.ann") as inh: with open("exercises/v01-p0006.txt.tsv", "w") as outh: brat2tsv(inh, outh) # Our recorder knows how to do transform this file in feature data. # In[13]: features = rec.makeFeatures("exercises/v01-p0006.txt.tsv") # Let's see. # In[14]: for (feat, data) in features.items(): print(feat) print("\t", data) # We can show this prettier: # In[15]: for (feat, data) in features.items(): print(feat) for (node, value) in data.items(): print(f"\t{F.otype.v(node)} {node} => {value}") # Note that we assign entity features to line nodes as well. # # If that is undesired, we should not have instructed the Recorder to `rec.add(ln)` above. # # Saving data # # The [documentation](https://annotation.github.io/text-fabric/tf/core/fabric.html#tf.core.fabric.FabricCore.save) # explains how to save this data into text-fabric data files. # # We choose a location where to save it, the `exercises` directory next to this notebook. # In[16]: GITHUB = os.path.expanduser("~/github") ORG = A.context.org REPO = A.context.repo PATH = "exercises" VERSION = A.version print(f"{ORG=} {REPO=} {VERSION=}") # Note the version: we have built the version against a specific version of the data. # Later on, we pass this version on, so that users of our data will get the shared data in exactly the same version as their core data. # We have to specify a bit of metadata for this feature: # In[17]: metaData = { "entityId": dict( valueType="str", description="identifier of a named entity", creator="Dirk Roorda", ), "entityKind": dict( valueType="str", description="kind of a named entity", creator="Dirk Roorda", ), "entityComment": dict( valueType="str", description="comment to a named entity", creator="Dirk Roorda", ), } # Now we can give the save command: # In[18]: location = f"{GITHUB}/{ORG}/{REPO}/{PATH}/entities/tf" TF.save(nodeFeatures=features, metaData=metaData, location=location, module=VERSION, silent="auto") # # Migrating # # We now migrate these annotations to the current version (1.0), which is different from 0.4 in that the footnote texts have been drawn into # the main text. # We use the mapping from 0.4 nodes to 1.0 nodes, which is available as an edge feature `omap#0.4-1.0` in the # current version of the dataset. # # We load both the old and new versions of the dataset. # In[19]: entityModule = "clariah/wp6-missieven/exercises/entities/tf" va = "0.4" A = {} # In[20]: A[va] = use(f"CLARIAH/wp6-missieven:v{va}", mod=f"{entityModule}:clone", checkout="clone", version=va, legacy=True, provenanceSpec=dict(org="clariah", repo="wp6-missieven")) # In[21]: Alater = use("CLARIAH/wp6-missieven:clone", checkout="clone") vb = Alater.version A[vb] = Alater # Now we can use the function # [migrateFeatures](https://annotation.github.io/text-fabric/tf/dataset/nodemaps.html#tf.dataset.nodemaps.Versions.migrateFeatures) # from TF to migrate our features. # See also # [nodeMaps](https://annotation.github.io/text-fabric/tf/dataset/nodemaps.html). # In[22]: V = Versions({va: A[va].api, vb: A[vb].api}, va, vb) # In[23]: features = ("entityComment", "entityId", "entityKind") V.migrateFeatures(features, location=location, silent="auto") # We load the migrated features: # In[24]: A[vb] = use("CLARIAH/wp6-missieven:clone", version=vb, mod=f"{entityModule}:clone", checkout="clone") # We compare the features in both versions # In[29]: def showFeature(v, f): F = A[v].api.F Fs = A[v].api.Fs T = A[v].api.T for (n, val) in Fs(f).items(): ntp = F.otype.v(n) print(f"{v} {f} ({ntp:<4} {n:>8}) {val:<8} <= {T.text(n)}") # In[30]: for f in features: showFeature(va, f) print("") showFeature(vb, f) print("") # # Sharing # # In [share](share.ipynb) we show how we can share and reuse these features. # --- # # # Contents # # * **[start](start.ipynb)** start computing with this corpus # * **[search](search.ipynb)** turbo charge your hand-coding with search templates # * **[compute](compute.ipynb)** sink down a level and compute it yourself # * **[exportExcel](exportExcel)** make tailor-made spreadsheets out of your results # * **annotate** export text, annotate with BRAT, import annotations # * **[share](share.ipynb)** draw in other people's data and let them use yours # * **[entities](entities.ipynb)** use results of third-party NER (named entity recognition) # * **[volumes](volumes.ipynb)** work with selected volumes only # # CC-BY Dirk Roorda