#!/usr/bin/env python # coding: utf-8 # # Make a skeleton ELAN document from (text, translation) pairs # https://nbviewer.org/gist/xflr6/452ce3b7be31e46e1cad32146af33d15/MakeElanSentences.ipynb # In[1]: from __future__ import annotations import dataclasses import datetime import enum import itertools import pathlib import os from typing import Optional import lxml.builder from lxml import etree XSI = 'http://www.w3.org/2001/XMLSchema-instance' XSI_CLARK = '{%s}' % XSI ELAN_SCHEMA = 'http://www.mpi.nl/tools/elan/EAFv3.0.xsd' ELAN_URN = 'urn:nl-mpi-tools-elan-eaf:315e945c-5e96-4492-9605-fbfe3487b506' ELAN_VERSION = '3.0' ELAN_SUFFIX = '.eaf' ENCODING = 'utf-8' E = lxml.builder.ElementMaker(nsmap={'xsi': XSI}) # ## Build document root and header # In[2]: def make_document_skeleton(*, author: str = '', media_file: str = '', time_units: str = 'milliseconds', format: str = ELAN_VERSION, version: str = ELAN_VERSION, date: Optional[datetime.datetime] = None) -> etree._ElementTree: """Create a new annotation document from the given arguments.""" kwargs = {f'{XSI_CLARK}noNamespaceSchemaLocation': ELAN_SCHEMA} if date is None: date = datetime.datetime.now(datetime.timezone.utc) root = E.annotation_document(author=author, date=date.replace(microsecond=0).astimezone().isoformat(), format=format, version=format, **kwargs) header = E.header(E.property(ELAN_URN, name='URN'), media_file=media_file, time_units=time_units) root.append(header) return etree.ElementTree(root) # In[3]: make_document_skeleton() # ## Pretty-print document # In[4]: def pprint(doc: etree._ElementTree, *, file=None, canonical: bool = False, pretty_print: bool = True, end: str = '\n', **kwargs) -> None: """Pretty-print the XML serialization of the given document.""" if canonical: doc = canonicalized(doc) text = etree.tostring(doc, encoding='unicode', pretty_print=pretty_print, **kwargs) print(text, file=file, end=end) def canonicalized(doc: etree._ElementTree, *, indent: str = ' ' * 4) -> etree._ElementTree: """Return a copy of the annotation document in ELAN formatting.""" doc = etree.ElementTree(etree.fromstring(etree.tostring(doc))) etree.indent(doc, space=indent) for elem in doc.iter('*'): elem.tag = elem.tag.upper() for k, v in sorted(elem.attrib.items()): del elem.attrib[k] elem.attrib[k.upper() if not k.startswith(XSI_CLARK) else k] = v return doc # In[5]: for kwargs in [{}, {'canonical': True}]: pprint(make_document_skeleton(), **kwargs, end='') # ## Serialize document # In[6]: def write(path: os.PathLike | str, doc: etree._ElementTree, *, canonical: bool = True, pretty_print: bool = True, xml_declaration: bool = True, encoding: str = ENCODING) -> pathlib.Path: """(Over)write the given path with the XML serialization of the given document.""" path = pathlib.Path(path) if canonical: doc = canonicalized(doc) doc.write(path, xml_declaration=xml_declaration, pretty_print=pretty_print, encoding=encoding) return path # ## Define linguistic type constraints # In[7]: class Constraint(enum.Enum): """Possible values for `constraints` in a linguistic type.""" Time_Subdivision = "Time subdivision of parent annotation's time interval, no time gaps allowed within this interval" Symbolic_Subdivision = 'Symbolic subdivision of a parent annotation. Annotations refering to the same parent are ordered' Symbolic_Association = '1-1 association with a parent annotation' Included_In = "Time alignable annotations within the parent annotation's time interval, gaps are allowed" def as_xml(self) -> etree._Element: return E.constraint(stereotype=self.name, description=self.value) # In[8]: for const in Constraint: pprint(const.as_xml(), end='') # ## Define linguistic types # In[9]: @dataclasses.dataclass class LinguisticType: """Possible targets for `linguistic_type_ref` in a tier.""" id_: str time_alignable: bool = False graphic_references: bool = False constraints: Optional[Constraint] = None @classmethod def make_dict(cls, types) -> dict[str, LinguisticType]: return {id_: cls(id_=id_, **kwargs) for id_, kwargs in types.items()} def as_xml(self) -> etree._Element: attrs = {'linguistic_type_id': self.id_, 'time_alignable': 'true' if self.time_alignable else 'false'} if self.constraints: attrs['constraints'] = self.constraints.name attrs['graphic_references'] = 'true' if self.graphic_references else 'false' return E.linguistic_type(**attrs) # In[10]: LINGUISTIC_TYPES = LinguisticType.make_dict({'default-lt': {'time_alignable': True}, 'translation': {'constraints': Constraint.Symbolic_Association}, 'transcription': {'time_alignable': True}, 'orth': {'time_alignable': True}, 'ref': {'time_alignable': True}, 'tx': {'constraints': Constraint.Included_In, 'time_alignable': True}, 'mb': {'constraints': Constraint.Symbolic_Subdivision}, 'orig': {'constraints': Constraint.Symbolic_Association}, 'ge': {'constraints': Constraint.Symbolic_Association}, 'ps': {'constraints': Constraint.Symbolic_Association}, 'so': {'constraints': Constraint.Symbolic_Association}, 'lxid': {'constraints': Constraint.Symbolic_Association}, 'fte': {'constraints': Constraint.Symbolic_Association}, 'nt': {'constraints': Constraint.Symbolic_Association}, 'imported-sep': {'time_alignable': True}}) # In[11]: for lt in LINGUISTIC_TYPES.values(): pprint(lt.as_xml(), end='') # ## Define tiers # In[12]: @dataclasses.dataclass class Tier: """Container for annotations of one linguistic type. Possible target for `parent_ref` of a tier.""" id_: str linguistic_type_ref: 'str' parent_ref: Optional[str] = None default_locale: str = 'en' @classmethod def make_dict(cls, tiers) -> dict[str, Tier]: return {id_: cls(id_=id_, **kwargs) for id_, kwargs in tiers.items()} @property def linguistic_type(self): return LINGUISTIC_TYPES[self.linguistic_type_ref] def as_xml(self) -> etree._Element: attrs = {'tier_id': self.id_, 'linguistic_type_ref': self.linguistic_type.id_} if self.parent_ref: attrs['parent_ref'] = self.parent_ref attrs['default_locale'] = self.default_locale return E.tier(**attrs) # In[13]: TIERS = Tier.make_dict({'ref@A': {'linguistic_type_ref': 'ref'}, 'tx@A': {'linguistic_type_ref': 'tx', 'parent_ref': 'ref@A'}, 'fte@A': {'linguistic_type_ref': 'translation', 'parent_ref': 'tx@A'}, 'mb@A': {'linguistic_type_ref': 'mb', 'parent_ref': 'tx@A'}, 'ge@A': {'linguistic_type_ref': 'ge', 'parent_ref': 'mb@A'}, 'ps@A': {'linguistic_type_ref': 'ps', 'parent_ref': 'mb@A'}, 'lxid@A': {'linguistic_type_ref': 'lxid', 'parent_ref': 'mb@A'}, 'so@A': {'linguistic_type_ref': 'so', 'parent_ref': 'mb@A'}, 'nt@A': {'linguistic_type_ref': 'nt', 'parent_ref': 'ref@A'}, 'orig@A': {'linguistic_type_ref': 'orig', 'parent_ref': 'tx@A'}, 'fta@A': {'linguistic_type_ref': 'translation', 'parent_ref': 'tx@A'}}) # In[14]: for tier in TIERS.values(): pprint(tier.as_xml(), end='') # ## Build document # In[15]: SENTENCE_TIERS = (TIERS['ref@A'].id_, TIERS['tx@A'].id_, TIERS['fte@A'].id_) TIME_TICK_MILIS = 10_000 # In[16]: def make_document(basename: str, sentences, *, language_code: str = 'en', country_code: str = 'US', sentence_tiers: Sequence[str] = SENTENCE_TIERS, time_tick: int = TIME_TICK_MILIS, **kwargs) -> etree._ElementTree: """Return a new ELAN document from the given (text, translation) pairs.""" doc = make_document_skeleton(**kwargs) root = doc.getroot() tiers = {id_: t.as_xml() for id_, t in TIERS.items()} assert all(t in tiers for t in sentence_tiers) annotation_tiers = [elem for id_, elem in tiers.items() if id_ in sentence_tiers] time_order = E.time_order() for time_slots, annotations in iterannotations(basename, sentences, time_tick=time_tick): time_order.extend(time_slots) for tier, elem in zip(annotation_tiers, annotations): tier.append(elem) root.append(time_order) root.extend(tiers.values()) last_id, = root.xpath('(tier/annotation/*[self::ref_annotation or self::alignable_annotation]/@annotation_id)[last()]') last_id = int(last_id.removeprefix('a')) root.find('header').append(E.property(str(last_id), name='lastUsedAnnotationId')) root.extend(linguistic_type.as_xml() for linguistic_type in LINGUISTIC_TYPES.values()) root.append(E.locale(language_code=language_code, country_code=country_code)) root.extend(const.as_xml() for const in Constraint) return doc def iterannotations(basename: str, sentences, *, time_tick: int): assert set(map(len, sentences)) == {2} annotation_ids = (f'a{i}' for i in itertools.count(1)) def iterchildren(start_end_value): for start, end, value in start_end_value: ann = E.alignable_annotation(E.annotation_value(value), annotation_id=next(annotation_ids), time_slot_ref1=start.attrib['time_slot_id'], time_slot_ref2=end.attrib['time_slot_id']) yield E.annotation(ann) yield E.annotation(E.ref_annotation(E.annotation_value(fte_value), annotation_id=next(annotation_ids), annotation_ref=ann.attrib['annotation_id'])) time_slot_ids = (f't{i}' for i in itertools.count(1)) for sentence_index, (tx_value, fte_value) in enumerate(sentences): start_time, end_time = (time_tick * i for i in range(sentence_index, sentence_index + 2)) align_values = [f'{basename}.{sentence_index + 1:03d}', tx_value] start = [E.time_slot(time_slot_id=next(time_slot_ids), time_value=str(start_time)) for _ in align_values] end = [E.time_slot(time_slot_id=next(time_slot_ids), time_value=str(end_time)) for _ in align_values] children = iterchildren(zip(start, end, align_values)) yield start + end, list(itertools.starmap(E.annotation,children)) # In[17]: TEST_SENTENCES = [('My Hovercraft is full of eels.', 'Can I please buy some matches?'), ('Please fondle my buttocks.', 'Can you direct me to the station?')] # In[18]: pprint(make_document('test_sentences', TEST_SENTENCES)) # ## Write `.eaf` file # In[19]: TEST_TARGET = pathlib.Path('test_sentences').with_suffix(ELAN_SUFFIX) print(write(TEST_TARGET, make_document(TEST_TARGET.stem, TEST_SENTENCES))) # In[20]: print(f'{TEST_TARGET.stat().st_size:_d}', 'bytes')