https://nbviewer.org/gist/xflr6/452ce3b7be31e46e1cad32146af33d15/MakeElanSentences.ipynb
from __future__ import annotations
import dataclasses
import datetime
import enum
import itertools
import pathlib
import os
from typing import Optional
import lxml.builder
from lxml import etree
XSI = 'http://www.w3.org/2001/XMLSchema-instance'
XSI_CLARK = '{%s}' % XSI
ELAN_SCHEMA = 'http://www.mpi.nl/tools/elan/EAFv3.0.xsd'
ELAN_URN = 'urn:nl-mpi-tools-elan-eaf:315e945c-5e96-4492-9605-fbfe3487b506'
ELAN_VERSION = '3.0'
ELAN_SUFFIX = '.eaf'
ENCODING = 'utf-8'
E = lxml.builder.ElementMaker(nsmap={'xsi': XSI})
def make_document_skeleton(*, author: str = '', media_file: str = '',
time_units: str = 'milliseconds',
format: str = ELAN_VERSION, version: str = ELAN_VERSION,
date: Optional[datetime.datetime] = None) -> etree._ElementTree:
"""Create a new annotation document from the given arguments."""
kwargs = {f'{XSI_CLARK}noNamespaceSchemaLocation': ELAN_SCHEMA}
if date is None:
date = datetime.datetime.now(datetime.timezone.utc)
root = E.annotation_document(author=author,
date=date.replace(microsecond=0).astimezone().isoformat(),
format=format,
version=format,
**kwargs)
header = E.header(E.property(ELAN_URN, name='URN'),
media_file=media_file, time_units=time_units)
root.append(header)
return etree.ElementTree(root)
make_document_skeleton()
<lxml.etree._ElementTree at 0x23266633c80>
def pprint(doc: etree._ElementTree, *, file=None,
canonical: bool = False,
pretty_print: bool = True,
end: str = '\n',
**kwargs) -> None:
"""Pretty-print the XML serialization of the given document."""
if canonical:
doc = canonicalized(doc)
text = etree.tostring(doc, encoding='unicode', pretty_print=pretty_print, **kwargs)
print(text, file=file, end=end)
def canonicalized(doc: etree._ElementTree, *,
indent: str = ' ' * 4) -> etree._ElementTree:
"""Return a copy of the annotation document in ELAN formatting."""
doc = etree.ElementTree(etree.fromstring(etree.tostring(doc)))
etree.indent(doc, space=indent)
for elem in doc.iter('*'):
elem.tag = elem.tag.upper()
for k, v in sorted(elem.attrib.items()):
del elem.attrib[k]
elem.attrib[k.upper() if not k.startswith(XSI_CLARK) else k] = v
return doc
for kwargs in [{}, {'canonical': True}]:
pprint(make_document_skeleton(), **kwargs, end='')
<annotation_document xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" author="" date="2022-09-18T22:40:54+02:00" format="3.0" version="3.0" xsi:noNamespaceSchemaLocation="http://www.mpi.nl/tools/elan/EAFv3.0.xsd"> <header media_file="" time_units="milliseconds"> <property name="URN">urn:nl-mpi-tools-elan-eaf:315e945c-5e96-4492-9605-fbfe3487b506</property> </header> </annotation_document> <ANNOTATION_DOCUMENT xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" AUTHOR="" DATE="2022-09-18T22:40:54+02:00" FORMAT="3.0" VERSION="3.0" xsi:noNamespaceSchemaLocation="http://www.mpi.nl/tools/elan/EAFv3.0.xsd"> <HEADER MEDIA_FILE="" TIME_UNITS="milliseconds"> <PROPERTY NAME="URN">urn:nl-mpi-tools-elan-eaf:315e945c-5e96-4492-9605-fbfe3487b506</PROPERTY> </HEADER> </ANNOTATION_DOCUMENT>
def write(path: os.PathLike | str, doc: etree._ElementTree, *,
canonical: bool = True,
pretty_print: bool = True,
xml_declaration: bool = True,
encoding: str = ENCODING) -> pathlib.Path:
"""(Over)write the given path with the XML serialization of the given document."""
path = pathlib.Path(path)
if canonical:
doc = canonicalized(doc)
doc.write(path, xml_declaration=xml_declaration, pretty_print=pretty_print, encoding=encoding)
return path
class Constraint(enum.Enum):
"""Possible values for `constraints` in a linguistic type."""
Time_Subdivision = "Time subdivision of parent annotation's time interval, no time gaps allowed within this interval"
Symbolic_Subdivision = 'Symbolic subdivision of a parent annotation. Annotations refering to the same parent are ordered'
Symbolic_Association = '1-1 association with a parent annotation'
Included_In = "Time alignable annotations within the parent annotation's time interval, gaps are allowed"
def as_xml(self) -> etree._Element:
return E.constraint(stereotype=self.name, description=self.value)
for const in Constraint:
pprint(const.as_xml(), end='')
<constraint xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" stereotype="Time_Subdivision" description="Time subdivision of parent annotation's time interval, no time gaps allowed within this interval"/> <constraint xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" stereotype="Symbolic_Subdivision" description="Symbolic subdivision of a parent annotation. Annotations refering to the same parent are ordered"/> <constraint xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" stereotype="Symbolic_Association" description="1-1 association with a parent annotation"/> <constraint xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" stereotype="Included_In" description="Time alignable annotations within the parent annotation's time interval, gaps are allowed"/>
@dataclasses.dataclass
class LinguisticType:
"""Possible targets for `linguistic_type_ref` in a tier."""
id_: str
time_alignable: bool = False
graphic_references: bool = False
constraints: Optional[Constraint] = None
@classmethod
def make_dict(cls, types) -> dict[str, LinguisticType]:
return {id_: cls(id_=id_, **kwargs) for id_, kwargs in types.items()}
def as_xml(self) -> etree._Element:
attrs = {'linguistic_type_id': self.id_,
'time_alignable': 'true' if self.time_alignable else 'false'}
if self.constraints:
attrs['constraints'] = self.constraints.name
attrs['graphic_references'] = 'true' if self.graphic_references else 'false'
return E.linguistic_type(**attrs)
LINGUISTIC_TYPES = LinguisticType.make_dict({'default-lt': {'time_alignable': True},
'translation': {'constraints': Constraint.Symbolic_Association},
'transcription': {'time_alignable': True},
'orth': {'time_alignable': True},
'ref': {'time_alignable': True},
'tx': {'constraints': Constraint.Included_In, 'time_alignable': True},
'mb': {'constraints': Constraint.Symbolic_Subdivision},
'orig': {'constraints': Constraint.Symbolic_Association},
'ge': {'constraints': Constraint.Symbolic_Association},
'ps': {'constraints': Constraint.Symbolic_Association},
'so': {'constraints': Constraint.Symbolic_Association},
'lxid': {'constraints': Constraint.Symbolic_Association},
'fte': {'constraints': Constraint.Symbolic_Association},
'nt': {'constraints': Constraint.Symbolic_Association},
'imported-sep': {'time_alignable': True}})
for lt in LINGUISTIC_TYPES.values():
pprint(lt.as_xml(), end='')
<linguistic_type xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" linguistic_type_id="default-lt" time_alignable="true" graphic_references="false"/> <linguistic_type xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" linguistic_type_id="translation" time_alignable="false" constraints="Symbolic_Association" graphic_references="false"/> <linguistic_type xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" linguistic_type_id="transcription" time_alignable="true" graphic_references="false"/> <linguistic_type xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" linguistic_type_id="orth" time_alignable="true" graphic_references="false"/> <linguistic_type xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" linguistic_type_id="ref" time_alignable="true" graphic_references="false"/> <linguistic_type xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" linguistic_type_id="tx" time_alignable="true" constraints="Included_In" graphic_references="false"/> <linguistic_type xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" linguistic_type_id="mb" time_alignable="false" constraints="Symbolic_Subdivision" graphic_references="false"/> <linguistic_type xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" linguistic_type_id="orig" time_alignable="false" constraints="Symbolic_Association" graphic_references="false"/> <linguistic_type xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" linguistic_type_id="ge" time_alignable="false" constraints="Symbolic_Association" graphic_references="false"/> <linguistic_type xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" linguistic_type_id="ps" time_alignable="false" constraints="Symbolic_Association" graphic_references="false"/> <linguistic_type xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" linguistic_type_id="so" time_alignable="false" constraints="Symbolic_Association" graphic_references="false"/> <linguistic_type xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" linguistic_type_id="lxid" time_alignable="false" constraints="Symbolic_Association" graphic_references="false"/> <linguistic_type xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" linguistic_type_id="fte" time_alignable="false" constraints="Symbolic_Association" graphic_references="false"/> <linguistic_type xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" linguistic_type_id="nt" time_alignable="false" constraints="Symbolic_Association" graphic_references="false"/> <linguistic_type xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" linguistic_type_id="imported-sep" time_alignable="true" graphic_references="false"/>
@dataclasses.dataclass
class Tier:
"""Container for annotations of one linguistic type. Possible target for `parent_ref` of a tier."""
id_: str
linguistic_type_ref: 'str'
parent_ref: Optional[str] = None
default_locale: str = 'en'
@classmethod
def make_dict(cls, tiers) -> dict[str, Tier]:
return {id_: cls(id_=id_, **kwargs) for id_, kwargs in tiers.items()}
@property
def linguistic_type(self):
return LINGUISTIC_TYPES[self.linguistic_type_ref]
def as_xml(self) -> etree._Element:
attrs = {'tier_id': self.id_,
'linguistic_type_ref': self.linguistic_type.id_}
if self.parent_ref:
attrs['parent_ref'] = self.parent_ref
attrs['default_locale'] = self.default_locale
return E.tier(**attrs)
TIERS = Tier.make_dict({'ref@A': {'linguistic_type_ref': 'ref'},
'tx@A': {'linguistic_type_ref': 'tx', 'parent_ref': 'ref@A'},
'fte@A': {'linguistic_type_ref': 'translation', 'parent_ref': 'tx@A'},
'mb@A': {'linguistic_type_ref': 'mb', 'parent_ref': 'tx@A'},
'ge@A': {'linguistic_type_ref': 'ge', 'parent_ref': 'mb@A'},
'ps@A': {'linguistic_type_ref': 'ps', 'parent_ref': 'mb@A'},
'lxid@A': {'linguistic_type_ref': 'lxid', 'parent_ref': 'mb@A'},
'so@A': {'linguistic_type_ref': 'so', 'parent_ref': 'mb@A'},
'nt@A': {'linguistic_type_ref': 'nt', 'parent_ref': 'ref@A'},
'orig@A': {'linguistic_type_ref': 'orig', 'parent_ref': 'tx@A'},
'fta@A': {'linguistic_type_ref': 'translation', 'parent_ref': 'tx@A'}})
for tier in TIERS.values():
pprint(tier.as_xml(), end='')
<tier xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" tier_id="ref@A" linguistic_type_ref="ref" default_locale="en"/> <tier xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" tier_id="tx@A" linguistic_type_ref="tx" parent_ref="ref@A" default_locale="en"/> <tier xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" tier_id="fte@A" linguistic_type_ref="translation" parent_ref="tx@A" default_locale="en"/> <tier xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" tier_id="mb@A" linguistic_type_ref="mb" parent_ref="tx@A" default_locale="en"/> <tier xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" tier_id="ge@A" linguistic_type_ref="ge" parent_ref="mb@A" default_locale="en"/> <tier xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" tier_id="ps@A" linguistic_type_ref="ps" parent_ref="mb@A" default_locale="en"/> <tier xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" tier_id="lxid@A" linguistic_type_ref="lxid" parent_ref="mb@A" default_locale="en"/> <tier xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" tier_id="so@A" linguistic_type_ref="so" parent_ref="mb@A" default_locale="en"/> <tier xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" tier_id="nt@A" linguistic_type_ref="nt" parent_ref="ref@A" default_locale="en"/> <tier xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" tier_id="orig@A" linguistic_type_ref="orig" parent_ref="tx@A" default_locale="en"/> <tier xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" tier_id="fta@A" linguistic_type_ref="translation" parent_ref="tx@A" default_locale="en"/>
SENTENCE_TIERS = (TIERS['ref@A'].id_, TIERS['tx@A'].id_, TIERS['fte@A'].id_)
TIME_TICK_MILIS = 10_000
def make_document(basename: str, sentences, *,
language_code: str = 'en', country_code: str = 'US',
sentence_tiers: Sequence[str] = SENTENCE_TIERS,
time_tick: int = TIME_TICK_MILIS,
**kwargs) -> etree._ElementTree:
"""Return a new ELAN document from the given (text, translation) pairs."""
doc = make_document_skeleton(**kwargs)
root = doc.getroot()
tiers = {id_: t.as_xml() for id_, t in TIERS.items()}
assert all(t in tiers for t in sentence_tiers)
annotation_tiers = [elem for id_, elem in tiers.items() if id_ in sentence_tiers]
time_order = E.time_order()
for time_slots, annotations in iterannotations(basename, sentences, time_tick=time_tick):
time_order.extend(time_slots)
for tier, elem in zip(annotation_tiers, annotations):
tier.append(elem)
root.append(time_order)
root.extend(tiers.values())
last_id, = root.xpath('(tier/annotation/*[self::ref_annotation or self::alignable_annotation]/@annotation_id)[last()]')
last_id = int(last_id.removeprefix('a'))
root.find('header').append(E.property(str(last_id), name='lastUsedAnnotationId'))
root.extend(linguistic_type.as_xml() for linguistic_type in LINGUISTIC_TYPES.values())
root.append(E.locale(language_code=language_code, country_code=country_code))
root.extend(const.as_xml() for const in Constraint)
return doc
def iterannotations(basename: str, sentences, *, time_tick: int):
assert set(map(len, sentences)) == {2}
annotation_ids = (f'a{i}' for i in itertools.count(1))
def iterchildren(start_end_value):
for start, end, value in start_end_value:
ann = E.alignable_annotation(E.annotation_value(value),
annotation_id=next(annotation_ids),
time_slot_ref1=start.attrib['time_slot_id'],
time_slot_ref2=end.attrib['time_slot_id'])
yield E.annotation(ann)
yield E.annotation(E.ref_annotation(E.annotation_value(fte_value),
annotation_id=next(annotation_ids),
annotation_ref=ann.attrib['annotation_id']))
time_slot_ids = (f't{i}' for i in itertools.count(1))
for sentence_index, (tx_value, fte_value) in enumerate(sentences):
start_time, end_time = (time_tick * i for i in range(sentence_index, sentence_index + 2))
align_values = [f'{basename}.{sentence_index + 1:03d}', tx_value]
start = [E.time_slot(time_slot_id=next(time_slot_ids), time_value=str(start_time)) for _ in align_values]
end = [E.time_slot(time_slot_id=next(time_slot_ids), time_value=str(end_time)) for _ in align_values]
children = iterchildren(zip(start, end, align_values))
yield start + end, list(itertools.starmap(E.annotation,children))
TEST_SENTENCES = [('My Hovercraft is full of eels.', 'Can I please buy some matches?'),
('Please fondle my buttocks.', 'Can you direct me to the station?')]
pprint(make_document('test_sentences', TEST_SENTENCES))
<annotation_document xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" author="" date="2022-09-18T22:40:54+02:00" format="3.0" version="3.0" xsi:noNamespaceSchemaLocation="http://www.mpi.nl/tools/elan/EAFv3.0.xsd"> <header media_file="" time_units="milliseconds"> <property name="URN">urn:nl-mpi-tools-elan-eaf:315e945c-5e96-4492-9605-fbfe3487b506</property> <property name="lastUsedAnnotationId">6</property> </header> <time_order> <time_slot time_slot_id="t1" time_value="0"/> <time_slot time_slot_id="t2" time_value="0"/> <time_slot time_slot_id="t3" time_value="10000"/> <time_slot time_slot_id="t4" time_value="10000"/> <time_slot time_slot_id="t5" time_value="10000"/> <time_slot time_slot_id="t6" time_value="10000"/> <time_slot time_slot_id="t7" time_value="20000"/> <time_slot time_slot_id="t8" time_value="20000"/> </time_order> <tier tier_id="ref@A" linguistic_type_ref="ref" default_locale="en"> <annotation> <alignable_annotation annotation_id="a1" time_slot_ref1="t1" time_slot_ref2="t3"> <annotation_value>test_sentences.001</annotation_value> </alignable_annotation> </annotation> <annotation> <alignable_annotation annotation_id="a4" time_slot_ref1="t5" time_slot_ref2="t7"> <annotation_value>test_sentences.002</annotation_value> </alignable_annotation> </annotation> </tier> <tier tier_id="tx@A" linguistic_type_ref="tx" parent_ref="ref@A" default_locale="en"> <annotation> <alignable_annotation annotation_id="a2" time_slot_ref1="t2" time_slot_ref2="t4"> <annotation_value>My Hovercraft is full of eels.</annotation_value> </alignable_annotation> </annotation> <annotation> <alignable_annotation annotation_id="a5" time_slot_ref1="t6" time_slot_ref2="t8"> <annotation_value>Please fondle my buttocks.</annotation_value> </alignable_annotation> </annotation> </tier> <tier tier_id="fte@A" linguistic_type_ref="translation" parent_ref="tx@A" default_locale="en"> <annotation> <ref_annotation annotation_id="a3" annotation_ref="a2"> <annotation_value>Can I please buy some matches?</annotation_value> </ref_annotation> </annotation> <annotation> <ref_annotation annotation_id="a6" annotation_ref="a5"> <annotation_value>Can you direct me to the station?</annotation_value> </ref_annotation> </annotation> </tier> <tier tier_id="mb@A" linguistic_type_ref="mb" parent_ref="tx@A" default_locale="en"/> <tier tier_id="ge@A" linguistic_type_ref="ge" parent_ref="mb@A" default_locale="en"/> <tier tier_id="ps@A" linguistic_type_ref="ps" parent_ref="mb@A" default_locale="en"/> <tier tier_id="lxid@A" linguistic_type_ref="lxid" parent_ref="mb@A" default_locale="en"/> <tier tier_id="so@A" linguistic_type_ref="so" parent_ref="mb@A" default_locale="en"/> <tier tier_id="nt@A" linguistic_type_ref="nt" parent_ref="ref@A" default_locale="en"/> <tier tier_id="orig@A" linguistic_type_ref="orig" parent_ref="tx@A" default_locale="en"/> <tier tier_id="fta@A" linguistic_type_ref="translation" parent_ref="tx@A" default_locale="en"/> <linguistic_type linguistic_type_id="default-lt" time_alignable="true" graphic_references="false"/> <linguistic_type linguistic_type_id="translation" time_alignable="false" constraints="Symbolic_Association" graphic_references="false"/> <linguistic_type linguistic_type_id="transcription" time_alignable="true" graphic_references="false"/> <linguistic_type linguistic_type_id="orth" time_alignable="true" graphic_references="false"/> <linguistic_type linguistic_type_id="ref" time_alignable="true" graphic_references="false"/> <linguistic_type linguistic_type_id="tx" time_alignable="true" constraints="Included_In" graphic_references="false"/> <linguistic_type linguistic_type_id="mb" time_alignable="false" constraints="Symbolic_Subdivision" graphic_references="false"/> <linguistic_type linguistic_type_id="orig" time_alignable="false" constraints="Symbolic_Association" graphic_references="false"/> <linguistic_type linguistic_type_id="ge" time_alignable="false" constraints="Symbolic_Association" graphic_references="false"/> <linguistic_type linguistic_type_id="ps" time_alignable="false" constraints="Symbolic_Association" graphic_references="false"/> <linguistic_type linguistic_type_id="so" time_alignable="false" constraints="Symbolic_Association" graphic_references="false"/> <linguistic_type linguistic_type_id="lxid" time_alignable="false" constraints="Symbolic_Association" graphic_references="false"/> <linguistic_type linguistic_type_id="fte" time_alignable="false" constraints="Symbolic_Association" graphic_references="false"/> <linguistic_type linguistic_type_id="nt" time_alignable="false" constraints="Symbolic_Association" graphic_references="false"/> <linguistic_type linguistic_type_id="imported-sep" time_alignable="true" graphic_references="false"/> <locale language_code="en" country_code="US"/> <constraint stereotype="Time_Subdivision" description="Time subdivision of parent annotation's time interval, no time gaps allowed within this interval"/> <constraint stereotype="Symbolic_Subdivision" description="Symbolic subdivision of a parent annotation. Annotations refering to the same parent are ordered"/> <constraint stereotype="Symbolic_Association" description="1-1 association with a parent annotation"/> <constraint stereotype="Included_In" description="Time alignable annotations within the parent annotation's time interval, gaps are allowed"/> </annotation_document>
.eaf
file¶TEST_TARGET = pathlib.Path('test_sentences').with_suffix(ELAN_SUFFIX)
print(write(TEST_TARGET, make_document(TEST_TARGET.stem, TEST_SENTENCES)))
test_sentences.eaf
print(f'{TEST_TARGET.stat().st_size:_d}', 'bytes')
5_954 bytes