#!/usr/bin/env python
# coding: utf-8

# # Make a skeleton ELAN document from (text, translation) pairs
# https://nbviewer.org/gist/xflr6/452ce3b7be31e46e1cad32146af33d15/MakeElanSentences.ipynb

# In[1]:


from __future__ import annotations

import dataclasses
import datetime
import enum
import itertools
import pathlib
import os
from typing import Optional

import lxml.builder
from lxml import etree

XSI = 'http://www.w3.org/2001/XMLSchema-instance'
XSI_CLARK = '{%s}' % XSI

ELAN_SCHEMA = 'http://www.mpi.nl/tools/elan/EAFv3.0.xsd'
ELAN_URN = 'urn:nl-mpi-tools-elan-eaf:315e945c-5e96-4492-9605-fbfe3487b506'
ELAN_VERSION = '3.0'
ELAN_SUFFIX = '.eaf'

ENCODING = 'utf-8'

E = lxml.builder.ElementMaker(nsmap={'xsi': XSI})


# ## Build document root and header

# In[2]:


def make_document_skeleton(*, author: str = '', media_file: str = '',
                           time_units: str = 'milliseconds',
                           format: str = ELAN_VERSION, version: str = ELAN_VERSION,
                           date: Optional[datetime.datetime] = None) -> etree._ElementTree:
    """Create a new annotation document from the given arguments."""
    kwargs = {f'{XSI_CLARK}noNamespaceSchemaLocation': ELAN_SCHEMA}
    if date is None:
        date = datetime.datetime.now(datetime.timezone.utc)
    root = E.annotation_document(author=author,
                                 date=date.replace(microsecond=0).astimezone().isoformat(),
                                 format=format,
                                 version=format,
                                 **kwargs)
    header = E.header(E.property(ELAN_URN, name='URN'), 
                      media_file=media_file, time_units=time_units)
    root.append(header)
    return etree.ElementTree(root)


# In[3]:


make_document_skeleton()


# ## Pretty-print document

# In[4]:


def pprint(doc: etree._ElementTree, *, file=None,
           canonical: bool = False,
           pretty_print: bool = True,
           end: str = '\n',
           **kwargs) -> None:
    """Pretty-print the XML serialization of the given document."""
    if canonical:
        doc = canonicalized(doc)
    text = etree.tostring(doc, encoding='unicode', pretty_print=pretty_print, **kwargs)
    print(text, file=file, end=end)

    
def canonicalized(doc: etree._ElementTree, *,
                  indent: str = ' ' * 4) -> etree._ElementTree:
    """Return a copy of the annotation document in ELAN formatting."""
    doc = etree.ElementTree(etree.fromstring(etree.tostring(doc)))
    etree.indent(doc, space=indent)
    for elem in doc.iter('*'):
        elem.tag = elem.tag.upper()
        for k, v in sorted(elem.attrib.items()):
            del elem.attrib[k]
            elem.attrib[k.upper() if not k.startswith(XSI_CLARK) else k] = v
    return doc


# In[5]:


for kwargs in [{}, {'canonical': True}]:
    pprint(make_document_skeleton(), **kwargs, end='')


# ## Serialize document

# In[6]:


def write(path: os.PathLike | str, doc: etree._ElementTree, *,
          canonical: bool = True,
          pretty_print: bool = True,
          xml_declaration: bool = True,
          encoding: str = ENCODING) -> pathlib.Path:
    """(Over)write the given path with the XML serialization of the given document."""
    path = pathlib.Path(path)
    if canonical:
        doc = canonicalized(doc)
    doc.write(path, xml_declaration=xml_declaration, pretty_print=pretty_print, encoding=encoding)
    return path


# ## Define linguistic type constraints

# In[7]:


class Constraint(enum.Enum):
    """Possible values for `constraints` in a linguistic type."""

    Time_Subdivision = "Time subdivision of parent annotation's time interval, no time gaps allowed within this interval"
    Symbolic_Subdivision = 'Symbolic subdivision of a parent annotation. Annotations refering to the same parent are ordered'
    Symbolic_Association = '1-1 association with a parent annotation'
    Included_In = "Time alignable annotations within the parent annotation's time interval, gaps are allowed"

    def as_xml(self) -> etree._Element:
        return E.constraint(stereotype=self.name, description=self.value)


# In[8]:


for const in Constraint:
    pprint(const.as_xml(), end='')


# ## Define linguistic types

# In[9]:


@dataclasses.dataclass
class LinguisticType:
    """Possible targets for `linguistic_type_ref` in a tier."""

    id_: str
    time_alignable: bool = False
    graphic_references: bool = False
    constraints: Optional[Constraint] = None

    @classmethod
    def make_dict(cls, types) -> dict[str, LinguisticType]:
        return {id_: cls(id_=id_, **kwargs) for id_, kwargs in types.items()}        

    def as_xml(self) -> etree._Element:
        attrs = {'linguistic_type_id': self.id_,
                 'time_alignable': 'true' if self.time_alignable else 'false'}
        if self.constraints:
            attrs['constraints'] = self.constraints.name
        attrs['graphic_references'] = 'true' if self.graphic_references else 'false'
        return E.linguistic_type(**attrs)


# In[10]:


LINGUISTIC_TYPES = LinguisticType.make_dict({'default-lt': {'time_alignable': True},
                                             'translation': {'constraints': Constraint.Symbolic_Association},
                                             'transcription': {'time_alignable': True},
                                             'orth': {'time_alignable': True},
                                             'ref': {'time_alignable': True},
                                             'tx': {'constraints': Constraint.Included_In, 'time_alignable': True},
                                             'mb': {'constraints': Constraint.Symbolic_Subdivision},
                                             'orig': {'constraints': Constraint.Symbolic_Association},
                                             'ge': {'constraints': Constraint.Symbolic_Association},
                                             'ps': {'constraints': Constraint.Symbolic_Association},
                                             'so': {'constraints': Constraint.Symbolic_Association},
                                             'lxid': {'constraints': Constraint.Symbolic_Association},
                                             'fte': {'constraints': Constraint.Symbolic_Association},
                                             'nt': {'constraints': Constraint.Symbolic_Association},
                                             'imported-sep': {'time_alignable': True}})


# In[11]:


for lt in LINGUISTIC_TYPES.values():
    pprint(lt.as_xml(), end='')


# ## Define tiers

# In[12]:


@dataclasses.dataclass
class Tier:
    """Container for annotations of one linguistic type. Possible target for `parent_ref` of a tier."""

    id_: str
    linguistic_type_ref: 'str'
    parent_ref: Optional[str] = None
    default_locale: str = 'en'

    @classmethod
    def make_dict(cls, tiers) -> dict[str, Tier]:
        return {id_: cls(id_=id_, **kwargs) for id_, kwargs in tiers.items()}

    @property
    def linguistic_type(self):
        return LINGUISTIC_TYPES[self.linguistic_type_ref]

    def as_xml(self) -> etree._Element:
        attrs = {'tier_id': self.id_,
                 'linguistic_type_ref': self.linguistic_type.id_}
        if self.parent_ref:
            attrs['parent_ref'] = self.parent_ref
        attrs['default_locale'] = self.default_locale
        return E.tier(**attrs)


# In[13]:


TIERS = Tier.make_dict({'ref@A': {'linguistic_type_ref': 'ref'},
                        'tx@A': {'linguistic_type_ref': 'tx', 'parent_ref': 'ref@A'},
                        'fte@A': {'linguistic_type_ref': 'translation', 'parent_ref': 'tx@A'},
                        'mb@A': {'linguistic_type_ref': 'mb', 'parent_ref': 'tx@A'},
                        'ge@A': {'linguistic_type_ref': 'ge', 'parent_ref': 'mb@A'},
                        'ps@A': {'linguistic_type_ref': 'ps', 'parent_ref': 'mb@A'},
                        'lxid@A': {'linguistic_type_ref': 'lxid', 'parent_ref': 'mb@A'},
                        'so@A': {'linguistic_type_ref': 'so', 'parent_ref': 'mb@A'},
                        'nt@A': {'linguistic_type_ref': 'nt', 'parent_ref': 'ref@A'},
                        'orig@A': {'linguistic_type_ref': 'orig', 'parent_ref': 'tx@A'},
                        'fta@A': {'linguistic_type_ref': 'translation', 'parent_ref': 'tx@A'}})


# In[14]:


for tier in TIERS.values():
    pprint(tier.as_xml(), end='')


# ## Build document

# In[15]:


SENTENCE_TIERS = (TIERS['ref@A'].id_, TIERS['tx@A'].id_, TIERS['fte@A'].id_)

TIME_TICK_MILIS = 10_000


# In[16]:


def make_document(basename: str, sentences, *,
                  language_code: str = 'en', country_code: str = 'US',
                  sentence_tiers: Sequence[str] = SENTENCE_TIERS,
                  time_tick: int = TIME_TICK_MILIS,
                  **kwargs) -> etree._ElementTree:
    """Return a new ELAN document from the given (text, translation) pairs."""
    doc = make_document_skeleton(**kwargs)
    root = doc.getroot()

    tiers = {id_: t.as_xml() for id_, t in TIERS.items()}
    assert all(t in tiers for t in sentence_tiers)
    annotation_tiers = [elem for id_, elem in tiers.items() if id_ in sentence_tiers]
    time_order = E.time_order()
    for time_slots, annotations in iterannotations(basename, sentences, time_tick=time_tick):
        time_order.extend(time_slots)
        for tier, elem in zip(annotation_tiers, annotations):
            tier.append(elem)
    root.append(time_order)
    root.extend(tiers.values())

    last_id, = root.xpath('(tier/annotation/*[self::ref_annotation or self::alignable_annotation]/@annotation_id)[last()]')
    last_id = int(last_id.removeprefix('a'))
    root.find('header').append(E.property(str(last_id), name='lastUsedAnnotationId'))

    root.extend(linguistic_type.as_xml() for linguistic_type in LINGUISTIC_TYPES.values())
    root.append(E.locale(language_code=language_code, country_code=country_code))
    root.extend(const.as_xml() for const in Constraint)
    return doc


def iterannotations(basename: str, sentences, *, time_tick: int):
    assert set(map(len, sentences)) == {2}

    annotation_ids = (f'a{i}' for i in itertools.count(1))

    def iterchildren(start_end_value):
        for start, end, value in start_end_value:
            ann = E.alignable_annotation(E.annotation_value(value),
                                         annotation_id=next(annotation_ids),
                                         time_slot_ref1=start.attrib['time_slot_id'],
                                         time_slot_ref2=end.attrib['time_slot_id'])
            yield E.annotation(ann)
        yield E.annotation(E.ref_annotation(E.annotation_value(fte_value),
                                            annotation_id=next(annotation_ids),
                                            annotation_ref=ann.attrib['annotation_id']))

    time_slot_ids =  (f't{i}' for i in itertools.count(1))

    for sentence_index, (tx_value, fte_value) in enumerate(sentences):
        start_time, end_time = (time_tick * i  for i in range(sentence_index, sentence_index + 2))

        align_values = [f'{basename}.{sentence_index + 1:03d}', tx_value]

        start = [E.time_slot(time_slot_id=next(time_slot_ids), time_value=str(start_time)) for _ in align_values]
        end = [E.time_slot(time_slot_id=next(time_slot_ids), time_value=str(end_time)) for _ in align_values]

        children = iterchildren(zip(start, end, align_values))
        yield start + end, list(itertools.starmap(E.annotation,children))


# In[17]:


TEST_SENTENCES = [('My Hovercraft is full of eels.', 'Can I please buy some matches?'),
                  ('Please fondle my buttocks.', 'Can you direct me to the station?')]


# In[18]:


pprint(make_document('test_sentences', TEST_SENTENCES))


# ## Write `.eaf` file

# In[19]:


TEST_TARGET = pathlib.Path('test_sentences').with_suffix(ELAN_SUFFIX)

print(write(TEST_TARGET, make_document(TEST_TARGET.stem, TEST_SENTENCES)))


# In[20]:


print(f'{TEST_TARGET.stat().st_size:_d}', 'bytes')