#!/usr/bin/env python # coding: utf-8 # # Sketch Around Transkribus Python API # # [Transkribus](https://transkribus.eu/Transkribus/) is a desktop application fronted tool for transcribing handwritten texts and generating machine learning models that can then be used to automatically transcribe such texts. # # As well as the desktop app, there's also a [simple python API](https://github.com/Transkribus/TranskribusPyClient) for the Transkribus web service [[docs](http://htmlpreview.github.io/?https://github.com/Transkribus/TranskribusPyClient/blob/master/src/TranskribusPyClient/client.html) and [wiki](https://github.com/Transkribus/TranskribusPyClient/wiki)]. # # The wiki docs are primarily focused on calling the API from the command line. # # Whilst we can use CLI calls from `%bash` cells in a notebook, and so doing documenting (or helping automate) a workflow, I'm more interested in using the package in a Python scripting context. # # So what can we do with it? # ## Installation # # At the moment, the package is not pip installable, so we need to download the repo and then import the required module from a file: # In[ ]: get_ipython().run_cell_magic('capture', '', '!wget https://github.com/Transkribus/TranskribusPyClient/archive/master.zip\n!unzip master.zip\n') # In[1]: get_ipython().run_line_magic('cd', 'TranskribusPyClient-master/src') from TranskribusPyClient.client import TranskribusClient get_ipython().run_line_magic('cd', '../..') # ## The `TranskribusClient` # # To get started with the Python Transkribus API client, we need to get an instance of it: # In[2]: t = TranskribusClient() # The service that client calls is an authenticated one, so we need to supply credentials for it: # In[5]: from getpass import getpass user = input('User: ') pwd = getpass('Password: ') # Logging in updates the state of the client: # In[6]: t.auth_login(user, pwd) # In[7]: #Review the methods available from the client #dir(t) # Many of the client calls require a collection ID, but I can see how to request available collection IDs directly? # # We can, however, find client ID values from any recent jobs we've run, such as jobs run via the desktop application: # In[24]: #We can find recent collection IDs from jobs... jobs = t.getJobs() colIds = list({j[k] for j in jobs for k in j if k=='colId'}) colIds_str = ', '.join([str(c) for c in colIds]) print(f'Recent collectionIDs: {colIds_str}') # We can then get a list of documents associated with a particular collection: # In[32]: #How do we get the collectionId? docs = t.listDocsByCollectionId(colIds[0]) md='' for doc in docs: md = f"{md}\n### docId: {doc['docId']}\n{doc['title']}, {doc['nrOfPages']} pages\n" print(md) # ## Downloading Transkribus Data Structure XML # # We can download the XML metadata associated with a document's transcripts either as a parsed `lxml.etree` document (`bParse=True [default]`) or as a text string. # In[71]: col_id = colIds[0] #doc_id = [d['docId'] for d in docs if d['title']=='English Handwriting 0.1'][0] doc_id = 268888 # In[72]: #Get XML for a doc _xml_str = t.getDocByIdAsXml(col_id, doc_id, bParse=False) # If we grab the text string, we can use the convenient `xmltodict` Python package to convert the XML text to a `dict`: # In[73]: #!pip3 install xmltodict # In[74]: import xmltodict xml_dict = xmltodict.parse(_xml_str) xml_dict.keys() # The document comes in three parts: # # - metadata (`md`) # - page data [`pageList`] # - collection information [`collection`] # In[75]: xml_dict['trpDoc'].keys() # In[76]: xml_dict['trpDoc']['md'] # In[82]: for k in xml_dict['trpDoc']['collection'].keys(): print(k,xml_dict['trpDoc']['collection'][k]) # The `pageList` structure contains transcript information about each page: # In[77]: xml_dict['trpDoc']['pageList']['pages']['tsList']['transcripts'][0] # ## Downloading Transkribus Documents # # We can download complete Transkribus documents given a collection and document ID. # # The complete document contains the coordinates for segmented text regions as well as the transcript for each region Image downloads for each page are enabaled by default (`bNoImage=False`). # # If the specified download folder does not exist, create it. If it does exist, and `bForce=False [default]` an error is raised; if the argument is true, delete the directory, and recreate an empty one of the same name. # In[10]: download_dir = 'testDOwn' # In[87]: t.download_document(col_id, doc_id, download_dir) # In[88]: get_ipython().system('ls $download_dir') # The `trp.json` file looks like it's the Transkribus data structure we could download as the XML file. # # *(It would be nice if `getDocByIdAsXml()` were refactored as `getDocById()` with a switch allowing for `xml|json` and the `bParse` flag, when set, returning the `etree` or a Python `dict` correspondingly.)* # In[81]: get_ipython().system('cat $download_dir/trp.json') # The `png` file is an image file for the page: # In[4]: from IPython.display import Image Image('testDOwn/HO-40-2_13_14.png') # The `pxml` file contains the line segmentationa and transcript information. # ## Parsing the XML and Generating a Markdown Document With Individual Text Lines and Transcript # # The aim here is to see if we can take the XML from a document that has already been segmented and transcribed and process it in some way. For example, crop out each line from the image file and place it in a markdown document, along with the transcription of that line. # We could represent the `pxml` file as an ordered Python dict using `xmltodict`, or we could parse it as an XML document. Let's do the latter for now: # In[25]: import xml.etree.ElementTree as ET def transkribus_tree(fn): with open(fn) as f: _xml = f.read() #Clean the XML of namespace cruft _xml = _xml.replace('xmlns="http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15 http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15/pagecontent.xsd"','') #Parse the XML tree = ET.fromstring(_xml) return tree # In[28]: tree = transkribus_tree('testDown/HO-40-2_13_14.pxml') # For each text region, which may contain one or more text lines, each with its own transcript, a complete transcript is also provided: # In[165]: get_ipython().system('head -n 200 testDown/HO-40-2_13_14.pxml | tail -n 30') # We can parse this out as follows: # In[164]: for t in tree.findall('Page/TextRegion/TextEquiv/Unicode'): print('...\n',t.text) # We can also pull out the co-ordinates defining lines identified within each text region. # # Here's what the XML looks like: # In[120]: get_ipython().system('head -n 45 testDown/HO-40-2_13_14.pxml') # So let's get some co-ordinates... # In[181]: for line in tree.findall('Page/TextRegion')[1].findall('TextLine')[3:6]: te = line.find('TextEquiv/Unicode') if te is not None: print(te.text, ' ::: ', line.find('Coords').attrib['points'],'\n') # Using these co-ordinates, can we crop a region of the image corresponding to the text? # # How about trying with OpenCV? # In[9]: import numpy as np import cv2 img = cv2.imread("testDOwn/HO-40-2_13_14.png") # Preview the image: # In[10]: from matplotlib import pyplot as plt plt.imshow(img) plt.title('Sample Page') plt.show() # Extract some co-ordinates: # In[12]: _pts = tree.findall('Page/TextRegion')[1].findall('TextLine')[4].find('Coords').attrib['points'] pts = np.array([[int(_p.split(',')[0]), int(_p.split(',')[1])] for _p in _pts.split()]) pts[:3] # And then via a [handy little script](https://stackoverflow.com/a/48301735/454773) I found on Stack Overflow we can crop out the specified region: # In[ ]: import matplotlib as mpl def getCroppedArea(img, pts, dpi=300): """Get a cropped area from an image. Via: https://stackoverflow.com/a/48301735/454773""" _dpi = mpl.rcParams['figure.dpi'] mpl.rcParams['figure.dpi'] = dpi ## (1) Crop the bounding rect rect = cv2.boundingRect(pts) x,y,w,h = rect cropped = img[y:y+h, x:x+w].copy() ## (2) make mask pts = pts - pts.min(axis=0) mask = np.zeros(cropped.shape[:2], np.uint8) cv2.drawContours(mask, [pts], -1, (255, 255, 255), -1, cv2.LINE_AA) ## (3) do bit-op dst = cv2.bitwise_and(cropped, cropped, mask=mask) ## (4) add the white background bg = np.ones_like(cropped, np.uint8)*255 cv2.bitwise_not(bg, bg, mask=mask) cropped_area = bg + dst return cropped_area # Let's see if it works... # In[13]: sentence = getCroppedArea(img, pts, 400) plt.figure(figsize = (15,5)) plt.axis('off') plt.imshow(sentence) #Save the image plt.savefig("test.png", dpi=400) # We can also embed the saved image, of course... # In[59]: Image("test.png") # This sets up the possible of creating a markdown file, for example, that embeds single line images as well as transcripts. Using Jupytext, such a document can be edited in a notebook editor. # In[27]: #!pip3 install --upgrade tqdm import os.path import hashlib import pathlib from tqdm.notebook import tqdm def markdownFromRegion(region, imgfile, fn='testout.md', outdir='test'): """Generate markdown page for Transkribus text region.""" #Make sure the path to the outdir exists... pathlib.Path(outdir).mkdir(parents=True, exist_ok=True) md = '' img = cv2.imread(imgfile) #Provide a progress bar using tqdm for line in tqdm(region.findall('TextLine')): _te = line.find('TextEquiv/Unicode') _pts = line.find('Coords').attrib['points'] pts = np.array([[int(_p.split(',')[0]), int(_p.split(',')[1])] for _p in _pts.split()]) sentence = getCroppedArea(img, pts, 400) #Save the image _img_uid = hashlib.md5(str(pts).encode()).hexdigest() ifn = f"img_{_img_uid}.png" cv2.imwrite(os.path.join(outdir, ifn), sentence) _txt = line.find('TextEquiv/Unicode') _txt = _txt.text if _txt is not None else '' md = f'{md}![{_img_uid}]({ifn})\n\n\n{_txt}\n\n\n' with open(os.path.join(outdir, fn), 'w') as f: f.write(md) # Running it seems to be pretty quick... # In[30]: tree = transkribus_tree('testDown/HO-40-2_13_14.pxml') markdownFromRegion(tree.findall('Page/TextRegion')[1], "testDOwn/HO-40-2_13_14.png") # Here's an example of what the md raw, and rendered, looks like: # In[32]: Image("transkribus_testout_md.png") # ## Working with a 1 or 2 page doc on Transkribus # # Let's see if we can create a collection conting a couple of one or two page PDFs, segment the text lines and retrieve the XML and JPG, then segment the JPG into a markdown document. # In[ ]: # Create a collection # In[7]: #Get one or two pages out of a PDF as a PDF # In[8]: # Submit the doc # In[ ]: # Segment the Lines # In[9]: # Get the XML and JPG Back # In[ ]: # Crop each line, saving image fragments # In[ ]: # Insert image link to each fragment into md along with trancscription text # ## Submitting Content Back to the Transkribus Server # # If we implemented a transciption client via a Jupyter extension, for example, we'd want to be able to submit the transcript back. # # The `postPageTranscript()` method allows us to do that. See the [docs](http://htmlpreview.github.io/?https://github.com/Transkribus/TranskribusPyClient/blob/master/src/TranskribusPyClient/client.html) for the XML format that we need to return. # In[ ]: