Demonstrate basic workflow components for individual rows extracted from Stock Exchange images.
Page
format!pip install lxml
import os
import requests
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry
s = requests.Session()
retries = Retry(total=5, backoff_factor=1, status_forcelist=[ 502, 503, 504 ])
s.mount('https://', HTTPAdapter(max_retries=retries))
s.mount('http://', HTTPAdapter(max_retries=retries))
# Use JSON rather than XML
s.headers = {'Accept': 'application/json'}
Saves a JSESSIONID cookie in the session for future requests.
credentials = {
'user': '[email protected]',
'pw': 'hP8sQKPn9fLDKV'
}
# Login
response = s.post('https://transkribus.eu/TrpServer/rest/auth/login', data=credentials)
# Check that JSESSIONID has been set
response.cookies
# Get collections
response = s.get('https://transkribus.eu/TrpServer/rest/collections/list')
response.json()
new_collection = {
'collName': 'api-test2'
}
coll_response = s.post('https://transkribus.eu/TrpServer/rest/collections/createCollection', params=new_collection)
coll_response.text
# import lxml
from lxml import etree
from PIL import Image
def generate_xml_for_image(image_path=None):
'''
Prepares XML for upload, inserting image dimensions as required.
'''
with open('page_xml.xml', 'rb') as xml_file:
template = etree.parse(xml_file)
print(template)
root = template.getroot()
img = Image.open(image_path)
w, h = img.size
image_file = os.path.basename(image_path)
page = root.find('{http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15}Page')
page.set('imageFilename', image_file)
page.set('imageWidth', str(w))
page.set('imageHeight', str(h))
tr = page.find('{http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15}TextRegion')
tr_coords = tr.find('{http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15}Coords')
tr_coords.set('points', '0,0 0,{h} {w},{h}, {w},0'.format(w=w, h=h))
tl = tr.find('{http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15}TextLine')
tl_coords = tl.find('{http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15}Coords')
tl_coords.set('points', '15,15 {w},15 {w},{h}, 15,{h}'.format(w=w-15, h=h-15))
with open('{}.xml'.format(image_file[:-4]), 'wb') as new_xml:
new_xml.write(etree.tostring(template, pretty_print=True))
# Basic template for upload data
doc_payload = {
"md": {
"title": "Test",
"author": "Sydney Stock Exchange",
"genre": "",
"writer": ""
},
"pageList": {"pages": [
{
"fileName": "N193-150_0428-col-2-14.jpg",
"pageXmlName": "N193-150_0428-col-2-14-1.xml",
"pageNr": 1
}
]}
}
def upload_doc(coll_id, image_path, doc_name='Test'):
'''
Uploads image and XML files to Tranksribus.
'''
# Prepare XML file
generate_xml_for_image(image_path)
image_file = os.path.basename(image_path)
xml_file = '{}-1.xml'.format(image_file[:-4])
# Modify payload
payload = doc_payload.copy()
payload['md']['title'] = doc_name
payload['pageList']['pages'][0]['fileName'] = image_file
payload['pageList']['pages'][0]['pageXmlName'] = xml_file
# Post metadata
response = s.post('https://transkribus.eu/TrpServer/rest/uploads?collId={}'.format(coll_id), json=payload)
print(response.url)
# Get upload id from response to submit with files
upload_id = response.json()['uploadId']
print(upload_id)
files = {'img': open(image_path, 'rb'), 'xml': open(xml_file, 'rb')}
# Upload the xml and image files
response = s.put('https://transkribus.eu/TrpServer/rest/uploads/{}'.format(upload_id), files=files)
return upload_id
doc_id = upload_doc(40099, 'data/columns/rows-test/sample/N193-150_0428-col-2-14.jpg')
params = {
'id': doc_id,
'pages': 1
}
h = s.post('https://transkribus.eu/TrpServer/rest/recognition/{}/{}/htrCITlab'.format(40099, 133), params=params)
job_id = h.json()
# Check on the status of the job (put in a loop)
j = s.get('https://transkribus.eu/TrpServer/rest/jobs/{}'.format(job_id))
j.json()['success']
# If job success is true, then get the results
r = s.get('https://transkribus.eu/TrpServer/rest/collections/{}/{}/fulldoc'.format(40099, doc_id))
# Get the results
page = 0
docinfo = r.json()
xml_url = docinfo['pageList']['pages'][page]['tsList']['transcripts'][0]['url']
x = requests.get(xml_url)
x.text # Save as xml