On the advice of Martin Worthington I visited the Cambridge Cuneify+ page.
On that page there is a link to a Würzburg page on Cuneiform fonts with a download link to Old Babylonian Fonts, containing the Santakku(M) fonts and a sign list in PDF.
I extracted the text from that PDF, sanitized it to one table cell per line by means of the text editor Vim, and that file is the source of this notebook, that tries to restore the original table in a tab separated format.
The PDF is in the docs directory of this repo.
The sanitized text file is the file sources/writing/Santakku.txt in this repository.
While the text extraction and sanitizing went reasonably well, there are problems with empty cells.
The table is seven columns wide, but there are not seven lines per row in the text file due to missing cells.
Yet we can align by means of the typical values in the cells (unicode code points, characters, small numbers).
Sometimes the values are also missing.
We ignore the values in the Santakku columns and also the value, so we will not suffer much by this problem.
We just extract these columns:
Unicode
i.e. unicode code point,signe
i.e. grapheme,Autotext
i.e. readingimport os
import re
BASE = os.path.expanduser("~/github")
ORG = "Nino-cunei"
REPO = "oldbabylonian"
REPO_DIR = f"{BASE}/{ORG}/{REPO}"
SRC = f"{REPO_DIR}/sources/writing/Santakku.txt"
CUNEI_START = int("12000", 16)
CUNEI_END = int("13000", 16)
uniCandRe = re.compile(r"""^\s*([0-9A-Fa-f]{5}[ +]*)+$""")
# code below not working because I do not yet correctly all unicode code point strings
# correctly, eg "140 12363"
def makeMapping():
mapping = {}
def finishUni():
if curGrapheme is None:
print(f'ERROR at line {i + 1}: missing grapheme for uni "{curUni}"')
print(list(reversed(prevLines)))
return False
curReading = None
for (p, pLine) in enumerate(reversed(prevLines)):
if p == 0:
if not (pLine.isdigit() and not 0 < int(p) < 1000):
print(f'ERROR at line {i + 1}: missing Borger number "{pLine}"')
print(list(reversed(prevLines)))
return False
else:
curReading = line
return True
if curReading is None:
print(f'ERROR at line {i + 1}: missing reading for uni "{curUni}"')
print(list(reversed(prevLines)))
return False
uniStrs = curUni.strip().split()
for uniStr in uniStrs:
uniGood = True
try:
int(uniStr, 16)
except Exception:
uniGood = False
break
if not uniGood:
print(f'ERROR at line {i + 1}: malformed unicode number "{curUni}"')
print(list(reversed(prevLines)))
return False
unis = {int(uniStr) for uniStr in uniStrs}
if len(unis) != len(uniStrs):
print(f'ERROR at line {i + 1}: identical unis in "{curUni}"')
print(list(reversed(prevLines)))
return False
for uniStr in uniStrs:
uniStr = uniStr.upper()
if uniStr in mapping:
print(f'ERROR at line {i + 1}: duplicate uni {uniStr} in "{curUni}"')
print(list(reversed(prevLines)))
return False
mapping[uniStr] = (curGrapheme, curReading)
return True
with open(SRC) as fh:
curUni = None
curGrapheme = None
prevLines = []
i = 0
for line in fh:
i += 1
line = line.strip()
if uniCandRe.match(line):
if curUni:
if not finishUni():
break
curUni = line
curGrapheme = None
prevLines = []
continue
if len(prevLines) == 0:
curGrapheme = line
prevLines.append(line)
continue
prevLines.append(line)
if len(prevLines) > 6:
print(f'ERROR at line {i + 1}: out of sync "{line}"')
print(list(reversed(prevLines)))
break
i += 1
good = finishUni()
print(f"Seen {i - 1} lines")
if good:
print(f"{len(mapping)} unicode characters mapped")
else:
print("ERROR detected")
return mapping
mapping = makeMapping()
ERROR at line 366: out of sync "141 12100 GI" ['141 12100 GI', '140 12363', 'ZI', 'mud', 'JM', 'UY', 'MUD (ḪU-ḪI)'] ERROR at line 367: missing Borger number "141 12100 GI" ['141 12100 GI', '140 12363', 'ZI', 'mud', 'JM', 'UY', 'MUD (ḪU-ḪI)'] Seen 365 lines ERROR detected
for (uni, (grapheme, reading)) in sorted(mapping.items()):
print(f'"{chr(uni)}" = {uni} = "{grapheme}" = "{reading}"')