import re
EXTRA_DIGITS = {
"i": 1,
"l": 1,
}
EXTRA_DIGIT_STR = "".join(EXTRA_DIGITS)
NUMBER_SANITY_RE = re.compile(
fr"""
\b
(?:
[0-9{EXTRA_DIGIT_STR}]
[0-9{EXTRA_DIGIT_STR}.,/-]*
[0-9{EXTRA_DIGIT_STR}]
)
\b
""",
re.S | re.X,
)
def numberRepl(match):
number = match.group(0)
for (extraDigit, value) in EXTRA_DIGITS.items():
number = number.replace(extraDigit, str(value))
return number
text = """
de poederzuijker voor l32/243 Spaans de frazel van 27 lb, reekent 35 V2 percento<lb/>
"""
text = NUMBER_SANITY_RE.sub(numberRepl, text)
print(text)
de poederzuijker voor 132/243 Spaans de frazel van 27 lb, reekent 35 V2 percento<lb/>
MARK_NUM = r"""
(?:
[0-9]{1,2}
(?:
\s+
[0-9]{1,2}
)*
)
"""
MARK_PLAIN_RE = re.compile(
fr"""
(
(?:
<super>
{MARK_NUM}
</super>
)
|
(?:
⌊
[0-9]{{1,2}}
⌋
)
|
(?:
(?<=[a-zé])
[0-9]{{1,2}}
\b
)
|
(?:
(?<=[a-zé][;.])
[0-9]{{1,2}}
\b
)
)
""",
re.S | re.X,
)
text = """van het eyland Zakynthos28,<lb/>"""
match = MARK_PLAIN_RE.search(text)
match
<re.Match object; span=(24, 26), match='28'>
match.group(0)
'28'