from pathlib import Path
import re, sqlite3
from collections import defaultdict
# ~4800 commonly used chars from 常用國字標準字體表
common = Path('common.txt').read_text('utf-8').strip()
# ~6300 less common chars from 次常用國字標準字體表
lessCommon = Path('lessCommon.txt').read_text('utf-8').strip()
# your custom add-on chars, eg. 粵語用字
addon = Path('add_on_char.txt').read_text('utf-8').splitlines()
# combine ordered list from above
chars = list(common) + addon + list(lessCommon)
# rare chars with same input code with others, to be removed from the final character list
duplicated_code = Path('duplicated_code.txt').read_text('utf-8').splitlines()
# chars from 教育部 - 字頻表排序
char_rank = Path('char_rank.txt').read_text('utf-8').splitlines()
# dict with input code for all chars
with open('char_db.csv') as f:
input_dict = dict(line.strip().split(',') for line in f)
# pioritize charaters with 教育部 - 字頻表排序
for char in char_rank:
if char not in chars:
print(char, 'not exsit')
chars.remove(char)
ranked_chars = char_rank + chars
# remove rare chars having duplicated input code with others
for char in duplicated_code:
if char in ranked_chars:
ranked_chars.remove(char)
else:
print(char, ' not in ranked_chars')
# make sure all chars have input code
if chars_not_found := [char for char in ranked_chars if not input_dict.get(char)]:
raise f"chars not having input code: {chars_not_found}"
# header of final character list
header = '''%gen_inp
%ename cangjie
%cname 倉頡輸入法
%selkey 1234567890
%keyname begin
a 日
b 月
c 金
d 木
e 水
f 火
g 土
h 竹
i 戈
j 十
k 大
l 中
m 一
n 弓
o 人
p 心
q 手
r 口
s 尸
t 廿
u 山
v 女
w 田
x 難
y 卜
z 重
[ 「
] 」
; ;
' 、
\ \
, ,
. 。
! !
: :
/ ?
%keyname end
%chardef begin
, ,
! !
: :
. 。
/ ?
// /
.. .
... …
' 、
; ;
\ \
[ 「
[ 『
[ (
[ 〈
[ 《
] 」
] 』
] )
] 〉
] 》
'''
footer = '\n%chardef end'
# create main body of the final character list from `ranked_chars`
body = '\n'.join(input_dict[char].lower() + ' ' + char for char in ranked_chars)
# generate final character list
with open('dime_cangjie.txt', mode='w', encoding='utf-8') as f:
f.writelines(header + body +footer)
# total number of chars in the final character list
len(ranked_chars)
11047
base on the follow results, you may:
duplicated_code.txt
to remove unwanted charschar_rank.txt
to re-order the output when more then one candidatestmp = defaultdict(list)
for char in ranked_chars:
code = input_dict[char]
exist_char = tmp.get(code)
arr = tmp[code]
arr.append(char)
if len(arr) > 1:
print(f'{code} {" ".join(arr)}')
SU 己 已 YOLN 刻 劇 DHE 皮 板 OJ 什 午 OGE 雙 隻 MNP 死 恐 JD 未 宋 WD 果 困 EHSK 激 淚 YRBU 亮 毫 YHMBC 顏 頻 TW 曲 苗 DTMC 橫 棋 OPBUC 貨 貸 GRTR 喜 嘉 EMHF 源 鴻 RRIK 哭 獸 QOMR 拾 捨 SIP 忍 慰 GIKS 勢 劫 MGOK 致 玫 NO 久 欠 A 日 曰 MBUC 貢 頁 KN 九 夷 HND 朵 梨 QHLO 抓 掀 TCNO 歉 欺 EJMC 演 濱 RC 只 叭 PA 旨 旬 HPA 昏 筍 ANAU 晚 冕 HS 戶 乍 MRNO 歌 砍 TWK 奠 茵 NBG 角 墮 OKR 知 佑 DBDB 棘 棗 IPP 態 庇 FBOK 敞 敝 EOMN 汽 渝 NL 引 弔 ABJJ 暈 暉 HOUFK 徽 黴 BT 冊 皿 AYRF 景 晾 YCK 交 奕 HUP 息 憩 HMNL 郵 邸 TMD 某 芋 SRNL 郡 邵 HTMC 簧 箕 SEB 腎 臀 NI 夕 弘 ARF 照 煦 BM 且 肛 HDLN 利 剁 IRP 感 怠 SHOE 履 屐 HI 鬼 么 SHI 刃 戮 THJD 茱 孽 RAU 吧 邑 ENI 汐 泓 VID 樂 槳 ETMC 淇 潢 BHN 肌 冗 FBR 尚 炯 QSMG 握 擢 PI 勾 勺 OLOK 攸 倏 DYTJ 樟 梓 YVVV 巡 邋 IT 戒 弁 YRU 訕 乩 TVID 藥 孳 ESMG 濯 渥 NBKS 勇 觔 GKLMI 螯 螫 VFHAF 鷥 鸞 YMP 此 忐 HKP 懲 忝 YRPA 詢 詣 KJCC 痲 癲 IPP 態 庇 忒 ROMR 哈 啥 RKS 另 叻 EPD 池 柒 LWB 冑 胄 BUOG 瞿 睢 MGTMC 琪 璜 AFMBC 顯 顥 OM 丘 仝 EMCW 酒 洒 TYTR 菩 蒟 RJBF 嗦 嚓 THON 荇 蘅 TT 井 并 IFP 憑 慼 RJI 戰 戢 DWD 棵 梱 KB 有 冇 K 大 乂 RKI 吰 呔 RSJ 咡 咠 RMCW 哂 唒 RBUC 員 唄 ROIR 嗆 唅 RRRD 噪 喿 RHSK 唳 噭 RTWI 噂 囆 GTWI 墫 壿 SHOD 髹 屧 UOIN 岑 岒 UNMU 峗 峞 IP 弋 庀 PWD 悃 惈 PMRW 匐 愊 WOP 囮 慁 YKP 忞 憝 PHSK 悷 憿 EBP 懣 懘 QIHF 搣 摵 QHSK 捩 撽 QYRN 揨 攍 AYK 旻 旼 AUU 岊 昢 DPI 杓 构 DKN 朹 桋 DNIN 杼 栘 DNHD 楙 楺 DYWV 榱 櫰 YBNO 欳 歊 WPP 毗 毘 OMN 仃 气 EMVM 涇 沍 EKI 汰 汯 ETT 汫 洴 ESMG 濯 渥 洭 EKN 氿 洟 EC 汃 淦 EYRN 瀛 渟 EHNI 汎 渢 JCEGG 窪 漥 EHOO 漇 漎 EOLB 滫 潃 EJJJ 澣 濣 EITC 濂 瀇 EOMB 淪 瀹 ELIM 渱 灗 EMBB 濡 灞 KHHSB 猏 猵 MGOIR 琀 瑲 HOPI 彴 瓝 YBMVN 甋 甗 OMRW 偪 畣 KKB 肴 痏 KMSO 瓿 瘃 KKLU 瓾 痷 KRYE 敧 瘕 KHOK 癥 癓 YMDHE 皻 皽 BUJMC 瞚 矉 HBMR 筒 礐 YPMR 砦 礱 HLLN 劓 笰 HBUU 筧 篹 HJII 篿 簙 HMGN 箌 籈 HSMG 筐 籊 HYHS 舴 籩 PFMBC 熲 顈 BUU 朏 胐 BPR 朐 胊 BLMO 朓 脁 BICE 朘 脧 YVB 肓 膂 TIP 懟 芅 TSP 懃 苨 TKN 艽 荑 TNIN 芧 茤 TOG 茌 萑 THSB 菺 萹 TIHR 葴 蒧 TGGI 葑 蓺 THOO 蓏 蓰 THOO 蓏 蓰 蓯 THDH 菞 蕛 TMNV 蒆 薞 TYRE 蔎 蕸 TMGN 菿 薽 THDV 萎 藒 THAF 蔦 蘤 TWLP 薨 藣 TYRV 藹 蘘 TOMB 菕 蘥 TTCG 蘣 蘳 HYPU 箎 虒 IHHI 蠯 螷 LITMC 蜞 蟥 YIHXO 斔 螤 LIJMC 螾 蠙 YHDV 逶 褎 YRYCB 謫 謪 YRYPM 謔 謯 YPYMR 訾 讋 BHUU 膬 貀 RMHOO 蹤 蹝 JJTGI 轙 轛 HRHPM 錕 迣 HRNL 郈 郜 HUNL 郋 郳 MGNL 郅 鄄 TRNL 鄯 鄀 BCNL 郥 鄍 IBNL 郙 鄘 TMNL 邯 鄞 HDNL 邾 鄡 MRNL 郚 酃 HCNL 鄮 酇 HFD 乎 釆 CNL 弚 鈏 EUC 鎏 鍌 CJMO 錠 鎵 TOG 茌 萑 雈 UOGB 巂 雟 TJMN 苧 靪 TJMU 莞 靰 ORMBC 頷 頜 NUMBC 頠 顄 TBMBC 顢 顜 HRHPM 錕 迣 鬳 OLOF 絛 鯈 NFTWI 鱒 鱴 CHHAF 鵜 鳻 HOHAF 鴔 鴩 YKHAF 鳼 鵁 HRHAF 鵠 鴰 HRHAF 鵠 鴰 鵅 MNHAF 殦 鴷 JVHAF 鴳 鶈 KRHAF 鴐 鵸 RSHAF 鴞 鶚 HUHAF 鶂 鶞 HBHAF 鵳 鶣 JRHAF 鴣 鶷 TTHAF 鵧 鷁 GFHNE 縠 鷇 MBHAF 鴯 鷊 JCHAF 窵 鷏 AVHAF 鶡 鷃 JTCF 騫 鶱 IBHAF 鵏 鷛 SHHAF 鳭 鷚 AVHAF 鶡 鷃 鷐 YKHAF 鳼 鵁 鷟 MJHAF 鳱 鷣 OFHAF 鷦 鷡 HBHAF 鵳 鶣 鷮 YKHAF 鳼 鵁 鷟 鸆 HBHAF 鵳 鶣 鷮 鷽 MBHAF 鴯 鷊 鸍 KKHAF 鷞 鸑 YTHAF 鴗 鸕 YPHAF 鷾 鸗 YPMMF 祡 龒