gukhanmun_stdict/lib.rs
1// Gukhanmun: Bundled Standard Korean Language Dictionary for Gukhanmun.
2// Copyright (C) 2026 Hong Minhee
3//
4// This program is free software: you can redistribute it and/or modify
5// it under the terms of the GNU General Public License as published by
6// the Free Software Foundation, either version 3 of the License, or
7// (at your option) any later version.
8//
9// This program is distributed in the hope that it will be useful,
10// but WITHOUT ANY WARRANTY; without even the implied warranty of
11// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12// GNU General Public License for more details.
13//
14// You should have received a copy of the GNU General Public License
15// along with this program. If not, see <https://www.gnu.org/licenses/>.
16
17//! Bundled Standard Korean Language Dictionary for Gukhanmun.
18
19#![forbid(unsafe_code)]
20#![deny(missing_docs)]
21
22use std::sync::OnceLock;
23
24use gukhanmun_core::{DictionaryRecord, HanjaDictionary, Match};
25use gukhanmun_fst::FstDictionary;
26
27/// Extracts canonical TSV rows from Standard Korean Language Dictionary dumps.
28pub mod extract;
29
30static KO_KR_BYTES: &[u8] = include_bytes!(concat!(env!("OUT_DIR"), "/stdict.gukfst"));
31static KO_KR_FST: OnceLock<FstDictionary> = OnceLock::new();
32
33/// Multi-syllable suffix overrides generated by `gukhanmun-stdict-extract`.
34///
35/// Each row is `hanja\tinitial\tsuffix`: the word-initial reading and the
36/// reading used outside word-initial position. See [`ko_kr`] for how it is
37/// applied. Single-hanja initial sound law is handled by the engine from the
38/// bundled unihan readings and is intentionally absent here.
39static KO_KR_SUFFIX_TSV: &str = include_str!("../data/suffix.tsv");
40static KO_KR_SUFFIXES: OnceLock<Vec<(String, (String, String))>> = OnceLock::new();
41
42fn ko_kr_fst() -> &'static FstDictionary {
43 KO_KR_FST.get_or_init(|| {
44 FstDictionary::from_static_bytes(KO_KR_BYTES)
45 .expect("embedded Standard Korean Language Dictionary FST is valid")
46 })
47}
48
49fn ko_kr_suffixes() -> &'static [(String, (String, String))] {
50 KO_KR_SUFFIXES
51 .get_or_init(|| {
52 let mut rows = KO_KR_SUFFIX_TSV
53 .lines()
54 .skip(1)
55 .filter(|line| !line.is_empty())
56 .map(|line| {
57 let mut fields = line.split('\t');
58 let hanja = fields.next().expect("suffix.tsv row has a hanja key");
59 let initial = fields
60 .next()
61 .expect("suffix.tsv row has an initial reading");
62 let suffix = fields.next().expect("suffix.tsv row has a suffix reading");
63 (hanja.to_owned(), (initial.to_owned(), suffix.to_owned()))
64 })
65 .collect::<Vec<_>>();
66 rows.sort_by(|(left, _), (right, _)| left.cmp(right));
67 rows
68 })
69 .as_slice()
70}
71
72/// The bundled South Korean Standard Korean Language Dictionary.
73///
74/// Wraps the embedded FST and layers on the multi-syllable suffix overrides
75/// (see [`ko_kr`]). Obtain the shared instance through [`ko_kr`].
76pub struct KoKrDictionary {
77 fst: &'static FstDictionary,
78 suffixes: &'static [(String, (String, String))],
79}
80
81impl KoKrDictionary {
82 /// Returns the number of entries recorded in the embedded FST.
83 pub fn entry_count(&self) -> u64 {
84 self.fst.entry_count()
85 }
86
87 /// Returns the exact embedded entry for `hanja`, if present.
88 ///
89 /// This reflects the raw FST reading and does not apply the multi-syllable
90 /// suffix overrides; use [`HanjaDictionary::matches_at`] for the readings the
91 /// engine consumes.
92 pub fn lookup(
93 &self,
94 hanja: &str,
95 ) -> Result<Option<gukhanmun_fst::LookupEntry>, gukhanmun_fst::Error> {
96 self.fst.lookup(hanja)
97 }
98}
99
100impl HanjaDictionary for KoKrDictionary {
101 fn matches_at<'a>(&'a self, s: &'a str) -> Box<dyn Iterator<Item = Match> + 'a> {
102 let suffixes = self.suffixes;
103 Box::new(self.fst.matches_at(s).map(move |mut matched| {
104 let key = &s[..matched.byte_len];
105 if let Ok(index) = suffixes.binary_search_by(|(hanja, _)| hanja.as_str().cmp(key)) {
106 let (initial, suffix) = &suffixes[index].1;
107 matched.reading = initial.clone();
108 matched.suffix_reading = Some(suffix.clone());
109 }
110 matched
111 }))
112 }
113
114 fn max_word_chars(&self) -> Option<usize> {
115 self.fst.max_word_chars()
116 }
117
118 fn entries<'a>(&'a self) -> Option<Box<dyn Iterator<Item = DictionaryRecord> + 'a>> {
119 self.fst.entries()
120 }
121
122 fn has_homophone(&self, hanja: &str, reading: &str) -> bool {
123 self.fst.has_homophone(hanja, reading)
124 }
125}
126
127static KO_KR: OnceLock<KoKrDictionary> = OnceLock::new();
128
129/// Returns the bundled South Korean Standard Korean Language Dictionary.
130///
131/// The dictionary is embedded as FST bytes generated from the canonical TSV
132/// snapshot in this crate's `data` directory and is decoded lazily on first
133/// use. Multi-syllable entries that the source records with a distinct suffix
134/// or bound-noun form (such as `年代`, read `연대` word-initially but `년대`
135/// after a number) carry that form in [`Match::suffix_reading`] so the engine
136/// can pick the position-correct reading. Single-hanja initial sound law (for
137/// example `年` → `년` after a number) is applied by the engine from the bundled
138/// unihan readings and needs no per-entry data here.
139pub fn ko_kr() -> &'static KoKrDictionary {
140 KO_KR.get_or_init(|| KoKrDictionary {
141 fst: ko_kr_fst(),
142 suffixes: ko_kr_suffixes(),
143 })
144}