Skip to main content

gukhanmun_stdict/
lib.rs

1// Gukhanmun: Bundled Standard Korean Language Dictionary for Gukhanmun.
2// Copyright (C) 2026  Hong Minhee
3//
4// This program is free software: you can redistribute it and/or modify
5// it under the terms of the GNU General Public License as published by
6// the Free Software Foundation, either version 3 of the License, or
7// (at your option) any later version.
8//
9// This program is distributed in the hope that it will be useful,
10// but WITHOUT ANY WARRANTY; without even the implied warranty of
11// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12// GNU General Public License for more details.
13//
14// You should have received a copy of the GNU General Public License
15// along with this program.  If not, see <https://www.gnu.org/licenses/>.
16
17//! Bundled Standard Korean Language Dictionary for Gukhanmun.
18
19#![forbid(unsafe_code)]
20#![deny(missing_docs)]
21
22use std::sync::OnceLock;
23
24use gukhanmun_core::{DictionaryRecord, HanjaDictionary, Match};
25use gukhanmun_fst::FstDictionary;
26
27/// Extracts canonical TSV rows from Standard Korean Language Dictionary dumps.
28pub mod extract;
29
30static KO_KR_BYTES: &[u8] = include_bytes!(concat!(env!("OUT_DIR"), "/stdict.gukfst"));
31static KO_KR_FST: OnceLock<FstDictionary> = OnceLock::new();
32
33/// Multi-syllable suffix overrides generated by `gukhanmun-stdict-extract`.
34///
35/// Each row is `hanja\tinitial\tsuffix`: the word-initial reading and the
36/// reading used outside word-initial position. See [`ko_kr`] for how it is
37/// applied. Single-hanja initial sound law is handled by the engine from the
38/// bundled unihan readings and is intentionally absent here.
39static KO_KR_SUFFIX_TSV: &str = include_str!("../data/suffix.tsv");
40static KO_KR_SUFFIXES: OnceLock<Vec<(String, (String, String))>> = OnceLock::new();
41
42fn ko_kr_fst() -> &'static FstDictionary {
43    KO_KR_FST.get_or_init(|| {
44        FstDictionary::from_static_bytes(KO_KR_BYTES)
45            .expect("embedded Standard Korean Language Dictionary FST is valid")
46    })
47}
48
49fn ko_kr_suffixes() -> &'static [(String, (String, String))] {
50    KO_KR_SUFFIXES
51        .get_or_init(|| {
52            let mut rows = KO_KR_SUFFIX_TSV
53                .lines()
54                .skip(1)
55                .filter(|line| !line.is_empty())
56                .map(|line| {
57                    let mut fields = line.split('\t');
58                    let hanja = fields.next().expect("suffix.tsv row has a hanja key");
59                    let initial = fields
60                        .next()
61                        .expect("suffix.tsv row has an initial reading");
62                    let suffix = fields.next().expect("suffix.tsv row has a suffix reading");
63                    (hanja.to_owned(), (initial.to_owned(), suffix.to_owned()))
64                })
65                .collect::<Vec<_>>();
66            rows.sort_by(|(left, _), (right, _)| left.cmp(right));
67            rows
68        })
69        .as_slice()
70}
71
72/// The bundled South Korean Standard Korean Language Dictionary.
73///
74/// Wraps the embedded FST and layers on the multi-syllable suffix overrides
75/// (see [`ko_kr`]). Obtain the shared instance through [`ko_kr`].
76pub struct KoKrDictionary {
77    fst: &'static FstDictionary,
78    suffixes: &'static [(String, (String, String))],
79}
80
81impl KoKrDictionary {
82    /// Returns the number of entries recorded in the embedded FST.
83    pub fn entry_count(&self) -> u64 {
84        self.fst.entry_count()
85    }
86
87    /// Returns the exact embedded entry for `hanja`, if present.
88    ///
89    /// This reflects the raw FST reading and does not apply the multi-syllable
90    /// suffix overrides; use [`HanjaDictionary::matches_at`] for the readings the
91    /// engine consumes.
92    pub fn lookup(
93        &self,
94        hanja: &str,
95    ) -> Result<Option<gukhanmun_fst::LookupEntry>, gukhanmun_fst::Error> {
96        self.fst.lookup(hanja)
97    }
98}
99
100impl HanjaDictionary for KoKrDictionary {
101    fn matches_at<'a>(&'a self, s: &'a str) -> Box<dyn Iterator<Item = Match> + 'a> {
102        let suffixes = self.suffixes;
103        Box::new(self.fst.matches_at(s).map(move |mut matched| {
104            let key = &s[..matched.byte_len];
105            if let Ok(index) = suffixes.binary_search_by(|(hanja, _)| hanja.as_str().cmp(key)) {
106                let (initial, suffix) = &suffixes[index].1;
107                matched.reading = initial.clone();
108                matched.suffix_reading = Some(suffix.clone());
109            }
110            matched
111        }))
112    }
113
114    fn max_word_chars(&self) -> Option<usize> {
115        self.fst.max_word_chars()
116    }
117
118    fn entries<'a>(&'a self) -> Option<Box<dyn Iterator<Item = DictionaryRecord> + 'a>> {
119        self.fst.entries()
120    }
121
122    fn has_homophone(&self, hanja: &str, reading: &str) -> bool {
123        self.fst.has_homophone(hanja, reading)
124    }
125}
126
127static KO_KR: OnceLock<KoKrDictionary> = OnceLock::new();
128
129/// Returns the bundled South Korean Standard Korean Language Dictionary.
130///
131/// The dictionary is embedded as FST bytes generated from the canonical TSV
132/// snapshot in this crate's `data` directory and is decoded lazily on first
133/// use. Multi-syllable entries that the source records with a distinct suffix
134/// or bound-noun form (such as `年代`, read `연대` word-initially but `년대`
135/// after a number) carry that form in [`Match::suffix_reading`] so the engine
136/// can pick the position-correct reading. Single-hanja initial sound law (for
137/// example `年` → `년` after a number) is applied by the engine from the bundled
138/// unihan readings and needs no per-entry data here.
139pub fn ko_kr() -> &'static KoKrDictionary {
140    KO_KR.get_or_init(|| KoKrDictionary {
141        fst: ko_kr_fst(),
142        suffixes: ko_kr_suffixes(),
143    })
144}