Skip to main content

gukhanmun_mkdict/
lib.rs

1// Gukhanmun: Builds Gukhanmun dictionary backend files from canonical TSV input.
2// Copyright (C) 2026  Hong Minhee
3//
4// This program is free software: you can redistribute it and/or modify
5// it under the terms of the GNU General Public License as published by
6// the Free Software Foundation, either version 3 of the License, or
7// (at your option) any later version.
8//
9// This program is distributed in the hope that it will be useful,
10// but WITHOUT ANY WARRANTY; without even the implied warranty of
11// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12// GNU General Public License for more details.
13//
14// You should have received a copy of the GNU General Public License
15// along with this program.  If not, see <https://www.gnu.org/licenses/>.
16
17//! Dictionary builder support for `gukhanmun-mkdict`.
18//!
19//! The crate owns parsers for normalized dictionary inputs and writers for the
20//! first on-disk FST and CDB dictionary formats. Runtime lookup is handled by
21//! backend crates.
22
23#![forbid(unsafe_code)]
24#![deny(missing_docs)]
25
26use std::collections::{BTreeMap, BTreeSet};
27use std::env;
28use std::error::Error as StdError;
29use std::fs;
30use std::io::{BufRead, BufReader};
31use std::path::{Path, PathBuf};
32
33use ciborium::ser::into_writer;
34use fst::MapBuilder;
35use gukhanmun_cdb::CdbDictionary;
36use gukhanmun_fst::FstDictionary;
37use serde::{Deserialize, Serialize};
38use time::OffsetDateTime;
39use time::format_description::well_known::Rfc3339;
40
41const MAGIC: &[u8; 8] = b"GUKHMFST";
42const FORMAT_VERSION: u32 = 1;
43const FIXED_HEADER_LEN: usize = 64;
44const MARK_REQUIRE_HANJA: u8 = 0b0000_0001;
45const MARK_REQUIRE_HANGUL: u8 = 0b0000_0010;
46const VALUE_MARK_SHIFT: u64 = 16;
47const VALUE_OFFSET_SHIFT: u64 = 24;
48const VALUE_MAX_OFFSET: u64 = (1u64 << 40) - 1;
49const RESERVED_METADATA_KEYS: &[&str] = &[
50    "entry_count",
51    "version",
52    "max_word_chars",
53    "max_key_bytes",
54    "prefix_count",
55];
56const CDB_META_KEY: &[u8] = b"__gukhanmun_meta__";
57const CDB_MARK_REQUIRE_HANJA: u8 = 0b0000_0001;
58const CDB_MARK_REQUIRE_HANGUL: u8 = 0b0000_0010;
59
60/// Error returned while parsing inputs or building dictionary files.
61#[derive(Debug, thiserror::Error)]
62#[non_exhaustive]
63pub enum Error {
64    /// The input violated the builder contract.
65    #[error("{0}")]
66    Message(String),
67
68    /// An underlying operation failed with extra builder context.
69    #[error("{context}: {source}")]
70    Source {
71        /// Builder context for the failing operation.
72        context: String,
73        /// Underlying source error.
74        #[source]
75        source: Box<dyn StdError + Send + Sync + 'static>,
76    },
77
78    /// FST backend validation or decoding failed.
79    #[error(transparent)]
80    Fst(#[from] gukhanmun_fst::Error),
81
82    /// CDB backend validation or decoding failed.
83    #[error(transparent)]
84    Cdb(#[from] gukhanmun_cdb::Error),
85}
86
87impl Error {
88    fn message(message: impl Into<String>) -> Self {
89        Self::Message(message.into())
90    }
91
92    fn source(context: impl Into<String>, source: impl StdError + Send + Sync + 'static) -> Self {
93        Self::Source {
94            context: context.into(),
95            source: Box::new(source),
96        }
97    }
98}
99
100/// Result type returned by dictionary builder APIs.
101pub type Result<T> = std::result::Result<T, Error>;
102
103trait ResultContext<T> {
104    fn context(self, context: impl Into<String>) -> Result<T>;
105
106    fn with_context(self, context: impl FnOnce() -> String) -> Result<T>;
107}
108
109impl<T, E> ResultContext<T> for std::result::Result<T, E>
110where
111    E: StdError + Send + Sync + 'static,
112{
113    fn context(self, context: impl Into<String>) -> Result<T> {
114        self.map_err(|source| Error::source(context, source))
115    }
116
117    fn with_context(self, context: impl FnOnce() -> String) -> Result<T> {
118        self.map_err(|source| Error::source(context(), source))
119    }
120}
121
122trait OptionContext<T> {
123    fn context(self, context: impl Into<String>) -> Result<T>;
124}
125
126impl<T> OptionContext<T> for Option<T> {
127    fn context(self, context: impl Into<String>) -> Result<T> {
128        self.ok_or_else(|| Error::message(context.into()))
129    }
130}
131
132macro_rules! bail {
133    ($($arg:tt)*) => {
134        return Err(Error::message(format!($($arg)*)))
135    };
136}
137
138macro_rules! ensure {
139    ($condition:expr, $($arg:tt)*) => {
140        if !$condition {
141            bail!($($arg)*);
142        }
143    };
144}
145
146/// The maximum accepted UTF-8 key length when the CLI option is omitted.
147pub const DEFAULT_MAX_KEY_BYTES: usize = 1024;
148
149/// The supported output backend format for this implementation step.
150#[derive(Clone, Copy, Debug, Eq, PartialEq)]
151pub enum DictionaryFormat {
152    /// Build the FST dictionary file format.
153    Fst,
154
155    /// Build the CDB-trie dictionary file format.
156    Cdb,
157}
158
159/// Conflict policy used when the same hanja key appears more than once.
160#[derive(Clone, Copy, Debug, Eq, PartialEq)]
161pub enum MergePolicy {
162    /// Treat duplicate keys as an error.
163    Error,
164
165    /// Keep the first entry and ignore later duplicates.
166    FirstWins,
167
168    /// Replace earlier entries with the last duplicate.
169    LastWins,
170}
171
172/// Dictionary-provided rendering constraints encoded in built files.
173#[derive(Clone, Copy, Debug, Default, Eq, PartialEq, Serialize, Deserialize)]
174pub struct EntryMark {
175    /// Whether output should keep the original hanja visible.
176    pub require_hanja: bool,
177
178    /// Whether output should include a hangul gloss when hanja remains primary.
179    pub require_hangul: bool,
180}
181
182/// One normalized dictionary entry after parsing and merge handling.
183#[derive(Clone, Debug, Eq, PartialEq)]
184pub struct DictionaryEntry {
185    hanja: String,
186    reading: String,
187    mark: EntryMark,
188}
189
190impl DictionaryEntry {
191    /// Creates a dictionary entry from a hanja key, hangul reading, and mark.
192    pub fn new(hanja: impl Into<String>, reading: impl Into<String>, mark: EntryMark) -> Self {
193        Self {
194            hanja: hanja.into(),
195            reading: reading.into(),
196            mark,
197        }
198    }
199
200    /// Returns the hanja key.
201    pub fn hanja(&self) -> &str {
202        &self.hanja
203    }
204
205    /// Returns the hangul reading.
206    pub fn reading(&self) -> &str {
207        &self.reading
208    }
209
210    /// Returns dictionary-provided rendering constraints.
211    pub fn mark(&self) -> EntryMark {
212        self.mark
213    }
214
215    /// Replaces the dictionary-provided rendering constraints in place.
216    pub fn set_mark(&mut self, mark: EntryMark) {
217        self.mark = mark;
218    }
219}
220
221/// Selector kind used by a rules-file row.
222#[derive(Clone, Copy, Debug, Eq, Hash, Ord, PartialEq, PartialOrd)]
223pub enum RuleKind {
224    /// Match a single dictionary entry whose hanja key equals `pattern`.
225    Entry,
226
227    /// Match every dictionary entry whose hanja key contains the hanja
228    /// substring in `pattern`.
229    Contains,
230
231    /// Match every dictionary entry whose hangul reading equals `pattern`.
232    Reading,
233}
234
235impl RuleKind {
236    fn parse(value: &str) -> Option<Self> {
237        match value {
238            "entry" => Some(Self::Entry),
239            "contains" => Some(Self::Contains),
240            "reading" => Some(Self::Reading),
241            _ => None,
242        }
243    }
244
245    fn as_str(self) -> &'static str {
246        match self {
247            Self::Entry => "entry",
248            Self::Contains => "contains",
249            Self::Reading => "reading",
250        }
251    }
252}
253
254/// One row from a rules file: a selector that picks dictionary entries and the
255/// mark bits to OR into their [`EntryMark`].
256#[derive(Clone, Debug, Eq, PartialEq)]
257pub struct Rule {
258    kind: RuleKind,
259    pattern: String,
260    mark: EntryMark,
261    reason: String,
262    location: String,
263}
264
265impl Rule {
266    /// Creates a rule for programmatic callers.
267    ///
268    /// This constructor is unchecked: the pattern, mark bits, and reason are
269    /// stored verbatim.  All semantic validation — non-empty pattern, at least
270    /// one mark bit set, and `contains` patterns that are hanja-only — runs in
271    /// [`apply_rules`], so even programmatically constructed rules surface the
272    /// same errors as rules parsed from a TSV file.
273    pub fn new(
274        kind: RuleKind,
275        pattern: impl Into<String>,
276        mark: EntryMark,
277        reason: impl Into<String>,
278    ) -> Self {
279        Self {
280            kind,
281            pattern: pattern.into(),
282            mark,
283            reason: reason.into(),
284            location: "<programmatic>".to_owned(),
285        }
286    }
287
288    /// Returns the selector kind.
289    pub fn kind(&self) -> RuleKind {
290        self.kind
291    }
292
293    /// Returns the selector pattern.
294    pub fn pattern(&self) -> &str {
295        &self.pattern
296    }
297
298    /// Returns the mark bits the rule contributes.
299    pub fn mark(&self) -> EntryMark {
300        self.mark
301    }
302
303    /// Returns the human-readable reason this rule exists.
304    pub fn reason(&self) -> &str {
305        &self.reason
306    }
307
308    /// Returns the source location used for error reporting.
309    pub fn location(&self) -> &str {
310        &self.location
311    }
312}
313
314/// Parses a rules TSV file.
315///
316/// The expected header is `kind`, `pattern`, `require_hanja`,
317/// `require_hangul`, `reason` in any column order.  Unknown columns are
318/// ignored with a warning printed to stderr, mirroring how dictionary inputs
319/// are parsed.  Duplicate `(kind, pattern)` pairs are rejected.
320pub fn parse_rules_file(path: &Path) -> Result<Vec<Rule>> {
321    let file =
322        fs::File::open(path).with_context(|| format!("failed to open {}", path.display()))?;
323    parse_rules_reader(BufReader::new(file), path)
324}
325
326fn parse_rules_reader(reader: impl BufRead, path: &Path) -> Result<Vec<Rule>> {
327    let mut lines = reader.lines();
328    let header = loop {
329        let Some(line) = lines.next() else {
330            bail!("{} is empty", path.display());
331        };
332        let line = line.with_context(|| format!("failed to read {}", path.display()))?;
333        if !line.is_empty() {
334            break line;
335        }
336    };
337    let columns = parse_rules_header(&header)?;
338    let mut rules = Vec::new();
339    let mut seen = BTreeSet::<(RuleKind, String)>::new();
340
341    for (index, line) in lines.enumerate() {
342        let line_number = index + 2;
343        let line = line.with_context(|| format!("failed to read {}", path.display()))?;
344        if line.is_empty() {
345            continue;
346        }
347        let location = format!("{}:{line_number}", path.display());
348        let rule = parse_rule_row(&line, &columns, &location)?;
349        if !seen.insert((rule.kind, rule.pattern.clone())) {
350            bail!(
351                "{}: duplicate rule for kind `{}` and pattern `{}`",
352                location,
353                rule.kind.as_str(),
354                rule.pattern,
355            );
356        }
357        rules.push(rule);
358    }
359
360    Ok(rules)
361}
362
363#[derive(Clone, Debug)]
364struct RulesHeaderColumns {
365    kind: usize,
366    pattern: usize,
367    require_hanja: usize,
368    require_hangul: usize,
369    reason: usize,
370    column_count: usize,
371}
372
373fn parse_rules_header(header: &str) -> Result<RulesHeaderColumns> {
374    let columns = header.split('\t').collect::<Vec<_>>();
375    let mut seen = BTreeSet::new();
376    let mut kind = None;
377    let mut pattern = None;
378    let mut require_hanja = None;
379    let mut require_hangul = None;
380    let mut reason = None;
381
382    for (index, column) in columns.iter().enumerate() {
383        ensure!(
384            !column.is_empty(),
385            "rules TSV header contains an empty column name"
386        );
387        ensure!(
388            seen.insert(*column),
389            "rules TSV header contains duplicate `{column}` column"
390        );
391        match *column {
392            "kind" => kind = Some(index),
393            "pattern" => pattern = Some(index),
394            "require_hanja" => require_hanja = Some(index),
395            "require_hangul" => require_hangul = Some(index),
396            "reason" => reason = Some(index),
397            extra => tracing::warn!(column = extra, "ignoring unsupported rules TSV column"),
398        }
399    }
400
401    Ok(RulesHeaderColumns {
402        kind: kind.ok_or_else(|| Error::message("rules TSV missing required `kind` column"))?,
403        pattern: pattern
404            .ok_or_else(|| Error::message("rules TSV missing required `pattern` column"))?,
405        require_hanja: require_hanja
406            .ok_or_else(|| Error::message("rules TSV missing required `require_hanja` column"))?,
407        require_hangul: require_hangul
408            .ok_or_else(|| Error::message("rules TSV missing required `require_hangul` column"))?,
409        reason: reason
410            .ok_or_else(|| Error::message("rules TSV missing required `reason` column"))?,
411        column_count: columns.len(),
412    })
413}
414
415fn parse_rule_row(line: &str, columns: &RulesHeaderColumns, location: &str) -> Result<Rule> {
416    let fields = line.split('\t').collect::<Vec<_>>();
417    ensure!(
418        fields.len() >= columns.column_count,
419        "{location}: expected {} TSV fields, got {}",
420        columns.column_count,
421        fields.len()
422    );
423
424    let kind_field = fields[columns.kind];
425    let kind = RuleKind::parse(kind_field).ok_or_else(|| {
426        Error::message(format!(
427            "{location}: unknown rule kind `{kind_field}`; expected `entry`, `contains`, or `reading`"
428        ))
429    })?;
430    let pattern = fields[columns.pattern];
431    ensure!(
432        !pattern.is_empty(),
433        "{location}: `pattern` must not be empty"
434    );
435    let require_hanja = parse_required_bool(fields[columns.require_hanja], location)?;
436    let require_hangul = parse_required_bool(fields[columns.require_hangul], location)?;
437    ensure!(
438        require_hanja || require_hangul,
439        "{location}: rule must set at least one of `require_hanja` or `require_hangul`"
440    );
441    let reason = fields[columns.reason].trim();
442    ensure!(
443        !reason.is_empty(),
444        "{location}: `reason` must not be empty so future maintainers can audit the rule"
445    );
446
447    Ok(Rule {
448        kind,
449        pattern: pattern.to_owned(),
450        mark: EntryMark {
451            require_hanja,
452            require_hangul,
453        },
454        reason: reason.to_owned(),
455        location: location.to_owned(),
456    })
457}
458
459fn parse_required_bool(value: &str, location: &str) -> Result<bool> {
460    match value {
461        "true" | "1" => Ok(true),
462        "false" | "0" | "" => Ok(false),
463        other => bail!("{location}: invalid boolean value `{other}`"),
464    }
465}
466
467/// Applies parsed rules to dictionary entries by OR-merging their mark bits.
468///
469/// When `allow_unmatched` is false, all rules that matched no entry are
470/// collected and reported as a single error so editors can fix them in one
471/// pass.  When true, unmatched rules are silently ignored (useful for partial
472/// dictionaries shared across builds).
473pub fn apply_rules(
474    entries: &mut [DictionaryEntry],
475    rules: &[Rule],
476    allow_unmatched: bool,
477) -> Result<()> {
478    if rules.is_empty() {
479        return Ok(());
480    }
481
482    tracing::info!(
483        rule_count = rules.len(),
484        entry_count = entries.len(),
485        "applying dictionary rules"
486    );
487
488    for rule in rules {
489        ensure!(
490            !rule.pattern.is_empty(),
491            "{}: rule pattern must not be empty",
492            rule.location,
493        );
494        ensure!(
495            rule.mark.require_hanja || rule.mark.require_hangul,
496            "{}: rule must set at least one of `require_hanja` or `require_hangul`",
497            rule.location,
498        );
499        if matches!(rule.kind, RuleKind::Contains) {
500            ensure!(
501                rule.pattern.chars().all(gukhanmun_core::is_hanja),
502                "{}: `contains` rule pattern `{}` must consist only of hanja characters; \
503                 dictionary keys can be mixed-script so a pattern with hangul or other \
504                 scripts would silently match unrelated entries",
505                rule.location,
506                rule.pattern,
507            );
508        }
509    }
510    let mut matched = vec![false; rules.len()];
511
512    for entry in entries.iter_mut() {
513        let hanja = entry.hanja().to_owned();
514        let reading = entry.reading().to_owned();
515        for (i, rule) in rules.iter().enumerate() {
516            let hit = match rule.kind {
517                RuleKind::Entry => hanja == rule.pattern,
518                RuleKind::Contains => hanja.contains(rule.pattern.as_str()),
519                RuleKind::Reading => reading == rule.pattern,
520            };
521            if hit {
522                matched[i] = true;
523                let mut mark = entry.mark();
524                mark.require_hanja |= rule.mark.require_hanja;
525                mark.require_hangul |= rule.mark.require_hangul;
526                entry.set_mark(mark);
527            }
528        }
529    }
530
531    if !allow_unmatched {
532        let mut unmatched = rules
533            .iter()
534            .zip(matched.iter())
535            .filter(|(_, hit)| !**hit)
536            .map(|(rule, _)| {
537                format!(
538                    "{}: rule `{}={}` matched no entries",
539                    rule.location,
540                    rule.kind.as_str(),
541                    rule.pattern,
542                )
543            })
544            .collect::<Vec<_>>();
545        if !unmatched.is_empty() {
546            tracing::error!(
547                unmatched_count = unmatched.len(),
548                "rules matched no entries"
549            );
550            unmatched.sort();
551            bail!(
552                "{} unmatched rule(s):\n  {}",
553                unmatched.len(),
554                unmatched.join("\n  ")
555            );
556        }
557    }
558
559    Ok(())
560}
561
562/// Options controlling dictionary file construction.
563#[derive(Clone, Debug, Eq, PartialEq)]
564pub struct BuildOptions {
565    /// Output backend format.
566    pub format: DictionaryFormat,
567
568    /// Duplicate-key merge policy.
569    pub merge: MergePolicy,
570
571    /// Whether to reopen and validate the generated output.
572    pub validate: bool,
573
574    /// Maximum accepted UTF-8 byte length for dictionary keys.
575    pub max_key_bytes: usize,
576
577    /// User-supplied metadata values embedded in the output file.
578    pub metadata: BTreeMap<String, String>,
579
580    /// Paths to rules TSV files whose entries OR-merge marks into the
581    /// dictionary entries before serialization.
582    pub rules: Vec<PathBuf>,
583
584    /// Allow rules that match no entries to pass instead of erroring.
585    pub allow_unmatched_rules: bool,
586}
587
588impl Default for BuildOptions {
589    fn default() -> Self {
590        Self {
591            format: DictionaryFormat::Fst,
592            merge: MergePolicy::Error,
593            validate: false,
594            max_key_bytes: DEFAULT_MAX_KEY_BYTES,
595            metadata: BTreeMap::new(),
596            rules: Vec::new(),
597            allow_unmatched_rules: false,
598        }
599    }
600}
601
602/// Builds a dictionary file from normalized TSV, CSV, or JSONL inputs.
603pub fn build_dictionary(
604    input_paths: &[PathBuf],
605    output_path: impl AsRef<Path>,
606    options: &BuildOptions,
607) -> Result<()> {
608    ensure!(
609        !input_paths.is_empty(),
610        "at least one input file is required"
611    );
612    tracing::info!(
613        input_count = input_paths.len(),
614        output = %output_path.as_ref().display(),
615        ?options.format,
616        "building dictionary"
617    );
618    let mut entries = read_and_merge_inputs(input_paths, options)?;
619    if !options.rules.is_empty() {
620        let mut rules = Vec::new();
621        let mut seen = BTreeSet::<(RuleKind, String)>::new();
622        for path in &options.rules {
623            for rule in parse_rules_file(path)? {
624                if !seen.insert((rule.kind, rule.pattern.clone())) {
625                    bail!(
626                        "{}: duplicate rule for kind `{}` and pattern `{}`",
627                        rule.location,
628                        rule.kind.as_str(),
629                        rule.pattern,
630                    );
631                }
632                rules.push(rule);
633            }
634        }
635        apply_rules(&mut entries, &rules, options.allow_unmatched_rules)?;
636    }
637    let metadata = build_metadata(&options.metadata, &entries)?;
638    match options.format {
639        DictionaryFormat::Fst => {
640            let bytes = build_fst_bytes(&entries, &metadata)?;
641            fs::write(output_path.as_ref(), &bytes)
642                .with_context(|| format!("failed to write {}", output_path.as_ref().display()))?;
643
644            if options.validate {
645                let dictionary = FstDictionary::open(output_path.as_ref()).with_context(|| {
646                    format!("failed to validate {}", output_path.as_ref().display())
647                })?;
648                validate_fst_round_trip(&entries, &dictionary)?;
649            }
650        }
651        DictionaryFormat::Cdb => {
652            reject_reserved_cdb_keys(&entries)?;
653            build_cdb_file(&entries, &metadata, output_path.as_ref())?;
654
655            if options.validate {
656                let dictionary = CdbDictionary::open(output_path.as_ref()).with_context(|| {
657                    format!("failed to validate {}", output_path.as_ref().display())
658                })?;
659                validate_cdb_round_trip(&entries, &dictionary)?;
660            }
661        }
662    }
663
664    tracing::info!(entry_count = entries.len(), "dictionary build complete");
665    Ok(())
666}
667
668fn read_and_merge_inputs(
669    input_paths: &[PathBuf],
670    options: &BuildOptions,
671) -> Result<Vec<DictionaryEntry>> {
672    let mut merged = BTreeMap::<String, DictionaryEntry>::new();
673
674    for path in input_paths {
675        let file =
676            fs::File::open(path).with_context(|| format!("failed to open {}", path.display()))?;
677        let entries = parse_input(BufReader::new(file), path, options.max_key_bytes)?;
678        for entry in entries {
679            match (options.merge, merged.contains_key(entry.hanja())) {
680                (MergePolicy::Error, true) => bail!("duplicate entry for `{}`", entry.hanja()),
681                (MergePolicy::FirstWins, true) => {}
682                (MergePolicy::LastWins, true) | (_, false) => {
683                    merged.insert(entry.hanja.clone(), entry);
684                }
685            }
686        }
687    }
688
689    Ok(merged.into_values().collect())
690}
691
692fn reject_reserved_cdb_keys(entries: &[DictionaryEntry]) -> Result<()> {
693    for entry in entries {
694        ensure!(
695            entry.hanja().as_bytes() != CDB_META_KEY,
696            "`{}` is reserved for CDB metadata",
697            entry.hanja()
698        );
699    }
700    Ok(())
701}
702
703fn parse_input(
704    reader: impl BufRead,
705    path: &Path,
706    max_key_bytes: usize,
707) -> Result<Vec<DictionaryEntry>> {
708    let format = match path.extension().and_then(|ext| ext.to_str()) {
709        Some("csv") => "csv",
710        Some("jsonl") => "jsonl",
711        _ => "tsv",
712    };
713    tracing::debug!(path = %path.display(), format, "detected dictionary input format");
714    match format {
715        "csv" => parse_csv(reader, path, max_key_bytes),
716        "jsonl" => parse_jsonl(reader, path, max_key_bytes),
717        _ => parse_tsv(reader, path, max_key_bytes),
718    }
719}
720
721fn parse_tsv(
722    reader: impl BufRead,
723    path: &Path,
724    max_key_bytes: usize,
725) -> Result<Vec<DictionaryEntry>> {
726    let mut lines = reader.lines();
727    let header = loop {
728        let Some(line) = lines.next() else {
729            bail!("{} is empty", path.display());
730        };
731        let line = line.with_context(|| format!("failed to read {}", path.display()))?;
732        if !line.is_empty() {
733            break line;
734        }
735    };
736    let columns = parse_header(&header)?;
737    let mut entries = Vec::new();
738
739    for (index, line) in lines.enumerate() {
740        let line_number = index + 2;
741        let line = line.with_context(|| format!("failed to read {}", path.display()))?;
742        if line.is_empty() {
743            continue;
744        }
745        entries.push(parse_row(
746            &line,
747            &columns,
748            max_key_bytes,
749            &format!("{}:{line_number}", path.display()),
750        )?);
751    }
752
753    Ok(entries)
754}
755
756fn parse_csv(
757    reader: impl BufRead,
758    path: &Path,
759    max_key_bytes: usize,
760) -> Result<Vec<DictionaryEntry>> {
761    let mut reader = csv::Reader::from_reader(reader);
762    let header = reader
763        .headers()
764        .with_context(|| format!("failed to read CSV header from {}", path.display()))?
765        .iter()
766        .collect::<Vec<_>>()
767        .join("\t");
768    let columns = parse_header_with_format(&header, "CSV")?;
769    let mut entries = Vec::new();
770
771    for (index, record) in reader.records().enumerate() {
772        let location = format!("{}:{}", path.display(), index + 2);
773        let record = record.with_context(|| format!("failed to read CSV record at {location}"))?;
774        let fields = record.iter().collect::<Vec<_>>();
775        entries.push(parse_fields(&fields, &columns, max_key_bytes, &location)?);
776    }
777
778    Ok(entries)
779}
780
781fn parse_jsonl(
782    reader: impl BufRead,
783    path: &Path,
784    max_key_bytes: usize,
785) -> Result<Vec<DictionaryEntry>> {
786    let mut entries = Vec::new();
787    for (index, line) in reader.lines().enumerate() {
788        let line_number = index + 1;
789        let line = line.with_context(|| format!("failed to read {}", path.display()))?;
790        if line.trim().is_empty() {
791            continue;
792        }
793        let record: JsonLineEntry = serde_json::from_str(&line).with_context(|| {
794            format!(
795                "failed to parse JSONL record at {}:{line_number}",
796                path.display()
797            )
798        })?;
799        entries.push(normalize_entry(
800            &record.hanja,
801            &record.hangul,
802            EntryMark {
803                require_hanja: record.require_hanja,
804                require_hangul: record.require_hangul,
805            },
806            max_key_bytes,
807            &format!("{}:{line_number}", path.display()),
808        )?);
809    }
810    Ok(entries)
811}
812
813#[derive(Clone, Debug)]
814struct HeaderColumns {
815    hanja: usize,
816    hangul: usize,
817    require_hanja: Option<usize>,
818    require_hangul: Option<usize>,
819    column_count: usize,
820}
821
822fn parse_header(header: &str) -> Result<HeaderColumns> {
823    parse_header_with_format(header, "TSV")
824}
825
826fn parse_header_with_format(header: &str, format_name: &str) -> Result<HeaderColumns> {
827    let columns = header.split('\t').collect::<Vec<_>>();
828    let mut seen = BTreeSet::new();
829    let mut hanja = None;
830    let mut hangul = None;
831    let mut require_hanja = None;
832    let mut require_hangul = None;
833
834    for (index, column) in columns.iter().enumerate() {
835        ensure!(
836            !column.is_empty(),
837            "{format_name} header contains an empty column name"
838        );
839        ensure!(
840            seen.insert(*column),
841            "{format_name} header contains duplicate `{column}` column"
842        );
843        match *column {
844            "hanja" => hanja = Some(index),
845            "hangul" => hangul = Some(index),
846            "require_hanja" => require_hanja = Some(index),
847            "require_hangul" => require_hangul = Some(index),
848            extra => {
849                tracing::warn!(
850                    column = extra,
851                    format = format_name,
852                    "ignoring unsupported input column"
853                );
854            }
855        }
856    }
857
858    Ok(HeaderColumns {
859        hanja: hanja.ok_or_else(|| Error::message("missing required `hanja` column"))?,
860        hangul: hangul.ok_or_else(|| Error::message("missing required `hangul` column"))?,
861        require_hanja,
862        require_hangul,
863        column_count: columns.len(),
864    })
865}
866
867fn parse_row(
868    line: &str,
869    columns: &HeaderColumns,
870    max_key_bytes: usize,
871    location: &str,
872) -> Result<DictionaryEntry> {
873    let fields = line.split('\t').collect::<Vec<_>>();
874    parse_fields(&fields, columns, max_key_bytes, location)
875}
876
877fn parse_fields(
878    fields: &[&str],
879    columns: &HeaderColumns,
880    max_key_bytes: usize,
881    location: &str,
882) -> Result<DictionaryEntry> {
883    ensure!(
884        fields.len() >= columns.column_count,
885        "{location}: expected {} TSV fields, got {}",
886        columns.column_count,
887        fields.len()
888    );
889
890    let hanja = fields[columns.hanja];
891    let hangul = fields[columns.hangul];
892    let require_hanja = parse_optional_bool(fields, columns.require_hanja, location)?;
893    let require_hangul = parse_optional_bool(fields, columns.require_hangul, location)?;
894
895    normalize_entry(
896        hanja,
897        hangul,
898        EntryMark {
899            require_hanja,
900            require_hangul,
901        },
902        max_key_bytes,
903        location,
904    )
905}
906
907fn normalize_entry(
908    hanja: &str,
909    hangul: &str,
910    mark: EntryMark,
911    max_key_bytes: usize,
912    location: &str,
913) -> Result<DictionaryEntry> {
914    ensure!(!hanja.is_empty(), "{location}: `hanja` must not be empty");
915    ensure!(!hangul.is_empty(), "{location}: `hangul` must not be empty");
916    ensure!(
917        hanja.len() <= max_key_bytes,
918        "{location}: key `{hanja}` exceeds --max-key-bytes={max_key_bytes}"
919    );
920
921    Ok(DictionaryEntry::new(hanja, hangul, mark))
922}
923
924fn parse_optional_bool(fields: &[&str], index: Option<usize>, location: &str) -> Result<bool> {
925    let Some(index) = index else {
926        return Ok(false);
927    };
928    let Some(value) = fields.get(index).copied() else {
929        return Ok(false);
930    };
931    if value.is_empty() {
932        return Ok(false);
933    }
934    match value {
935        "true" | "1" => Ok(true),
936        "false" | "0" => Ok(false),
937        _ => bail!("{location}: invalid boolean value `{value}`"),
938    }
939}
940
941fn build_metadata(
942    user_metadata: &BTreeMap<String, String>,
943    entries: &[DictionaryEntry],
944) -> Result<BTreeMap<String, String>> {
945    for key in RESERVED_METADATA_KEYS {
946        ensure!(
947            !user_metadata.contains_key(*key),
948            "`{key}` metadata is reserved"
949        );
950    }
951
952    let mut metadata = BTreeMap::new();
953    metadata.insert(
954        "source".to_owned(),
955        user_metadata.get("source").cloned().unwrap_or_default(),
956    );
957    metadata.insert(
958        "license".to_owned(),
959        user_metadata.get("license").cloned().unwrap_or_default(),
960    );
961    metadata.insert(
962        "build_date".to_owned(),
963        user_metadata
964            .get("build_date")
965            .cloned()
966            .unwrap_or_else(default_build_date),
967    );
968    metadata.insert("entry_count".to_owned(), entries.len().to_string());
969    metadata.insert("version".to_owned(), FORMAT_VERSION.to_string());
970    metadata.insert(
971        "max_word_chars".to_owned(),
972        entries
973            .iter()
974            .map(|entry| entry.hanja().chars().count())
975            .max()
976            .unwrap_or(0)
977            .to_string(),
978    );
979    metadata.insert(
980        "max_key_bytes".to_owned(),
981        entries
982            .iter()
983            .map(|entry| entry.hanja().len())
984            .max()
985            .unwrap_or(0)
986            .to_string(),
987    );
988
989    for (key, value) in user_metadata {
990        metadata.entry(key.clone()).or_insert_with(|| value.clone());
991    }
992
993    Ok(metadata)
994}
995
996fn default_build_date() -> String {
997    let Some(epoch) = env::var("SOURCE_DATE_EPOCH")
998        .ok()
999        .and_then(|epoch| epoch.parse::<i64>().ok())
1000    else {
1001        return "1970-01-01T00:00:00Z".to_owned();
1002    };
1003    OffsetDateTime::from_unix_timestamp(epoch)
1004        .ok()
1005        .and_then(|datetime| datetime.format(&Rfc3339).ok())
1006        .unwrap_or_else(|| "1970-01-01T00:00:00Z".to_owned())
1007}
1008
1009fn build_fst_bytes(
1010    entries: &[DictionaryEntry],
1011    metadata: &BTreeMap<String, String>,
1012) -> Result<Vec<u8>> {
1013    let mut metadata_bytes = Vec::new();
1014    into_writer(metadata, &mut metadata_bytes).context("failed to encode dictionary metadata")?;
1015
1016    let mut readings = Vec::new();
1017    let mut builder = MapBuilder::memory();
1018    for entry in entries {
1019        let reading_len = u16::try_from(entry.reading().len())
1020            .with_context(|| format!("reading for `{}` is too long", entry.hanja()))?;
1021        let reading_offset =
1022            u64::try_from(readings.len()).context("reading table offset too large")?;
1023        ensure!(
1024            reading_offset <= VALUE_MAX_OFFSET,
1025            "reading table exceeds the FST value layout"
1026        );
1027        let value = encode_value(reading_len, entry.mark(), reading_offset);
1028        builder
1029            .insert(entry.hanja().as_bytes(), value)
1030            .with_context(|| format!("failed to insert `{}` into FST", entry.hanja()))?;
1031        readings.extend_from_slice(entry.reading().as_bytes());
1032    }
1033    let fst_bytes = builder.into_inner().context("failed to finish FST map")?;
1034
1035    let metadata_offset = u64::try_from(FIXED_HEADER_LEN).expect("header length fits in u64");
1036    let fst_offset = metadata_offset
1037        .checked_add(u64::try_from(metadata_bytes.len()).context("metadata too large")?)
1038        .context("FST offset overflow")?;
1039    let readings_offset = fst_offset
1040        .checked_add(u64::try_from(fst_bytes.len()).context("FST bytes too large")?)
1041        .context("reading table offset overflow")?;
1042    let header = FixedHeader {
1043        metadata_offset,
1044        metadata_len: u64::try_from(metadata_bytes.len()).context("metadata too large")?,
1045        fst_offset,
1046        fst_len: u64::try_from(fst_bytes.len()).context("FST bytes too large")?,
1047        readings_offset,
1048        readings_len: u64::try_from(readings.len()).context("reading table too large")?,
1049    };
1050
1051    let mut output = Vec::with_capacity(
1052        FIXED_HEADER_LEN + metadata_bytes.len() + fst_bytes.len() + readings.len(),
1053    );
1054    header.write(&mut output);
1055    output.extend(metadata_bytes);
1056    output.extend(fst_bytes);
1057    output.extend(readings);
1058    tracing::info!(
1059        entry_count = entries.len(),
1060        total_bytes = output.len(),
1061        "built FST dictionary bytes"
1062    );
1063    Ok(output)
1064}
1065
1066fn build_cdb_file(
1067    entries: &[DictionaryEntry],
1068    metadata: &BTreeMap<String, String>,
1069    output_path: &Path,
1070) -> Result<()> {
1071    let records = build_cdb_records(entries);
1072    let mut metadata = metadata.clone();
1073    metadata.insert("prefix_count".to_owned(), records.len().to_string());
1074    let mut metadata_bytes = Vec::new();
1075    into_writer(&metadata, &mut metadata_bytes).context("failed to encode dictionary metadata")?;
1076
1077    let output_name = output_path.to_str().ok_or_else(|| {
1078        Error::message(format!(
1079            "CDB output path must be valid UTF-8: {}",
1080            output_path.display()
1081        ))
1082    })?;
1083    let mut writer = cdb::CDBWriter::create(output_name)
1084        .with_context(|| format!("failed to create {}", output_path.display()))?;
1085    writer
1086        .add(CDB_META_KEY, &metadata_bytes)
1087        .context("failed to add CDB metadata record")?;
1088    for (key, record) in records {
1089        let value = encode_cdb_record(record.as_ref())?;
1090        writer
1091            .add(key.as_bytes(), &value)
1092            .with_context(|| format!("failed to add CDB record `{key}`"))?;
1093    }
1094    writer
1095        .finish()
1096        .with_context(|| format!("failed to finish {}", output_path.display()))?;
1097    tracing::info!(
1098        entry_count = entries.len(),
1099        path = %output_path.display(),
1100        "built CDB dictionary file"
1101    );
1102    Ok(())
1103}
1104
1105fn build_cdb_records(entries: &[DictionaryEntry]) -> BTreeMap<String, Option<DictionaryEntry>> {
1106    let mut records = BTreeMap::new();
1107    for entry in entries {
1108        let mut prefix = String::new();
1109        for ch in entry.hanja().chars() {
1110            prefix.push(ch);
1111            records.entry(prefix.clone()).or_insert(None);
1112        }
1113        records.insert(entry.hanja().to_owned(), Some(entry.clone()));
1114    }
1115    records
1116}
1117
1118fn encode_cdb_record(entry: Option<&DictionaryEntry>) -> Result<Vec<u8>> {
1119    let mut output = Vec::new();
1120    match entry {
1121        Some(entry) => {
1122            let reading_len = u16::try_from(entry.reading().len())
1123                .with_context(|| format!("reading for `{}` is too long", entry.hanja()))?;
1124            output.push(1);
1125            output.push(encode_cdb_mark(entry.mark()));
1126            output.extend_from_slice(&reading_len.to_le_bytes());
1127            output.extend_from_slice(entry.reading().as_bytes());
1128        }
1129        None => {
1130            output.push(0);
1131            output.push(0);
1132            output.extend_from_slice(&0u16.to_le_bytes());
1133        }
1134    }
1135    Ok(output)
1136}
1137
1138fn encode_cdb_mark(mark: EntryMark) -> u8 {
1139    let mut encoded = 0;
1140    if mark.require_hanja {
1141        encoded |= CDB_MARK_REQUIRE_HANJA;
1142    }
1143    if mark.require_hangul {
1144        encoded |= CDB_MARK_REQUIRE_HANGUL;
1145    }
1146    encoded
1147}
1148
1149fn validate_fst_round_trip(entries: &[DictionaryEntry], dictionary: &FstDictionary) -> Result<()> {
1150    if dictionary.entry_count() != entries.len() as u64 {
1151        tracing::error!(
1152            actual = dictionary.entry_count(),
1153            expected = entries.len() as u64,
1154            "round-trip validation failed: entry count mismatch"
1155        );
1156        bail!("round-trip validation failed: entry count mismatch");
1157    }
1158    for entry in entries {
1159        let actual = dictionary.lookup(entry.hanja())?.ok_or_else(|| {
1160            Error::message(format!(
1161                "round-trip validation failed: `{}` is missing",
1162                entry.hanja()
1163            ))
1164        })?;
1165        let mark = actual.mark();
1166        ensure!(
1167            actual.reading() == entry.reading()
1168                && mark.require_hanja == entry.mark().require_hanja
1169                && mark.require_hangul == entry.mark().require_hangul,
1170            "round-trip validation failed for `{}`",
1171            entry.hanja()
1172        );
1173    }
1174    Ok(())
1175}
1176
1177fn validate_cdb_round_trip(entries: &[DictionaryEntry], dictionary: &CdbDictionary) -> Result<()> {
1178    if dictionary.entry_count() != entries.len() as u64 {
1179        tracing::error!(
1180            actual = dictionary.entry_count(),
1181            expected = entries.len() as u64,
1182            "round-trip validation failed: entry count mismatch"
1183        );
1184        bail!("round-trip validation failed: entry count mismatch");
1185    }
1186    for entry in entries {
1187        let actual = dictionary.lookup(entry.hanja())?.ok_or_else(|| {
1188            Error::message(format!(
1189                "round-trip validation failed: `{}` is missing",
1190                entry.hanja()
1191            ))
1192        })?;
1193        let mark = actual.mark();
1194        ensure!(
1195            actual.reading() == entry.reading()
1196                && mark.require_hanja == entry.mark().require_hanja
1197                && mark.require_hangul == entry.mark().require_hangul,
1198            "round-trip validation failed for `{}`",
1199            entry.hanja()
1200        );
1201    }
1202    Ok(())
1203}
1204
1205#[derive(Debug, Deserialize)]
1206struct JsonLineEntry {
1207    hanja: String,
1208    hangul: String,
1209    #[serde(default, alias = "requireHanja")]
1210    require_hanja: bool,
1211    #[serde(default, alias = "requireHangul")]
1212    require_hangul: bool,
1213}
1214
1215fn encode_value(reading_len: u16, mark: EntryMark, reading_offset: u64) -> u64 {
1216    u64::from(reading_len)
1217        | (u64::from(encode_mark(mark)) << VALUE_MARK_SHIFT)
1218        | (reading_offset << VALUE_OFFSET_SHIFT)
1219}
1220
1221fn encode_mark(mark: EntryMark) -> u8 {
1222    let mut encoded = 0;
1223    if mark.require_hanja {
1224        encoded |= MARK_REQUIRE_HANJA;
1225    }
1226    if mark.require_hangul {
1227        encoded |= MARK_REQUIRE_HANGUL;
1228    }
1229    encoded
1230}
1231
1232#[derive(Clone, Copy, Debug, Eq, PartialEq)]
1233struct FixedHeader {
1234    metadata_offset: u64,
1235    metadata_len: u64,
1236    fst_offset: u64,
1237    fst_len: u64,
1238    readings_offset: u64,
1239    readings_len: u64,
1240}
1241
1242impl FixedHeader {
1243    fn write(self, output: &mut Vec<u8>) {
1244        output.extend_from_slice(MAGIC);
1245        output.extend_from_slice(&FORMAT_VERSION.to_le_bytes());
1246        output.extend_from_slice(&(FIXED_HEADER_LEN as u32).to_le_bytes());
1247        output.extend_from_slice(&self.metadata_offset.to_le_bytes());
1248        output.extend_from_slice(&self.metadata_len.to_le_bytes());
1249        output.extend_from_slice(&self.fst_offset.to_le_bytes());
1250        output.extend_from_slice(&self.fst_len.to_le_bytes());
1251        output.extend_from_slice(&self.readings_offset.to_le_bytes());
1252        output.extend_from_slice(&self.readings_len.to_le_bytes());
1253        debug_assert_eq!(output.len(), FIXED_HEADER_LEN);
1254    }
1255}
1256
1257/// Parses one `KEY=VAL` metadata argument.
1258pub fn parse_metadata_arg(arg: &str) -> Result<(String, String)> {
1259    let (key, value) = arg
1260        .split_once('=')
1261        .ok_or_else(|| Error::message("metadata must use KEY=VAL syntax"))?;
1262    ensure!(!key.is_empty(), "metadata key must not be empty");
1263    Ok((key.to_owned(), value.to_owned()))
1264}
1265
1266#[cfg(test)]
1267mod tests {
1268    use tracing_test::traced_test;
1269
1270    use super::*;
1271
1272    #[traced_test]
1273    #[test]
1274    fn unmatched_rules_emits_error_event() {
1275        let mut entries = vec![DictionaryEntry::new("漢字", "한자", EntryMark::default())];
1276        let rules = vec![Rule::new(
1277            RuleKind::Entry,
1278            "天地",
1279            EntryMark {
1280                require_hanja: true,
1281                require_hangul: false,
1282            },
1283            "missing entry",
1284        )];
1285
1286        let result = apply_rules(&mut entries, &rules, false);
1287
1288        assert!(result.is_err());
1289        assert!(logs_contain("rules matched no entries"));
1290    }
1291
1292    #[test]
1293    fn parses_headered_tsv_and_optional_flags() {
1294        let input = "hanja\thangul\trequire_hanja\trequire_hangul\tcategory\n漢字\t한자\t1\tfalse\tnoun\n天地\t천지\t\ttrue\tnoun\n";
1295
1296        let entries = parse_tsv(input.as_bytes(), Path::new("fixture.tsv"), 1024).unwrap();
1297
1298        assert_eq!(entries.len(), 2);
1299        assert_eq!(entries[0].hanja(), "漢字");
1300        assert_eq!(entries[0].reading(), "한자");
1301        assert!(entries[0].mark().require_hanja);
1302        assert!(!entries[0].mark().require_hangul);
1303        assert!(!entries[1].mark().require_hanja);
1304        assert!(entries[1].mark().require_hangul);
1305    }
1306
1307    #[test]
1308    fn rejects_invalid_boolean_values() {
1309        let input = "hanja\thangul\trequire_hanja\n漢字\t한자\tyes\n";
1310
1311        let error = parse_tsv(input.as_bytes(), Path::new("fixture.tsv"), 1024).unwrap_err();
1312
1313        assert!(error.to_string().contains("invalid boolean value `yes`"));
1314    }
1315
1316    #[test]
1317    fn rejects_reserved_metadata_keys() {
1318        let metadata = BTreeMap::from([("entry_count".to_owned(), "1".to_owned())]);
1319
1320        let error = build_metadata(&metadata, &[]).unwrap_err();
1321
1322        assert!(error.to_string().contains("reserved"));
1323    }
1324
1325    fn parse_rules_str(input: &str) -> Result<Vec<Rule>> {
1326        parse_rules_reader(input.as_bytes(), Path::new("rules.tsv"))
1327    }
1328
1329    #[test]
1330    fn parses_minimal_rules_tsv() {
1331        let input = "kind\tpattern\trequire_hanja\trequire_hangul\treason\n\
1332                     entry\t漢字\ttrue\tfalse\thomophone\n\
1333                     contains\t驟\ttrue\tfalse\trare hanja\n\
1334                     reading\t사기\ttrue\tfalse\tcommon homophone\n";
1335
1336        let rules = parse_rules_str(input).unwrap();
1337
1338        assert_eq!(rules.len(), 3);
1339        assert_eq!(rules[0].kind(), RuleKind::Entry);
1340        assert_eq!(rules[0].pattern(), "漢字");
1341        assert!(rules[0].mark().require_hanja);
1342        assert!(!rules[0].mark().require_hangul);
1343        assert_eq!(rules[0].reason(), "homophone");
1344        assert_eq!(rules[1].kind(), RuleKind::Contains);
1345        assert_eq!(rules[2].kind(), RuleKind::Reading);
1346    }
1347
1348    #[test]
1349    fn rejects_unknown_rule_kind() {
1350        let input = "kind\tpattern\trequire_hanja\trequire_hangul\treason\n\
1351                     glob\t漢*\ttrue\tfalse\tnope\n";
1352
1353        let error = parse_rules_str(input).unwrap_err();
1354
1355        let text = error.to_string();
1356        assert!(text.contains("unknown rule kind `glob`"), "{text}");
1357        // Recovery guidance must enumerate the currently accepted kinds.
1358        assert!(text.contains("`entry`"), "{text}");
1359        assert!(text.contains("`contains`"), "{text}");
1360        assert!(text.contains("`reading`"), "{text}");
1361    }
1362
1363    #[test]
1364    fn rejects_contains_rule_with_non_hanja_pattern_from_tsv() {
1365        // Mixed-script dictionary keys (e.g. `布告하다`) mean a non-hanja
1366        // `contains` pattern would silently mark unrelated entries; reject at
1367        // the apply step.
1368        let input = "kind\tpattern\trequire_hanja\trequire_hangul\treason\n\
1369                     contains\t하다\ttrue\tfalse\ttypo\n";
1370        let rules = parse_rules_str(input).unwrap();
1371        let mut entries = vec![entry("布告하다", "포고하다")];
1372
1373        let error = apply_rules(&mut entries, &rules, false).unwrap_err();
1374
1375        assert!(
1376            error.to_string().contains("must consist only of hanja"),
1377            "{error}"
1378        );
1379    }
1380
1381    #[test]
1382    fn rejects_rule_with_empty_reason() {
1383        let input = "kind\tpattern\trequire_hanja\trequire_hangul\treason\n\
1384                     entry\t漢字\ttrue\tfalse\t\n";
1385
1386        let error = parse_rules_str(input).unwrap_err();
1387
1388        assert!(error.to_string().contains("reason"), "{error}");
1389    }
1390
1391    #[test]
1392    fn rejects_rule_with_no_mark_bits_set() {
1393        let input = "kind\tpattern\trequire_hanja\trequire_hangul\treason\n\
1394                     entry\t漢字\tfalse\tfalse\tno-op\n";
1395
1396        let error = parse_rules_str(input).unwrap_err();
1397
1398        assert!(
1399            error
1400                .to_string()
1401                .contains("at least one of `require_hanja` or `require_hangul`"),
1402            "{error}"
1403        );
1404    }
1405
1406    #[test]
1407    fn rejects_duplicate_rule_keys() {
1408        let input = "kind\tpattern\trequire_hanja\trequire_hangul\treason\n\
1409                     entry\t漢字\ttrue\tfalse\tfirst\n\
1410                     entry\t漢字\tfalse\ttrue\tsecond\n";
1411
1412        let error = parse_rules_str(input).unwrap_err();
1413
1414        assert!(error.to_string().contains("duplicate rule"), "{error}");
1415    }
1416
1417    #[test]
1418    fn allows_overlapping_rules_across_kinds() {
1419        let input = "kind\tpattern\trequire_hanja\trequire_hangul\treason\n\
1420                     entry\t漢字\ttrue\tfalse\thomophone entry\n\
1421                     contains\t漢\ttrue\tfalse\trare character\n";
1422
1423        let rules = parse_rules_str(input).unwrap();
1424
1425        assert_eq!(rules.len(), 2);
1426    }
1427
1428    fn entry(hanja: &str, reading: &str) -> DictionaryEntry {
1429        DictionaryEntry::new(hanja, reading, EntryMark::default())
1430    }
1431
1432    #[test]
1433    fn apply_rules_or_merges_marks_across_kinds() {
1434        let mut entries = vec![
1435            entry("漢字", "한자"),
1436            entry("天地", "천지"),
1437            entry("史記", "사기"),
1438            entry("詐欺", "사기"),
1439        ];
1440        let rules = vec![
1441            Rule::new(
1442                RuleKind::Entry,
1443                "漢字",
1444                EntryMark {
1445                    require_hanja: true,
1446                    require_hangul: false,
1447                },
1448                "homophone-heavy entry",
1449            ),
1450            Rule::new(
1451                RuleKind::Contains,
1452                "天",
1453                EntryMark {
1454                    require_hanja: true,
1455                    require_hangul: false,
1456                },
1457                "rare hanja",
1458            ),
1459            Rule::new(
1460                RuleKind::Reading,
1461                "사기",
1462                EntryMark {
1463                    require_hanja: true,
1464                    require_hangul: false,
1465                },
1466                "ambiguous reading",
1467            ),
1468        ];
1469
1470        apply_rules(&mut entries, &rules, false).unwrap();
1471
1472        assert!(
1473            entries[0].mark().require_hanja,
1474            "entry rule applied to 漢字"
1475        );
1476        assert!(
1477            entries[1].mark().require_hanja,
1478            "contains rule applied to 天地"
1479        );
1480        assert!(
1481            entries[2].mark().require_hanja,
1482            "reading rule applied to 史記"
1483        );
1484        assert!(
1485            entries[3].mark().require_hanja,
1486            "reading rule applied to 詐欺"
1487        );
1488    }
1489
1490    #[test]
1491    fn apply_rules_or_merges_multiple_rules_on_one_entry() {
1492        let mut entries = vec![entry("漢字", "한자")];
1493        let rules = vec![
1494            Rule::new(
1495                RuleKind::Entry,
1496                "漢字",
1497                EntryMark {
1498                    require_hanja: true,
1499                    require_hangul: false,
1500                },
1501                "entry-level",
1502            ),
1503            Rule::new(
1504                RuleKind::Reading,
1505                "한자",
1506                EntryMark {
1507                    require_hanja: false,
1508                    require_hangul: true,
1509                },
1510                "reading-level",
1511            ),
1512        ];
1513
1514        apply_rules(&mut entries, &rules, false).unwrap();
1515
1516        let mark = entries[0].mark();
1517        assert!(mark.require_hanja);
1518        assert!(mark.require_hangul);
1519    }
1520
1521    #[test]
1522    fn apply_rules_reports_all_unmatched_rules_in_one_error() {
1523        let mut entries = vec![entry("漢字", "한자")];
1524        let rules = vec![
1525            Rule::new(
1526                RuleKind::Entry,
1527                "天地",
1528                EntryMark {
1529                    require_hanja: true,
1530                    require_hangul: false,
1531                },
1532                "missing entry",
1533            ),
1534            Rule::new(
1535                RuleKind::Contains,
1536                "驟",
1537                EntryMark {
1538                    require_hanja: true,
1539                    require_hangul: false,
1540                },
1541                "missing contains",
1542            ),
1543        ];
1544
1545        let error = apply_rules(&mut entries, &rules, false).unwrap_err();
1546
1547        let text = error.to_string();
1548        assert!(text.contains("entry=天地"), "{text}");
1549        assert!(text.contains("contains=驟"), "{text}");
1550        assert!(text.contains("2 unmatched"), "{text}");
1551    }
1552
1553    #[test]
1554    fn apply_rules_accepts_multi_hanja_contains_pattern() {
1555        // `contains` is a substring matcher, so multi-character hanja patterns
1556        // mark every entry containing the substring.
1557        let mut entries = vec![
1558            entry("國民學校", "국민학교"),
1559            entry("國民年金", "국민연금"),
1560            entry("民國", "민국"),
1561        ];
1562        let rules = vec![Rule::new(
1563            RuleKind::Contains,
1564            "國民",
1565            EntryMark {
1566                require_hanja: true,
1567                require_hangul: false,
1568            },
1569            "compound containing 國民",
1570        )];
1571
1572        apply_rules(&mut entries, &rules, false).unwrap();
1573
1574        assert!(entries[0].mark().require_hanja);
1575        assert!(entries[1].mark().require_hanja);
1576        assert!(
1577            !entries[2].mark().require_hanja,
1578            "民國 does not contain the substring 國民"
1579        );
1580    }
1581
1582    #[test]
1583    fn apply_rules_rejects_contains_rule_with_non_hanja_character() {
1584        // Dictionary keys can be mixed-script (e.g. `布告하다`), so a `contains`
1585        // pattern with hangul would silently mark every `~하다` entry.
1586        let mut entries = vec![entry("布告하다", "포고하다"), entry("漢字", "한자")];
1587        let rules = vec![Rule::new(
1588            RuleKind::Contains,
1589            "하",
1590            EntryMark {
1591                require_hanja: true,
1592                require_hangul: false,
1593            },
1594            "typo: meant a rare hanja",
1595        )];
1596
1597        let error = apply_rules(&mut entries, &rules, false).unwrap_err();
1598
1599        let text = error.to_string();
1600        assert!(text.contains("must consist only of hanja"), "{text}");
1601        assert!(
1602            !entries[0].mark().require_hanja,
1603            "the typo'd rule must not silently mark 布告하다"
1604        );
1605    }
1606
1607    #[test]
1608    fn apply_rules_rejects_programmatic_empty_pattern() {
1609        let mut entries = vec![entry("漢字", "한자")];
1610        let rules = vec![Rule::new(
1611            RuleKind::Entry,
1612            "",
1613            EntryMark {
1614                require_hanja: true,
1615                require_hangul: false,
1616            },
1617            "programmatic mistake",
1618        )];
1619
1620        let error = apply_rules(&mut entries, &rules, false).unwrap_err();
1621
1622        assert!(error.to_string().contains("must not be empty"), "{error}");
1623    }
1624
1625    #[test]
1626    fn apply_rules_rejects_programmatic_no_mark_bits() {
1627        let mut entries = vec![entry("漢字", "한자")];
1628        let rules = vec![Rule::new(
1629            RuleKind::Entry,
1630            "漢字",
1631            EntryMark::default(),
1632            "programmatic mistake",
1633        )];
1634
1635        let error = apply_rules(&mut entries, &rules, false).unwrap_err();
1636
1637        assert!(
1638            error
1639                .to_string()
1640                .contains("at least one of `require_hanja` or `require_hangul`"),
1641            "{error}"
1642        );
1643    }
1644
1645    #[test]
1646    fn apply_rules_allows_unmatched_when_configured() {
1647        let mut entries = vec![entry("漢字", "한자")];
1648        let rules = vec![Rule::new(
1649            RuleKind::Entry,
1650            "天地",
1651            EntryMark {
1652                require_hanja: true,
1653                require_hangul: false,
1654            },
1655            "missing entry",
1656        )];
1657
1658        apply_rules(&mut entries, &rules, true).unwrap();
1659
1660        assert!(!entries[0].mark().require_hanja);
1661    }
1662}