1#![forbid(unsafe_code)]
24#![deny(missing_docs)]
25
26use std::collections::{BTreeMap, BTreeSet};
27use std::env;
28use std::error::Error as StdError;
29use std::fs;
30use std::io::{BufRead, BufReader};
31use std::path::{Path, PathBuf};
32
33use ciborium::ser::into_writer;
34use fst::MapBuilder;
35use gukhanmun_cdb::CdbDictionary;
36use gukhanmun_fst::FstDictionary;
37use serde::{Deserialize, Serialize};
38use time::OffsetDateTime;
39use time::format_description::well_known::Rfc3339;
40
41const MAGIC: &[u8; 8] = b"GUKHMFST";
42const FORMAT_VERSION: u32 = 1;
43const FIXED_HEADER_LEN: usize = 64;
44const MARK_REQUIRE_HANJA: u8 = 0b0000_0001;
45const MARK_REQUIRE_HANGUL: u8 = 0b0000_0010;
46const VALUE_MARK_SHIFT: u64 = 16;
47const VALUE_OFFSET_SHIFT: u64 = 24;
48const VALUE_MAX_OFFSET: u64 = (1u64 << 40) - 1;
49const RESERVED_METADATA_KEYS: &[&str] = &[
50 "entry_count",
51 "version",
52 "max_word_chars",
53 "max_key_bytes",
54 "prefix_count",
55];
56const CDB_META_KEY: &[u8] = b"__gukhanmun_meta__";
57const CDB_MARK_REQUIRE_HANJA: u8 = 0b0000_0001;
58const CDB_MARK_REQUIRE_HANGUL: u8 = 0b0000_0010;
59
60#[derive(Debug, thiserror::Error)]
62#[non_exhaustive]
63pub enum Error {
64 #[error("{0}")]
66 Message(String),
67
68 #[error("{context}: {source}")]
70 Source {
71 context: String,
73 #[source]
75 source: Box<dyn StdError + Send + Sync + 'static>,
76 },
77
78 #[error(transparent)]
80 Fst(#[from] gukhanmun_fst::Error),
81
82 #[error(transparent)]
84 Cdb(#[from] gukhanmun_cdb::Error),
85}
86
87impl Error {
88 fn message(message: impl Into<String>) -> Self {
89 Self::Message(message.into())
90 }
91
92 fn source(context: impl Into<String>, source: impl StdError + Send + Sync + 'static) -> Self {
93 Self::Source {
94 context: context.into(),
95 source: Box::new(source),
96 }
97 }
98}
99
100pub type Result<T> = std::result::Result<T, Error>;
102
103trait ResultContext<T> {
104 fn context(self, context: impl Into<String>) -> Result<T>;
105
106 fn with_context(self, context: impl FnOnce() -> String) -> Result<T>;
107}
108
109impl<T, E> ResultContext<T> for std::result::Result<T, E>
110where
111 E: StdError + Send + Sync + 'static,
112{
113 fn context(self, context: impl Into<String>) -> Result<T> {
114 self.map_err(|source| Error::source(context, source))
115 }
116
117 fn with_context(self, context: impl FnOnce() -> String) -> Result<T> {
118 self.map_err(|source| Error::source(context(), source))
119 }
120}
121
122trait OptionContext<T> {
123 fn context(self, context: impl Into<String>) -> Result<T>;
124}
125
126impl<T> OptionContext<T> for Option<T> {
127 fn context(self, context: impl Into<String>) -> Result<T> {
128 self.ok_or_else(|| Error::message(context.into()))
129 }
130}
131
132macro_rules! bail {
133 ($($arg:tt)*) => {
134 return Err(Error::message(format!($($arg)*)))
135 };
136}
137
138macro_rules! ensure {
139 ($condition:expr, $($arg:tt)*) => {
140 if !$condition {
141 bail!($($arg)*);
142 }
143 };
144}
145
146pub const DEFAULT_MAX_KEY_BYTES: usize = 1024;
148
149#[derive(Clone, Copy, Debug, Eq, PartialEq)]
151pub enum DictionaryFormat {
152 Fst,
154
155 Cdb,
157}
158
159#[derive(Clone, Copy, Debug, Eq, PartialEq)]
161pub enum MergePolicy {
162 Error,
164
165 FirstWins,
167
168 LastWins,
170}
171
172#[derive(Clone, Copy, Debug, Default, Eq, PartialEq, Serialize, Deserialize)]
174pub struct EntryMark {
175 pub require_hanja: bool,
177
178 pub require_hangul: bool,
180}
181
182#[derive(Clone, Debug, Eq, PartialEq)]
184pub struct DictionaryEntry {
185 hanja: String,
186 reading: String,
187 mark: EntryMark,
188}
189
190impl DictionaryEntry {
191 pub fn new(hanja: impl Into<String>, reading: impl Into<String>, mark: EntryMark) -> Self {
193 Self {
194 hanja: hanja.into(),
195 reading: reading.into(),
196 mark,
197 }
198 }
199
200 pub fn hanja(&self) -> &str {
202 &self.hanja
203 }
204
205 pub fn reading(&self) -> &str {
207 &self.reading
208 }
209
210 pub fn mark(&self) -> EntryMark {
212 self.mark
213 }
214
215 pub fn set_mark(&mut self, mark: EntryMark) {
217 self.mark = mark;
218 }
219}
220
221#[derive(Clone, Copy, Debug, Eq, Hash, Ord, PartialEq, PartialOrd)]
223pub enum RuleKind {
224 Entry,
226
227 Contains,
230
231 Reading,
233}
234
235impl RuleKind {
236 fn parse(value: &str) -> Option<Self> {
237 match value {
238 "entry" => Some(Self::Entry),
239 "contains" => Some(Self::Contains),
240 "reading" => Some(Self::Reading),
241 _ => None,
242 }
243 }
244
245 fn as_str(self) -> &'static str {
246 match self {
247 Self::Entry => "entry",
248 Self::Contains => "contains",
249 Self::Reading => "reading",
250 }
251 }
252}
253
254#[derive(Clone, Debug, Eq, PartialEq)]
257pub struct Rule {
258 kind: RuleKind,
259 pattern: String,
260 mark: EntryMark,
261 reason: String,
262 location: String,
263}
264
265impl Rule {
266 pub fn new(
274 kind: RuleKind,
275 pattern: impl Into<String>,
276 mark: EntryMark,
277 reason: impl Into<String>,
278 ) -> Self {
279 Self {
280 kind,
281 pattern: pattern.into(),
282 mark,
283 reason: reason.into(),
284 location: "<programmatic>".to_owned(),
285 }
286 }
287
288 pub fn kind(&self) -> RuleKind {
290 self.kind
291 }
292
293 pub fn pattern(&self) -> &str {
295 &self.pattern
296 }
297
298 pub fn mark(&self) -> EntryMark {
300 self.mark
301 }
302
303 pub fn reason(&self) -> &str {
305 &self.reason
306 }
307
308 pub fn location(&self) -> &str {
310 &self.location
311 }
312}
313
314pub fn parse_rules_file(path: &Path) -> Result<Vec<Rule>> {
321 let file =
322 fs::File::open(path).with_context(|| format!("failed to open {}", path.display()))?;
323 parse_rules_reader(BufReader::new(file), path)
324}
325
326fn parse_rules_reader(reader: impl BufRead, path: &Path) -> Result<Vec<Rule>> {
327 let mut lines = reader.lines();
328 let header = loop {
329 let Some(line) = lines.next() else {
330 bail!("{} is empty", path.display());
331 };
332 let line = line.with_context(|| format!("failed to read {}", path.display()))?;
333 if !line.is_empty() {
334 break line;
335 }
336 };
337 let columns = parse_rules_header(&header)?;
338 let mut rules = Vec::new();
339 let mut seen = BTreeSet::<(RuleKind, String)>::new();
340
341 for (index, line) in lines.enumerate() {
342 let line_number = index + 2;
343 let line = line.with_context(|| format!("failed to read {}", path.display()))?;
344 if line.is_empty() {
345 continue;
346 }
347 let location = format!("{}:{line_number}", path.display());
348 let rule = parse_rule_row(&line, &columns, &location)?;
349 if !seen.insert((rule.kind, rule.pattern.clone())) {
350 bail!(
351 "{}: duplicate rule for kind `{}` and pattern `{}`",
352 location,
353 rule.kind.as_str(),
354 rule.pattern,
355 );
356 }
357 rules.push(rule);
358 }
359
360 Ok(rules)
361}
362
363#[derive(Clone, Debug)]
364struct RulesHeaderColumns {
365 kind: usize,
366 pattern: usize,
367 require_hanja: usize,
368 require_hangul: usize,
369 reason: usize,
370 column_count: usize,
371}
372
373fn parse_rules_header(header: &str) -> Result<RulesHeaderColumns> {
374 let columns = header.split('\t').collect::<Vec<_>>();
375 let mut seen = BTreeSet::new();
376 let mut kind = None;
377 let mut pattern = None;
378 let mut require_hanja = None;
379 let mut require_hangul = None;
380 let mut reason = None;
381
382 for (index, column) in columns.iter().enumerate() {
383 ensure!(
384 !column.is_empty(),
385 "rules TSV header contains an empty column name"
386 );
387 ensure!(
388 seen.insert(*column),
389 "rules TSV header contains duplicate `{column}` column"
390 );
391 match *column {
392 "kind" => kind = Some(index),
393 "pattern" => pattern = Some(index),
394 "require_hanja" => require_hanja = Some(index),
395 "require_hangul" => require_hangul = Some(index),
396 "reason" => reason = Some(index),
397 extra => tracing::warn!(column = extra, "ignoring unsupported rules TSV column"),
398 }
399 }
400
401 Ok(RulesHeaderColumns {
402 kind: kind.ok_or_else(|| Error::message("rules TSV missing required `kind` column"))?,
403 pattern: pattern
404 .ok_or_else(|| Error::message("rules TSV missing required `pattern` column"))?,
405 require_hanja: require_hanja
406 .ok_or_else(|| Error::message("rules TSV missing required `require_hanja` column"))?,
407 require_hangul: require_hangul
408 .ok_or_else(|| Error::message("rules TSV missing required `require_hangul` column"))?,
409 reason: reason
410 .ok_or_else(|| Error::message("rules TSV missing required `reason` column"))?,
411 column_count: columns.len(),
412 })
413}
414
415fn parse_rule_row(line: &str, columns: &RulesHeaderColumns, location: &str) -> Result<Rule> {
416 let fields = line.split('\t').collect::<Vec<_>>();
417 ensure!(
418 fields.len() >= columns.column_count,
419 "{location}: expected {} TSV fields, got {}",
420 columns.column_count,
421 fields.len()
422 );
423
424 let kind_field = fields[columns.kind];
425 let kind = RuleKind::parse(kind_field).ok_or_else(|| {
426 Error::message(format!(
427 "{location}: unknown rule kind `{kind_field}`; expected `entry`, `contains`, or `reading`"
428 ))
429 })?;
430 let pattern = fields[columns.pattern];
431 ensure!(
432 !pattern.is_empty(),
433 "{location}: `pattern` must not be empty"
434 );
435 let require_hanja = parse_required_bool(fields[columns.require_hanja], location)?;
436 let require_hangul = parse_required_bool(fields[columns.require_hangul], location)?;
437 ensure!(
438 require_hanja || require_hangul,
439 "{location}: rule must set at least one of `require_hanja` or `require_hangul`"
440 );
441 let reason = fields[columns.reason].trim();
442 ensure!(
443 !reason.is_empty(),
444 "{location}: `reason` must not be empty so future maintainers can audit the rule"
445 );
446
447 Ok(Rule {
448 kind,
449 pattern: pattern.to_owned(),
450 mark: EntryMark {
451 require_hanja,
452 require_hangul,
453 },
454 reason: reason.to_owned(),
455 location: location.to_owned(),
456 })
457}
458
459fn parse_required_bool(value: &str, location: &str) -> Result<bool> {
460 match value {
461 "true" | "1" => Ok(true),
462 "false" | "0" | "" => Ok(false),
463 other => bail!("{location}: invalid boolean value `{other}`"),
464 }
465}
466
467pub fn apply_rules(
474 entries: &mut [DictionaryEntry],
475 rules: &[Rule],
476 allow_unmatched: bool,
477) -> Result<()> {
478 if rules.is_empty() {
479 return Ok(());
480 }
481
482 tracing::info!(
483 rule_count = rules.len(),
484 entry_count = entries.len(),
485 "applying dictionary rules"
486 );
487
488 for rule in rules {
489 ensure!(
490 !rule.pattern.is_empty(),
491 "{}: rule pattern must not be empty",
492 rule.location,
493 );
494 ensure!(
495 rule.mark.require_hanja || rule.mark.require_hangul,
496 "{}: rule must set at least one of `require_hanja` or `require_hangul`",
497 rule.location,
498 );
499 if matches!(rule.kind, RuleKind::Contains) {
500 ensure!(
501 rule.pattern.chars().all(gukhanmun_core::is_hanja),
502 "{}: `contains` rule pattern `{}` must consist only of hanja characters; \
503 dictionary keys can be mixed-script so a pattern with hangul or other \
504 scripts would silently match unrelated entries",
505 rule.location,
506 rule.pattern,
507 );
508 }
509 }
510 let mut matched = vec![false; rules.len()];
511
512 for entry in entries.iter_mut() {
513 let hanja = entry.hanja().to_owned();
514 let reading = entry.reading().to_owned();
515 for (i, rule) in rules.iter().enumerate() {
516 let hit = match rule.kind {
517 RuleKind::Entry => hanja == rule.pattern,
518 RuleKind::Contains => hanja.contains(rule.pattern.as_str()),
519 RuleKind::Reading => reading == rule.pattern,
520 };
521 if hit {
522 matched[i] = true;
523 let mut mark = entry.mark();
524 mark.require_hanja |= rule.mark.require_hanja;
525 mark.require_hangul |= rule.mark.require_hangul;
526 entry.set_mark(mark);
527 }
528 }
529 }
530
531 if !allow_unmatched {
532 let mut unmatched = rules
533 .iter()
534 .zip(matched.iter())
535 .filter(|(_, hit)| !**hit)
536 .map(|(rule, _)| {
537 format!(
538 "{}: rule `{}={}` matched no entries",
539 rule.location,
540 rule.kind.as_str(),
541 rule.pattern,
542 )
543 })
544 .collect::<Vec<_>>();
545 if !unmatched.is_empty() {
546 tracing::error!(
547 unmatched_count = unmatched.len(),
548 "rules matched no entries"
549 );
550 unmatched.sort();
551 bail!(
552 "{} unmatched rule(s):\n {}",
553 unmatched.len(),
554 unmatched.join("\n ")
555 );
556 }
557 }
558
559 Ok(())
560}
561
562#[derive(Clone, Debug, Eq, PartialEq)]
564pub struct BuildOptions {
565 pub format: DictionaryFormat,
567
568 pub merge: MergePolicy,
570
571 pub validate: bool,
573
574 pub max_key_bytes: usize,
576
577 pub metadata: BTreeMap<String, String>,
579
580 pub rules: Vec<PathBuf>,
583
584 pub allow_unmatched_rules: bool,
586}
587
588impl Default for BuildOptions {
589 fn default() -> Self {
590 Self {
591 format: DictionaryFormat::Fst,
592 merge: MergePolicy::Error,
593 validate: false,
594 max_key_bytes: DEFAULT_MAX_KEY_BYTES,
595 metadata: BTreeMap::new(),
596 rules: Vec::new(),
597 allow_unmatched_rules: false,
598 }
599 }
600}
601
602pub fn build_dictionary(
604 input_paths: &[PathBuf],
605 output_path: impl AsRef<Path>,
606 options: &BuildOptions,
607) -> Result<()> {
608 ensure!(
609 !input_paths.is_empty(),
610 "at least one input file is required"
611 );
612 tracing::info!(
613 input_count = input_paths.len(),
614 output = %output_path.as_ref().display(),
615 ?options.format,
616 "building dictionary"
617 );
618 let mut entries = read_and_merge_inputs(input_paths, options)?;
619 if !options.rules.is_empty() {
620 let mut rules = Vec::new();
621 let mut seen = BTreeSet::<(RuleKind, String)>::new();
622 for path in &options.rules {
623 for rule in parse_rules_file(path)? {
624 if !seen.insert((rule.kind, rule.pattern.clone())) {
625 bail!(
626 "{}: duplicate rule for kind `{}` and pattern `{}`",
627 rule.location,
628 rule.kind.as_str(),
629 rule.pattern,
630 );
631 }
632 rules.push(rule);
633 }
634 }
635 apply_rules(&mut entries, &rules, options.allow_unmatched_rules)?;
636 }
637 let metadata = build_metadata(&options.metadata, &entries)?;
638 match options.format {
639 DictionaryFormat::Fst => {
640 let bytes = build_fst_bytes(&entries, &metadata)?;
641 fs::write(output_path.as_ref(), &bytes)
642 .with_context(|| format!("failed to write {}", output_path.as_ref().display()))?;
643
644 if options.validate {
645 let dictionary = FstDictionary::open(output_path.as_ref()).with_context(|| {
646 format!("failed to validate {}", output_path.as_ref().display())
647 })?;
648 validate_fst_round_trip(&entries, &dictionary)?;
649 }
650 }
651 DictionaryFormat::Cdb => {
652 reject_reserved_cdb_keys(&entries)?;
653 build_cdb_file(&entries, &metadata, output_path.as_ref())?;
654
655 if options.validate {
656 let dictionary = CdbDictionary::open(output_path.as_ref()).with_context(|| {
657 format!("failed to validate {}", output_path.as_ref().display())
658 })?;
659 validate_cdb_round_trip(&entries, &dictionary)?;
660 }
661 }
662 }
663
664 tracing::info!(entry_count = entries.len(), "dictionary build complete");
665 Ok(())
666}
667
668fn read_and_merge_inputs(
669 input_paths: &[PathBuf],
670 options: &BuildOptions,
671) -> Result<Vec<DictionaryEntry>> {
672 let mut merged = BTreeMap::<String, DictionaryEntry>::new();
673
674 for path in input_paths {
675 let file =
676 fs::File::open(path).with_context(|| format!("failed to open {}", path.display()))?;
677 let entries = parse_input(BufReader::new(file), path, options.max_key_bytes)?;
678 for entry in entries {
679 match (options.merge, merged.contains_key(entry.hanja())) {
680 (MergePolicy::Error, true) => bail!("duplicate entry for `{}`", entry.hanja()),
681 (MergePolicy::FirstWins, true) => {}
682 (MergePolicy::LastWins, true) | (_, false) => {
683 merged.insert(entry.hanja.clone(), entry);
684 }
685 }
686 }
687 }
688
689 Ok(merged.into_values().collect())
690}
691
692fn reject_reserved_cdb_keys(entries: &[DictionaryEntry]) -> Result<()> {
693 for entry in entries {
694 ensure!(
695 entry.hanja().as_bytes() != CDB_META_KEY,
696 "`{}` is reserved for CDB metadata",
697 entry.hanja()
698 );
699 }
700 Ok(())
701}
702
703fn parse_input(
704 reader: impl BufRead,
705 path: &Path,
706 max_key_bytes: usize,
707) -> Result<Vec<DictionaryEntry>> {
708 let format = match path.extension().and_then(|ext| ext.to_str()) {
709 Some("csv") => "csv",
710 Some("jsonl") => "jsonl",
711 _ => "tsv",
712 };
713 tracing::debug!(path = %path.display(), format, "detected dictionary input format");
714 match format {
715 "csv" => parse_csv(reader, path, max_key_bytes),
716 "jsonl" => parse_jsonl(reader, path, max_key_bytes),
717 _ => parse_tsv(reader, path, max_key_bytes),
718 }
719}
720
721fn parse_tsv(
722 reader: impl BufRead,
723 path: &Path,
724 max_key_bytes: usize,
725) -> Result<Vec<DictionaryEntry>> {
726 let mut lines = reader.lines();
727 let header = loop {
728 let Some(line) = lines.next() else {
729 bail!("{} is empty", path.display());
730 };
731 let line = line.with_context(|| format!("failed to read {}", path.display()))?;
732 if !line.is_empty() {
733 break line;
734 }
735 };
736 let columns = parse_header(&header)?;
737 let mut entries = Vec::new();
738
739 for (index, line) in lines.enumerate() {
740 let line_number = index + 2;
741 let line = line.with_context(|| format!("failed to read {}", path.display()))?;
742 if line.is_empty() {
743 continue;
744 }
745 entries.push(parse_row(
746 &line,
747 &columns,
748 max_key_bytes,
749 &format!("{}:{line_number}", path.display()),
750 )?);
751 }
752
753 Ok(entries)
754}
755
756fn parse_csv(
757 reader: impl BufRead,
758 path: &Path,
759 max_key_bytes: usize,
760) -> Result<Vec<DictionaryEntry>> {
761 let mut reader = csv::Reader::from_reader(reader);
762 let header = reader
763 .headers()
764 .with_context(|| format!("failed to read CSV header from {}", path.display()))?
765 .iter()
766 .collect::<Vec<_>>()
767 .join("\t");
768 let columns = parse_header_with_format(&header, "CSV")?;
769 let mut entries = Vec::new();
770
771 for (index, record) in reader.records().enumerate() {
772 let location = format!("{}:{}", path.display(), index + 2);
773 let record = record.with_context(|| format!("failed to read CSV record at {location}"))?;
774 let fields = record.iter().collect::<Vec<_>>();
775 entries.push(parse_fields(&fields, &columns, max_key_bytes, &location)?);
776 }
777
778 Ok(entries)
779}
780
781fn parse_jsonl(
782 reader: impl BufRead,
783 path: &Path,
784 max_key_bytes: usize,
785) -> Result<Vec<DictionaryEntry>> {
786 let mut entries = Vec::new();
787 for (index, line) in reader.lines().enumerate() {
788 let line_number = index + 1;
789 let line = line.with_context(|| format!("failed to read {}", path.display()))?;
790 if line.trim().is_empty() {
791 continue;
792 }
793 let record: JsonLineEntry = serde_json::from_str(&line).with_context(|| {
794 format!(
795 "failed to parse JSONL record at {}:{line_number}",
796 path.display()
797 )
798 })?;
799 entries.push(normalize_entry(
800 &record.hanja,
801 &record.hangul,
802 EntryMark {
803 require_hanja: record.require_hanja,
804 require_hangul: record.require_hangul,
805 },
806 max_key_bytes,
807 &format!("{}:{line_number}", path.display()),
808 )?);
809 }
810 Ok(entries)
811}
812
813#[derive(Clone, Debug)]
814struct HeaderColumns {
815 hanja: usize,
816 hangul: usize,
817 require_hanja: Option<usize>,
818 require_hangul: Option<usize>,
819 column_count: usize,
820}
821
822fn parse_header(header: &str) -> Result<HeaderColumns> {
823 parse_header_with_format(header, "TSV")
824}
825
826fn parse_header_with_format(header: &str, format_name: &str) -> Result<HeaderColumns> {
827 let columns = header.split('\t').collect::<Vec<_>>();
828 let mut seen = BTreeSet::new();
829 let mut hanja = None;
830 let mut hangul = None;
831 let mut require_hanja = None;
832 let mut require_hangul = None;
833
834 for (index, column) in columns.iter().enumerate() {
835 ensure!(
836 !column.is_empty(),
837 "{format_name} header contains an empty column name"
838 );
839 ensure!(
840 seen.insert(*column),
841 "{format_name} header contains duplicate `{column}` column"
842 );
843 match *column {
844 "hanja" => hanja = Some(index),
845 "hangul" => hangul = Some(index),
846 "require_hanja" => require_hanja = Some(index),
847 "require_hangul" => require_hangul = Some(index),
848 extra => {
849 tracing::warn!(
850 column = extra,
851 format = format_name,
852 "ignoring unsupported input column"
853 );
854 }
855 }
856 }
857
858 Ok(HeaderColumns {
859 hanja: hanja.ok_or_else(|| Error::message("missing required `hanja` column"))?,
860 hangul: hangul.ok_or_else(|| Error::message("missing required `hangul` column"))?,
861 require_hanja,
862 require_hangul,
863 column_count: columns.len(),
864 })
865}
866
867fn parse_row(
868 line: &str,
869 columns: &HeaderColumns,
870 max_key_bytes: usize,
871 location: &str,
872) -> Result<DictionaryEntry> {
873 let fields = line.split('\t').collect::<Vec<_>>();
874 parse_fields(&fields, columns, max_key_bytes, location)
875}
876
877fn parse_fields(
878 fields: &[&str],
879 columns: &HeaderColumns,
880 max_key_bytes: usize,
881 location: &str,
882) -> Result<DictionaryEntry> {
883 ensure!(
884 fields.len() >= columns.column_count,
885 "{location}: expected {} TSV fields, got {}",
886 columns.column_count,
887 fields.len()
888 );
889
890 let hanja = fields[columns.hanja];
891 let hangul = fields[columns.hangul];
892 let require_hanja = parse_optional_bool(fields, columns.require_hanja, location)?;
893 let require_hangul = parse_optional_bool(fields, columns.require_hangul, location)?;
894
895 normalize_entry(
896 hanja,
897 hangul,
898 EntryMark {
899 require_hanja,
900 require_hangul,
901 },
902 max_key_bytes,
903 location,
904 )
905}
906
907fn normalize_entry(
908 hanja: &str,
909 hangul: &str,
910 mark: EntryMark,
911 max_key_bytes: usize,
912 location: &str,
913) -> Result<DictionaryEntry> {
914 ensure!(!hanja.is_empty(), "{location}: `hanja` must not be empty");
915 ensure!(!hangul.is_empty(), "{location}: `hangul` must not be empty");
916 ensure!(
917 hanja.len() <= max_key_bytes,
918 "{location}: key `{hanja}` exceeds --max-key-bytes={max_key_bytes}"
919 );
920
921 Ok(DictionaryEntry::new(hanja, hangul, mark))
922}
923
924fn parse_optional_bool(fields: &[&str], index: Option<usize>, location: &str) -> Result<bool> {
925 let Some(index) = index else {
926 return Ok(false);
927 };
928 let Some(value) = fields.get(index).copied() else {
929 return Ok(false);
930 };
931 if value.is_empty() {
932 return Ok(false);
933 }
934 match value {
935 "true" | "1" => Ok(true),
936 "false" | "0" => Ok(false),
937 _ => bail!("{location}: invalid boolean value `{value}`"),
938 }
939}
940
941fn build_metadata(
942 user_metadata: &BTreeMap<String, String>,
943 entries: &[DictionaryEntry],
944) -> Result<BTreeMap<String, String>> {
945 for key in RESERVED_METADATA_KEYS {
946 ensure!(
947 !user_metadata.contains_key(*key),
948 "`{key}` metadata is reserved"
949 );
950 }
951
952 let mut metadata = BTreeMap::new();
953 metadata.insert(
954 "source".to_owned(),
955 user_metadata.get("source").cloned().unwrap_or_default(),
956 );
957 metadata.insert(
958 "license".to_owned(),
959 user_metadata.get("license").cloned().unwrap_or_default(),
960 );
961 metadata.insert(
962 "build_date".to_owned(),
963 user_metadata
964 .get("build_date")
965 .cloned()
966 .unwrap_or_else(default_build_date),
967 );
968 metadata.insert("entry_count".to_owned(), entries.len().to_string());
969 metadata.insert("version".to_owned(), FORMAT_VERSION.to_string());
970 metadata.insert(
971 "max_word_chars".to_owned(),
972 entries
973 .iter()
974 .map(|entry| entry.hanja().chars().count())
975 .max()
976 .unwrap_or(0)
977 .to_string(),
978 );
979 metadata.insert(
980 "max_key_bytes".to_owned(),
981 entries
982 .iter()
983 .map(|entry| entry.hanja().len())
984 .max()
985 .unwrap_or(0)
986 .to_string(),
987 );
988
989 for (key, value) in user_metadata {
990 metadata.entry(key.clone()).or_insert_with(|| value.clone());
991 }
992
993 Ok(metadata)
994}
995
996fn default_build_date() -> String {
997 let Some(epoch) = env::var("SOURCE_DATE_EPOCH")
998 .ok()
999 .and_then(|epoch| epoch.parse::<i64>().ok())
1000 else {
1001 return "1970-01-01T00:00:00Z".to_owned();
1002 };
1003 OffsetDateTime::from_unix_timestamp(epoch)
1004 .ok()
1005 .and_then(|datetime| datetime.format(&Rfc3339).ok())
1006 .unwrap_or_else(|| "1970-01-01T00:00:00Z".to_owned())
1007}
1008
1009fn build_fst_bytes(
1010 entries: &[DictionaryEntry],
1011 metadata: &BTreeMap<String, String>,
1012) -> Result<Vec<u8>> {
1013 let mut metadata_bytes = Vec::new();
1014 into_writer(metadata, &mut metadata_bytes).context("failed to encode dictionary metadata")?;
1015
1016 let mut readings = Vec::new();
1017 let mut builder = MapBuilder::memory();
1018 for entry in entries {
1019 let reading_len = u16::try_from(entry.reading().len())
1020 .with_context(|| format!("reading for `{}` is too long", entry.hanja()))?;
1021 let reading_offset =
1022 u64::try_from(readings.len()).context("reading table offset too large")?;
1023 ensure!(
1024 reading_offset <= VALUE_MAX_OFFSET,
1025 "reading table exceeds the FST value layout"
1026 );
1027 let value = encode_value(reading_len, entry.mark(), reading_offset);
1028 builder
1029 .insert(entry.hanja().as_bytes(), value)
1030 .with_context(|| format!("failed to insert `{}` into FST", entry.hanja()))?;
1031 readings.extend_from_slice(entry.reading().as_bytes());
1032 }
1033 let fst_bytes = builder.into_inner().context("failed to finish FST map")?;
1034
1035 let metadata_offset = u64::try_from(FIXED_HEADER_LEN).expect("header length fits in u64");
1036 let fst_offset = metadata_offset
1037 .checked_add(u64::try_from(metadata_bytes.len()).context("metadata too large")?)
1038 .context("FST offset overflow")?;
1039 let readings_offset = fst_offset
1040 .checked_add(u64::try_from(fst_bytes.len()).context("FST bytes too large")?)
1041 .context("reading table offset overflow")?;
1042 let header = FixedHeader {
1043 metadata_offset,
1044 metadata_len: u64::try_from(metadata_bytes.len()).context("metadata too large")?,
1045 fst_offset,
1046 fst_len: u64::try_from(fst_bytes.len()).context("FST bytes too large")?,
1047 readings_offset,
1048 readings_len: u64::try_from(readings.len()).context("reading table too large")?,
1049 };
1050
1051 let mut output = Vec::with_capacity(
1052 FIXED_HEADER_LEN + metadata_bytes.len() + fst_bytes.len() + readings.len(),
1053 );
1054 header.write(&mut output);
1055 output.extend(metadata_bytes);
1056 output.extend(fst_bytes);
1057 output.extend(readings);
1058 tracing::info!(
1059 entry_count = entries.len(),
1060 total_bytes = output.len(),
1061 "built FST dictionary bytes"
1062 );
1063 Ok(output)
1064}
1065
1066fn build_cdb_file(
1067 entries: &[DictionaryEntry],
1068 metadata: &BTreeMap<String, String>,
1069 output_path: &Path,
1070) -> Result<()> {
1071 let records = build_cdb_records(entries);
1072 let mut metadata = metadata.clone();
1073 metadata.insert("prefix_count".to_owned(), records.len().to_string());
1074 let mut metadata_bytes = Vec::new();
1075 into_writer(&metadata, &mut metadata_bytes).context("failed to encode dictionary metadata")?;
1076
1077 let output_name = output_path.to_str().ok_or_else(|| {
1078 Error::message(format!(
1079 "CDB output path must be valid UTF-8: {}",
1080 output_path.display()
1081 ))
1082 })?;
1083 let mut writer = cdb::CDBWriter::create(output_name)
1084 .with_context(|| format!("failed to create {}", output_path.display()))?;
1085 writer
1086 .add(CDB_META_KEY, &metadata_bytes)
1087 .context("failed to add CDB metadata record")?;
1088 for (key, record) in records {
1089 let value = encode_cdb_record(record.as_ref())?;
1090 writer
1091 .add(key.as_bytes(), &value)
1092 .with_context(|| format!("failed to add CDB record `{key}`"))?;
1093 }
1094 writer
1095 .finish()
1096 .with_context(|| format!("failed to finish {}", output_path.display()))?;
1097 tracing::info!(
1098 entry_count = entries.len(),
1099 path = %output_path.display(),
1100 "built CDB dictionary file"
1101 );
1102 Ok(())
1103}
1104
1105fn build_cdb_records(entries: &[DictionaryEntry]) -> BTreeMap<String, Option<DictionaryEntry>> {
1106 let mut records = BTreeMap::new();
1107 for entry in entries {
1108 let mut prefix = String::new();
1109 for ch in entry.hanja().chars() {
1110 prefix.push(ch);
1111 records.entry(prefix.clone()).or_insert(None);
1112 }
1113 records.insert(entry.hanja().to_owned(), Some(entry.clone()));
1114 }
1115 records
1116}
1117
1118fn encode_cdb_record(entry: Option<&DictionaryEntry>) -> Result<Vec<u8>> {
1119 let mut output = Vec::new();
1120 match entry {
1121 Some(entry) => {
1122 let reading_len = u16::try_from(entry.reading().len())
1123 .with_context(|| format!("reading for `{}` is too long", entry.hanja()))?;
1124 output.push(1);
1125 output.push(encode_cdb_mark(entry.mark()));
1126 output.extend_from_slice(&reading_len.to_le_bytes());
1127 output.extend_from_slice(entry.reading().as_bytes());
1128 }
1129 None => {
1130 output.push(0);
1131 output.push(0);
1132 output.extend_from_slice(&0u16.to_le_bytes());
1133 }
1134 }
1135 Ok(output)
1136}
1137
1138fn encode_cdb_mark(mark: EntryMark) -> u8 {
1139 let mut encoded = 0;
1140 if mark.require_hanja {
1141 encoded |= CDB_MARK_REQUIRE_HANJA;
1142 }
1143 if mark.require_hangul {
1144 encoded |= CDB_MARK_REQUIRE_HANGUL;
1145 }
1146 encoded
1147}
1148
1149fn validate_fst_round_trip(entries: &[DictionaryEntry], dictionary: &FstDictionary) -> Result<()> {
1150 if dictionary.entry_count() != entries.len() as u64 {
1151 tracing::error!(
1152 actual = dictionary.entry_count(),
1153 expected = entries.len() as u64,
1154 "round-trip validation failed: entry count mismatch"
1155 );
1156 bail!("round-trip validation failed: entry count mismatch");
1157 }
1158 for entry in entries {
1159 let actual = dictionary.lookup(entry.hanja())?.ok_or_else(|| {
1160 Error::message(format!(
1161 "round-trip validation failed: `{}` is missing",
1162 entry.hanja()
1163 ))
1164 })?;
1165 let mark = actual.mark();
1166 ensure!(
1167 actual.reading() == entry.reading()
1168 && mark.require_hanja == entry.mark().require_hanja
1169 && mark.require_hangul == entry.mark().require_hangul,
1170 "round-trip validation failed for `{}`",
1171 entry.hanja()
1172 );
1173 }
1174 Ok(())
1175}
1176
1177fn validate_cdb_round_trip(entries: &[DictionaryEntry], dictionary: &CdbDictionary) -> Result<()> {
1178 if dictionary.entry_count() != entries.len() as u64 {
1179 tracing::error!(
1180 actual = dictionary.entry_count(),
1181 expected = entries.len() as u64,
1182 "round-trip validation failed: entry count mismatch"
1183 );
1184 bail!("round-trip validation failed: entry count mismatch");
1185 }
1186 for entry in entries {
1187 let actual = dictionary.lookup(entry.hanja())?.ok_or_else(|| {
1188 Error::message(format!(
1189 "round-trip validation failed: `{}` is missing",
1190 entry.hanja()
1191 ))
1192 })?;
1193 let mark = actual.mark();
1194 ensure!(
1195 actual.reading() == entry.reading()
1196 && mark.require_hanja == entry.mark().require_hanja
1197 && mark.require_hangul == entry.mark().require_hangul,
1198 "round-trip validation failed for `{}`",
1199 entry.hanja()
1200 );
1201 }
1202 Ok(())
1203}
1204
1205#[derive(Debug, Deserialize)]
1206struct JsonLineEntry {
1207 hanja: String,
1208 hangul: String,
1209 #[serde(default, alias = "requireHanja")]
1210 require_hanja: bool,
1211 #[serde(default, alias = "requireHangul")]
1212 require_hangul: bool,
1213}
1214
1215fn encode_value(reading_len: u16, mark: EntryMark, reading_offset: u64) -> u64 {
1216 u64::from(reading_len)
1217 | (u64::from(encode_mark(mark)) << VALUE_MARK_SHIFT)
1218 | (reading_offset << VALUE_OFFSET_SHIFT)
1219}
1220
1221fn encode_mark(mark: EntryMark) -> u8 {
1222 let mut encoded = 0;
1223 if mark.require_hanja {
1224 encoded |= MARK_REQUIRE_HANJA;
1225 }
1226 if mark.require_hangul {
1227 encoded |= MARK_REQUIRE_HANGUL;
1228 }
1229 encoded
1230}
1231
1232#[derive(Clone, Copy, Debug, Eq, PartialEq)]
1233struct FixedHeader {
1234 metadata_offset: u64,
1235 metadata_len: u64,
1236 fst_offset: u64,
1237 fst_len: u64,
1238 readings_offset: u64,
1239 readings_len: u64,
1240}
1241
1242impl FixedHeader {
1243 fn write(self, output: &mut Vec<u8>) {
1244 output.extend_from_slice(MAGIC);
1245 output.extend_from_slice(&FORMAT_VERSION.to_le_bytes());
1246 output.extend_from_slice(&(FIXED_HEADER_LEN as u32).to_le_bytes());
1247 output.extend_from_slice(&self.metadata_offset.to_le_bytes());
1248 output.extend_from_slice(&self.metadata_len.to_le_bytes());
1249 output.extend_from_slice(&self.fst_offset.to_le_bytes());
1250 output.extend_from_slice(&self.fst_len.to_le_bytes());
1251 output.extend_from_slice(&self.readings_offset.to_le_bytes());
1252 output.extend_from_slice(&self.readings_len.to_le_bytes());
1253 debug_assert_eq!(output.len(), FIXED_HEADER_LEN);
1254 }
1255}
1256
1257pub fn parse_metadata_arg(arg: &str) -> Result<(String, String)> {
1259 let (key, value) = arg
1260 .split_once('=')
1261 .ok_or_else(|| Error::message("metadata must use KEY=VAL syntax"))?;
1262 ensure!(!key.is_empty(), "metadata key must not be empty");
1263 Ok((key.to_owned(), value.to_owned()))
1264}
1265
1266#[cfg(test)]
1267mod tests {
1268 use tracing_test::traced_test;
1269
1270 use super::*;
1271
1272 #[traced_test]
1273 #[test]
1274 fn unmatched_rules_emits_error_event() {
1275 let mut entries = vec![DictionaryEntry::new("漢字", "한자", EntryMark::default())];
1276 let rules = vec![Rule::new(
1277 RuleKind::Entry,
1278 "天地",
1279 EntryMark {
1280 require_hanja: true,
1281 require_hangul: false,
1282 },
1283 "missing entry",
1284 )];
1285
1286 let result = apply_rules(&mut entries, &rules, false);
1287
1288 assert!(result.is_err());
1289 assert!(logs_contain("rules matched no entries"));
1290 }
1291
1292 #[test]
1293 fn parses_headered_tsv_and_optional_flags() {
1294 let input = "hanja\thangul\trequire_hanja\trequire_hangul\tcategory\n漢字\t한자\t1\tfalse\tnoun\n天地\t천지\t\ttrue\tnoun\n";
1295
1296 let entries = parse_tsv(input.as_bytes(), Path::new("fixture.tsv"), 1024).unwrap();
1297
1298 assert_eq!(entries.len(), 2);
1299 assert_eq!(entries[0].hanja(), "漢字");
1300 assert_eq!(entries[0].reading(), "한자");
1301 assert!(entries[0].mark().require_hanja);
1302 assert!(!entries[0].mark().require_hangul);
1303 assert!(!entries[1].mark().require_hanja);
1304 assert!(entries[1].mark().require_hangul);
1305 }
1306
1307 #[test]
1308 fn rejects_invalid_boolean_values() {
1309 let input = "hanja\thangul\trequire_hanja\n漢字\t한자\tyes\n";
1310
1311 let error = parse_tsv(input.as_bytes(), Path::new("fixture.tsv"), 1024).unwrap_err();
1312
1313 assert!(error.to_string().contains("invalid boolean value `yes`"));
1314 }
1315
1316 #[test]
1317 fn rejects_reserved_metadata_keys() {
1318 let metadata = BTreeMap::from([("entry_count".to_owned(), "1".to_owned())]);
1319
1320 let error = build_metadata(&metadata, &[]).unwrap_err();
1321
1322 assert!(error.to_string().contains("reserved"));
1323 }
1324
1325 fn parse_rules_str(input: &str) -> Result<Vec<Rule>> {
1326 parse_rules_reader(input.as_bytes(), Path::new("rules.tsv"))
1327 }
1328
1329 #[test]
1330 fn parses_minimal_rules_tsv() {
1331 let input = "kind\tpattern\trequire_hanja\trequire_hangul\treason\n\
1332 entry\t漢字\ttrue\tfalse\thomophone\n\
1333 contains\t驟\ttrue\tfalse\trare hanja\n\
1334 reading\t사기\ttrue\tfalse\tcommon homophone\n";
1335
1336 let rules = parse_rules_str(input).unwrap();
1337
1338 assert_eq!(rules.len(), 3);
1339 assert_eq!(rules[0].kind(), RuleKind::Entry);
1340 assert_eq!(rules[0].pattern(), "漢字");
1341 assert!(rules[0].mark().require_hanja);
1342 assert!(!rules[0].mark().require_hangul);
1343 assert_eq!(rules[0].reason(), "homophone");
1344 assert_eq!(rules[1].kind(), RuleKind::Contains);
1345 assert_eq!(rules[2].kind(), RuleKind::Reading);
1346 }
1347
1348 #[test]
1349 fn rejects_unknown_rule_kind() {
1350 let input = "kind\tpattern\trequire_hanja\trequire_hangul\treason\n\
1351 glob\t漢*\ttrue\tfalse\tnope\n";
1352
1353 let error = parse_rules_str(input).unwrap_err();
1354
1355 let text = error.to_string();
1356 assert!(text.contains("unknown rule kind `glob`"), "{text}");
1357 assert!(text.contains("`entry`"), "{text}");
1359 assert!(text.contains("`contains`"), "{text}");
1360 assert!(text.contains("`reading`"), "{text}");
1361 }
1362
1363 #[test]
1364 fn rejects_contains_rule_with_non_hanja_pattern_from_tsv() {
1365 let input = "kind\tpattern\trequire_hanja\trequire_hangul\treason\n\
1369 contains\t하다\ttrue\tfalse\ttypo\n";
1370 let rules = parse_rules_str(input).unwrap();
1371 let mut entries = vec![entry("布告하다", "포고하다")];
1372
1373 let error = apply_rules(&mut entries, &rules, false).unwrap_err();
1374
1375 assert!(
1376 error.to_string().contains("must consist only of hanja"),
1377 "{error}"
1378 );
1379 }
1380
1381 #[test]
1382 fn rejects_rule_with_empty_reason() {
1383 let input = "kind\tpattern\trequire_hanja\trequire_hangul\treason\n\
1384 entry\t漢字\ttrue\tfalse\t\n";
1385
1386 let error = parse_rules_str(input).unwrap_err();
1387
1388 assert!(error.to_string().contains("reason"), "{error}");
1389 }
1390
1391 #[test]
1392 fn rejects_rule_with_no_mark_bits_set() {
1393 let input = "kind\tpattern\trequire_hanja\trequire_hangul\treason\n\
1394 entry\t漢字\tfalse\tfalse\tno-op\n";
1395
1396 let error = parse_rules_str(input).unwrap_err();
1397
1398 assert!(
1399 error
1400 .to_string()
1401 .contains("at least one of `require_hanja` or `require_hangul`"),
1402 "{error}"
1403 );
1404 }
1405
1406 #[test]
1407 fn rejects_duplicate_rule_keys() {
1408 let input = "kind\tpattern\trequire_hanja\trequire_hangul\treason\n\
1409 entry\t漢字\ttrue\tfalse\tfirst\n\
1410 entry\t漢字\tfalse\ttrue\tsecond\n";
1411
1412 let error = parse_rules_str(input).unwrap_err();
1413
1414 assert!(error.to_string().contains("duplicate rule"), "{error}");
1415 }
1416
1417 #[test]
1418 fn allows_overlapping_rules_across_kinds() {
1419 let input = "kind\tpattern\trequire_hanja\trequire_hangul\treason\n\
1420 entry\t漢字\ttrue\tfalse\thomophone entry\n\
1421 contains\t漢\ttrue\tfalse\trare character\n";
1422
1423 let rules = parse_rules_str(input).unwrap();
1424
1425 assert_eq!(rules.len(), 2);
1426 }
1427
1428 fn entry(hanja: &str, reading: &str) -> DictionaryEntry {
1429 DictionaryEntry::new(hanja, reading, EntryMark::default())
1430 }
1431
1432 #[test]
1433 fn apply_rules_or_merges_marks_across_kinds() {
1434 let mut entries = vec![
1435 entry("漢字", "한자"),
1436 entry("天地", "천지"),
1437 entry("史記", "사기"),
1438 entry("詐欺", "사기"),
1439 ];
1440 let rules = vec![
1441 Rule::new(
1442 RuleKind::Entry,
1443 "漢字",
1444 EntryMark {
1445 require_hanja: true,
1446 require_hangul: false,
1447 },
1448 "homophone-heavy entry",
1449 ),
1450 Rule::new(
1451 RuleKind::Contains,
1452 "天",
1453 EntryMark {
1454 require_hanja: true,
1455 require_hangul: false,
1456 },
1457 "rare hanja",
1458 ),
1459 Rule::new(
1460 RuleKind::Reading,
1461 "사기",
1462 EntryMark {
1463 require_hanja: true,
1464 require_hangul: false,
1465 },
1466 "ambiguous reading",
1467 ),
1468 ];
1469
1470 apply_rules(&mut entries, &rules, false).unwrap();
1471
1472 assert!(
1473 entries[0].mark().require_hanja,
1474 "entry rule applied to 漢字"
1475 );
1476 assert!(
1477 entries[1].mark().require_hanja,
1478 "contains rule applied to 天地"
1479 );
1480 assert!(
1481 entries[2].mark().require_hanja,
1482 "reading rule applied to 史記"
1483 );
1484 assert!(
1485 entries[3].mark().require_hanja,
1486 "reading rule applied to 詐欺"
1487 );
1488 }
1489
1490 #[test]
1491 fn apply_rules_or_merges_multiple_rules_on_one_entry() {
1492 let mut entries = vec![entry("漢字", "한자")];
1493 let rules = vec![
1494 Rule::new(
1495 RuleKind::Entry,
1496 "漢字",
1497 EntryMark {
1498 require_hanja: true,
1499 require_hangul: false,
1500 },
1501 "entry-level",
1502 ),
1503 Rule::new(
1504 RuleKind::Reading,
1505 "한자",
1506 EntryMark {
1507 require_hanja: false,
1508 require_hangul: true,
1509 },
1510 "reading-level",
1511 ),
1512 ];
1513
1514 apply_rules(&mut entries, &rules, false).unwrap();
1515
1516 let mark = entries[0].mark();
1517 assert!(mark.require_hanja);
1518 assert!(mark.require_hangul);
1519 }
1520
1521 #[test]
1522 fn apply_rules_reports_all_unmatched_rules_in_one_error() {
1523 let mut entries = vec![entry("漢字", "한자")];
1524 let rules = vec![
1525 Rule::new(
1526 RuleKind::Entry,
1527 "天地",
1528 EntryMark {
1529 require_hanja: true,
1530 require_hangul: false,
1531 },
1532 "missing entry",
1533 ),
1534 Rule::new(
1535 RuleKind::Contains,
1536 "驟",
1537 EntryMark {
1538 require_hanja: true,
1539 require_hangul: false,
1540 },
1541 "missing contains",
1542 ),
1543 ];
1544
1545 let error = apply_rules(&mut entries, &rules, false).unwrap_err();
1546
1547 let text = error.to_string();
1548 assert!(text.contains("entry=天地"), "{text}");
1549 assert!(text.contains("contains=驟"), "{text}");
1550 assert!(text.contains("2 unmatched"), "{text}");
1551 }
1552
1553 #[test]
1554 fn apply_rules_accepts_multi_hanja_contains_pattern() {
1555 let mut entries = vec![
1558 entry("國民學校", "국민학교"),
1559 entry("國民年金", "국민연금"),
1560 entry("民國", "민국"),
1561 ];
1562 let rules = vec![Rule::new(
1563 RuleKind::Contains,
1564 "國民",
1565 EntryMark {
1566 require_hanja: true,
1567 require_hangul: false,
1568 },
1569 "compound containing 國民",
1570 )];
1571
1572 apply_rules(&mut entries, &rules, false).unwrap();
1573
1574 assert!(entries[0].mark().require_hanja);
1575 assert!(entries[1].mark().require_hanja);
1576 assert!(
1577 !entries[2].mark().require_hanja,
1578 "民國 does not contain the substring 國民"
1579 );
1580 }
1581
1582 #[test]
1583 fn apply_rules_rejects_contains_rule_with_non_hanja_character() {
1584 let mut entries = vec![entry("布告하다", "포고하다"), entry("漢字", "한자")];
1587 let rules = vec![Rule::new(
1588 RuleKind::Contains,
1589 "하",
1590 EntryMark {
1591 require_hanja: true,
1592 require_hangul: false,
1593 },
1594 "typo: meant a rare hanja",
1595 )];
1596
1597 let error = apply_rules(&mut entries, &rules, false).unwrap_err();
1598
1599 let text = error.to_string();
1600 assert!(text.contains("must consist only of hanja"), "{text}");
1601 assert!(
1602 !entries[0].mark().require_hanja,
1603 "the typo'd rule must not silently mark 布告하다"
1604 );
1605 }
1606
1607 #[test]
1608 fn apply_rules_rejects_programmatic_empty_pattern() {
1609 let mut entries = vec![entry("漢字", "한자")];
1610 let rules = vec![Rule::new(
1611 RuleKind::Entry,
1612 "",
1613 EntryMark {
1614 require_hanja: true,
1615 require_hangul: false,
1616 },
1617 "programmatic mistake",
1618 )];
1619
1620 let error = apply_rules(&mut entries, &rules, false).unwrap_err();
1621
1622 assert!(error.to_string().contains("must not be empty"), "{error}");
1623 }
1624
1625 #[test]
1626 fn apply_rules_rejects_programmatic_no_mark_bits() {
1627 let mut entries = vec![entry("漢字", "한자")];
1628 let rules = vec![Rule::new(
1629 RuleKind::Entry,
1630 "漢字",
1631 EntryMark::default(),
1632 "programmatic mistake",
1633 )];
1634
1635 let error = apply_rules(&mut entries, &rules, false).unwrap_err();
1636
1637 assert!(
1638 error
1639 .to_string()
1640 .contains("at least one of `require_hanja` or `require_hangul`"),
1641 "{error}"
1642 );
1643 }
1644
1645 #[test]
1646 fn apply_rules_allows_unmatched_when_configured() {
1647 let mut entries = vec![entry("漢字", "한자")];
1648 let rules = vec![Rule::new(
1649 RuleKind::Entry,
1650 "天地",
1651 EntryMark {
1652 require_hanja: true,
1653 require_hangul: false,
1654 },
1655 "missing entry",
1656 )];
1657
1658 apply_rules(&mut entries, &rules, true).unwrap();
1659
1660 assert!(!entries[0].mark().require_hanja);
1661 }
1662}