gukhanmun_core/
lib.rs

1// Gukhanmun: Core IR, engine, dictionary traits, and fallback logic for Gukhanmun.
2// Copyright (C) 2026  Hong Minhee
3//
4// This program is free software: you can redistribute it and/or modify
5// it under the terms of the GNU General Public License as published by
6// the Free Software Foundation, either version 3 of the License, or
7// (at your option) any later version.
8//
9// This program is distributed in the hope that it will be useful,
10// but WITHOUT ANY WARRANTY; without even the implied warranty of
11// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12// GNU General Public License for more details.
13//
14// You should have received a copy of the GNU General Public License
15// along with this program.  If not, see <https://www.gnu.org/licenses/>.
16
17//! Core types and algorithms for Gukhanmun.
18//!
19//! This crate is the home for the format-neutral intermediate representation,
20//! conversion engine, dictionary traits, lattice segmentation, and fallback
21//! hanja reading logic. Format adapters, command-line I/O, and language
22//! bindings live in separate crates.
23
24#![no_std]
25#![forbid(unsafe_code)]
26#![deny(missing_docs)]
27
28extern crate alloc;
29
30mod fallback;
31mod generated;
32mod segment;
33
34use alloc::boxed::Box;
35use alloc::collections::{BTreeMap, BTreeSet};
36use alloc::string::{String, ToString};
37use alloc::vec::Vec;
38use core::marker::PhantomData;
39
40use fallback::{
41    FallbackPart, FallbackState, apply_initial_sound_law_to_first_syllable,
42    fallback_reading_for_run, phoneticize_fallback_run_with_state, phoneticize_hanja_char,
43    should_apply_yeol_yul,
44};
45use generated::unihan_readings::KHANGUL_READINGS;
46use segment::{Segment, segment_text};
47
48/// Error returned by fallible core pipeline entry points.
49///
50/// The core engine is mostly infallible today because dictionary lookup is a
51/// synchronous trait contract. This type is still the common structured error
52/// surface for reader/engine/writer boundaries and for future engine
53/// invariants that callers may need to inspect.
54#[derive(Debug, thiserror::Error)]
55#[non_exhaustive]
56pub enum Error {
57    /// Loading or preparing a dictionary failed before conversion could run.
58    #[error("dictionary load failed: {0}")]
59    DictionaryLoad(String),
60
61    /// Lattice segmentation failed for a specific source string.
62    #[error("segmentation failed for {hanja:?}: {reason}")]
63    Segmentation {
64        /// The hanja source span that could not be segmented.
65        hanja: String,
66
67        /// Human-readable reason for the segmentation failure.
68        reason: String,
69    },
70
71    /// A dictionary or fallback path produced a reading that is not accepted.
72    #[error("invalid hangul reading {reading:?} for hanja {hanja:?}")]
73    InvalidReading {
74        /// The hanja source string associated with the reading.
75        hanja: String,
76
77        /// The rejected hangul reading.
78        reading: String,
79    },
80
81    /// An internal invariant was violated.
82    #[error("internal invariant violated: {0}")]
83    Internal(&'static str),
84
85    /// A boxed error from an extension point that has no more specific core
86    /// variant yet.
87    #[error(transparent)]
88    Other(#[from] Box<dyn core::error::Error + Send + Sync + 'static>),
89}
90
91/// Stream-level error recovery policy.
92///
93/// `Strict` is the default and returns the first recoverable reader error.
94/// `Lenient` logs the error and emits the original unrecognized region as a
95/// verbatim token so downstream tokens can continue flowing.
96#[derive(Clone, Copy, Debug, Default, Eq, PartialEq)]
97pub enum Recovery {
98    /// Return the first reader, engine, or writer error and stop processing.
99    #[default]
100    Strict,
101
102    /// Preserve recoverable bad input regions and continue processing.
103    Lenient,
104}
105
106/// A recoverable reader error plus the original source region.
107///
108/// Readers use this value when they can identify a malformed region and know
109/// how to preserve its source bytes or text in lenient mode. Strict mode
110/// returns the stored error directly.
111#[derive(Debug)]
112pub struct RecoverableInputError {
113    original: String,
114    error: Error,
115}
116
117impl RecoverableInputError {
118    /// Creates a recoverable input error from original source and cause.
119    pub fn new(original: String, error: Error) -> Self {
120        Self { original, error }
121    }
122
123    /// Returns the original source region that can be preserved in lenient
124    /// mode.
125    pub fn original(&self) -> &str {
126        &self.original
127    }
128
129    /// Returns the structured error describing why the region was rejected.
130    pub fn error(&self) -> &Error {
131        &self.error
132    }
133
134    /// Consumes the error and returns the original source plus cause.
135    pub fn into_parts(self) -> (String, Error) {
136        (self.original, self.error)
137    }
138}
139
140/// Adapter-owned data attached to an intermediate-representation scope.
141///
142/// The engine treats this trait as an opaque policy boundary. Format adapters
143/// can encode HTML elements, Markdown events, or plain-text scopes in the
144/// concrete type, while the engine only asks whether text should be preserved
145/// and whether later stages may insert inline markup.
146pub trait ScopeData: Clone + 'static {
147    /// Returns whether text inside this scope must pass through untouched.
148    fn is_preserve(&self) -> bool;
149
150    /// Returns whether inline markup may be inserted inside this scope.
151    ///
152    /// This flag is about *structural* permission for markup at the current
153    /// position, not about whether the engine actually converts text here.
154    /// A scope may legitimately set [`Self::is_preserve`] to `true` (so no
155    /// annotation is produced) while still reporting `true` for this method,
156    /// because preserve does not by itself restrict what a deeper non-preserved
157    /// child may emit. Adapters should return `false` only when an HTML5
158    /// text-only content model (such as `<title>` or `<option>`) or an
159    /// analogous host rule actually forbids markup at this position.
160    ///
161    /// Scope-aware renderers treat inline markup as allowed only when *every*
162    /// open ancestor reports `true`; a nested allow-markup scope cannot
163    /// re-enable markup that an ancestor has forbidden.
164    fn allows_inline_markup(&self) -> bool {
165        true
166    }
167
168    /// Returns whether this scope resets block-oriented stateful stages.
169    fn is_block_boundary(&self) -> bool {
170        false
171    }
172
173    /// Returns whether this scope resets section-oriented stateful stages.
174    fn is_section_boundary(&self) -> bool {
175        false
176    }
177}
178
179/// A structural scope in the format-neutral token stream.
180///
181/// `Scope` carries only adapter-owned data. The engine may clone and stack
182/// scopes, but it does not inspect the concrete data beyond the `ScopeData`
183/// methods.
184#[derive(Clone, Debug, Eq, PartialEq)]
185pub struct Scope<S> {
186    data: S,
187}
188
189impl<S> Scope<S> {
190    /// Creates a scope from adapter-specific data.
191    pub fn new(data: S) -> Self {
192        Self { data }
193    }
194
195    /// Returns a shared reference to the adapter-specific scope data.
196    pub fn data(&self) -> &S {
197        &self.data
198    }
199
200    /// Consumes the scope and returns its adapter-specific data.
201    pub fn into_data(self) -> S {
202        self.data
203    }
204}
205
206/// A token emitted by a reader before hanja conversion has run.
207///
208/// This type intentionally has no annotation variant: annotations are produced
209/// by the engine and consumed by renderers, so input adapters cannot inject
210/// already-converted positions into the stream.
211#[derive(Clone, Debug, Eq, PartialEq)]
212pub enum InputToken<S> {
213    /// Enters a structural scope.
214    Open(Scope<S>),
215
216    /// Leaves the most recent structural scope.
217    Close,
218
219    /// Text that the engine may convert unless a preserving scope is active.
220    Text(String),
221
222    /// Text that must pass through untouched.
223    Verbatim(String),
224}
225
226/// A token emitted by the engine after hanja conversion.
227///
228/// Most tokens pass through from `InputToken`, but converted dictionary matches
229/// become `Annotated` so middlewares and renderers can choose their final
230/// surface form.
231#[derive(Clone, Debug, Eq, PartialEq)]
232pub enum OutputToken<S> {
233    /// Enters a structural scope.
234    Open(Scope<S>),
235
236    /// Leaves the most recent structural scope.
237    Close,
238
239    /// Text that needs no annotation-aware rendering.
240    Text(String),
241
242    /// Text that must pass through untouched.
243    Verbatim(String),
244
245    /// A converted hanja word plus metadata for later stages.
246    Annotated(Annotation),
247}
248
249/// A token emitted by a renderer after all annotations have been expanded.
250///
251/// Writers consume this stream because it cannot contain unrendered
252/// annotations. That makes the renderer-to-writer boundary explicit in the type
253/// system.
254#[derive(Clone, Debug, Eq, PartialEq)]
255pub enum RenderedToken<S> {
256    /// Enters a structural scope.
257    Open(Scope<S>),
258
259    /// Leaves the most recent structural scope.
260    Close,
261
262    /// Text ready for serialization.
263    Text(String),
264
265    /// Verbatim text ready for serialization.
266    Verbatim(String),
267
268    /// A structural ruby annotation pairing a base text with an `rt` gloss.
269    ///
270    /// Writers serialize this in a format-appropriate way: HTML emits a
271    /// `<ruby>` element, Markdown emits inline HTML, and plain text falls back
272    /// to parenthesized text. Because the variant carries the base and gloss
273    /// as separate strings rather than pre-built markup, each writer is
274    /// responsible for escaping the contents according to its own rules—the
275    /// renderer never injects raw HTML produced by string concatenation.
276    ///
277    /// Renderers only emit this variant when the active scope reports
278    /// [`ScopeData::allows_inline_markup`] as `true`; scopes that disallow
279    /// inline markup receive a plain `Text` fallback instead.
280    Ruby {
281        /// Base text shown as the primary side of the ruby annotation.
282        base: String,
283
284        /// Gloss text shown in the `rt` position.
285        rt: String,
286    },
287}
288
289/// Metadata for a dictionary-backed hanja conversion.
290///
291/// The engine fills this value when it turns source hanja into a hangul
292/// reading. The flags describe known constraints; middlewares may adjust them
293/// before a renderer chooses the concrete output form.
294#[derive(Clone, Debug, Eq, PartialEq)]
295pub struct Annotation {
296    /// The original hanja text from the input.
297    pub hanja: String,
298
299    /// The hangul reading selected for the hanja text.
300    pub reading: String,
301
302    /// Whether another hanja form in the active context shares this reading.
303    pub homophone: bool,
304
305    /// Whether rendered output must keep the original hanja visible.
306    pub require_hanja: bool,
307
308    /// Whether rendered output must include a hangul gloss when hanja remains
309    /// primary.
310    pub require_hangul: bool,
311
312    /// Whether this is the first occurrence in the active context window.
313    pub first_in_context: bool,
314
315    /// Whether renderers should collapse this annotation to its primary plain
316    /// text form instead of adding annotation markup or parentheses.
317    pub skip_annotation: bool,
318
319    /// Whether this annotation came from a dictionary match.
320    pub from_dictionary: bool,
321}
322
323/// Dictionary-provided rendering constraints for a match.
324#[derive(Clone, Copy, Debug, Default, Eq, PartialEq)]
325pub struct MatchMark {
326    /// Whether this dictionary entry should always show its hanja form.
327    pub require_hanja: bool,
328
329    /// Whether this dictionary entry should always show its hangul reading.
330    pub require_hangul: bool,
331}
332
333/// A complete dictionary entry exposed for batch policy analysis.
334///
335/// Conversion only needs prefix lookup through [`HanjaDictionary::matches_at`],
336/// but middlewares such as homophone marking need to reason about the effective
337/// entry set without repeatedly probing the dictionary. Backends that can
338/// enumerate entries should return these records from
339/// [`HanjaDictionary::entries`].
340#[derive(Clone, Debug, Eq, PartialEq)]
341pub struct DictionaryRecord {
342    /// The hanja spelling stored as a dictionary key.
343    pub hanja: String,
344
345    /// The hangul reading selected for this hanja spelling.
346    pub reading: String,
347
348    /// Dictionary-provided rendering constraints for this entry.
349    pub mark: MatchMark,
350}
351
352/// A dictionary match that starts at the queried cursor position.
353#[derive(Clone, Debug, Eq, PartialEq)]
354pub struct Match {
355    /// The matched prefix length in UTF-8 bytes.
356    pub byte_len: usize,
357
358    /// The hangul reading for the matched hanja prefix.
359    ///
360    /// This is the word-initial reading, which already reflects South Korean
361    /// initial sound law where it applies (for example `年` reads `연`).
362    pub reading: String,
363
364    /// The reading to use when this match is *not* word-initial, when it
365    /// differs from [`Match::reading`] by initial sound law.
366    ///
367    /// Dictionaries set this for multi-syllable entries whose leading morpheme
368    /// keeps its original sound outside word-initial position, as the Standard
369    /// Korean Language Dictionary records through its suffix and bound-noun
370    /// head words (for example `年代` reads `연대` word-initially but `년대`
371    /// after a number). Single-hanja initial sound law is handled by the engine
372    /// from the bundled unihan readings and does not need this field. `None`
373    /// means the reading is position independent.
374    pub suffix_reading: Option<String>,
375
376    /// Dictionary-provided rendering constraints for this match.
377    pub mark: MatchMark,
378}
379
380/// A hanja dictionary queried by the conversion engine.
381///
382/// The key operation returns every entry that starts at the beginning of the
383/// supplied string. This shape supports lattice segmentation because the
384/// engine must consider every candidate path through a hanja run.
385pub trait HanjaDictionary {
386    /// Yields every dictionary match that starts at the beginning of `s`.
387    fn matches_at<'a>(&'a self, s: &'a str) -> Box<dyn Iterator<Item = Match> + 'a>;
388
389    /// Returns the greatest dictionary entry length in Unicode scalar values.
390    fn max_word_chars(&self) -> Option<usize> {
391        None
392    }
393
394    /// Enumerates complete dictionary entries when the backend supports it.
395    ///
396    /// The default returns `None`, which keeps custom lookup-only dictionaries
397    /// valid. Homophone-aware middlewares use this as an optional batch path so
398    /// built-in backends can avoid per-token full-dictionary scans.
399    fn entries<'a>(&'a self) -> Option<Box<dyn Iterator<Item = DictionaryRecord> + 'a>> {
400        None
401    }
402
403    /// Returns whether another hanja spelling has the same hangul reading.
404    fn has_homophone(&self, hanja: &str, reading: &str) -> bool {
405        self.entries().is_some_and(|mut entries| {
406            entries.any(|record| record.hanja != hanja && record.reading == reading)
407        })
408    }
409}
410
411impl<D> HanjaDictionary for &D
412where
413    D: HanjaDictionary + ?Sized,
414{
415    fn matches_at<'a>(&'a self, s: &'a str) -> Box<dyn Iterator<Item = Match> + 'a> {
416        (**self).matches_at(s)
417    }
418
419    fn max_word_chars(&self) -> Option<usize> {
420        (**self).max_word_chars()
421    }
422
423    fn entries<'a>(&'a self) -> Option<Box<dyn Iterator<Item = DictionaryRecord> + 'a>> {
424        (**self).entries()
425    }
426
427    fn has_homophone(&self, hanja: &str, reading: &str) -> bool {
428        (**self).has_homophone(hanja, reading)
429    }
430}
431
432impl<D> HanjaDictionary for Box<D>
433where
434    D: HanjaDictionary + ?Sized,
435{
436    fn matches_at<'a>(&'a self, s: &'a str) -> Box<dyn Iterator<Item = Match> + 'a> {
437        (**self).matches_at(s)
438    }
439
440    fn max_word_chars(&self) -> Option<usize> {
441        (**self).max_word_chars()
442    }
443
444    fn entries<'a>(&'a self) -> Option<Box<dyn Iterator<Item = DictionaryRecord> + 'a>> {
445        (**self).entries()
446    }
447
448    fn has_homophone(&self, hanja: &str, reading: &str) -> bool {
449        (**self).has_homophone(hanja, reading)
450    }
451}
452
453/// Per-character Unihan fallback readings exposed as a dictionary.
454///
455/// This type reads the same generated `kHangul` table used by the engine's
456/// fallback phoneticizer, but it deliberately returns canonical pre-initial
457/// sound law readings. Stateful orthographic rules such as the initial sound
458/// law, `列`/`律`, and numeral grouping remain engine fallback behavior rather
459/// than dictionary behavior.
460#[derive(Clone, Copy, Debug, Default, Eq, PartialEq)]
461pub struct UnihanCharDict;
462
463impl HanjaDictionary for UnihanCharDict {
464    fn matches_at<'a>(&'a self, s: &'a str) -> Box<dyn Iterator<Item = Match> + 'a> {
465        let matched = s.chars().next().and_then(|ch| {
466            khangul_reading(ch).map(|reading| Match {
467                byte_len: ch.len_utf8(),
468                reading: reading.to_string(),
469                suffix_reading: None,
470                mark: MatchMark::default(),
471            })
472        });
473        Box::new(matched.into_iter())
474    }
475
476    fn max_word_chars(&self) -> Option<usize> {
477        Some(1)
478    }
479
480    fn entries<'a>(&'a self) -> Option<Box<dyn Iterator<Item = DictionaryRecord> + 'a>> {
481        Some(Box::new(KHANGUL_READINGS.iter().map(|(hanja, reading)| {
482            DictionaryRecord {
483                hanja: hanja.to_string(),
484                reading: reading.to_string(),
485                mark: MatchMark::default(),
486            }
487        })))
488    }
489
490    fn has_homophone(&self, hanja: &str, reading: &str) -> bool {
491        let mut chars = hanja.chars();
492        let Some(hanja) = chars.next() else {
493            return false;
494        };
495        if chars.next().is_some() {
496            return false;
497        }
498        KHANGUL_READINGS
499            .iter()
500            .any(|&(other_hanja, other_reading)| other_hanja != hanja && other_reading == reading)
501    }
502}
503
504/// A dictionary composition that preserves caller-supplied priority order.
505///
506/// Dictionaries are stored from highest to lowest priority. During lookup,
507/// matches of different byte lengths are all returned so the lattice segmenter
508/// can still compare shorter high-priority entries with longer low-priority
509/// entries. When two dictionaries produce a match with the same byte length,
510/// only the first one is kept.
511#[derive(Clone, Debug, Default, Eq, PartialEq)]
512pub struct ChainDictionary<D> {
513    dictionaries: Vec<D>,
514}
515
516impl<D> ChainDictionary<D> {
517    /// Creates an empty chain.
518    pub fn new() -> Self {
519        Self {
520            dictionaries: Vec::new(),
521        }
522    }
523
524    /// Appends a dictionary with lower priority than the existing entries.
525    pub fn push(&mut self, dictionary: D) {
526        self.dictionaries.push(dictionary);
527    }
528
529    /// Returns the number of dictionaries in the chain.
530    pub fn len(&self) -> usize {
531        self.dictionaries.len()
532    }
533
534    /// Returns whether the chain contains no dictionaries.
535    pub fn is_empty(&self) -> bool {
536        self.dictionaries.is_empty()
537    }
538
539    /// Returns the chained dictionaries in priority order.
540    pub fn dictionaries(&self) -> &[D] {
541        &self.dictionaries
542    }
543
544    /// Consumes the chain and returns its dictionaries in priority order.
545    pub fn into_dictionaries(self) -> Vec<D> {
546        self.dictionaries
547    }
548}
549
550impl<D> FromIterator<D> for ChainDictionary<D> {
551    fn from_iter<T: IntoIterator<Item = D>>(iter: T) -> Self {
552        Self {
553            dictionaries: Vec::from_iter(iter),
554        }
555    }
556}
557
558impl<D> HanjaDictionary for ChainDictionary<D>
559where
560    D: HanjaDictionary,
561{
562    fn matches_at<'a>(&'a self, s: &'a str) -> Box<dyn Iterator<Item = Match> + 'a> {
563        let mut seen_lengths = BTreeSet::new();
564        let mut matches = Vec::new();
565
566        for dictionary in &self.dictionaries {
567            for matched in dictionary.matches_at(s) {
568                if seen_lengths.insert(matched.byte_len) {
569                    matches.push(matched);
570                }
571            }
572        }
573
574        matches.sort_by_key(|matched| matched.byte_len);
575        Box::new(matches.into_iter())
576    }
577
578    fn max_word_chars(&self) -> Option<usize> {
579        let mut max = None;
580        for dictionary in &self.dictionaries {
581            let word_chars = dictionary.max_word_chars()?;
582            max = Some(max.map_or(word_chars, |current: usize| current.max(word_chars)));
583        }
584        max
585    }
586
587    fn entries<'a>(&'a self) -> Option<Box<dyn Iterator<Item = DictionaryRecord> + 'a>> {
588        let mut records = BTreeMap::<String, DictionaryRecord>::new();
589
590        for dictionary in &self.dictionaries {
591            for record in dictionary.entries()? {
592                records.entry(record.hanja.clone()).or_insert(record);
593            }
594        }
595
596        Some(Box::new(records.into_values()))
597    }
598
599    fn has_homophone(&self, hanja: &str, reading: &str) -> bool {
600        if let Some(mut records) = self.entries() {
601            return records.any(|record| record.hanja != hanja && record.reading == reading);
602        }
603
604        self.dictionaries
605            .iter()
606            .any(|dictionary| dictionary.has_homophone(hanja, reading))
607    }
608}
609
610fn khangul_reading(ch: char) -> Option<&'static str> {
611    KHANGUL_READINGS
612        .binary_search_by_key(&ch, |(hanja, _)| *hanja)
613        .ok()
614        .map(|index| KHANGUL_READINGS[index].1)
615}
616
617/// Engine-level options that affect hanja conversion before rendering.
618///
619/// These options apply to fallback text that is not covered by the supplied
620/// dictionary. Dictionary matches are assumed to already contain the desired
621/// reading and are not rewritten by fallback orthography rules.
622#[derive(Clone, Copy, Debug, Eq, PartialEq)]
623pub struct EngineOptions {
624    /// How hanja-containing spans are split into dictionary and fallback
625    /// segments.
626    pub segmentation: SegmentationStrategy,
627
628    /// Whether fallback readings should apply South Korean initial sound law.
629    pub initial_sound_law: bool,
630
631    /// How fallback hanja numerals are rendered.
632    pub numeral_strategy: NumeralStrategy,
633}
634
635impl Default for EngineOptions {
636    fn default() -> Self {
637        Self {
638            segmentation: SegmentationStrategy::Lattice,
639            initial_sound_law: true,
640            numeral_strategy: NumeralStrategy::HangulPhonetic,
641        }
642    }
643}
644
645/// Strategy used to segment hanja-containing spans.
646///
647/// `Lattice` considers every dictionary path and chooses the best coverage,
648/// while `Eager` greedily takes the longest match at each cursor.  The eager
649/// strategy can reduce work for callers that prefer speed over segmentation
650/// accuracy.
651#[non_exhaustive]
652#[derive(Clone, Copy, Debug, Default, Eq, PartialEq)]
653pub enum SegmentationStrategy {
654    /// Use dynamic programming to maximize dictionary coverage.
655    #[default]
656    Lattice,
657
658    /// Use left-to-right eager longest-match segmentation.
659    Eager,
660}
661
662/// Strategy for rendering hanja numerals encountered in fallback text.
663#[non_exhaustive]
664#[derive(Clone, Copy, Debug, Eq, PartialEq)]
665pub enum NumeralStrategy {
666    /// Render hanja numerals as their hangul phonetic readings.
667    ///
668    /// This strategy emits fallback annotations so renderers can still expose
669    /// the original hanja in annotation-oriented render modes.
670    HangulPhonetic,
671
672    /// Normalize positional digit-only hanja numerals to Arabic digits.
673    ///
674    /// Arabic normalization emits plain text rather than annotations. Renderers
675    /// and user directives therefore cannot later recover the original numeral
676    /// hanja for the normalized span.
677    PositionalArabic,
678
679    /// Normalize additive hanja numerals with place markers to Arabic digits.
680    ///
681    /// This parser handles small units such as `十`, `百`, and `千` and large
682    /// units through `澗`. Malformed or overflowing numerals fall back to
683    /// [`NumeralStrategy::HangulPhonetic`] for that run.
684    AdditiveArabic,
685
686    /// Choose Arabic normalization for common numeric contexts and otherwise
687    /// keep hangul phonetic fallback behavior.
688    ///
689    /// Well-formed additive numerals are normalized to Arabic. Pure positional
690    /// digit runs are normalized when they contain at least four digits
691    /// (matching common year notation) or when a unit hanja
692    /// (`年月日時分秒號世紀` and so on) immediately follows. Other numerals
693    /// remain hangul annotations.
694    Smart,
695}
696
697#[derive(Clone, Debug, Eq, PartialEq)]
698struct DictionaryEntry {
699    reading: String,
700    suffix_reading: Option<String>,
701    mark: MatchMark,
702}
703
704/// A small in-memory dictionary backed by an ordered map.
705///
706/// This implementation is intended for tests, user-supplied custom entries,
707/// and early pipeline validation. It returns all prefix matches at a cursor so
708/// the engine can score every candidate path through a hanja run.
709#[derive(Clone, Debug, Default, Eq, PartialEq)]
710pub struct MapDictionary {
711    entries: BTreeMap<String, DictionaryEntry>,
712    max_word_chars: Option<usize>,
713}
714
715impl MapDictionary {
716    /// Creates an empty map dictionary.
717    pub fn new() -> Self {
718        Self::default()
719    }
720
721    /// Inserts an entry with no special rendering constraints.
722    pub fn insert(&mut self, hanja: impl Into<String>, reading: impl Into<String>) {
723        self.insert_marked(hanja, reading, MatchMark::default());
724    }
725
726    /// Inserts an entry with dictionary-provided rendering constraints.
727    pub fn insert_marked(
728        &mut self,
729        hanja: impl Into<String>,
730        reading: impl Into<String>,
731        mark: MatchMark,
732    ) {
733        self.insert_entry(hanja, reading, None, mark);
734    }
735
736    /// Inserts an entry that carries a distinct non-word-initial reading.
737    ///
738    /// `suffix` is the reading used when the match is not word-initial (see
739    /// [`Match::suffix_reading`]); `reading` is the word-initial reading.
740    pub fn insert_with_suffix(
741        &mut self,
742        hanja: impl Into<String>,
743        reading: impl Into<String>,
744        suffix: impl Into<String>,
745    ) {
746        self.insert_entry(hanja, reading, Some(suffix.into()), MatchMark::default());
747    }
748
749    fn insert_entry(
750        &mut self,
751        hanja: impl Into<String>,
752        reading: impl Into<String>,
753        suffix_reading: Option<String>,
754        mark: MatchMark,
755    ) {
756        let hanja = hanja.into();
757        let word_chars = hanja.chars().count();
758        self.max_word_chars = Some(self.max_word_chars.map_or(word_chars, |max| {
759            if word_chars > max { word_chars } else { max }
760        }));
761        self.entries.insert(
762            hanja,
763            DictionaryEntry {
764                reading: reading.into(),
765                suffix_reading,
766                mark,
767            },
768        );
769    }
770
771    /// Returns whether the dictionary has no entries.
772    pub fn is_empty(&self) -> bool {
773        self.entries.is_empty()
774    }
775
776    /// Returns the number of dictionary entries.
777    pub fn len(&self) -> usize {
778        self.entries.len()
779    }
780}
781
782impl HanjaDictionary for MapDictionary {
783    fn matches_at<'a>(&'a self, s: &'a str) -> Box<dyn Iterator<Item = Match> + 'a> {
784        Box::new(
785            self.entries
786                .iter()
787                .filter(move |(hanja, _)| s.starts_with(hanja.as_str()))
788                .map(|(hanja, entry)| Match {
789                    byte_len: hanja.len(),
790                    reading: entry.reading.clone(),
791                    suffix_reading: entry.suffix_reading.clone(),
792                    mark: entry.mark,
793                }),
794        )
795    }
796
797    fn max_word_chars(&self) -> Option<usize> {
798        self.max_word_chars
799    }
800
801    fn entries<'a>(&'a self) -> Option<Box<dyn Iterator<Item = DictionaryRecord> + 'a>> {
802        Some(Box::new(self.entries.iter().map(|(hanja, entry)| {
803            DictionaryRecord {
804                hanja: hanja.clone(),
805                reading: entry.reading.clone(),
806                mark: entry.mark,
807            }
808        })))
809    }
810
811    fn has_homophone(&self, hanja: &str, reading: &str) -> bool {
812        self.entries
813            .iter()
814            .any(|(other_hanja, entry)| other_hanja != hanja && entry.reading == reading)
815    }
816}
817
818/// Scope data used by the plain-text adapter.
819///
820/// Plain text has no preserved regions or block boundaries, and inline markup
821/// such as `<ruby>` is not meaningful in a plain-text stream. Reporting
822/// [`ScopeData::allows_inline_markup`] as `false` lets scope-aware renderers
823/// fall back to parenthesized text before any [`RenderedToken::Ruby`] reaches
824/// the plain-text writer.
825#[derive(Clone, Copy, Debug, Default, Eq, PartialEq)]
826pub struct PlainScopeData;
827
828impl ScopeData for PlainScopeData {
829    fn is_preserve(&self) -> bool {
830        false
831    }
832
833    fn allows_inline_markup(&self) -> bool {
834        false
835    }
836}
837
838/// Reads a plain-text string into the core input-token stream.
839///
840/// The adapter wraps the input in a plain scope and emits the entire input as a
841/// single `Text` token.
842pub fn read_plain_text(input: &str) -> Vec<InputToken<PlainScopeData>> {
843    Vec::from([
844        InputToken::Open(Scope::new(PlainScopeData)),
845        InputToken::Text(input.to_string()),
846        InputToken::Close,
847    ])
848}
849
850/// Writes rendered plain-text tokens back to a string.
851///
852/// Structural tokens are ignored because plain text has no serialized scope
853/// markers. `Text` and `Verbatim` tokens are concatenated in stream order.
854/// `Ruby` tokens are not expected because [`PlainScopeData`] disallows inline
855/// markup, but they are defensively serialized as `base(rt)` rather than
856/// dropped silently if one ever reaches the writer.
857pub fn write_plain_text<S>(tokens: impl IntoIterator<Item = RenderedToken<S>>) -> String {
858    let mut output = String::new();
859    for token in tokens {
860        match token {
861            RenderedToken::Open(_) | RenderedToken::Close => {}
862            RenderedToken::Text(text) | RenderedToken::Verbatim(text) => output.push_str(&text),
863            RenderedToken::Ruby { base, rt } => {
864                output.push_str(&parens(&base, &rt));
865            }
866        }
867    }
868    output
869}
870
871/// Processes input tokens with the default hanja conversion engine options.
872///
873/// The engine preserves structural and verbatim tokens, skips text when the
874/// current scope is preserving, and uses lattice segmentation to annotate
875/// dictionary and fallback matches inside text tokens.
876pub fn process_tokens<S, D>(
877    tokens: impl IntoIterator<Item = InputToken<S>>,
878    dictionary: &D,
879) -> Vec<OutputToken<S>>
880where
881    S: ScopeData,
882    D: HanjaDictionary + ?Sized,
883{
884    process_tokens_iter(tokens, dictionary).collect()
885}
886
887/// Processes input tokens through the default engine options and returns an
888/// iterator over the collected output.
889///
890/// This is an iterator-shaped compatibility adapter, not the low-level
891/// streaming surface: it consumes the supplied input before returning. For
892/// true incremental processing, use [`Engine`] directly and call
893/// [`Engine::push_token`] as chunks arrive.
894pub fn process_tokens_iter<S, D>(
895    tokens: impl IntoIterator<Item = InputToken<S>>,
896    dictionary: &D,
897) -> alloc::vec::IntoIter<OutputToken<S>>
898where
899    S: ScopeData,
900    D: HanjaDictionary + ?Sized,
901{
902    process_tokens_with_options(tokens, dictionary, EngineOptions::default()).into_iter()
903}
904
905/// Processes input tokens with explicit hanja conversion engine options.
906///
907/// This is the lower-level entry point for callers that need to disable
908/// fallback initial sound law or choose a non-default numeral strategy.
909pub fn process_tokens_with_options<S, D>(
910    tokens: impl IntoIterator<Item = InputToken<S>>,
911    dictionary: &D,
912    options: EngineOptions,
913) -> Vec<OutputToken<S>>
914where
915    S: ScopeData,
916    D: HanjaDictionary + ?Sized,
917{
918    let mut engine = Engine::collecting(dictionary, options);
919    let mut output = Vec::new();
920
921    for token in tokens {
922        output.extend(engine.push_token(token));
923    }
924
925    output.extend(engine.finish());
926    output
927}
928
929/// Processes input tokens through explicit engine options and returns an
930/// iterator over the collected output.
931///
932/// This convenience adapter preserves the existing collect-into-`Vec` behavior
933/// while exposing an iterator-shaped API for callers that compose pipeline
934/// stages. Use [`Engine`] for chunk-by-chunk output.
935pub fn process_tokens_iter_with_options<S, D>(
936    tokens: impl IntoIterator<Item = InputToken<S>>,
937    dictionary: &D,
938    options: EngineOptions,
939) -> alloc::vec::IntoIter<OutputToken<S>>
940where
941    S: ScopeData,
942    D: HanjaDictionary + ?Sized,
943{
944    process_tokens_with_options(tokens, dictionary, options).into_iter()
945}
946
947/// Resolves a fallible reader token stream into recovered input tokens.
948///
949/// This is the single place where the stream-level [`Recovery`] policy is
950/// applied to a reader's output. Format adapters (such as the HTML scanner)
951/// emit `Ok(InputToken)` for well-formed regions and
952/// `Err(RecoverableInputError)` for malformed regions they can describe and
953/// preserve; this function turns that stream into the plain
954/// [`InputToken`] sequence the rest of the pipeline consumes:
955///
956///  -  In [`Recovery::Strict`] mode the first error stops processing and its
957///     cause is returned, so the caller never sees a partial token stream.
958///  -  In [`Recovery::Lenient`] mode each error is logged at `warn` level once
959///     and replaced by an [`InputToken::Verbatim`] holding the original source
960///     region, so the malformed bytes pass through untouched while surrounding
961///     tokens continue to flow.
962///
963/// It sits one stage before the [`Engine`]: feed its output into
964/// [`process_tokens_with_options`] or a streaming [`Engine`]. The recovery-aware
965/// engine entry points ([`process_fallible_tokens`] and
966/// [`process_fallible_tokens_with_options`]) are thin wrappers that call this
967/// and then run the engine.
968pub fn recover_input_tokens<S>(
969    tokens: impl IntoIterator<Item = Result<InputToken<S>, RecoverableInputError>>,
970    recovery: Recovery,
971) -> Result<Vec<InputToken<S>>, Error>
972where
973    S: ScopeData,
974{
975    let mut recovered = Vec::new();
976    for token in tokens {
977        recovered.push(recover_input_token(token, recovery)?);
978    }
979    Ok(recovered)
980}
981
982/// Resolves one fallible reader item according to a [`Recovery`] policy.
983///
984/// This is the per-token form of [`recover_input_tokens`] for streaming
985/// pipelines. In strict mode an error is returned immediately. In lenient mode
986/// the error is logged once and replaced with an [`InputToken::Verbatim`]
987/// carrying the original malformed region.
988pub fn recover_input_token<S>(
989    token: Result<InputToken<S>, RecoverableInputError>,
990    recovery: Recovery,
991) -> Result<InputToken<S>, Error>
992where
993    S: ScopeData,
994{
995    match token {
996        Ok(token) => Ok(token),
997        Err(error) => match recovery {
998            Recovery::Strict => Err(error.into_parts().1),
999            Recovery::Lenient => {
1000                let (original, error) = error.into_parts();
1001                tracing::warn!(error = %error, "recovering from input reader error");
1002                Ok(InputToken::Verbatim(original))
1003            }
1004        },
1005    }
1006}
1007
1008/// Processes fallible input tokens with default engine options.
1009///
1010/// Reader errors are handled according to `recovery`. In strict mode the first
1011/// error is returned. In lenient mode each recoverable region is logged and
1012/// emitted as `OutputToken::Verbatim`, after which later tokens continue
1013/// through the normal engine path.
1014pub fn process_fallible_tokens<S, D>(
1015    tokens: impl IntoIterator<Item = Result<InputToken<S>, RecoverableInputError>>,
1016    dictionary: &D,
1017    recovery: Recovery,
1018) -> Result<Vec<OutputToken<S>>, Error>
1019where
1020    S: ScopeData,
1021    D: HanjaDictionary + ?Sized,
1022{
1023    process_fallible_tokens_with_options(tokens, dictionary, EngineOptions::default(), recovery)
1024}
1025
1026/// Processes fallible input tokens with explicit engine options.
1027///
1028/// This is the recovery-aware counterpart to
1029/// [`process_tokens_with_options`]. It does not make the dictionary trait
1030/// fallible; it only handles reader errors that carry enough original source
1031/// text for lenient preservation.
1032pub fn process_fallible_tokens_with_options<S, D>(
1033    tokens: impl IntoIterator<Item = Result<InputToken<S>, RecoverableInputError>>,
1034    dictionary: &D,
1035    options: EngineOptions,
1036    recovery: Recovery,
1037) -> Result<Vec<OutputToken<S>>, Error>
1038where
1039    S: ScopeData,
1040    D: HanjaDictionary + ?Sized,
1041{
1042    let recovered = recover_input_tokens(tokens, recovery)?;
1043    Ok(process_tokens_with_options(recovered, dictionary, options))
1044}
1045
1046/// Stateful hanja conversion engine for chunked token streams.
1047///
1048/// `Engine` is the low-level streaming surface. Call [`Engine::push_token`] for
1049/// each incoming token and then [`Engine::finish`] once the upstream reader is
1050/// exhausted. When the dictionary reports a maximum word length, text chunks are
1051/// buffered only at the tail so dictionary matches can cross chunk boundaries
1052/// without requiring the whole document in memory. A trailing fallback hanja run
1053/// is also kept buffered until a non-convertible boundary or EOF so render modes
1054/// that expose annotation spans match one-shot conversion. Dictionaries with an
1055/// unknown maximum keep hanja-containing text until a non-convertible boundary
1056/// or EOF so long custom entries remain observable.
1057pub struct Engine<'a, S, D>
1058where
1059    S: ScopeData,
1060    D: HanjaDictionary + ?Sized,
1061{
1062    dictionary: &'a D,
1063    options: EngineOptions,
1064    scopes: Vec<Scope<S>>,
1065    pending_text: String,
1066    pending_unflushable_fallback_run_bytes: Option<usize>,
1067    fallback_state: FallbackState,
1068    incremental_flush: bool,
1069}
1070
1071impl<'a, S, D> Engine<'a, S, D>
1072where
1073    S: ScopeData,
1074    D: HanjaDictionary + ?Sized,
1075{
1076    /// Creates a streaming engine with default options.
1077    pub fn new(dictionary: &'a D) -> Self {
1078        Self::with_options(dictionary, EngineOptions::default())
1079    }
1080
1081    /// Creates a streaming engine with explicit conversion options.
1082    pub fn with_options(dictionary: &'a D, options: EngineOptions) -> Self {
1083        Self::with_incremental_flush(dictionary, options, true)
1084    }
1085
1086    fn collecting(dictionary: &'a D, options: EngineOptions) -> Self {
1087        Self::with_incremental_flush(dictionary, options, false)
1088    }
1089
1090    fn with_incremental_flush(
1091        dictionary: &'a D,
1092        options: EngineOptions,
1093        incremental_flush: bool,
1094    ) -> Self {
1095        tracing::debug!(
1096            strategy = ?options.segmentation,
1097            "engine created with segmentation strategy"
1098        );
1099        Self {
1100            dictionary,
1101            options,
1102            scopes: Vec::new(),
1103            pending_text: String::new(),
1104            pending_unflushable_fallback_run_bytes: None,
1105            fallback_state: FallbackState::default(),
1106            incremental_flush,
1107        }
1108    }
1109
1110    /// Pushes one input token and returns output tokens that are now safe to
1111    /// emit.
1112    pub fn push_token(&mut self, token: InputToken<S>) -> Vec<OutputToken<S>> {
1113        let mut output = Vec::new();
1114        match token {
1115            InputToken::Open(scope) => {
1116                self.flush_into(&mut output);
1117                if scope.data().is_block_boundary() {
1118                    self.reset_fallback_context();
1119                }
1120                self.scopes.push(scope.clone());
1121                output.push(OutputToken::Open(scope));
1122            }
1123            InputToken::Close => {
1124                self.flush_into(&mut output);
1125                let closes_block_boundary = self
1126                    .scopes
1127                    .pop()
1128                    .is_some_and(|scope| scope.data().is_block_boundary());
1129                output.push(OutputToken::Close);
1130                if closes_block_boundary {
1131                    self.reset_fallback_context();
1132                }
1133            }
1134            InputToken::Text(text) => {
1135                if self
1136                    .scopes
1137                    .last()
1138                    .is_some_and(|scope| scope.data().is_preserve())
1139                {
1140                    self.flush_into(&mut output);
1141                    self.reset_fallback_context();
1142                    output.push(OutputToken::Text(text));
1143                } else {
1144                    let previous_pending_bytes = self.pending_text.len();
1145                    self.pending_text.push_str(&text);
1146                    if self
1147                        .pending_unflushable_fallback_run_bytes
1148                        .is_some_and(|bytes| bytes == previous_pending_bytes)
1149                    {
1150                        self.pending_unflushable_fallback_run_bytes = Some(previous_pending_bytes);
1151                    } else {
1152                        self.pending_unflushable_fallback_run_bytes = None;
1153                    }
1154                    if self.incremental_flush {
1155                        self.flush_safe_into(&mut output);
1156                    }
1157                }
1158            }
1159            InputToken::Verbatim(text) => {
1160                self.flush_into(&mut output);
1161                self.reset_fallback_context();
1162                output.push(OutputToken::Verbatim(text));
1163            }
1164        }
1165        output
1166    }
1167
1168    /// Flushes all pending text without ending the engine.
1169    pub fn flush(&mut self) -> Vec<OutputToken<S>> {
1170        let mut output = Vec::new();
1171        self.flush_into(&mut output);
1172        output
1173    }
1174
1175    /// Finishes the stream and returns every remaining output token.
1176    pub fn finish(mut self) -> Vec<OutputToken<S>> {
1177        self.flush()
1178    }
1179
1180    /// Returns the number of Unicode scalar values currently buffered.
1181    pub fn buffered_chars(&self) -> usize {
1182        self.pending_text.chars().count()
1183    }
1184
1185    fn tail_bound(&self) -> Option<usize> {
1186        self.dictionary.max_word_chars().filter(|bound| *bound > 0)
1187    }
1188
1189    fn flush_safe_into(&mut self, output: &mut Vec<OutputToken<S>>) {
1190        if self.pending_text.is_empty() {
1191            return;
1192        }
1193        if !self.pending_text.chars().any(is_hanja) {
1194            self.flush_non_hanja_safe_into(output);
1195            return;
1196        }
1197
1198        let Some(bound) = self.tail_bound() else {
1199            let Some(flush_end) = safe_unknown_bound_flush_end(&self.pending_text) else {
1200                return;
1201            };
1202            self.flush_prefix_into(flush_end, output);
1203            if !self.pending_text.chars().any(is_hanja) {
1204                self.flush_non_hanja_safe_into(output);
1205            }
1206            return;
1207        };
1208        if let Some(flush_end) = safe_unknown_bound_flush_end(&self.pending_text) {
1209            self.flush_prefix_into(flush_end, output);
1210            if !self.pending_text.chars().any(is_hanja) {
1211                self.flush_non_hanja_safe_into(output);
1212            }
1213            return;
1214        }
1215        let buffered_chars = self.buffered_chars();
1216        if buffered_chars > bound.saturating_mul(10) {
1217            tracing::debug!(
1218                buffered_chars,
1219                dict_max_word_chars = bound,
1220                "streaming tail buffer is unusually large"
1221            );
1222        }
1223        if buffered_chars <= bound {
1224            return;
1225        }
1226
1227        if self.extends_unflushable_fallback_run(bound) {
1228            self.pending_unflushable_fallback_run_bytes = Some(self.pending_text.len());
1229            return;
1230        }
1231
1232        let safe_chars = buffered_chars.saturating_sub(bound).saturating_add(1);
1233        let segments = segment_text(
1234            &self.pending_text,
1235            self.dictionary,
1236            self.options.segmentation,
1237        );
1238        let mut flush_end = 0;
1239        let mut flush_segments = Vec::new();
1240        for segment in &segments {
1241            let (byte_start, byte_end) = segment_bounds(segment);
1242            let start_chars = self.pending_text[..byte_start].chars().count();
1243            let end_chars = self.pending_text[..byte_end].chars().count();
1244            if byte_start > flush_end || (start_chars > safe_chars && flush_end > 0) {
1245                break;
1246            }
1247            if end_chars > safe_chars {
1248                break;
1249            }
1250            flush_end = byte_end;
1251            flush_segments.push(segment.clone());
1252        }
1253
1254        // Fallback runs render as one annotation in non-default render modes.
1255        // Keep a trailing fallback run buffered because the next chunk may
1256        // extend it, even when the dictionary lookahead bound is only one char.
1257        if let Some(fallback_start) = trailing_fallback_run_start(&segments, flush_end) {
1258            flush_end = fallback_start;
1259            while flush_segments
1260                .last()
1261                .is_some_and(|segment| segment_bounds(segment).1 > flush_end)
1262            {
1263                flush_segments.pop();
1264            }
1265        }
1266
1267        if flush_end > 0 {
1268            self.pending_unflushable_fallback_run_bytes = None;
1269            self.flush_segments_prefix_into(flush_end, &flush_segments, output);
1270            if !self.pending_text.chars().any(is_hanja) {
1271                self.flush_non_hanja_safe_into(output);
1272            }
1273        } else if trailing_fallback_run_start(&segments, self.pending_text.len()) == Some(0) {
1274            self.pending_unflushable_fallback_run_bytes = Some(self.pending_text.len());
1275        }
1276    }
1277
1278    fn extends_unflushable_fallback_run(&self, bound: usize) -> bool {
1279        let Some(previous_bytes) = self.pending_unflushable_fallback_run_bytes else {
1280            return false;
1281        };
1282        if previous_bytes == 0
1283            || previous_bytes > self.pending_text.len()
1284            || !self.pending_text.is_char_boundary(previous_bytes)
1285        {
1286            return false;
1287        }
1288
1289        let appended = &self.pending_text[previous_bytes..];
1290        if appended.is_empty() {
1291            return true;
1292        }
1293        if appended.chars().any(|ch| !is_hanja(ch)) {
1294            return false;
1295        }
1296
1297        // The existing prefix was already segmented as one fallback run.  Only
1298        // the old suffix that can participate in a cross-chunk dictionary match
1299        // and the newly appended text need to be inspected here.
1300        let probe_start = suffix_start_for_char_count(
1301            &self.pending_text[..previous_bytes],
1302            bound.saturating_sub(1),
1303        );
1304        let probe = &self.pending_text[probe_start..];
1305        segment_text(probe, self.dictionary, self.options.segmentation)
1306            .iter()
1307            .all(|segment| matches!(segment, Segment::Fallback { .. }))
1308    }
1309
1310    fn flush_non_hanja_safe_into(&mut self, output: &mut Vec<OutputToken<S>>) {
1311        let flush_end = match self.tail_bound() {
1312            Some(bound) => safe_non_hanja_flush_end(&self.pending_text, bound),
1313            None => safe_unknown_bound_flush_end(&self.pending_text),
1314        };
1315        if let Some(flush_end) = flush_end {
1316            self.flush_prefix_into(flush_end, output);
1317        }
1318    }
1319
1320    fn flush_prefix_into(&mut self, flush_end: usize, output: &mut Vec<OutputToken<S>>) {
1321        if flush_end == self.pending_text.len() {
1322            self.flush_into(output);
1323            return;
1324        }
1325        self.pending_unflushable_fallback_run_bytes = None;
1326        let prefix = self.pending_text[..flush_end].to_string();
1327        let segments = segment_text(&prefix, self.dictionary, self.options.segmentation);
1328        self.flush_segments_prefix_into(flush_end, &segments, output);
1329    }
1330
1331    fn flush_segments_prefix_into(
1332        &mut self,
1333        flush_end: usize,
1334        segments: &[Segment],
1335        output: &mut Vec<OutputToken<S>>,
1336    ) {
1337        let prefix = self.pending_text[..flush_end].to_string();
1338        process_segments_with_state(
1339            &prefix,
1340            segments,
1341            self.dictionary,
1342            self.options,
1343            &mut self.fallback_state,
1344            output,
1345        );
1346        self.pending_text.replace_range(..flush_end, "");
1347    }
1348
1349    fn flush_into(&mut self, output: &mut Vec<OutputToken<S>>) {
1350        if self.pending_text.is_empty() {
1351            return;
1352        }
1353        self.pending_unflushable_fallback_run_bytes = None;
1354        let text = core::mem::take(&mut self.pending_text);
1355        process_text_with_state(
1356            &text,
1357            self.dictionary,
1358            self.options,
1359            &mut self.fallback_state,
1360            output,
1361        );
1362    }
1363
1364    fn reset_fallback_context(&mut self) {
1365        self.fallback_state = FallbackState::default();
1366    }
1367}
1368
1369fn safe_non_hanja_flush_end(text: &str, bound: usize) -> Option<usize> {
1370    if text.is_empty() {
1371        return None;
1372    }
1373
1374    let keep_chars = bound.saturating_sub(1);
1375    let span_start = text
1376        .char_indices()
1377        .rfind(|(_, ch)| ch.is_whitespace())
1378        .map_or(0, |(index, ch)| index + ch.len_utf8());
1379    let suffix = &text[span_start..];
1380    let suffix_chars = suffix.chars().count();
1381    if suffix_chars <= keep_chars {
1382        return (span_start > 0).then_some(span_start);
1383    }
1384
1385    let flush_suffix_chars = suffix_chars - keep_chars;
1386    let flush_end = suffix
1387        .char_indices()
1388        .nth(flush_suffix_chars)
1389        .map_or(text.len(), |(index, _)| span_start + index);
1390    (flush_end > 0).then_some(flush_end)
1391}
1392
1393fn safe_unknown_bound_flush_end(text: &str) -> Option<usize> {
1394    text.char_indices()
1395        .rfind(|(_, ch)| ch.is_whitespace())
1396        .map(|(index, ch)| index + ch.len_utf8())
1397}
1398
1399fn suffix_start_for_char_count(text: &str, count: usize) -> usize {
1400    if count == 0 {
1401        return text.len();
1402    }
1403
1404    text.char_indices()
1405        .rev()
1406        .nth(count.saturating_sub(1))
1407        .map_or(0, |(index, _)| index)
1408}
1409
1410fn trailing_fallback_run_start(segments: &[Segment], split_byte: usize) -> Option<usize> {
1411    if split_byte == 0 {
1412        return None;
1413    }
1414
1415    for (index, segment) in segments.iter().enumerate() {
1416        let (byte_start, byte_end) = segment_bounds(segment);
1417        if byte_end != split_byte {
1418            continue;
1419        }
1420        if !matches!(segment, Segment::Fallback { .. }) {
1421            return None;
1422        }
1423        if let Some(next) = segments.get(index + 1)
1424            && !matches!(next, Segment::Fallback { .. })
1425        {
1426            return None;
1427        }
1428
1429        let mut run_start = byte_start;
1430        for previous in segments[..index].iter().rev() {
1431            let (previous_start, previous_end) = segment_bounds(previous);
1432            if previous_end != run_start || !matches!(previous, Segment::Fallback { .. }) {
1433                break;
1434            }
1435            run_start = previous_start;
1436        }
1437        return (run_start < split_byte).then_some(run_start);
1438    }
1439
1440    None
1441}
1442
1443fn process_text_with_state<S, D>(
1444    text: &str,
1445    dictionary: &D,
1446    options: EngineOptions,
1447    fallback_state: &mut FallbackState,
1448    output: &mut Vec<OutputToken<S>>,
1449) where
1450    D: HanjaDictionary + ?Sized,
1451{
1452    let segments = segment_text(text, dictionary, options.segmentation);
1453    process_segments_with_state(text, &segments, dictionary, options, fallback_state, output);
1454}
1455
1456fn process_segments_with_state<S, D>(
1457    text: &str,
1458    segments: &[Segment],
1459    _dictionary: &D,
1460    options: EngineOptions,
1461    fallback_state: &mut FallbackState,
1462    output: &mut Vec<OutputToken<S>>,
1463) where
1464    D: HanjaDictionary + ?Sized,
1465{
1466    let mut index = 0;
1467
1468    while index < segments.len() {
1469        match &segments[index] {
1470            Segment::Dictionary {
1471                byte_start,
1472                byte_end,
1473                reading,
1474                suffix_reading,
1475                mark,
1476            } => {
1477                let source = &text[*byte_start..*byte_end];
1478                let effective = dictionary_effective_reading(
1479                    source,
1480                    reading,
1481                    suffix_reading.as_deref(),
1482                    options,
1483                    fallback_state.starts_word,
1484                    fallback_state.previous_reading,
1485                );
1486                output.push(OutputToken::Annotated(Annotation {
1487                    hanja: source.to_string(),
1488                    homophone: false,
1489                    reading: effective.clone(),
1490                    require_hanja: mark.require_hanja,
1491                    require_hangul: mark.require_hangul,
1492                    first_in_context: true,
1493                    skip_annotation: false,
1494                    from_dictionary: true,
1495                }));
1496                if should_preserve_dictionary_context(source, &effective, options) {
1497                    update_fallback_state_for_reading(&effective, fallback_state);
1498                } else {
1499                    *fallback_state = FallbackState::default();
1500                }
1501                index += 1;
1502            }
1503            Segment::Fallback {
1504                byte_start,
1505                byte_end,
1506            } => {
1507                let mut fallback_end = *byte_end;
1508                while let Some(Segment::Fallback { byte_end, .. }) = segments.get(index + 1) {
1509                    fallback_end = *byte_end;
1510                    index += 1;
1511                }
1512                process_fallback_text(
1513                    &text[*byte_start..fallback_end],
1514                    options,
1515                    fallback_state,
1516                    output,
1517                );
1518                index += 1;
1519            }
1520            Segment::Text {
1521                byte_start,
1522                byte_end,
1523            } => {
1524                let text_segment = &text[*byte_start..*byte_end];
1525                push_text(output, text_segment);
1526                update_fallback_state_for_text(text_segment, fallback_state);
1527                index += 1;
1528            }
1529        }
1530    }
1531}
1532
1533fn segment_bounds(segment: &Segment) -> (usize, usize) {
1534    match segment {
1535        Segment::Dictionary {
1536            byte_start,
1537            byte_end,
1538            ..
1539        }
1540        | Segment::Fallback {
1541            byte_start,
1542            byte_end,
1543        }
1544        | Segment::Text {
1545            byte_start,
1546            byte_end,
1547        } => (*byte_start, *byte_end),
1548    }
1549}
1550
1551fn process_fallback_text<S>(
1552    text: &str,
1553    options: EngineOptions,
1554    state: &mut FallbackState,
1555    output: &mut Vec<OutputToken<S>>,
1556) {
1557    for part in phoneticize_fallback_run_with_state(text, options, state) {
1558        match part {
1559            FallbackPart::Annotation { hanja, reading } => {
1560                output.push(OutputToken::Annotated(Annotation {
1561                    hanja,
1562                    reading,
1563                    homophone: false,
1564                    require_hanja: false,
1565                    require_hangul: false,
1566                    first_in_context: true,
1567                    skip_annotation: false,
1568                    from_dictionary: false,
1569                }));
1570            }
1571            FallbackPart::ReadingText(text) => push_text(output, &text),
1572            FallbackPart::Text(text) => push_text(output, &text),
1573        }
1574    }
1575}
1576
1577fn update_fallback_state_for_text(text: &str, state: &mut FallbackState) {
1578    if text.is_empty() {
1579        return;
1580    }
1581
1582    if text
1583        .chars()
1584        .last()
1585        .is_some_and(|character| character.is_whitespace())
1586    {
1587        *state = FallbackState::default();
1588        return;
1589    }
1590
1591    let Some(last) = text.chars().rev().find(|ch| !ch.is_whitespace()) else {
1592        return;
1593    };
1594
1595    if last.is_alphanumeric() {
1596        state.starts_word = false;
1597        state.previous_reading = Some(last);
1598    } else {
1599        *state = FallbackState::default();
1600    }
1601}
1602
1603/// Chooses the reading a dictionary match should emit at its position.
1604///
1605/// South Korean initial sound law (頭音法則) makes some morphemes read
1606/// differently word-initially than elsewhere. The bundled dictionary stores the
1607/// word-initial form, so a bare match would render `1998年` as `1998연` instead
1608/// of `1998년`. This applies the position-correct reading:
1609///
1610///  -  When the match carries an explicit [`Match::suffix_reading`] (a
1611///     multi-syllable entry the Standard Korean Language Dictionary records with
1612///     a distinct suffix or bound-noun form, such as `年代`), that suffix
1613///     reading is used outside word-initial position.
1614///  -  Otherwise, for a single hanja whose bundled unihan reading undergoes
1615///     initial sound law, the original (non-word-initial) reading is recovered
1616///     from the unihan table. This covers every such hanja without per-entry
1617///     data. The match's reading must already be one of the two law variants so
1618///     unrelated readings (and non-law hanja) are left untouched. The
1619///     `렬`/`률` → `열`/`율` rule after a vowel or `ㄴ` coda is honored through
1620///     [`should_apply_yeol_yul`], matching fallback behavior.
1621///
1622/// With initial sound law disabled (for example the North Korean preset) the
1623/// original reading is used everywhere.
1624fn dictionary_effective_reading(
1625    source: &str,
1626    reading: &str,
1627    suffix_reading: Option<&str>,
1628    options: EngineOptions,
1629    starts_word: bool,
1630    previous_reading: Option<char>,
1631) -> String {
1632    if let Some(suffix) = suffix_reading {
1633        return if starts_word && options.initial_sound_law {
1634            reading.to_string()
1635        } else {
1636            suffix.to_string()
1637        };
1638    }
1639
1640    let mut chars = source.chars();
1641    if let (Some(ch), None) = (chars.next(), chars.next())
1642        && let Some(base) = phoneticize_hanja_char(ch)
1643    {
1644        let initial = apply_initial_sound_law_to_first_syllable(base);
1645        if initial != base && (reading == base || reading == initial) {
1646            let apply_law = options.initial_sound_law
1647                && (starts_word || should_apply_yeol_yul(previous_reading, base));
1648            return if apply_law { initial } else { base.to_string() };
1649        }
1650    }
1651
1652    reading.to_string()
1653}
1654
1655fn should_preserve_dictionary_context(source: &str, reading: &str, options: EngineOptions) -> bool {
1656    if reading.chars().all(char::is_whitespace) {
1657        return false;
1658    }
1659
1660    if source.chars().all(is_hanja) {
1661        match fallback_reading_for_run(source, options) {
1662            Some(fallback_reading) => {
1663                fallback_reading == reading || has_one_hangul_syllable_per_hanja(source, reading)
1664            }
1665            None => has_one_hangul_syllable_per_hanja(source, reading),
1666        }
1667    } else {
1668        true
1669    }
1670}
1671
1672fn has_one_hangul_syllable_per_hanja(source: &str, reading: &str) -> bool {
1673    let source_len = source.chars().count();
1674    let mut reading_len = 0;
1675
1676    for ch in reading.chars() {
1677        if !is_hangul_syllable(ch) {
1678            return false;
1679        }
1680        reading_len += 1;
1681    }
1682
1683    reading_len == source_len
1684}
1685
1686fn is_hangul_syllable(ch: char) -> bool {
1687    ('\u{ac00}'..='\u{d7a3}').contains(&ch)
1688}
1689
1690fn update_fallback_state_for_reading(reading: &str, state: &mut FallbackState) {
1691    let Some(last) = reading.chars().rev().find(|ch| !ch.is_whitespace()) else {
1692        *state = FallbackState::default();
1693        return;
1694    };
1695
1696    if last.is_alphanumeric() {
1697        state.starts_word = false;
1698        state.previous_reading = Some(last);
1699    } else {
1700        *state = FallbackState::default();
1701    }
1702}
1703
1704fn push_text<S>(output: &mut Vec<OutputToken<S>>, text: &str) {
1705    if text.is_empty() {
1706        return;
1707    }
1708
1709    match output.last_mut() {
1710        Some(OutputToken::Text(existing)) => existing.push_str(text),
1711        _ => output.push(OutputToken::Text(text.to_string())),
1712    }
1713}
1714
1715/// Returns whether `ch` is in a known CJK ideograph range.
1716pub fn is_hanja(ch: char) -> bool {
1717    matches!(
1718        ch,
1719        '\u{2F00}'..='\u{2FFF}'
1720            | '\u{3007}'
1721            | '\u{3400}'..='\u{4DBF}'
1722            | '\u{4E00}'..='\u{9FFF}'
1723            | '\u{F900}'..='\u{FAFF}'
1724            | '\u{20000}'..='\u{2A6DF}'
1725            | '\u{2A700}'..='\u{2B73F}'
1726            | '\u{2B740}'..='\u{2B81F}'
1727            | '\u{2B820}'..='\u{2CEAF}'
1728            | '\u{2CEB0}'..='\u{2EBEF}'
1729            | '\u{2EBF0}'..='\u{2EE5F}'
1730            | '\u{2F800}'..='\u{2FA1F}'
1731            | '\u{30000}'..='\u{3134F}'
1732            | '\u{31350}'..='\u{323AF}'
1733            | '\u{323B0}'..='\u{3347F}'
1734    )
1735}
1736
1737/// The concrete rendering mode for annotated hanja words.
1738#[derive(Clone, Copy, Debug, Eq, PartialEq)]
1739pub enum RenderMode {
1740    /// Emits only hangul unless annotation flags require hanja disambiguation.
1741    HangulOnly,
1742
1743    /// Always emits hangul followed by the original hanja in parentheses.
1744    HangulHanjaParens,
1745
1746    /// Always emits original hanja followed by the hangul reading in
1747    /// parentheses.
1748    HanjaHangulParens,
1749
1750    /// Emits a `<ruby>` element pairing hangul reading and source hanja.
1751    ///
1752    /// The [`RubyBase`] sub-mode chooses which side becomes the base text.
1753    /// When the active scope reports
1754    /// [`ScopeData::allows_inline_markup`] as `false`, the renderer falls back
1755    /// to parenthesized text so that adapters which cannot embed markup still
1756    /// receive a sensible surface form.
1757    Ruby(RubyBase),
1758
1759    /// Emits original hanja, adding a hangul gloss only when requested.
1760    Original,
1761}
1762
1763/// Selects which side of a `<ruby>` element is the base text.
1764#[derive(Clone, Copy, Debug, Eq, PartialEq)]
1765pub enum RubyBase {
1766    /// `<ruby>hangul<rp>(</rp><rt>hanja</rt><rp>)</rp></ruby>`; hangul is the
1767    /// base, hanja is the gloss. The `<rp>` elements provide parenthesized
1768    /// fallback text for browsers without `<ruby>` support.
1769    OnHangul,
1770
1771    /// `<ruby>hanja<rp>(</rp><rt>hangul</rt><rp>)</rp></ruby>`; hanja is the
1772    /// base, hangul is the gloss. The `<rp>` elements provide parenthesized
1773    /// fallback text for browsers without `<ruby>` support.
1774    OnHanja,
1775}
1776
1777/// Form for the gloss attached to annotations in [`RenderMode::Original`].
1778///
1779/// `Original` keeps the source hanja as primary text and only attaches a
1780/// hangul gloss when the annotation flags or a user directive demand one.
1781/// This option controls how that gloss appears. Because `Original` always
1782/// treats hanja as primary, the ruby form uses hanja as the base and hangul
1783/// as the `rt` gloss; there is no sub-mode to flip the sides.
1784#[derive(Clone, Copy, Debug, Default, Eq, PartialEq)]
1785pub enum OriginalGloss {
1786    /// `hanja(hangul)`; matches the legacy behavior.
1787    #[default]
1788    Parens,
1789
1790    /// A `<ruby>` element with hanja as the base and hangul as the `rt`
1791    /// gloss, falling back to parens when the active scope disallows inline
1792    /// markup.
1793    Ruby,
1794}
1795
1796/// Rendering options that combine a [`RenderMode`] with per-mode sub-options.
1797///
1798/// Most pipelines configure rendering by mode alone, so `RenderOptions`
1799/// implements `From<RenderMode>` and `Default` to keep existing call sites
1800/// terse. Pipelines that need finer control (such as a ruby gloss in
1801/// [`RenderMode::Original`]) construct a `RenderOptions` value directly.
1802#[derive(Clone, Copy, Debug, Eq, PartialEq)]
1803pub struct RenderOptions {
1804    /// Top-level rendering mode applied to every annotation.
1805    pub mode: RenderMode,
1806
1807    /// Gloss form used by [`RenderMode::Original`]. Ignored by other modes.
1808    pub original_gloss: OriginalGloss,
1809}
1810
1811impl Default for RenderOptions {
1812    fn default() -> Self {
1813        Self {
1814            mode: RenderMode::HangulOnly,
1815            original_gloss: OriginalGloss::Parens,
1816        }
1817    }
1818}
1819
1820impl From<RenderMode> for RenderOptions {
1821    fn from(mode: RenderMode) -> Self {
1822        Self {
1823            mode,
1824            original_gloss: OriginalGloss::default(),
1825        }
1826    }
1827}
1828
1829/// The context boundary used by stateful annotation middlewares.
1830///
1831/// `PerBlock` resets when a scope reports [`ScopeData::is_block_boundary`].
1832/// `PerSection` resets when a later scope reports
1833/// [`ScopeData::is_section_boundary`].  Plain-text streams have no block or
1834/// section scopes, so those windows behave like one document context.  This is
1835/// required for exact homophone rendering because a later plain-text line can
1836/// make an earlier annotation ambiguous after it would otherwise have been
1837/// written.
1838#[derive(Clone, Copy, Debug, Eq, PartialEq)]
1839pub enum ContextWindow {
1840    /// Disable the middleware and leave tokens unchanged.
1841    Off,
1842
1843    /// Reset state at format-adapter block boundaries.
1844    PerBlock,
1845
1846    /// Reset state at format-adapter section boundaries.
1847    PerSection,
1848
1849    /// Use the entire token stream as one context.
1850    PerDocument,
1851}
1852
1853/// How homophone disambiguation decides that an annotation needs its hanja
1854/// shown in [`RenderMode::HangulOnly`].
1855///
1856/// The two strategies differ in what counts as a homophone collision:
1857///
1858/// `ContextLocal` (the default) marks an annotation only when another reading
1859/// with a *different* hanja form actually appears within the same context
1860/// window.  This keeps hangul-only output clean: a Sino-Korean word is glossed
1861/// only when the surrounding text genuinely makes it ambiguous.
1862///
1863/// `DictionaryWide` additionally marks an annotation whenever its reading is
1864/// shared by any other hanja form anywhere in the dictionary, regardless of
1865/// whether those alternatives occur in the text.  With a large reference
1866/// dictionary such as the Standard Korean Dictionary almost every common
1867/// reading has some homophone, so this strategy glosses most Sino-Korean
1868/// words.  It is preserved as an opt-in for callers that want maximal
1869/// disambiguation; words that should always be glossed regardless of context
1870/// are better expressed through [`MatchMark::require_hanja`].
1871#[derive(Clone, Copy, Debug, Default, Eq, PartialEq)]
1872pub enum HomophoneDetection {
1873    /// Mark only readings that collide within the active context window.
1874    #[default]
1875    ContextLocal,
1876
1877    /// Also mark readings shared by other hanja forms anywhere in the
1878    /// dictionary.
1879    DictionaryWide,
1880}
1881
1882/// Action applied when a user directive predicate matches an annotation.
1883#[derive(Clone, Copy, Debug, Eq, PartialEq)]
1884pub enum DirectiveAction {
1885    /// Require rendered output to keep the original hanja visible.
1886    RequireHanja,
1887
1888    /// Require rendered output to include a hangul gloss.
1889    RequireHangul,
1890
1891    /// Collapse the annotation to plain primary text for the active renderer.
1892    SkipAnnotation,
1893}
1894
1895/// User rules that adjust annotation presentation policy.
1896///
1897/// Literal helpers cover common hanja-form rules.  Callers that need richer
1898/// matching can add closure predicates over the whole [`Annotation`], which
1899/// keeps the core API independent of CLI-only pattern syntaxes.
1900#[derive(Default)]
1901pub struct UserDirectives<'a> {
1902    rules: Vec<UserDirectiveRule<'a>>,
1903}
1904
1905impl<'a> UserDirectives<'a> {
1906    /// Creates an empty directive set.
1907    pub fn new() -> Self {
1908        Self::default()
1909    }
1910
1911    /// Marks a literal hanja form as requiring visible hanja in output.
1912    pub fn require_hanja(&mut self, hanja: impl Into<String>) {
1913        self.add_literal(hanja, DirectiveAction::RequireHanja);
1914    }
1915
1916    /// Marks a literal hanja form as requiring a visible hangul gloss.
1917    pub fn require_hangul(&mut self, hanja: impl Into<String>) {
1918        self.add_literal(hanja, DirectiveAction::RequireHangul);
1919    }
1920
1921    /// Marks a literal hanja form as not receiving annotation rendering.
1922    pub fn skip_annotation(&mut self, hanja: impl Into<String>) {
1923        self.add_literal(hanja, DirectiveAction::SkipAnnotation);
1924    }
1925
1926    /// Adds a literal hanja-form directive.
1927    pub fn add_literal(&mut self, hanja: impl Into<String>, action: DirectiveAction) {
1928        self.rules.push(UserDirectiveRule {
1929            predicate: UserDirectivePredicate::Literal(hanja.into()),
1930            action,
1931        });
1932    }
1933
1934    /// Adds a predicate directive over the complete annotation metadata.
1935    pub fn add_predicate(
1936        &mut self,
1937        predicate: impl Fn(&Annotation) -> bool + 'a,
1938        action: DirectiveAction,
1939    ) {
1940        self.rules.push(UserDirectiveRule {
1941            predicate: UserDirectivePredicate::Predicate(Box::new(predicate)),
1942            action,
1943        });
1944    }
1945
1946    /// Returns whether no directive rules are configured.
1947    pub fn is_empty(&self) -> bool {
1948        self.rules.is_empty()
1949    }
1950
1951    /// Applies every configured directive to a single output token.
1952    ///
1953    /// Non-[`OutputToken::Annotated`] tokens pass through unchanged. For an
1954    /// annotation, each matching rule sets the corresponding flag in priority
1955    /// of declaration order. This method is the per-token primitive used by
1956    /// streaming pipelines that want to apply directives without buffering.
1957    pub fn apply<S>(&self, token: OutputToken<S>) -> OutputToken<S> {
1958        match token {
1959            OutputToken::Annotated(mut annotation) => {
1960                for rule in &self.rules {
1961                    if !rule.predicate.matches(&annotation) {
1962                        continue;
1963                    }
1964                    match rule.action {
1965                        DirectiveAction::RequireHanja => annotation.require_hanja = true,
1966                        DirectiveAction::RequireHangul => annotation.require_hangul = true,
1967                        DirectiveAction::SkipAnnotation => annotation.skip_annotation = true,
1968                    }
1969                }
1970                OutputToken::Annotated(annotation)
1971            }
1972            token => token,
1973        }
1974    }
1975}
1976
1977struct UserDirectiveRule<'a> {
1978    predicate: UserDirectivePredicate<'a>,
1979    action: DirectiveAction,
1980}
1981
1982enum UserDirectivePredicate<'a> {
1983    Literal(String),
1984    Predicate(Box<dyn Fn(&Annotation) -> bool + 'a>),
1985}
1986
1987impl UserDirectivePredicate<'_> {
1988    fn matches(&self, annotation: &Annotation) -> bool {
1989        match self {
1990            Self::Literal(hanja) => annotation.hanja == *hanja,
1991            Self::Predicate(predicate) => predicate(annotation),
1992        }
1993    }
1994}
1995
1996/// Sets `homophone` on dictionary annotations sharing a reading.
1997///
1998/// Uses [`HomophoneDetection::ContextLocal`], marking only readings that
1999/// collide within the active context window.  Use
2000/// [`mark_homophones_with_detection`] to opt into dictionary-wide marking.
2001pub fn mark_homophones<S, D>(
2002    tokens: impl IntoIterator<Item = OutputToken<S>>,
2003    dictionary: &D,
2004    window: ContextWindow,
2005) -> Vec<OutputToken<S>>
2006where
2007    S: ScopeData,
2008    D: HanjaDictionary + ?Sized,
2009{
2010    mark_homophones_with_detection(tokens, dictionary, window, HomophoneDetection::ContextLocal)
2011}
2012
2013/// Sets `homophone` on dictionary annotations sharing a reading, choosing the
2014/// detection strategy explicitly.
2015///
2016/// With [`HomophoneDetection::ContextLocal`] an annotation is marked only when
2017/// another hanja form with the same reading occurs within the context window,
2018/// so no dictionary index is built.  With
2019/// [`HomophoneDetection::DictionaryWide`] the marker also builds one homophone
2020/// index from the supplied dictionary and falls back to
2021/// [`HanjaDictionary::has_homophone`] for lookup-only dictionaries.  Fallback
2022/// (non-dictionary) annotations are ignored either way because they are
2023/// phonetic fragments rather than known lexical homophones.
2024pub fn mark_homophones_with_detection<S, D>(
2025    tokens: impl IntoIterator<Item = OutputToken<S>>,
2026    dictionary: &D,
2027    window: ContextWindow,
2028    detection: HomophoneDetection,
2029) -> Vec<OutputToken<S>>
2030where
2031    S: ScopeData,
2032    D: HanjaDictionary + ?Sized,
2033{
2034    if window == ContextWindow::Off {
2035        return tokens.into_iter().collect();
2036    }
2037
2038    let index = match detection {
2039        HomophoneDetection::ContextLocal => None,
2040        HomophoneDetection::DictionaryWide => HomophoneIndex::from_dictionary(dictionary),
2041    };
2042    let lookup_fallback = match detection {
2043        HomophoneDetection::ContextLocal => None,
2044        HomophoneDetection::DictionaryWide => index.is_none().then_some(dictionary),
2045    };
2046    ContextMiddleware::new(window, |tokens| {
2047        mark_homophones_in_context(tokens, index.as_ref(), lookup_fallback);
2048    })
2049    .process(tokens)
2050}
2051
2052/// Clears repeat gloss requirements after the first occurrence of each hanja.
2053///
2054/// The first occurrence key is the original hanja form. Later annotations for
2055/// the same form have `first_in_context` set to false and no longer require
2056/// either side to be shown.
2057pub fn filter_first_occurrences<S>(
2058    tokens: impl IntoIterator<Item = OutputToken<S>>,
2059    window: ContextWindow,
2060) -> Vec<OutputToken<S>>
2061where
2062    S: ScopeData,
2063{
2064    ContextMiddleware::new(window, filter_first_occurrences_in_context).process(tokens)
2065}
2066
2067type ContextApply<S> = fn(&mut [OutputToken<S>]);
2068type HomophoneApply<'a, S> = Box<dyn FnMut(&mut [OutputToken<S>]) + 'a>;
2069
2070/// Streaming homophone marker middleware.
2071///
2072/// Context windows that require lookahead buffer only until their configured
2073/// boundary. `PerDocument`, and scoped windows on streams that never emit the
2074/// corresponding boundary, buffer until [`HomophoneMarker::finish`].  For
2075/// example, exact plain-text homophone marking with `PerBlock` is document-wide
2076/// because plain text has no block scopes.
2077pub struct HomophoneMarker<'a, S>
2078where
2079    S: ScopeData,
2080{
2081    inner: ContextMiddleware<S, HomophoneApply<'a, S>>,
2082}
2083
2084impl<'a, S> HomophoneMarker<'a, S>
2085where
2086    S: ScopeData,
2087{
2088    /// Creates a homophone marker for the selected context window using
2089    /// [`HomophoneDetection::ContextLocal`].
2090    ///
2091    /// Use [`HomophoneMarker::with_detection`] to opt into dictionary-wide
2092    /// marking.
2093    pub fn new<D>(dictionary: &'a D, window: ContextWindow) -> Self
2094    where
2095        D: HanjaDictionary + ?Sized,
2096    {
2097        Self::with_detection(dictionary, window, HomophoneDetection::ContextLocal)
2098    }
2099
2100    /// Creates a homophone marker for the selected context window and detection
2101    /// strategy.
2102    ///
2103    /// With [`HomophoneDetection::ContextLocal`] no dictionary index is built;
2104    /// only readings that collide within the context window are marked.  With
2105    /// [`HomophoneDetection::DictionaryWide`] a homophone index (or
2106    /// [`HanjaDictionary::has_homophone`] fallback) is consulted as well.
2107    pub fn with_detection<D>(
2108        dictionary: &'a D,
2109        window: ContextWindow,
2110        detection: HomophoneDetection,
2111    ) -> Self
2112    where
2113        D: HanjaDictionary + ?Sized,
2114    {
2115        let index = match detection {
2116            _ if window == ContextWindow::Off => None,
2117            HomophoneDetection::ContextLocal => None,
2118            HomophoneDetection::DictionaryWide => HomophoneIndex::from_dictionary(dictionary),
2119        };
2120        let lookup_fallback = match detection {
2121            HomophoneDetection::ContextLocal => None,
2122            HomophoneDetection::DictionaryWide => index.is_none().then_some(dictionary),
2123        };
2124        Self {
2125            inner: ContextMiddleware::new(
2126                window,
2127                Box::new(move |tokens| {
2128                    mark_homophones_in_context(tokens, index.as_ref(), lookup_fallback);
2129                }),
2130            ),
2131        }
2132    }
2133
2134    /// Pushes one output token and returns tokens ready for downstream stages.
2135    pub fn push_token(&mut self, token: OutputToken<S>) -> Vec<OutputToken<S>> {
2136        self.inner.push_token(token)
2137    }
2138
2139    /// Finishes the middleware and returns buffered tokens.
2140    pub fn finish(self) -> Vec<OutputToken<S>> {
2141        self.inner.finish()
2142    }
2143}
2144
2145/// Streaming first-occurrence middleware.
2146///
2147/// Repeated annotations inside a context have `first_in_context` cleared and
2148/// presentation requirements removed once the context is flushed.
2149pub struct FirstOccurrenceFilter<S>
2150where
2151    S: ScopeData,
2152{
2153    inner: ContextMiddleware<S, ContextApply<S>>,
2154}
2155
2156impl<S> FirstOccurrenceFilter<S>
2157where
2158    S: ScopeData,
2159{
2160    /// Creates a first-occurrence filter for the selected context window.
2161    pub fn new(window: ContextWindow) -> Self {
2162        Self {
2163            inner: ContextMiddleware::new(window, filter_first_occurrences_in_context::<S>),
2164        }
2165    }
2166
2167    /// Pushes one output token and returns tokens ready for downstream stages.
2168    pub fn push_token(&mut self, token: OutputToken<S>) -> Vec<OutputToken<S>> {
2169        self.inner.push_token(token)
2170    }
2171
2172    /// Finishes the middleware and returns buffered tokens.
2173    pub fn finish(self) -> Vec<OutputToken<S>> {
2174        self.inner.finish()
2175    }
2176}
2177
2178/// Applies literal user directives to annotation policy flags.
2179///
2180/// Rules only set flags; they do not render, remove, or reorder tokens.
2181pub fn apply_user_directives<S>(
2182    tokens: impl IntoIterator<Item = OutputToken<S>>,
2183    directives: &UserDirectives<'_>,
2184) -> Vec<OutputToken<S>> {
2185    apply_user_directives_iter(tokens, directives).collect()
2186}
2187
2188/// Lazily applies literal user directives to an output token stream.
2189///
2190/// Returns an iterator that walks the input tokens without intermediate
2191/// buffering. Use this variant in streaming pipelines that need to chain
2192/// directive application with other lazy stages such as [`render_tokens_iter`].
2193pub fn apply_user_directives_iter<'a, S>(
2194    tokens: impl IntoIterator<Item = OutputToken<S>> + 'a,
2195    directives: &'a UserDirectives<'_>,
2196) -> impl Iterator<Item = OutputToken<S>> + 'a {
2197    tokens.into_iter().map(|token| directives.apply(token))
2198}
2199
2200struct ContextMiddleware<S, F>
2201where
2202    S: ScopeData,
2203    F: FnMut(&mut [OutputToken<S>]),
2204{
2205    window: ContextWindow,
2206    apply: F,
2207    context: Vec<OutputToken<S>>,
2208    scope_boundaries: Vec<bool>,
2209}
2210
2211impl<S, F> ContextMiddleware<S, F>
2212where
2213    S: ScopeData,
2214    F: FnMut(&mut [OutputToken<S>]),
2215{
2216    fn new(window: ContextWindow, apply: F) -> Self {
2217        Self {
2218            window,
2219            apply,
2220            context: Vec::new(),
2221            scope_boundaries: Vec::new(),
2222        }
2223    }
2224
2225    fn process(mut self, tokens: impl IntoIterator<Item = OutputToken<S>>) -> Vec<OutputToken<S>> {
2226        let mut output = Vec::new();
2227        for token in tokens {
2228            output.extend(self.push_token(token));
2229        }
2230        output.extend(self.finish());
2231        output
2232    }
2233
2234    fn push_token(&mut self, token: OutputToken<S>) -> Vec<OutputToken<S>> {
2235        let mut output = Vec::new();
2236        match self.window {
2237            ContextWindow::Off => output.push(token),
2238            ContextWindow::PerDocument => self.context.push(token),
2239            ContextWindow::PerBlock | ContextWindow::PerSection => match &token {
2240                OutputToken::Open(scope) => {
2241                    let is_boundary = match self.window {
2242                        ContextWindow::PerBlock => scope.data().is_block_boundary(),
2243                        ContextWindow::PerSection => scope.data().is_section_boundary(),
2244                        ContextWindow::Off | ContextWindow::PerDocument => false,
2245                    };
2246                    if is_boundary {
2247                        self.flush_context(&mut output);
2248                    }
2249                    self.scope_boundaries.push(is_boundary);
2250                    self.context.push(token);
2251                }
2252                OutputToken::Close => {
2253                    let closes_boundary = self.scope_boundaries.pop().unwrap_or(false);
2254                    self.context.push(token);
2255                    if closes_boundary && self.window == ContextWindow::PerBlock {
2256                        self.flush_context(&mut output);
2257                    }
2258                }
2259                _ => self.context.push(token),
2260            },
2261        }
2262        output
2263    }
2264
2265    fn finish(mut self) -> Vec<OutputToken<S>> {
2266        let mut output = Vec::new();
2267        self.flush_context(&mut output);
2268        output
2269    }
2270
2271    fn flush_context(&mut self, output: &mut Vec<OutputToken<S>>) {
2272        if self.context.is_empty() {
2273            return;
2274        }
2275
2276        (self.apply)(&mut self.context);
2277        output.append(&mut self.context);
2278    }
2279}
2280
2281#[derive(Clone, Debug, Default, Eq, PartialEq)]
2282struct HomophoneIndex {
2283    forms_by_reading: BTreeMap<String, BTreeSet<String>>,
2284}
2285
2286impl HomophoneIndex {
2287    fn from_dictionary<D>(dictionary: &D) -> Option<Self>
2288    where
2289        D: HanjaDictionary + ?Sized,
2290    {
2291        let mut forms_by_reading = BTreeMap::<String, BTreeSet<String>>::new();
2292        for record in dictionary.entries()? {
2293            forms_by_reading
2294                .entry(record.reading)
2295                .or_default()
2296                .insert(record.hanja);
2297        }
2298        Some(Self { forms_by_reading })
2299    }
2300
2301    fn has_homophone(&self, hanja: &str, reading: &str) -> bool {
2302        self.forms_by_reading
2303            .get(reading)
2304            .is_some_and(|forms| forms.iter().any(|form| form != hanja))
2305    }
2306}
2307
2308fn mark_homophones_in_context<S, D>(
2309    tokens: &mut [OutputToken<S>],
2310    index: Option<&HomophoneIndex>,
2311    lookup_fallback: Option<&D>,
2312) where
2313    D: HanjaDictionary + ?Sized,
2314{
2315    let mut forms_by_reading = BTreeMap::<String, BTreeSet<String>>::new();
2316
2317    for token in tokens.iter() {
2318        if let OutputToken::Annotated(annotation) = token
2319            && annotation.from_dictionary
2320        {
2321            forms_by_reading
2322                .entry(annotation.reading.clone())
2323                .or_default()
2324                .insert(annotation.hanja.clone());
2325        }
2326    }
2327
2328    for token in tokens.iter_mut() {
2329        if let OutputToken::Annotated(annotation) = token {
2330            annotation.homophone = annotation.from_dictionary
2331                && (index.is_some_and(|index| {
2332                    index.has_homophone(&annotation.hanja, &annotation.reading)
2333                }) || lookup_fallback.is_some_and(|dictionary| {
2334                    dictionary.has_homophone(&annotation.hanja, &annotation.reading)
2335                }) || forms_by_reading
2336                    .get(&annotation.reading)
2337                    .is_some_and(|forms| forms.len() > 1));
2338        }
2339    }
2340}
2341
2342fn filter_first_occurrences_in_context<S>(tokens: &mut [OutputToken<S>]) {
2343    let mut seen = BTreeSet::new();
2344
2345    for token in tokens.iter_mut() {
2346        if let OutputToken::Annotated(annotation) = token {
2347            if seen.insert(annotation.hanja.clone()) {
2348                annotation.first_in_context = true;
2349            } else {
2350                annotation.first_in_context = false;
2351                annotation.require_hanja = false;
2352                annotation.require_hangul = false;
2353            }
2354        }
2355    }
2356}
2357
2358/// Renders engine output tokens into annotation-free tokens.
2359///
2360/// Structural and text tokens pass through. Each annotation is expanded into a
2361/// concrete rendered token according to the supplied options, the current
2362/// scope, and the annotation's flags. `options` accepts either a bare
2363/// [`RenderMode`] (via the `From<RenderMode>` impl on [`RenderOptions`]) or a
2364/// full [`RenderOptions`] value.
2365pub fn render_tokens<S, O>(
2366    tokens: impl IntoIterator<Item = OutputToken<S>>,
2367    options: O,
2368) -> Vec<RenderedToken<S>>
2369where
2370    S: ScopeData,
2371    O: Into<RenderOptions>,
2372{
2373    render_tokens_iter(tokens, options).collect()
2374}
2375
2376/// Renders engine output tokens into annotation-free tokens as an iterator.
2377///
2378/// The renderer maintains a small scope stack so that annotation expansion can
2379/// consult the active scope's [`ScopeData::allows_inline_markup`] when
2380/// choosing between an inline-markup form and a parenthesized fallback. Every
2381/// other token maps one-to-one to its rendered counterpart.
2382pub fn render_tokens_iter<S, O>(
2383    tokens: impl IntoIterator<Item = OutputToken<S>>,
2384    options: O,
2385) -> impl Iterator<Item = RenderedToken<S>>
2386where
2387    S: ScopeData,
2388    O: Into<RenderOptions>,
2389{
2390    RendererIter {
2391        upstream: tokens.into_iter(),
2392        renderer: Renderer::new(options),
2393    }
2394}
2395
2396/// Stateful renderer for chunked [`OutputToken`] streams.
2397///
2398/// `Renderer` is the push-based counterpart to [`render_tokens_iter`]. It
2399/// preserves the active scope stack across calls so format writers can consume
2400/// rendered tokens as soon as upstream engine and middleware stages release
2401/// them, without losing inline-markup restrictions from earlier chunks.
2402pub struct Renderer<S>
2403where
2404    S: ScopeData,
2405{
2406    options: RenderOptions,
2407    /// Cached `allows_inline_markup` value for each open scope. Storing the
2408    /// boolean instead of the whole scope keeps the renderer free of an extra
2409    /// `S: Clone` bound at this layer (it already requires it via `ScopeData`)
2410    /// and avoids the cost of cloning adapter-owned data.
2411    markup_stack: Vec<bool>,
2412    /// Number of currently open scopes whose `allows_inline_markup` is
2413    /// `false`. Inline markup is safe at the current cursor only when this
2414    /// counter is zero; otherwise some ancestor forbids markup and a nested
2415    /// allow-markup scope cannot override that restriction.
2416    disallowing_ancestors: usize,
2417    _scope: PhantomData<fn(S)>,
2418}
2419
2420impl<S> Renderer<S>
2421where
2422    S: ScopeData,
2423{
2424    /// Creates a renderer with the supplied rendering options.
2425    pub fn new<O>(options: O) -> Self
2426    where
2427        O: Into<RenderOptions>,
2428    {
2429        Self {
2430            options: options.into(),
2431            markup_stack: Vec::new(),
2432            disallowing_ancestors: 0,
2433            _scope: PhantomData,
2434        }
2435    }
2436
2437    /// Pushes one output token and returns its rendered counterpart.
2438    pub fn push_token(&mut self, token: OutputToken<S>) -> RenderedToken<S> {
2439        match token {
2440            OutputToken::Open(scope) => {
2441                let allows = scope.data().allows_inline_markup();
2442                if !allows {
2443                    self.disallowing_ancestors += 1;
2444                }
2445                self.markup_stack.push(allows);
2446                RenderedToken::Open(scope)
2447            }
2448            OutputToken::Close => {
2449                if let Some(false) = self.markup_stack.pop() {
2450                    // Saturating guard for malformed streams that emit more
2451                    // Close than Open tokens; the renderer should never
2452                    // panic on broken input.
2453                    self.disallowing_ancestors = self.disallowing_ancestors.saturating_sub(1);
2454                }
2455                RenderedToken::Close
2456            }
2457            OutputToken::Text(text) => RenderedToken::Text(text),
2458            OutputToken::Verbatim(text) => RenderedToken::Verbatim(text),
2459            OutputToken::Annotated(annotation) => {
2460                // Inline markup is allowed only when no open ancestor scope
2461                // forbids it. The plain-text reader wraps its input in a
2462                // scope whose `allows_inline_markup` is false, so plain text
2463                // still falls back to parens; HTML and Markdown root
2464                // contexts emit no enclosing scope and therefore start with
2465                // an empty stack, leaving annotations free to use markup.
2466                let allows_inline_markup = self.disallowing_ancestors == 0;
2467                render_annotation(&annotation, &self.options, allows_inline_markup)
2468            }
2469        }
2470    }
2471}
2472
2473struct RendererIter<I, S>
2474where
2475    S: ScopeData,
2476{
2477    upstream: I,
2478    renderer: Renderer<S>,
2479}
2480
2481impl<I, S> Iterator for RendererIter<I, S>
2482where
2483    I: Iterator<Item = OutputToken<S>>,
2484    S: ScopeData,
2485{
2486    type Item = RenderedToken<S>;
2487
2488    fn next(&mut self) -> Option<Self::Item> {
2489        let token = self.upstream.next()?;
2490        Some(self.renderer.push_token(token))
2491    }
2492}
2493
2494fn render_annotation<S>(
2495    annotation: &Annotation,
2496    options: &RenderOptions,
2497    allows_inline_markup: bool,
2498) -> RenderedToken<S> {
2499    if annotation.skip_annotation {
2500        let primary = match options.mode {
2501            RenderMode::HangulOnly | RenderMode::HangulHanjaParens => annotation.reading.clone(),
2502            RenderMode::HanjaHangulParens | RenderMode::Original => annotation.hanja.clone(),
2503            RenderMode::Ruby(RubyBase::OnHangul) => annotation.reading.clone(),
2504            RenderMode::Ruby(RubyBase::OnHanja) => annotation.hanja.clone(),
2505        };
2506        return RenderedToken::Text(primary);
2507    }
2508
2509    match options.mode {
2510        RenderMode::HangulOnly if annotation.require_hanja || annotation.homophone => {
2511            RenderedToken::Text(parens(&annotation.reading, &annotation.hanja))
2512        }
2513        RenderMode::HangulOnly => RenderedToken::Text(annotation.reading.clone()),
2514        RenderMode::HangulHanjaParens => {
2515            RenderedToken::Text(parens(&annotation.reading, &annotation.hanja))
2516        }
2517        RenderMode::HanjaHangulParens => {
2518            RenderedToken::Text(parens(&annotation.hanja, &annotation.reading))
2519        }
2520        RenderMode::Ruby(base) => render_ruby(annotation, base, allows_inline_markup),
2521        RenderMode::Original if annotation.require_hangul => match options.original_gloss {
2522            OriginalGloss::Parens => {
2523                RenderedToken::Text(parens(&annotation.hanja, &annotation.reading))
2524            }
2525            // `Original` keeps hanja as the primary text, so its ruby form
2526            // always uses hanja as the base regardless of any other setting.
2527            OriginalGloss::Ruby => render_ruby(annotation, RubyBase::OnHanja, allows_inline_markup),
2528        },
2529        RenderMode::Original => RenderedToken::Text(annotation.hanja.clone()),
2530    }
2531}
2532
2533fn render_ruby<S>(
2534    annotation: &Annotation,
2535    base: RubyBase,
2536    allows_inline_markup: bool,
2537) -> RenderedToken<S> {
2538    let (base_text, rt_text) = match base {
2539        RubyBase::OnHangul => (&annotation.reading, &annotation.hanja),
2540        RubyBase::OnHanja => (&annotation.hanja, &annotation.reading),
2541    };
2542    if !allows_inline_markup {
2543        return RenderedToken::Text(parens(base_text, rt_text));
2544    }
2545    RenderedToken::Ruby {
2546        base: base_text.clone(),
2547        rt: rt_text.clone(),
2548    }
2549}
2550
2551fn parens(reading: &str, hanja: &str) -> String {
2552    let mut output = String::new();
2553    output.push_str(reading);
2554    output.push('(');
2555    output.push_str(hanja);
2556    output.push(')');
2557    output
2558}
2559
2560/// Converts plain text through reader, engine, renderer, and writer stages.
2561///
2562/// This is a convenience for the plain-text MVP path. More capable format
2563/// adapters should call the individual stages so they can preserve their own
2564/// structural tokens. The `render` argument accepts either a [`RenderMode`]
2565/// (converted via `From<RenderMode>` for [`RenderOptions`]) or a full
2566/// [`RenderOptions`] value.
2567pub fn convert_plain_text<D, R>(input: &str, dictionary: &D, render: R) -> String
2568where
2569    D: HanjaDictionary + ?Sized,
2570    R: Into<RenderOptions>,
2571{
2572    convert_plain_text_with_options(input, dictionary, render, EngineOptions::default())
2573}
2574
2575/// Converts plain text with explicit hanja conversion engine options.
2576///
2577/// This is the option-aware variant of [`convert_plain_text`].
2578pub fn convert_plain_text_with_options<D, R>(
2579    input: &str,
2580    dictionary: &D,
2581    render: R,
2582    options: EngineOptions,
2583) -> String
2584where
2585    D: HanjaDictionary + ?Sized,
2586    R: Into<RenderOptions>,
2587{
2588    let input_tokens = read_plain_text(input);
2589    let output_tokens = process_tokens_with_options(input_tokens, dictionary, options);
2590    let output_tokens = mark_homophones(output_tokens, dictionary, ContextWindow::PerBlock);
2591    let rendered_tokens = render_tokens(output_tokens, render);
2592    write_plain_text(rendered_tokens)
2593}
gukhanmun_core/lib.rs

gukhanmun_core/
lib.rs