gukhanmun_core/
lib.rs

1// Gukhanmun: Core IR, engine, dictionary traits, and fallback logic for Gukhanmun.
2// Copyright (C) 2026  Hong Minhee
3//
4// This program is free software: you can redistribute it and/or modify
5// it under the terms of the GNU General Public License as published by
6// the Free Software Foundation, either version 3 of the License, or
7// (at your option) any later version.
8//
9// This program is distributed in the hope that it will be useful,
10// but WITHOUT ANY WARRANTY; without even the implied warranty of
11// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12// GNU General Public License for more details.
13//
14// You should have received a copy of the GNU General Public License
15// along with this program.  If not, see <https://www.gnu.org/licenses/>.
16
17//! Core types and algorithms for Gukhanmun.
18//!
19//! This crate is the home for the format-neutral intermediate representation,
20//! conversion engine, dictionary traits, lattice segmentation, and fallback
21//! hanja reading logic. Format adapters, command-line I/O, and language
22//! bindings live in separate crates.
23
24#![no_std]
25#![forbid(unsafe_code)]
26#![deny(missing_docs)]
27
28extern crate alloc;
29
30mod fallback;
31mod generated;
32mod segment;
33
34use alloc::boxed::Box;
35use alloc::collections::{BTreeMap, BTreeSet};
36use alloc::string::{String, ToString};
37use alloc::vec::Vec;
38use core::marker::PhantomData;
39
40use fallback::{
41    FallbackPart, FallbackState, fallback_reading_for_run, phoneticize_fallback_run_with_state,
42};
43use generated::unihan_readings::KHANGUL_READINGS;
44use segment::{Segment, segment_text};
45
46/// Error returned by fallible core pipeline entry points.
47///
48/// The core engine is mostly infallible today because dictionary lookup is a
49/// synchronous trait contract. This type is still the common structured error
50/// surface for reader/engine/writer boundaries and for future engine
51/// invariants that callers may need to inspect.
52#[derive(Debug, thiserror::Error)]
53#[non_exhaustive]
54pub enum Error {
55    /// Loading or preparing a dictionary failed before conversion could run.
56    #[error("dictionary load failed: {0}")]
57    DictionaryLoad(String),
58
59    /// Lattice segmentation failed for a specific source string.
60    #[error("segmentation failed for {hanja:?}: {reason}")]
61    Segmentation {
62        /// The hanja source span that could not be segmented.
63        hanja: String,
64
65        /// Human-readable reason for the segmentation failure.
66        reason: String,
67    },
68
69    /// A dictionary or fallback path produced a reading that is not accepted.
70    #[error("invalid hangul reading {reading:?} for hanja {hanja:?}")]
71    InvalidReading {
72        /// The hanja source string associated with the reading.
73        hanja: String,
74
75        /// The rejected hangul reading.
76        reading: String,
77    },
78
79    /// An internal invariant was violated.
80    #[error("internal invariant violated: {0}")]
81    Internal(&'static str),
82
83    /// A boxed error from an extension point that has no more specific core
84    /// variant yet.
85    #[error(transparent)]
86    Other(#[from] Box<dyn core::error::Error + Send + Sync + 'static>),
87}
88
89/// Stream-level error recovery policy.
90///
91/// `Strict` is the default and returns the first recoverable reader error.
92/// `Lenient` logs the error and emits the original unrecognized region as a
93/// verbatim token so downstream tokens can continue flowing.
94#[derive(Clone, Copy, Debug, Default, Eq, PartialEq)]
95pub enum Recovery {
96    /// Return the first reader, engine, or writer error and stop processing.
97    #[default]
98    Strict,
99
100    /// Preserve recoverable bad input regions and continue processing.
101    Lenient,
102}
103
104/// A recoverable reader error plus the original source region.
105///
106/// Readers use this value when they can identify a malformed region and know
107/// how to preserve its source bytes or text in lenient mode. Strict mode
108/// returns the stored error directly.
109#[derive(Debug)]
110pub struct RecoverableInputError {
111    original: String,
112    error: Error,
113}
114
115impl RecoverableInputError {
116    /// Creates a recoverable input error from original source and cause.
117    pub fn new(original: String, error: Error) -> Self {
118        Self { original, error }
119    }
120
121    /// Returns the original source region that can be preserved in lenient
122    /// mode.
123    pub fn original(&self) -> &str {
124        &self.original
125    }
126
127    /// Returns the structured error describing why the region was rejected.
128    pub fn error(&self) -> &Error {
129        &self.error
130    }
131
132    /// Consumes the error and returns the original source plus cause.
133    pub fn into_parts(self) -> (String, Error) {
134        (self.original, self.error)
135    }
136}
137
138/// Adapter-owned data attached to an intermediate-representation scope.
139///
140/// The engine treats this trait as an opaque policy boundary. Format adapters
141/// can encode HTML elements, Markdown events, or plain-text scopes in the
142/// concrete type, while the engine only asks whether text should be preserved
143/// and whether later stages may insert inline markup.
144pub trait ScopeData: Clone + 'static {
145    /// Returns whether text inside this scope must pass through untouched.
146    fn is_preserve(&self) -> bool;
147
148    /// Returns whether inline markup may be inserted inside this scope.
149    ///
150    /// This flag is about *structural* permission for markup at the current
151    /// position, not about whether the engine actually converts text here.
152    /// A scope may legitimately set [`Self::is_preserve`] to `true` (so no
153    /// annotation is produced) while still reporting `true` for this method,
154    /// because preserve does not by itself restrict what a deeper non-preserved
155    /// child may emit. Adapters should return `false` only when an HTML5
156    /// text-only content model (such as `<title>` or `<option>`) or an
157    /// analogous host rule actually forbids markup at this position.
158    ///
159    /// Scope-aware renderers treat inline markup as allowed only when *every*
160    /// open ancestor reports `true`; a nested allow-markup scope cannot
161    /// re-enable markup that an ancestor has forbidden.
162    fn allows_inline_markup(&self) -> bool {
163        true
164    }
165
166    /// Returns whether this scope resets block-oriented stateful stages.
167    fn is_block_boundary(&self) -> bool {
168        false
169    }
170
171    /// Returns whether this scope resets section-oriented stateful stages.
172    fn is_section_boundary(&self) -> bool {
173        false
174    }
175}
176
177/// A structural scope in the format-neutral token stream.
178///
179/// `Scope` carries only adapter-owned data. The engine may clone and stack
180/// scopes, but it does not inspect the concrete data beyond the `ScopeData`
181/// methods.
182#[derive(Clone, Debug, Eq, PartialEq)]
183pub struct Scope<S> {
184    data: S,
185}
186
187impl<S> Scope<S> {
188    /// Creates a scope from adapter-specific data.
189    pub fn new(data: S) -> Self {
190        Self { data }
191    }
192
193    /// Returns a shared reference to the adapter-specific scope data.
194    pub fn data(&self) -> &S {
195        &self.data
196    }
197
198    /// Consumes the scope and returns its adapter-specific data.
199    pub fn into_data(self) -> S {
200        self.data
201    }
202}
203
204/// A token emitted by a reader before hanja conversion has run.
205///
206/// This type intentionally has no annotation variant: annotations are produced
207/// by the engine and consumed by renderers, so input adapters cannot inject
208/// already-converted positions into the stream.
209#[derive(Clone, Debug, Eq, PartialEq)]
210pub enum InputToken<S> {
211    /// Enters a structural scope.
212    Open(Scope<S>),
213
214    /// Leaves the most recent structural scope.
215    Close,
216
217    /// Text that the engine may convert unless a preserving scope is active.
218    Text(String),
219
220    /// Text that must pass through untouched.
221    Verbatim(String),
222}
223
224/// A token emitted by the engine after hanja conversion.
225///
226/// Most tokens pass through from `InputToken`, but converted dictionary matches
227/// become `Annotated` so middlewares and renderers can choose their final
228/// surface form.
229#[derive(Clone, Debug, Eq, PartialEq)]
230pub enum OutputToken<S> {
231    /// Enters a structural scope.
232    Open(Scope<S>),
233
234    /// Leaves the most recent structural scope.
235    Close,
236
237    /// Text that needs no annotation-aware rendering.
238    Text(String),
239
240    /// Text that must pass through untouched.
241    Verbatim(String),
242
243    /// A converted hanja word plus metadata for later stages.
244    Annotated(Annotation),
245}
246
247/// A token emitted by a renderer after all annotations have been expanded.
248///
249/// Writers consume this stream because it cannot contain unrendered
250/// annotations. That makes the renderer-to-writer boundary explicit in the type
251/// system.
252#[derive(Clone, Debug, Eq, PartialEq)]
253pub enum RenderedToken<S> {
254    /// Enters a structural scope.
255    Open(Scope<S>),
256
257    /// Leaves the most recent structural scope.
258    Close,
259
260    /// Text ready for serialization.
261    Text(String),
262
263    /// Verbatim text ready for serialization.
264    Verbatim(String),
265
266    /// A structural ruby annotation pairing a base text with an `rt` gloss.
267    ///
268    /// Writers serialize this in a format-appropriate way: HTML emits a
269    /// `<ruby>` element, Markdown emits inline HTML, and plain text falls back
270    /// to parenthesized text. Because the variant carries the base and gloss
271    /// as separate strings rather than pre-built markup, each writer is
272    /// responsible for escaping the contents according to its own rules — the
273    /// renderer never injects raw HTML produced by string concatenation.
274    ///
275    /// Renderers only emit this variant when the active scope reports
276    /// [`ScopeData::allows_inline_markup`] as `true`; scopes that disallow
277    /// inline markup receive a plain `Text` fallback instead.
278    Ruby {
279        /// Base text shown as the primary side of the ruby annotation.
280        base: String,
281
282        /// Gloss text shown in the `rt` position.
283        rt: String,
284    },
285}
286
287/// Metadata for a dictionary-backed hanja conversion.
288///
289/// The engine fills this value when it turns source hanja into a hangul
290/// reading. The flags describe known constraints; middlewares may adjust them
291/// before a renderer chooses the concrete output form.
292#[derive(Clone, Debug, Eq, PartialEq)]
293pub struct Annotation {
294    /// The original hanja text from the input.
295    pub hanja: String,
296
297    /// The hangul reading selected for the hanja text.
298    pub reading: String,
299
300    /// Whether another hanja form in the active context shares this reading.
301    pub homophone: bool,
302
303    /// Whether rendered output must keep the original hanja visible.
304    pub require_hanja: bool,
305
306    /// Whether rendered output must include a hangul gloss when hanja remains
307    /// primary.
308    pub require_hangul: bool,
309
310    /// Whether this is the first occurrence in the active context window.
311    pub first_in_context: bool,
312
313    /// Whether renderers should collapse this annotation to its primary plain
314    /// text form instead of adding annotation markup or parentheses.
315    pub skip_annotation: bool,
316
317    /// Whether this annotation came from a dictionary match.
318    pub from_dictionary: bool,
319}
320
321/// Dictionary-provided rendering constraints for a match.
322#[derive(Clone, Copy, Debug, Default, Eq, PartialEq)]
323pub struct MatchMark {
324    /// Whether this dictionary entry should always show its hanja form.
325    pub require_hanja: bool,
326
327    /// Whether this dictionary entry should always show its hangul reading.
328    pub require_hangul: bool,
329}
330
331/// A complete dictionary entry exposed for batch policy analysis.
332///
333/// Conversion only needs prefix lookup through [`HanjaDictionary::matches_at`],
334/// but middlewares such as homophone marking need to reason about the effective
335/// entry set without repeatedly probing the dictionary. Backends that can
336/// enumerate entries should return these records from
337/// [`HanjaDictionary::entries`].
338#[derive(Clone, Debug, Eq, PartialEq)]
339pub struct DictionaryRecord {
340    /// The hanja spelling stored as a dictionary key.
341    pub hanja: String,
342
343    /// The hangul reading selected for this hanja spelling.
344    pub reading: String,
345
346    /// Dictionary-provided rendering constraints for this entry.
347    pub mark: MatchMark,
348}
349
350/// A dictionary match that starts at the queried cursor position.
351#[derive(Clone, Debug, Eq, PartialEq)]
352pub struct Match {
353    /// The matched prefix length in UTF-8 bytes.
354    pub byte_len: usize,
355
356    /// The hangul reading for the matched hanja prefix.
357    pub reading: String,
358
359    /// Dictionary-provided rendering constraints for this match.
360    pub mark: MatchMark,
361}
362
363/// A hanja dictionary queried by the conversion engine.
364///
365/// The key operation returns every entry that starts at the beginning of the
366/// supplied string. This shape supports lattice segmentation because the
367/// engine must consider every candidate path through a hanja run.
368pub trait HanjaDictionary {
369    /// Yields every dictionary match that starts at the beginning of `s`.
370    fn matches_at<'a>(&'a self, s: &'a str) -> Box<dyn Iterator<Item = Match> + 'a>;
371
372    /// Returns the greatest dictionary entry length in Unicode scalar values.
373    fn max_word_chars(&self) -> Option<usize> {
374        None
375    }
376
377    /// Enumerates complete dictionary entries when the backend supports it.
378    ///
379    /// The default returns `None`, which keeps custom lookup-only dictionaries
380    /// valid. Homophone-aware middlewares use this as an optional batch path so
381    /// built-in backends can avoid per-token full-dictionary scans.
382    fn entries<'a>(&'a self) -> Option<Box<dyn Iterator<Item = DictionaryRecord> + 'a>> {
383        None
384    }
385
386    /// Returns whether another hanja spelling has the same hangul reading.
387    fn has_homophone(&self, hanja: &str, reading: &str) -> bool {
388        self.entries().is_some_and(|mut entries| {
389            entries.any(|record| record.hanja != hanja && record.reading == reading)
390        })
391    }
392}
393
394impl<D> HanjaDictionary for &D
395where
396    D: HanjaDictionary + ?Sized,
397{
398    fn matches_at<'a>(&'a self, s: &'a str) -> Box<dyn Iterator<Item = Match> + 'a> {
399        (**self).matches_at(s)
400    }
401
402    fn max_word_chars(&self) -> Option<usize> {
403        (**self).max_word_chars()
404    }
405
406    fn entries<'a>(&'a self) -> Option<Box<dyn Iterator<Item = DictionaryRecord> + 'a>> {
407        (**self).entries()
408    }
409
410    fn has_homophone(&self, hanja: &str, reading: &str) -> bool {
411        (**self).has_homophone(hanja, reading)
412    }
413}
414
415impl<D> HanjaDictionary for Box<D>
416where
417    D: HanjaDictionary + ?Sized,
418{
419    fn matches_at<'a>(&'a self, s: &'a str) -> Box<dyn Iterator<Item = Match> + 'a> {
420        (**self).matches_at(s)
421    }
422
423    fn max_word_chars(&self) -> Option<usize> {
424        (**self).max_word_chars()
425    }
426
427    fn entries<'a>(&'a self) -> Option<Box<dyn Iterator<Item = DictionaryRecord> + 'a>> {
428        (**self).entries()
429    }
430
431    fn has_homophone(&self, hanja: &str, reading: &str) -> bool {
432        (**self).has_homophone(hanja, reading)
433    }
434}
435
436/// Per-character Unihan fallback readings exposed as a dictionary.
437///
438/// This type reads the same generated `kHangul` table used by the engine's
439/// fallback phoneticizer, but it deliberately returns canonical pre-initial
440/// sound law readings. Stateful orthographic rules such as the initial sound
441/// law, `列`/`律`, and numeral grouping remain engine fallback behavior rather
442/// than dictionary behavior.
443#[derive(Clone, Copy, Debug, Default, Eq, PartialEq)]
444pub struct UnihanCharDict;
445
446impl HanjaDictionary for UnihanCharDict {
447    fn matches_at<'a>(&'a self, s: &'a str) -> Box<dyn Iterator<Item = Match> + 'a> {
448        let matched = s.chars().next().and_then(|ch| {
449            khangul_reading(ch).map(|reading| Match {
450                byte_len: ch.len_utf8(),
451                reading: reading.to_string(),
452                mark: MatchMark::default(),
453            })
454        });
455        Box::new(matched.into_iter())
456    }
457
458    fn max_word_chars(&self) -> Option<usize> {
459        Some(1)
460    }
461
462    fn entries<'a>(&'a self) -> Option<Box<dyn Iterator<Item = DictionaryRecord> + 'a>> {
463        Some(Box::new(KHANGUL_READINGS.iter().map(|(hanja, reading)| {
464            DictionaryRecord {
465                hanja: hanja.to_string(),
466                reading: reading.to_string(),
467                mark: MatchMark::default(),
468            }
469        })))
470    }
471
472    fn has_homophone(&self, hanja: &str, reading: &str) -> bool {
473        let mut chars = hanja.chars();
474        let Some(hanja) = chars.next() else {
475            return false;
476        };
477        if chars.next().is_some() {
478            return false;
479        }
480        KHANGUL_READINGS
481            .iter()
482            .any(|&(other_hanja, other_reading)| other_hanja != hanja && other_reading == reading)
483    }
484}
485
486/// A dictionary composition that preserves caller-supplied priority order.
487///
488/// Dictionaries are stored from highest to lowest priority. During lookup,
489/// matches of different byte lengths are all returned so the lattice segmenter
490/// can still compare shorter high-priority entries with longer low-priority
491/// entries. When two dictionaries produce a match with the same byte length,
492/// only the first one is kept.
493#[derive(Clone, Debug, Default, Eq, PartialEq)]
494pub struct ChainDictionary<D> {
495    dictionaries: Vec<D>,
496}
497
498impl<D> ChainDictionary<D> {
499    /// Creates an empty chain.
500    pub fn new() -> Self {
501        Self {
502            dictionaries: Vec::new(),
503        }
504    }
505
506    /// Appends a dictionary with lower priority than the existing entries.
507    pub fn push(&mut self, dictionary: D) {
508        self.dictionaries.push(dictionary);
509    }
510
511    /// Returns the number of dictionaries in the chain.
512    pub fn len(&self) -> usize {
513        self.dictionaries.len()
514    }
515
516    /// Returns whether the chain contains no dictionaries.
517    pub fn is_empty(&self) -> bool {
518        self.dictionaries.is_empty()
519    }
520
521    /// Returns the chained dictionaries in priority order.
522    pub fn dictionaries(&self) -> &[D] {
523        &self.dictionaries
524    }
525
526    /// Consumes the chain and returns its dictionaries in priority order.
527    pub fn into_dictionaries(self) -> Vec<D> {
528        self.dictionaries
529    }
530}
531
532impl<D> FromIterator<D> for ChainDictionary<D> {
533    fn from_iter<T: IntoIterator<Item = D>>(iter: T) -> Self {
534        Self {
535            dictionaries: Vec::from_iter(iter),
536        }
537    }
538}
539
540impl<D> HanjaDictionary for ChainDictionary<D>
541where
542    D: HanjaDictionary,
543{
544    fn matches_at<'a>(&'a self, s: &'a str) -> Box<dyn Iterator<Item = Match> + 'a> {
545        let mut seen_lengths = BTreeSet::new();
546        let mut matches = Vec::new();
547
548        for dictionary in &self.dictionaries {
549            for matched in dictionary.matches_at(s) {
550                if seen_lengths.insert(matched.byte_len) {
551                    matches.push(matched);
552                }
553            }
554        }
555
556        matches.sort_by_key(|matched| matched.byte_len);
557        Box::new(matches.into_iter())
558    }
559
560    fn max_word_chars(&self) -> Option<usize> {
561        let mut max = None;
562        for dictionary in &self.dictionaries {
563            let word_chars = dictionary.max_word_chars()?;
564            max = Some(max.map_or(word_chars, |current: usize| current.max(word_chars)));
565        }
566        max
567    }
568
569    fn entries<'a>(&'a self) -> Option<Box<dyn Iterator<Item = DictionaryRecord> + 'a>> {
570        let mut records = BTreeMap::<String, DictionaryRecord>::new();
571
572        for dictionary in &self.dictionaries {
573            for record in dictionary.entries()? {
574                records.entry(record.hanja.clone()).or_insert(record);
575            }
576        }
577
578        Some(Box::new(records.into_values()))
579    }
580
581    fn has_homophone(&self, hanja: &str, reading: &str) -> bool {
582        if let Some(mut records) = self.entries() {
583            return records.any(|record| record.hanja != hanja && record.reading == reading);
584        }
585
586        self.dictionaries
587            .iter()
588            .any(|dictionary| dictionary.has_homophone(hanja, reading))
589    }
590}
591
592fn khangul_reading(ch: char) -> Option<&'static str> {
593    KHANGUL_READINGS
594        .binary_search_by_key(&ch, |(hanja, _)| *hanja)
595        .ok()
596        .map(|index| KHANGUL_READINGS[index].1)
597}
598
599/// Engine-level options that affect hanja conversion before rendering.
600///
601/// These options apply to fallback text that is not covered by the supplied
602/// dictionary. Dictionary matches are assumed to already contain the desired
603/// reading and are not rewritten by fallback orthography rules.
604#[derive(Clone, Copy, Debug, Eq, PartialEq)]
605pub struct EngineOptions {
606    /// How hanja-containing spans are split into dictionary and fallback
607    /// segments.
608    pub segmentation: SegmentationStrategy,
609
610    /// Whether fallback readings should apply South Korean initial sound law.
611    pub initial_sound_law: bool,
612
613    /// How fallback hanja numerals are rendered.
614    pub numeral_strategy: NumeralStrategy,
615}
616
617impl Default for EngineOptions {
618    fn default() -> Self {
619        Self {
620            segmentation: SegmentationStrategy::Lattice,
621            initial_sound_law: true,
622            numeral_strategy: NumeralStrategy::HangulPhonetic,
623        }
624    }
625}
626
627/// Strategy used to segment hanja-containing spans.
628///
629/// `Lattice` considers every dictionary path and chooses the best coverage,
630/// while `Eager` greedily takes the longest match at each cursor.  The eager
631/// strategy can reduce work for callers that prefer speed over segmentation
632/// accuracy.
633#[non_exhaustive]
634#[derive(Clone, Copy, Debug, Default, Eq, PartialEq)]
635pub enum SegmentationStrategy {
636    /// Use dynamic programming to maximize dictionary coverage.
637    #[default]
638    Lattice,
639
640    /// Use left-to-right eager longest-match segmentation.
641    Eager,
642}
643
644/// Strategy for rendering hanja numerals encountered in fallback text.
645#[non_exhaustive]
646#[derive(Clone, Copy, Debug, Eq, PartialEq)]
647pub enum NumeralStrategy {
648    /// Render hanja numerals as their hangul phonetic readings.
649    ///
650    /// This strategy emits fallback annotations so renderers can still expose
651    /// the original hanja in annotation-oriented render modes.
652    HangulPhonetic,
653
654    /// Normalize positional digit-only hanja numerals to Arabic digits.
655    ///
656    /// Arabic normalization emits plain text rather than annotations. Renderers
657    /// and user directives therefore cannot later recover the original numeral
658    /// hanja for the normalized span.
659    PositionalArabic,
660
661    /// Normalize additive hanja numerals with place markers to Arabic digits.
662    ///
663    /// This parser handles small units such as `十`, `百`, and `千` and large
664    /// units through `澗`. Malformed or overflowing numerals fall back to
665    /// [`NumeralStrategy::HangulPhonetic`] for that run.
666    AdditiveArabic,
667
668    /// Choose Arabic normalization for common numeric contexts and otherwise
669    /// keep hangul phonetic fallback behavior.
670    ///
671    /// Well-formed additive numerals are normalized to Arabic. Pure positional
672    /// digit runs are normalized when they contain at least four digits
673    /// (matching common year notation) or when a unit hanja
674    /// (`年月日時分秒號世紀` and so on) immediately follows. Other numerals
675    /// remain hangul annotations.
676    Smart,
677}
678
679#[derive(Clone, Debug, Eq, PartialEq)]
680struct DictionaryEntry {
681    reading: String,
682    mark: MatchMark,
683}
684
685/// A small in-memory dictionary backed by an ordered map.
686///
687/// This implementation is intended for tests, user-supplied custom entries,
688/// and early pipeline validation. It returns all prefix matches at a cursor so
689/// the engine can score every candidate path through a hanja run.
690#[derive(Clone, Debug, Default, Eq, PartialEq)]
691pub struct MapDictionary {
692    entries: BTreeMap<String, DictionaryEntry>,
693    max_word_chars: Option<usize>,
694}
695
696impl MapDictionary {
697    /// Creates an empty map dictionary.
698    pub fn new() -> Self {
699        Self::default()
700    }
701
702    /// Inserts an entry with no special rendering constraints.
703    pub fn insert(&mut self, hanja: impl Into<String>, reading: impl Into<String>) {
704        self.insert_marked(hanja, reading, MatchMark::default());
705    }
706
707    /// Inserts an entry with dictionary-provided rendering constraints.
708    pub fn insert_marked(
709        &mut self,
710        hanja: impl Into<String>,
711        reading: impl Into<String>,
712        mark: MatchMark,
713    ) {
714        let hanja = hanja.into();
715        let word_chars = hanja.chars().count();
716        self.max_word_chars = Some(self.max_word_chars.map_or(word_chars, |max| {
717            if word_chars > max { word_chars } else { max }
718        }));
719        self.entries.insert(
720            hanja,
721            DictionaryEntry {
722                reading: reading.into(),
723                mark,
724            },
725        );
726    }
727
728    /// Returns whether the dictionary has no entries.
729    pub fn is_empty(&self) -> bool {
730        self.entries.is_empty()
731    }
732
733    /// Returns the number of dictionary entries.
734    pub fn len(&self) -> usize {
735        self.entries.len()
736    }
737}
738
739impl HanjaDictionary for MapDictionary {
740    fn matches_at<'a>(&'a self, s: &'a str) -> Box<dyn Iterator<Item = Match> + 'a> {
741        Box::new(
742            self.entries
743                .iter()
744                .filter(move |(hanja, _)| s.starts_with(hanja.as_str()))
745                .map(|(hanja, entry)| Match {
746                    byte_len: hanja.len(),
747                    reading: entry.reading.clone(),
748                    mark: entry.mark,
749                }),
750        )
751    }
752
753    fn max_word_chars(&self) -> Option<usize> {
754        self.max_word_chars
755    }
756
757    fn entries<'a>(&'a self) -> Option<Box<dyn Iterator<Item = DictionaryRecord> + 'a>> {
758        Some(Box::new(self.entries.iter().map(|(hanja, entry)| {
759            DictionaryRecord {
760                hanja: hanja.clone(),
761                reading: entry.reading.clone(),
762                mark: entry.mark,
763            }
764        })))
765    }
766
767    fn has_homophone(&self, hanja: &str, reading: &str) -> bool {
768        self.entries
769            .iter()
770            .any(|(other_hanja, entry)| other_hanja != hanja && entry.reading == reading)
771    }
772}
773
774/// Scope data used by the plain-text adapter.
775///
776/// Plain text has no preserved regions or block boundaries, and inline markup
777/// such as `<ruby>` is not meaningful in a plain-text stream. Reporting
778/// [`ScopeData::allows_inline_markup`] as `false` lets scope-aware renderers
779/// fall back to parenthesized text before any [`RenderedToken::Ruby`] reaches
780/// the plain-text writer.
781#[derive(Clone, Copy, Debug, Default, Eq, PartialEq)]
782pub struct PlainScopeData;
783
784impl ScopeData for PlainScopeData {
785    fn is_preserve(&self) -> bool {
786        false
787    }
788
789    fn allows_inline_markup(&self) -> bool {
790        false
791    }
792}
793
794/// Reads a plain-text string into the core input-token stream.
795///
796/// The adapter wraps the input in a plain scope and emits the entire input as a
797/// single `Text` token.
798pub fn read_plain_text(input: &str) -> Vec<InputToken<PlainScopeData>> {
799    Vec::from([
800        InputToken::Open(Scope::new(PlainScopeData)),
801        InputToken::Text(input.to_string()),
802        InputToken::Close,
803    ])
804}
805
806/// Writes rendered plain-text tokens back to a string.
807///
808/// Structural tokens are ignored because plain text has no serialized scope
809/// markers. `Text` and `Verbatim` tokens are concatenated in stream order.
810/// `Ruby` tokens are not expected because [`PlainScopeData`] disallows inline
811/// markup, but they are defensively serialized as `base(rt)` rather than
812/// dropped silently if one ever reaches the writer.
813pub fn write_plain_text<S>(tokens: impl IntoIterator<Item = RenderedToken<S>>) -> String {
814    let mut output = String::new();
815    for token in tokens {
816        match token {
817            RenderedToken::Open(_) | RenderedToken::Close => {}
818            RenderedToken::Text(text) | RenderedToken::Verbatim(text) => output.push_str(&text),
819            RenderedToken::Ruby { base, rt } => {
820                output.push_str(&parens(&base, &rt));
821            }
822        }
823    }
824    output
825}
826
827/// Processes input tokens with the default hanja conversion engine options.
828///
829/// The engine preserves structural and verbatim tokens, skips text when the
830/// current scope is preserving, and uses lattice segmentation to annotate
831/// dictionary and fallback matches inside text tokens.
832pub fn process_tokens<S, D>(
833    tokens: impl IntoIterator<Item = InputToken<S>>,
834    dictionary: &D,
835) -> Vec<OutputToken<S>>
836where
837    S: ScopeData,
838    D: HanjaDictionary + ?Sized,
839{
840    process_tokens_iter(tokens, dictionary).collect()
841}
842
843/// Processes input tokens through the default engine options and returns an
844/// iterator over the collected output.
845///
846/// This is an iterator-shaped compatibility adapter, not the low-level
847/// streaming surface: it consumes the supplied input before returning. For
848/// true incremental processing, use [`Engine`] directly and call
849/// [`Engine::push_token`] as chunks arrive.
850pub fn process_tokens_iter<S, D>(
851    tokens: impl IntoIterator<Item = InputToken<S>>,
852    dictionary: &D,
853) -> alloc::vec::IntoIter<OutputToken<S>>
854where
855    S: ScopeData,
856    D: HanjaDictionary + ?Sized,
857{
858    process_tokens_with_options(tokens, dictionary, EngineOptions::default()).into_iter()
859}
860
861/// Processes input tokens with explicit hanja conversion engine options.
862///
863/// This is the lower-level entry point for callers that need to disable
864/// fallback initial sound law or choose a non-default numeral strategy.
865pub fn process_tokens_with_options<S, D>(
866    tokens: impl IntoIterator<Item = InputToken<S>>,
867    dictionary: &D,
868    options: EngineOptions,
869) -> Vec<OutputToken<S>>
870where
871    S: ScopeData,
872    D: HanjaDictionary + ?Sized,
873{
874    let mut engine = Engine::collecting(dictionary, options);
875    let mut output = Vec::new();
876
877    for token in tokens {
878        output.extend(engine.push_token(token));
879    }
880
881    output.extend(engine.finish());
882    output
883}
884
885/// Processes input tokens through explicit engine options and returns an
886/// iterator over the collected output.
887///
888/// This convenience adapter preserves the existing collect-into-`Vec` behavior
889/// while exposing an iterator-shaped API for callers that compose pipeline
890/// stages. Use [`Engine`] for chunk-by-chunk output.
891pub fn process_tokens_iter_with_options<S, D>(
892    tokens: impl IntoIterator<Item = InputToken<S>>,
893    dictionary: &D,
894    options: EngineOptions,
895) -> alloc::vec::IntoIter<OutputToken<S>>
896where
897    S: ScopeData,
898    D: HanjaDictionary + ?Sized,
899{
900    process_tokens_with_options(tokens, dictionary, options).into_iter()
901}
902
903/// Resolves a fallible reader token stream into recovered input tokens.
904///
905/// This is the single place where the stream-level [`Recovery`] policy is
906/// applied to a reader's output. Format adapters (such as the HTML scanner)
907/// emit `Ok(InputToken)` for well-formed regions and
908/// `Err(RecoverableInputError)` for malformed regions they can describe and
909/// preserve; this function turns that stream into the plain
910/// [`InputToken`] sequence the rest of the pipeline consumes:
911///
912///  -  In [`Recovery::Strict`] mode the first error stops processing and its
913///     cause is returned, so the caller never sees a partial token stream.
914///  -  In [`Recovery::Lenient`] mode each error is logged at `warn` level once
915///     and replaced by an [`InputToken::Verbatim`] holding the original source
916///     region, so the malformed bytes pass through untouched while surrounding
917///     tokens continue to flow.
918///
919/// It sits one stage before the [`Engine`]: feed its output into
920/// [`process_tokens_with_options`] or a streaming [`Engine`]. The recovery-aware
921/// engine entry points ([`process_fallible_tokens`] and
922/// [`process_fallible_tokens_with_options`]) are thin wrappers that call this
923/// and then run the engine.
924pub fn recover_input_tokens<S>(
925    tokens: impl IntoIterator<Item = Result<InputToken<S>, RecoverableInputError>>,
926    recovery: Recovery,
927) -> Result<Vec<InputToken<S>>, Error>
928where
929    S: ScopeData,
930{
931    let mut recovered = Vec::new();
932    for token in tokens {
933        recovered.push(recover_input_token(token, recovery)?);
934    }
935    Ok(recovered)
936}
937
938/// Resolves one fallible reader item according to a [`Recovery`] policy.
939///
940/// This is the per-token form of [`recover_input_tokens`] for streaming
941/// pipelines. In strict mode an error is returned immediately. In lenient mode
942/// the error is logged once and replaced with an [`InputToken::Verbatim`]
943/// carrying the original malformed region.
944pub fn recover_input_token<S>(
945    token: Result<InputToken<S>, RecoverableInputError>,
946    recovery: Recovery,
947) -> Result<InputToken<S>, Error>
948where
949    S: ScopeData,
950{
951    match token {
952        Ok(token) => Ok(token),
953        Err(error) => match recovery {
954            Recovery::Strict => Err(error.into_parts().1),
955            Recovery::Lenient => {
956                let (original, error) = error.into_parts();
957                tracing::warn!(error = %error, "recovering from input reader error");
958                Ok(InputToken::Verbatim(original))
959            }
960        },
961    }
962}
963
964/// Processes fallible input tokens with default engine options.
965///
966/// Reader errors are handled according to `recovery`. In strict mode the first
967/// error is returned. In lenient mode each recoverable region is logged and
968/// emitted as `OutputToken::Verbatim`, after which later tokens continue
969/// through the normal engine path.
970pub fn process_fallible_tokens<S, D>(
971    tokens: impl IntoIterator<Item = Result<InputToken<S>, RecoverableInputError>>,
972    dictionary: &D,
973    recovery: Recovery,
974) -> Result<Vec<OutputToken<S>>, Error>
975where
976    S: ScopeData,
977    D: HanjaDictionary + ?Sized,
978{
979    process_fallible_tokens_with_options(tokens, dictionary, EngineOptions::default(), recovery)
980}
981
982/// Processes fallible input tokens with explicit engine options.
983///
984/// This is the recovery-aware counterpart to
985/// [`process_tokens_with_options`]. It does not make the dictionary trait
986/// fallible; it only handles reader errors that carry enough original source
987/// text for lenient preservation.
988pub fn process_fallible_tokens_with_options<S, D>(
989    tokens: impl IntoIterator<Item = Result<InputToken<S>, RecoverableInputError>>,
990    dictionary: &D,
991    options: EngineOptions,
992    recovery: Recovery,
993) -> Result<Vec<OutputToken<S>>, Error>
994where
995    S: ScopeData,
996    D: HanjaDictionary + ?Sized,
997{
998    let recovered = recover_input_tokens(tokens, recovery)?;
999    Ok(process_tokens_with_options(recovered, dictionary, options))
1000}
1001
1002/// Stateful hanja conversion engine for chunked token streams.
1003///
1004/// `Engine` is the low-level streaming surface. Call [`Engine::push_token`] for
1005/// each incoming token and then [`Engine::finish`] once the upstream reader is
1006/// exhausted. When the dictionary reports a maximum word length, text chunks are
1007/// buffered only at the tail so dictionary matches can cross chunk boundaries
1008/// without requiring the whole document in memory. A trailing fallback hanja run
1009/// is also kept buffered until a non-convertible boundary or EOF so render modes
1010/// that expose annotation spans match one-shot conversion. Dictionaries with an
1011/// unknown maximum keep hanja-containing text until a non-convertible boundary
1012/// or EOF so long custom entries remain observable.
1013pub struct Engine<'a, S, D>
1014where
1015    S: ScopeData,
1016    D: HanjaDictionary + ?Sized,
1017{
1018    dictionary: &'a D,
1019    options: EngineOptions,
1020    scopes: Vec<Scope<S>>,
1021    pending_text: String,
1022    pending_unflushable_fallback_run_bytes: Option<usize>,
1023    fallback_state: FallbackState,
1024    incremental_flush: bool,
1025}
1026
1027impl<'a, S, D> Engine<'a, S, D>
1028where
1029    S: ScopeData,
1030    D: HanjaDictionary + ?Sized,
1031{
1032    /// Creates a streaming engine with default options.
1033    pub fn new(dictionary: &'a D) -> Self {
1034        Self::with_options(dictionary, EngineOptions::default())
1035    }
1036
1037    /// Creates a streaming engine with explicit conversion options.
1038    pub fn with_options(dictionary: &'a D, options: EngineOptions) -> Self {
1039        Self::with_incremental_flush(dictionary, options, true)
1040    }
1041
1042    fn collecting(dictionary: &'a D, options: EngineOptions) -> Self {
1043        Self::with_incremental_flush(dictionary, options, false)
1044    }
1045
1046    fn with_incremental_flush(
1047        dictionary: &'a D,
1048        options: EngineOptions,
1049        incremental_flush: bool,
1050    ) -> Self {
1051        tracing::debug!(
1052            strategy = ?options.segmentation,
1053            "engine created with segmentation strategy"
1054        );
1055        Self {
1056            dictionary,
1057            options,
1058            scopes: Vec::new(),
1059            pending_text: String::new(),
1060            pending_unflushable_fallback_run_bytes: None,
1061            fallback_state: FallbackState::default(),
1062            incremental_flush,
1063        }
1064    }
1065
1066    /// Pushes one input token and returns output tokens that are now safe to
1067    /// emit.
1068    pub fn push_token(&mut self, token: InputToken<S>) -> Vec<OutputToken<S>> {
1069        let mut output = Vec::new();
1070        match token {
1071            InputToken::Open(scope) => {
1072                self.flush_into(&mut output);
1073                if scope.data().is_block_boundary() {
1074                    self.reset_fallback_context();
1075                }
1076                self.scopes.push(scope.clone());
1077                output.push(OutputToken::Open(scope));
1078            }
1079            InputToken::Close => {
1080                self.flush_into(&mut output);
1081                let closes_block_boundary = self
1082                    .scopes
1083                    .pop()
1084                    .is_some_and(|scope| scope.data().is_block_boundary());
1085                output.push(OutputToken::Close);
1086                if closes_block_boundary {
1087                    self.reset_fallback_context();
1088                }
1089            }
1090            InputToken::Text(text) => {
1091                if self
1092                    .scopes
1093                    .last()
1094                    .is_some_and(|scope| scope.data().is_preserve())
1095                {
1096                    self.flush_into(&mut output);
1097                    self.reset_fallback_context();
1098                    output.push(OutputToken::Text(text));
1099                } else {
1100                    let previous_pending_bytes = self.pending_text.len();
1101                    self.pending_text.push_str(&text);
1102                    if self
1103                        .pending_unflushable_fallback_run_bytes
1104                        .is_some_and(|bytes| bytes == previous_pending_bytes)
1105                    {
1106                        self.pending_unflushable_fallback_run_bytes = Some(previous_pending_bytes);
1107                    } else {
1108                        self.pending_unflushable_fallback_run_bytes = None;
1109                    }
1110                    if self.incremental_flush {
1111                        self.flush_safe_into(&mut output);
1112                    }
1113                }
1114            }
1115            InputToken::Verbatim(text) => {
1116                self.flush_into(&mut output);
1117                self.reset_fallback_context();
1118                output.push(OutputToken::Verbatim(text));
1119            }
1120        }
1121        output
1122    }
1123
1124    /// Flushes all pending text without ending the engine.
1125    pub fn flush(&mut self) -> Vec<OutputToken<S>> {
1126        let mut output = Vec::new();
1127        self.flush_into(&mut output);
1128        output
1129    }
1130
1131    /// Finishes the stream and returns every remaining output token.
1132    pub fn finish(mut self) -> Vec<OutputToken<S>> {
1133        self.flush()
1134    }
1135
1136    /// Returns the number of Unicode scalar values currently buffered.
1137    pub fn buffered_chars(&self) -> usize {
1138        self.pending_text.chars().count()
1139    }
1140
1141    fn tail_bound(&self) -> Option<usize> {
1142        self.dictionary.max_word_chars().filter(|bound| *bound > 0)
1143    }
1144
1145    fn flush_safe_into(&mut self, output: &mut Vec<OutputToken<S>>) {
1146        if self.pending_text.is_empty() {
1147            return;
1148        }
1149        if !self.pending_text.chars().any(is_hanja) {
1150            self.flush_non_hanja_safe_into(output);
1151            return;
1152        }
1153
1154        let Some(bound) = self.tail_bound() else {
1155            let Some(flush_end) = safe_unknown_bound_flush_end(&self.pending_text) else {
1156                return;
1157            };
1158            self.flush_prefix_into(flush_end, output);
1159            if !self.pending_text.chars().any(is_hanja) {
1160                self.flush_non_hanja_safe_into(output);
1161            }
1162            return;
1163        };
1164        if let Some(flush_end) = safe_unknown_bound_flush_end(&self.pending_text) {
1165            self.flush_prefix_into(flush_end, output);
1166            if !self.pending_text.chars().any(is_hanja) {
1167                self.flush_non_hanja_safe_into(output);
1168            }
1169            return;
1170        }
1171        let buffered_chars = self.buffered_chars();
1172        if buffered_chars > bound.saturating_mul(10) {
1173            tracing::debug!(
1174                buffered_chars,
1175                dict_max_word_chars = bound,
1176                "streaming tail buffer is unusually large"
1177            );
1178        }
1179        if buffered_chars <= bound {
1180            return;
1181        }
1182
1183        if self.extends_unflushable_fallback_run(bound) {
1184            self.pending_unflushable_fallback_run_bytes = Some(self.pending_text.len());
1185            return;
1186        }
1187
1188        let safe_chars = buffered_chars.saturating_sub(bound).saturating_add(1);
1189        let segments = segment_text(
1190            &self.pending_text,
1191            self.dictionary,
1192            self.options.segmentation,
1193        );
1194        let mut flush_end = 0;
1195        let mut flush_segments = Vec::new();
1196        for segment in &segments {
1197            let (byte_start, byte_end) = segment_bounds(segment);
1198            let start_chars = self.pending_text[..byte_start].chars().count();
1199            let end_chars = self.pending_text[..byte_end].chars().count();
1200            if byte_start > flush_end || (start_chars > safe_chars && flush_end > 0) {
1201                break;
1202            }
1203            if end_chars > safe_chars {
1204                break;
1205            }
1206            flush_end = byte_end;
1207            flush_segments.push(segment.clone());
1208        }
1209
1210        // Fallback runs render as one annotation in non-default render modes.
1211        // Keep a trailing fallback run buffered because the next chunk may
1212        // extend it, even when the dictionary lookahead bound is only one char.
1213        if let Some(fallback_start) = trailing_fallback_run_start(&segments, flush_end) {
1214            flush_end = fallback_start;
1215            while flush_segments
1216                .last()
1217                .is_some_and(|segment| segment_bounds(segment).1 > flush_end)
1218            {
1219                flush_segments.pop();
1220            }
1221        }
1222
1223        if flush_end > 0 {
1224            self.pending_unflushable_fallback_run_bytes = None;
1225            self.flush_segments_prefix_into(flush_end, &flush_segments, output);
1226            if !self.pending_text.chars().any(is_hanja) {
1227                self.flush_non_hanja_safe_into(output);
1228            }
1229        } else if trailing_fallback_run_start(&segments, self.pending_text.len()) == Some(0) {
1230            self.pending_unflushable_fallback_run_bytes = Some(self.pending_text.len());
1231        }
1232    }
1233
1234    fn extends_unflushable_fallback_run(&self, bound: usize) -> bool {
1235        let Some(previous_bytes) = self.pending_unflushable_fallback_run_bytes else {
1236            return false;
1237        };
1238        if previous_bytes == 0
1239            || previous_bytes > self.pending_text.len()
1240            || !self.pending_text.is_char_boundary(previous_bytes)
1241        {
1242            return false;
1243        }
1244
1245        let appended = &self.pending_text[previous_bytes..];
1246        if appended.is_empty() {
1247            return true;
1248        }
1249        if appended.chars().any(|ch| !is_hanja(ch)) {
1250            return false;
1251        }
1252
1253        // The existing prefix was already segmented as one fallback run.  Only
1254        // the old suffix that can participate in a cross-chunk dictionary match
1255        // and the newly appended text need to be inspected here.
1256        let probe_start = suffix_start_for_char_count(
1257            &self.pending_text[..previous_bytes],
1258            bound.saturating_sub(1),
1259        );
1260        let probe = &self.pending_text[probe_start..];
1261        segment_text(probe, self.dictionary, self.options.segmentation)
1262            .iter()
1263            .all(|segment| matches!(segment, Segment::Fallback { .. }))
1264    }
1265
1266    fn flush_non_hanja_safe_into(&mut self, output: &mut Vec<OutputToken<S>>) {
1267        let flush_end = match self.tail_bound() {
1268            Some(bound) => safe_non_hanja_flush_end(&self.pending_text, bound),
1269            None => safe_unknown_bound_flush_end(&self.pending_text),
1270        };
1271        if let Some(flush_end) = flush_end {
1272            self.flush_prefix_into(flush_end, output);
1273        }
1274    }
1275
1276    fn flush_prefix_into(&mut self, flush_end: usize, output: &mut Vec<OutputToken<S>>) {
1277        if flush_end == self.pending_text.len() {
1278            self.flush_into(output);
1279            return;
1280        }
1281        self.pending_unflushable_fallback_run_bytes = None;
1282        let prefix = self.pending_text[..flush_end].to_string();
1283        let segments = segment_text(&prefix, self.dictionary, self.options.segmentation);
1284        self.flush_segments_prefix_into(flush_end, &segments, output);
1285    }
1286
1287    fn flush_segments_prefix_into(
1288        &mut self,
1289        flush_end: usize,
1290        segments: &[Segment],
1291        output: &mut Vec<OutputToken<S>>,
1292    ) {
1293        let prefix = self.pending_text[..flush_end].to_string();
1294        process_segments_with_state(
1295            &prefix,
1296            segments,
1297            self.dictionary,
1298            self.options,
1299            &mut self.fallback_state,
1300            output,
1301        );
1302        self.pending_text.replace_range(..flush_end, "");
1303    }
1304
1305    fn flush_into(&mut self, output: &mut Vec<OutputToken<S>>) {
1306        if self.pending_text.is_empty() {
1307            return;
1308        }
1309        self.pending_unflushable_fallback_run_bytes = None;
1310        let text = core::mem::take(&mut self.pending_text);
1311        process_text_with_state(
1312            &text,
1313            self.dictionary,
1314            self.options,
1315            &mut self.fallback_state,
1316            output,
1317        );
1318    }
1319
1320    fn reset_fallback_context(&mut self) {
1321        self.fallback_state = FallbackState::default();
1322    }
1323}
1324
1325fn safe_non_hanja_flush_end(text: &str, bound: usize) -> Option<usize> {
1326    if text.is_empty() {
1327        return None;
1328    }
1329
1330    let keep_chars = bound.saturating_sub(1);
1331    let span_start = text
1332        .char_indices()
1333        .rfind(|(_, ch)| ch.is_whitespace())
1334        .map_or(0, |(index, ch)| index + ch.len_utf8());
1335    let suffix = &text[span_start..];
1336    let suffix_chars = suffix.chars().count();
1337    if suffix_chars <= keep_chars {
1338        return (span_start > 0).then_some(span_start);
1339    }
1340
1341    let flush_suffix_chars = suffix_chars - keep_chars;
1342    let flush_end = suffix
1343        .char_indices()
1344        .nth(flush_suffix_chars)
1345        .map_or(text.len(), |(index, _)| span_start + index);
1346    (flush_end > 0).then_some(flush_end)
1347}
1348
1349fn safe_unknown_bound_flush_end(text: &str) -> Option<usize> {
1350    text.char_indices()
1351        .rfind(|(_, ch)| ch.is_whitespace())
1352        .map(|(index, ch)| index + ch.len_utf8())
1353}
1354
1355fn suffix_start_for_char_count(text: &str, count: usize) -> usize {
1356    if count == 0 {
1357        return text.len();
1358    }
1359
1360    text.char_indices()
1361        .rev()
1362        .nth(count.saturating_sub(1))
1363        .map_or(0, |(index, _)| index)
1364}
1365
1366fn trailing_fallback_run_start(segments: &[Segment], split_byte: usize) -> Option<usize> {
1367    if split_byte == 0 {
1368        return None;
1369    }
1370
1371    for (index, segment) in segments.iter().enumerate() {
1372        let (byte_start, byte_end) = segment_bounds(segment);
1373        if byte_end != split_byte {
1374            continue;
1375        }
1376        if !matches!(segment, Segment::Fallback { .. }) {
1377            return None;
1378        }
1379        if let Some(next) = segments.get(index + 1)
1380            && !matches!(next, Segment::Fallback { .. })
1381        {
1382            return None;
1383        }
1384
1385        let mut run_start = byte_start;
1386        for previous in segments[..index].iter().rev() {
1387            let (previous_start, previous_end) = segment_bounds(previous);
1388            if previous_end != run_start || !matches!(previous, Segment::Fallback { .. }) {
1389                break;
1390            }
1391            run_start = previous_start;
1392        }
1393        return (run_start < split_byte).then_some(run_start);
1394    }
1395
1396    None
1397}
1398
1399fn process_text_with_state<S, D>(
1400    text: &str,
1401    dictionary: &D,
1402    options: EngineOptions,
1403    fallback_state: &mut FallbackState,
1404    output: &mut Vec<OutputToken<S>>,
1405) where
1406    D: HanjaDictionary + ?Sized,
1407{
1408    let segments = segment_text(text, dictionary, options.segmentation);
1409    process_segments_with_state(text, &segments, dictionary, options, fallback_state, output);
1410}
1411
1412fn process_segments_with_state<S, D>(
1413    text: &str,
1414    segments: &[Segment],
1415    _dictionary: &D,
1416    options: EngineOptions,
1417    fallback_state: &mut FallbackState,
1418    output: &mut Vec<OutputToken<S>>,
1419) where
1420    D: HanjaDictionary + ?Sized,
1421{
1422    let mut index = 0;
1423
1424    while index < segments.len() {
1425        match &segments[index] {
1426            Segment::Dictionary {
1427                byte_start,
1428                byte_end,
1429                reading,
1430                mark,
1431            } => {
1432                let source = &text[*byte_start..*byte_end];
1433                output.push(OutputToken::Annotated(Annotation {
1434                    hanja: source.to_string(),
1435                    homophone: false,
1436                    reading: reading.clone(),
1437                    require_hanja: mark.require_hanja,
1438                    require_hangul: mark.require_hangul,
1439                    first_in_context: true,
1440                    skip_annotation: false,
1441                    from_dictionary: true,
1442                }));
1443                if should_preserve_dictionary_context(source, reading, options) {
1444                    update_fallback_state_for_reading(reading, fallback_state);
1445                } else {
1446                    *fallback_state = FallbackState::default();
1447                }
1448                index += 1;
1449            }
1450            Segment::Fallback {
1451                byte_start,
1452                byte_end,
1453            } => {
1454                let mut fallback_end = *byte_end;
1455                while let Some(Segment::Fallback { byte_end, .. }) = segments.get(index + 1) {
1456                    fallback_end = *byte_end;
1457                    index += 1;
1458                }
1459                process_fallback_text(
1460                    &text[*byte_start..fallback_end],
1461                    options,
1462                    fallback_state,
1463                    output,
1464                );
1465                index += 1;
1466            }
1467            Segment::Text {
1468                byte_start,
1469                byte_end,
1470            } => {
1471                let text_segment = &text[*byte_start..*byte_end];
1472                push_text(output, text_segment);
1473                update_fallback_state_for_text(text_segment, fallback_state);
1474                index += 1;
1475            }
1476        }
1477    }
1478}
1479
1480fn segment_bounds(segment: &Segment) -> (usize, usize) {
1481    match segment {
1482        Segment::Dictionary {
1483            byte_start,
1484            byte_end,
1485            ..
1486        }
1487        | Segment::Fallback {
1488            byte_start,
1489            byte_end,
1490        }
1491        | Segment::Text {
1492            byte_start,
1493            byte_end,
1494        } => (*byte_start, *byte_end),
1495    }
1496}
1497
1498fn process_fallback_text<S>(
1499    text: &str,
1500    options: EngineOptions,
1501    state: &mut FallbackState,
1502    output: &mut Vec<OutputToken<S>>,
1503) {
1504    for part in phoneticize_fallback_run_with_state(text, options, state) {
1505        match part {
1506            FallbackPart::Annotation { hanja, reading } => {
1507                output.push(OutputToken::Annotated(Annotation {
1508                    hanja,
1509                    reading,
1510                    homophone: false,
1511                    require_hanja: false,
1512                    require_hangul: false,
1513                    first_in_context: true,
1514                    skip_annotation: false,
1515                    from_dictionary: false,
1516                }));
1517            }
1518            FallbackPart::ReadingText(text) => push_text(output, &text),
1519            FallbackPart::Text(text) => push_text(output, &text),
1520        }
1521    }
1522}
1523
1524fn update_fallback_state_for_text(text: &str, state: &mut FallbackState) {
1525    if text.is_empty() {
1526        return;
1527    }
1528
1529    if text
1530        .chars()
1531        .last()
1532        .is_some_and(|character| character.is_whitespace())
1533    {
1534        *state = FallbackState::default();
1535        return;
1536    }
1537
1538    let Some(last) = text.chars().rev().find(|ch| !ch.is_whitespace()) else {
1539        return;
1540    };
1541
1542    if last.is_alphanumeric() {
1543        state.starts_word = false;
1544        state.previous_reading = Some(last);
1545    } else {
1546        *state = FallbackState::default();
1547    }
1548}
1549
1550fn should_preserve_dictionary_context(source: &str, reading: &str, options: EngineOptions) -> bool {
1551    if reading.chars().all(char::is_whitespace) {
1552        return false;
1553    }
1554
1555    if source.chars().all(is_hanja) {
1556        match fallback_reading_for_run(source, options) {
1557            Some(fallback_reading) => {
1558                fallback_reading == reading || has_one_hangul_syllable_per_hanja(source, reading)
1559            }
1560            None => has_one_hangul_syllable_per_hanja(source, reading),
1561        }
1562    } else {
1563        true
1564    }
1565}
1566
1567fn has_one_hangul_syllable_per_hanja(source: &str, reading: &str) -> bool {
1568    let source_len = source.chars().count();
1569    let mut reading_len = 0;
1570
1571    for ch in reading.chars() {
1572        if !is_hangul_syllable(ch) {
1573            return false;
1574        }
1575        reading_len += 1;
1576    }
1577
1578    reading_len == source_len
1579}
1580
1581fn is_hangul_syllable(ch: char) -> bool {
1582    ('\u{ac00}'..='\u{d7a3}').contains(&ch)
1583}
1584
1585fn update_fallback_state_for_reading(reading: &str, state: &mut FallbackState) {
1586    let Some(last) = reading.chars().rev().find(|ch| !ch.is_whitespace()) else {
1587        *state = FallbackState::default();
1588        return;
1589    };
1590
1591    if last.is_alphanumeric() {
1592        state.starts_word = false;
1593        state.previous_reading = Some(last);
1594    } else {
1595        *state = FallbackState::default();
1596    }
1597}
1598
1599fn push_text<S>(output: &mut Vec<OutputToken<S>>, text: &str) {
1600    if text.is_empty() {
1601        return;
1602    }
1603
1604    match output.last_mut() {
1605        Some(OutputToken::Text(existing)) => existing.push_str(text),
1606        _ => output.push(OutputToken::Text(text.to_string())),
1607    }
1608}
1609
1610/// Returns whether `ch` is in a known CJK ideograph range.
1611pub fn is_hanja(ch: char) -> bool {
1612    matches!(
1613        ch,
1614        '\u{2F00}'..='\u{2FFF}'
1615            | '\u{3007}'
1616            | '\u{3400}'..='\u{4DBF}'
1617            | '\u{4E00}'..='\u{9FFF}'
1618            | '\u{F900}'..='\u{FAFF}'
1619            | '\u{20000}'..='\u{2A6DF}'
1620            | '\u{2A700}'..='\u{2B73F}'
1621            | '\u{2B740}'..='\u{2B81F}'
1622            | '\u{2B820}'..='\u{2CEAF}'
1623            | '\u{2CEB0}'..='\u{2EBEF}'
1624            | '\u{2EBF0}'..='\u{2EE5F}'
1625            | '\u{2F800}'..='\u{2FA1F}'
1626            | '\u{30000}'..='\u{3134F}'
1627            | '\u{31350}'..='\u{323AF}'
1628            | '\u{323B0}'..='\u{3347F}'
1629    )
1630}
1631
1632/// The concrete rendering mode for annotated hanja words.
1633#[derive(Clone, Copy, Debug, Eq, PartialEq)]
1634pub enum RenderMode {
1635    /// Emits only hangul unless annotation flags require hanja disambiguation.
1636    HangulOnly,
1637
1638    /// Always emits hangul followed by the original hanja in parentheses.
1639    HangulHanjaParens,
1640
1641    /// Always emits original hanja followed by the hangul reading in
1642    /// parentheses.
1643    HanjaHangulParens,
1644
1645    /// Emits a `<ruby>` element pairing hangul reading and source hanja.
1646    ///
1647    /// The [`RubyBase`] sub-mode chooses which side becomes the base text.
1648    /// When the active scope reports
1649    /// [`ScopeData::allows_inline_markup`] as `false`, the renderer falls back
1650    /// to parenthesized text so that adapters which cannot embed markup still
1651    /// receive a sensible surface form.
1652    Ruby(RubyBase),
1653
1654    /// Emits original hanja, adding a hangul gloss only when requested.
1655    Original,
1656}
1657
1658/// Selects which side of a `<ruby>` element is the base text.
1659#[derive(Clone, Copy, Debug, Eq, PartialEq)]
1660pub enum RubyBase {
1661    /// `<ruby>hangul<rt>hanja</rt></ruby>`; hangul is the base, hanja is the gloss.
1662    OnHangul,
1663
1664    /// `<ruby>hanja<rt>hangul</rt></ruby>`; hanja is the base, hangul is the gloss.
1665    OnHanja,
1666}
1667
1668/// Form for the gloss attached to annotations in [`RenderMode::Original`].
1669///
1670/// `Original` keeps the source hanja as primary text and only attaches a
1671/// hangul gloss when the annotation flags or a user directive demand one.
1672/// This option controls how that gloss appears. Because `Original` always
1673/// treats hanja as primary, the ruby form uses hanja as the base and hangul
1674/// as the `rt` gloss; there is no sub-mode to flip the sides.
1675#[derive(Clone, Copy, Debug, Default, Eq, PartialEq)]
1676pub enum OriginalGloss {
1677    /// `hanja(hangul)`; matches the legacy behavior.
1678    #[default]
1679    Parens,
1680
1681    /// A `<ruby>` element with hanja as the base and hangul as the `rt`
1682    /// gloss, falling back to parens when the active scope disallows inline
1683    /// markup.
1684    Ruby,
1685}
1686
1687/// Rendering options that combine a [`RenderMode`] with per-mode sub-options.
1688///
1689/// Most pipelines configure rendering by mode alone, so `RenderOptions`
1690/// implements `From<RenderMode>` and `Default` to keep existing call sites
1691/// terse. Pipelines that need finer control (such as a ruby gloss in
1692/// [`RenderMode::Original`]) construct a `RenderOptions` value directly.
1693#[derive(Clone, Copy, Debug, Eq, PartialEq)]
1694pub struct RenderOptions {
1695    /// Top-level rendering mode applied to every annotation.
1696    pub mode: RenderMode,
1697
1698    /// Gloss form used by [`RenderMode::Original`]. Ignored by other modes.
1699    pub original_gloss: OriginalGloss,
1700}
1701
1702impl Default for RenderOptions {
1703    fn default() -> Self {
1704        Self {
1705            mode: RenderMode::HangulOnly,
1706            original_gloss: OriginalGloss::Parens,
1707        }
1708    }
1709}
1710
1711impl From<RenderMode> for RenderOptions {
1712    fn from(mode: RenderMode) -> Self {
1713        Self {
1714            mode,
1715            original_gloss: OriginalGloss::default(),
1716        }
1717    }
1718}
1719
1720/// The context boundary used by stateful annotation middlewares.
1721///
1722/// `PerBlock` resets when a scope reports [`ScopeData::is_block_boundary`].
1723/// `PerSection` resets when a later scope reports
1724/// [`ScopeData::is_section_boundary`].  Plain-text streams have no block or
1725/// section scopes, so those windows behave like one document context.  This is
1726/// required for exact homophone rendering because a later plain-text line can
1727/// make an earlier annotation ambiguous after it would otherwise have been
1728/// written.
1729#[derive(Clone, Copy, Debug, Eq, PartialEq)]
1730pub enum ContextWindow {
1731    /// Disable the middleware and leave tokens unchanged.
1732    Off,
1733
1734    /// Reset state at format-adapter block boundaries.
1735    PerBlock,
1736
1737    /// Reset state at format-adapter section boundaries.
1738    PerSection,
1739
1740    /// Use the entire token stream as one context.
1741    PerDocument,
1742}
1743
1744/// Action applied when a user directive predicate matches an annotation.
1745#[derive(Clone, Copy, Debug, Eq, PartialEq)]
1746pub enum DirectiveAction {
1747    /// Require rendered output to keep the original hanja visible.
1748    RequireHanja,
1749
1750    /// Require rendered output to include a hangul gloss.
1751    RequireHangul,
1752
1753    /// Collapse the annotation to plain primary text for the active renderer.
1754    SkipAnnotation,
1755}
1756
1757/// User rules that adjust annotation presentation policy.
1758///
1759/// Literal helpers cover common hanja-form rules.  Callers that need richer
1760/// matching can add closure predicates over the whole [`Annotation`], which
1761/// keeps the core API independent of CLI-only pattern syntaxes.
1762#[derive(Default)]
1763pub struct UserDirectives<'a> {
1764    rules: Vec<UserDirectiveRule<'a>>,
1765}
1766
1767impl<'a> UserDirectives<'a> {
1768    /// Creates an empty directive set.
1769    pub fn new() -> Self {
1770        Self::default()
1771    }
1772
1773    /// Marks a literal hanja form as requiring visible hanja in output.
1774    pub fn require_hanja(&mut self, hanja: impl Into<String>) {
1775        self.add_literal(hanja, DirectiveAction::RequireHanja);
1776    }
1777
1778    /// Marks a literal hanja form as requiring a visible hangul gloss.
1779    pub fn require_hangul(&mut self, hanja: impl Into<String>) {
1780        self.add_literal(hanja, DirectiveAction::RequireHangul);
1781    }
1782
1783    /// Marks a literal hanja form as not receiving annotation rendering.
1784    pub fn skip_annotation(&mut self, hanja: impl Into<String>) {
1785        self.add_literal(hanja, DirectiveAction::SkipAnnotation);
1786    }
1787
1788    /// Adds a literal hanja-form directive.
1789    pub fn add_literal(&mut self, hanja: impl Into<String>, action: DirectiveAction) {
1790        self.rules.push(UserDirectiveRule {
1791            predicate: UserDirectivePredicate::Literal(hanja.into()),
1792            action,
1793        });
1794    }
1795
1796    /// Adds a predicate directive over the complete annotation metadata.
1797    pub fn add_predicate(
1798        &mut self,
1799        predicate: impl Fn(&Annotation) -> bool + 'a,
1800        action: DirectiveAction,
1801    ) {
1802        self.rules.push(UserDirectiveRule {
1803            predicate: UserDirectivePredicate::Predicate(Box::new(predicate)),
1804            action,
1805        });
1806    }
1807
1808    /// Returns whether no directive rules are configured.
1809    pub fn is_empty(&self) -> bool {
1810        self.rules.is_empty()
1811    }
1812
1813    /// Applies every configured directive to a single output token.
1814    ///
1815    /// Non-[`OutputToken::Annotated`] tokens pass through unchanged. For an
1816    /// annotation, each matching rule sets the corresponding flag in priority
1817    /// of declaration order. This method is the per-token primitive used by
1818    /// streaming pipelines that want to apply directives without buffering.
1819    pub fn apply<S>(&self, token: OutputToken<S>) -> OutputToken<S> {
1820        match token {
1821            OutputToken::Annotated(mut annotation) => {
1822                for rule in &self.rules {
1823                    if !rule.predicate.matches(&annotation) {
1824                        continue;
1825                    }
1826                    match rule.action {
1827                        DirectiveAction::RequireHanja => annotation.require_hanja = true,
1828                        DirectiveAction::RequireHangul => annotation.require_hangul = true,
1829                        DirectiveAction::SkipAnnotation => annotation.skip_annotation = true,
1830                    }
1831                }
1832                OutputToken::Annotated(annotation)
1833            }
1834            token => token,
1835        }
1836    }
1837}
1838
1839struct UserDirectiveRule<'a> {
1840    predicate: UserDirectivePredicate<'a>,
1841    action: DirectiveAction,
1842}
1843
1844enum UserDirectivePredicate<'a> {
1845    Literal(String),
1846    Predicate(Box<dyn Fn(&Annotation) -> bool + 'a>),
1847}
1848
1849impl UserDirectivePredicate<'_> {
1850    fn matches(&self, annotation: &Annotation) -> bool {
1851        match self {
1852            Self::Literal(hanja) => annotation.hanja == *hanja,
1853            Self::Predicate(predicate) => predicate(annotation),
1854        }
1855    }
1856}
1857
1858/// Sets `homophone` on dictionary annotations sharing a reading.
1859///
1860/// The marker builds one optional homophone index from the supplied dictionary
1861/// and falls back to [`HanjaDictionary::has_homophone`] for lookup-only
1862/// dictionaries. It also preserves the context-local heuristic. Fallback
1863/// annotations are ignored because they are phonetic fragments rather than
1864/// known lexical homophones.
1865pub fn mark_homophones<S, D>(
1866    tokens: impl IntoIterator<Item = OutputToken<S>>,
1867    dictionary: &D,
1868    window: ContextWindow,
1869) -> Vec<OutputToken<S>>
1870where
1871    S: ScopeData,
1872    D: HanjaDictionary + ?Sized,
1873{
1874    if window == ContextWindow::Off {
1875        return tokens.into_iter().collect();
1876    }
1877
1878    let index = HomophoneIndex::from_dictionary(dictionary);
1879    let lookup_fallback = index.is_none().then_some(dictionary);
1880    ContextMiddleware::new(window, |tokens| {
1881        mark_homophones_in_context(tokens, index.as_ref(), lookup_fallback);
1882    })
1883    .process(tokens)
1884}
1885
1886/// Clears repeat gloss requirements after the first occurrence of each hanja.
1887///
1888/// The first occurrence key is the original hanja form. Later annotations for
1889/// the same form have `first_in_context` set to false and no longer require
1890/// either side to be shown.
1891pub fn filter_first_occurrences<S>(
1892    tokens: impl IntoIterator<Item = OutputToken<S>>,
1893    window: ContextWindow,
1894) -> Vec<OutputToken<S>>
1895where
1896    S: ScopeData,
1897{
1898    ContextMiddleware::new(window, filter_first_occurrences_in_context).process(tokens)
1899}
1900
1901type ContextApply<S> = fn(&mut [OutputToken<S>]);
1902type HomophoneApply<'a, S> = Box<dyn FnMut(&mut [OutputToken<S>]) + 'a>;
1903
1904/// Streaming homophone marker middleware.
1905///
1906/// Context windows that require lookahead buffer only until their configured
1907/// boundary. `PerDocument`, and scoped windows on streams that never emit the
1908/// corresponding boundary, buffer until [`HomophoneMarker::finish`].  For
1909/// example, exact plain-text homophone marking with `PerBlock` is document-wide
1910/// because plain text has no block scopes.
1911pub struct HomophoneMarker<'a, S>
1912where
1913    S: ScopeData,
1914{
1915    inner: ContextMiddleware<S, HomophoneApply<'a, S>>,
1916}
1917
1918impl<'a, S> HomophoneMarker<'a, S>
1919where
1920    S: ScopeData,
1921{
1922    /// Creates a homophone marker for the selected context window.
1923    pub fn new<D>(dictionary: &'a D, window: ContextWindow) -> Self
1924    where
1925        D: HanjaDictionary + ?Sized,
1926    {
1927        let index = if window == ContextWindow::Off {
1928            None
1929        } else {
1930            HomophoneIndex::from_dictionary(dictionary)
1931        };
1932        let lookup_fallback = index.is_none().then_some(dictionary);
1933        Self {
1934            inner: ContextMiddleware::new(
1935                window,
1936                Box::new(move |tokens| {
1937                    mark_homophones_in_context(tokens, index.as_ref(), lookup_fallback);
1938                }),
1939            ),
1940        }
1941    }
1942
1943    /// Pushes one output token and returns tokens ready for downstream stages.
1944    pub fn push_token(&mut self, token: OutputToken<S>) -> Vec<OutputToken<S>> {
1945        self.inner.push_token(token)
1946    }
1947
1948    /// Finishes the middleware and returns buffered tokens.
1949    pub fn finish(self) -> Vec<OutputToken<S>> {
1950        self.inner.finish()
1951    }
1952}
1953
1954/// Streaming first-occurrence middleware.
1955///
1956/// Repeated annotations inside a context have `first_in_context` cleared and
1957/// presentation requirements removed once the context is flushed.
1958pub struct FirstOccurrenceFilter<S>
1959where
1960    S: ScopeData,
1961{
1962    inner: ContextMiddleware<S, ContextApply<S>>,
1963}
1964
1965impl<S> FirstOccurrenceFilter<S>
1966where
1967    S: ScopeData,
1968{
1969    /// Creates a first-occurrence filter for the selected context window.
1970    pub fn new(window: ContextWindow) -> Self {
1971        Self {
1972            inner: ContextMiddleware::new(window, filter_first_occurrences_in_context::<S>),
1973        }
1974    }
1975
1976    /// Pushes one output token and returns tokens ready for downstream stages.
1977    pub fn push_token(&mut self, token: OutputToken<S>) -> Vec<OutputToken<S>> {
1978        self.inner.push_token(token)
1979    }
1980
1981    /// Finishes the middleware and returns buffered tokens.
1982    pub fn finish(self) -> Vec<OutputToken<S>> {
1983        self.inner.finish()
1984    }
1985}
1986
1987/// Applies literal user directives to annotation policy flags.
1988///
1989/// Rules only set flags; they do not render, remove, or reorder tokens.
1990pub fn apply_user_directives<S>(
1991    tokens: impl IntoIterator<Item = OutputToken<S>>,
1992    directives: &UserDirectives<'_>,
1993) -> Vec<OutputToken<S>> {
1994    apply_user_directives_iter(tokens, directives).collect()
1995}
1996
1997/// Lazily applies literal user directives to an output token stream.
1998///
1999/// Returns an iterator that walks the input tokens without intermediate
2000/// buffering. Use this variant in streaming pipelines that need to chain
2001/// directive application with other lazy stages such as [`render_tokens_iter`].
2002pub fn apply_user_directives_iter<'a, S>(
2003    tokens: impl IntoIterator<Item = OutputToken<S>> + 'a,
2004    directives: &'a UserDirectives<'_>,
2005) -> impl Iterator<Item = OutputToken<S>> + 'a {
2006    tokens.into_iter().map(|token| directives.apply(token))
2007}
2008
2009struct ContextMiddleware<S, F>
2010where
2011    S: ScopeData,
2012    F: FnMut(&mut [OutputToken<S>]),
2013{
2014    window: ContextWindow,
2015    apply: F,
2016    context: Vec<OutputToken<S>>,
2017    scope_boundaries: Vec<bool>,
2018}
2019
2020impl<S, F> ContextMiddleware<S, F>
2021where
2022    S: ScopeData,
2023    F: FnMut(&mut [OutputToken<S>]),
2024{
2025    fn new(window: ContextWindow, apply: F) -> Self {
2026        Self {
2027            window,
2028            apply,
2029            context: Vec::new(),
2030            scope_boundaries: Vec::new(),
2031        }
2032    }
2033
2034    fn process(mut self, tokens: impl IntoIterator<Item = OutputToken<S>>) -> Vec<OutputToken<S>> {
2035        let mut output = Vec::new();
2036        for token in tokens {
2037            output.extend(self.push_token(token));
2038        }
2039        output.extend(self.finish());
2040        output
2041    }
2042
2043    fn push_token(&mut self, token: OutputToken<S>) -> Vec<OutputToken<S>> {
2044        let mut output = Vec::new();
2045        match self.window {
2046            ContextWindow::Off => output.push(token),
2047            ContextWindow::PerDocument => self.context.push(token),
2048            ContextWindow::PerBlock | ContextWindow::PerSection => match &token {
2049                OutputToken::Open(scope) => {
2050                    let is_boundary = match self.window {
2051                        ContextWindow::PerBlock => scope.data().is_block_boundary(),
2052                        ContextWindow::PerSection => scope.data().is_section_boundary(),
2053                        ContextWindow::Off | ContextWindow::PerDocument => false,
2054                    };
2055                    if is_boundary {
2056                        self.flush_context(&mut output);
2057                    }
2058                    self.scope_boundaries.push(is_boundary);
2059                    self.context.push(token);
2060                }
2061                OutputToken::Close => {
2062                    let closes_boundary = self.scope_boundaries.pop().unwrap_or(false);
2063                    self.context.push(token);
2064                    if closes_boundary && self.window == ContextWindow::PerBlock {
2065                        self.flush_context(&mut output);
2066                    }
2067                }
2068                _ => self.context.push(token),
2069            },
2070        }
2071        output
2072    }
2073
2074    fn finish(mut self) -> Vec<OutputToken<S>> {
2075        let mut output = Vec::new();
2076        self.flush_context(&mut output);
2077        output
2078    }
2079
2080    fn flush_context(&mut self, output: &mut Vec<OutputToken<S>>) {
2081        if self.context.is_empty() {
2082            return;
2083        }
2084
2085        (self.apply)(&mut self.context);
2086        output.append(&mut self.context);
2087    }
2088}
2089
2090#[derive(Clone, Debug, Default, Eq, PartialEq)]
2091struct HomophoneIndex {
2092    forms_by_reading: BTreeMap<String, BTreeSet<String>>,
2093}
2094
2095impl HomophoneIndex {
2096    fn from_dictionary<D>(dictionary: &D) -> Option<Self>
2097    where
2098        D: HanjaDictionary + ?Sized,
2099    {
2100        let mut forms_by_reading = BTreeMap::<String, BTreeSet<String>>::new();
2101        for record in dictionary.entries()? {
2102            forms_by_reading
2103                .entry(record.reading)
2104                .or_default()
2105                .insert(record.hanja);
2106        }
2107        Some(Self { forms_by_reading })
2108    }
2109
2110    fn has_homophone(&self, hanja: &str, reading: &str) -> bool {
2111        self.forms_by_reading
2112            .get(reading)
2113            .is_some_and(|forms| forms.iter().any(|form| form != hanja))
2114    }
2115}
2116
2117fn mark_homophones_in_context<S, D>(
2118    tokens: &mut [OutputToken<S>],
2119    index: Option<&HomophoneIndex>,
2120    lookup_fallback: Option<&D>,
2121) where
2122    D: HanjaDictionary + ?Sized,
2123{
2124    let mut forms_by_reading = BTreeMap::<String, BTreeSet<String>>::new();
2125
2126    for token in tokens.iter() {
2127        if let OutputToken::Annotated(annotation) = token
2128            && annotation.from_dictionary
2129        {
2130            forms_by_reading
2131                .entry(annotation.reading.clone())
2132                .or_default()
2133                .insert(annotation.hanja.clone());
2134        }
2135    }
2136
2137    for token in tokens.iter_mut() {
2138        if let OutputToken::Annotated(annotation) = token {
2139            annotation.homophone = annotation.from_dictionary
2140                && (index.is_some_and(|index| {
2141                    index.has_homophone(&annotation.hanja, &annotation.reading)
2142                }) || lookup_fallback.is_some_and(|dictionary| {
2143                    dictionary.has_homophone(&annotation.hanja, &annotation.reading)
2144                }) || forms_by_reading
2145                    .get(&annotation.reading)
2146                    .is_some_and(|forms| forms.len() > 1));
2147        }
2148    }
2149}
2150
2151fn filter_first_occurrences_in_context<S>(tokens: &mut [OutputToken<S>]) {
2152    let mut seen = BTreeSet::new();
2153
2154    for token in tokens.iter_mut() {
2155        if let OutputToken::Annotated(annotation) = token {
2156            if seen.insert(annotation.hanja.clone()) {
2157                annotation.first_in_context = true;
2158            } else {
2159                annotation.first_in_context = false;
2160                annotation.require_hanja = false;
2161                annotation.require_hangul = false;
2162            }
2163        }
2164    }
2165}
2166
2167/// Renders engine output tokens into annotation-free tokens.
2168///
2169/// Structural and text tokens pass through. Each annotation is expanded into a
2170/// concrete rendered token according to the supplied options, the current
2171/// scope, and the annotation's flags. `options` accepts either a bare
2172/// [`RenderMode`] (via the `From<RenderMode>` impl on [`RenderOptions`]) or a
2173/// full [`RenderOptions`] value.
2174pub fn render_tokens<S, O>(
2175    tokens: impl IntoIterator<Item = OutputToken<S>>,
2176    options: O,
2177) -> Vec<RenderedToken<S>>
2178where
2179    S: ScopeData,
2180    O: Into<RenderOptions>,
2181{
2182    render_tokens_iter(tokens, options).collect()
2183}
2184
2185/// Renders engine output tokens into annotation-free tokens as an iterator.
2186///
2187/// The renderer maintains a small scope stack so that annotation expansion can
2188/// consult the active scope's [`ScopeData::allows_inline_markup`] when
2189/// choosing between an inline-markup form and a parenthesized fallback. Every
2190/// other token maps one-to-one to its rendered counterpart.
2191pub fn render_tokens_iter<S, O>(
2192    tokens: impl IntoIterator<Item = OutputToken<S>>,
2193    options: O,
2194) -> impl Iterator<Item = RenderedToken<S>>
2195where
2196    S: ScopeData,
2197    O: Into<RenderOptions>,
2198{
2199    RendererIter {
2200        upstream: tokens.into_iter(),
2201        renderer: Renderer::new(options),
2202    }
2203}
2204
2205/// Stateful renderer for chunked [`OutputToken`] streams.
2206///
2207/// `Renderer` is the push-based counterpart to [`render_tokens_iter`]. It
2208/// preserves the active scope stack across calls so format writers can consume
2209/// rendered tokens as soon as upstream engine and middleware stages release
2210/// them, without losing inline-markup restrictions from earlier chunks.
2211pub struct Renderer<S>
2212where
2213    S: ScopeData,
2214{
2215    options: RenderOptions,
2216    /// Cached `allows_inline_markup` value for each open scope. Storing the
2217    /// boolean instead of the whole scope keeps the renderer free of an extra
2218    /// `S: Clone` bound at this layer (it already requires it via `ScopeData`)
2219    /// and avoids the cost of cloning adapter-owned data.
2220    markup_stack: Vec<bool>,
2221    /// Number of currently open scopes whose `allows_inline_markup` is
2222    /// `false`. Inline markup is safe at the current cursor only when this
2223    /// counter is zero; otherwise some ancestor forbids markup and a nested
2224    /// allow-markup scope cannot override that restriction.
2225    disallowing_ancestors: usize,
2226    _scope: PhantomData<fn(S)>,
2227}
2228
2229impl<S> Renderer<S>
2230where
2231    S: ScopeData,
2232{
2233    /// Creates a renderer with the supplied rendering options.
2234    pub fn new<O>(options: O) -> Self
2235    where
2236        O: Into<RenderOptions>,
2237    {
2238        Self {
2239            options: options.into(),
2240            markup_stack: Vec::new(),
2241            disallowing_ancestors: 0,
2242            _scope: PhantomData,
2243        }
2244    }
2245
2246    /// Pushes one output token and returns its rendered counterpart.
2247    pub fn push_token(&mut self, token: OutputToken<S>) -> RenderedToken<S> {
2248        match token {
2249            OutputToken::Open(scope) => {
2250                let allows = scope.data().allows_inline_markup();
2251                if !allows {
2252                    self.disallowing_ancestors += 1;
2253                }
2254                self.markup_stack.push(allows);
2255                RenderedToken::Open(scope)
2256            }
2257            OutputToken::Close => {
2258                if let Some(false) = self.markup_stack.pop() {
2259                    // Saturating guard for malformed streams that emit more
2260                    // Close than Open tokens; the renderer should never
2261                    // panic on broken input.
2262                    self.disallowing_ancestors = self.disallowing_ancestors.saturating_sub(1);
2263                }
2264                RenderedToken::Close
2265            }
2266            OutputToken::Text(text) => RenderedToken::Text(text),
2267            OutputToken::Verbatim(text) => RenderedToken::Verbatim(text),
2268            OutputToken::Annotated(annotation) => {
2269                // Inline markup is allowed only when no open ancestor scope
2270                // forbids it. The plain-text reader wraps its input in a
2271                // scope whose `allows_inline_markup` is false, so plain text
2272                // still falls back to parens; HTML and Markdown root
2273                // contexts emit no enclosing scope and therefore start with
2274                // an empty stack, leaving annotations free to use markup.
2275                let allows_inline_markup = self.disallowing_ancestors == 0;
2276                render_annotation(&annotation, &self.options, allows_inline_markup)
2277            }
2278        }
2279    }
2280}
2281
2282struct RendererIter<I, S>
2283where
2284    S: ScopeData,
2285{
2286    upstream: I,
2287    renderer: Renderer<S>,
2288}
2289
2290impl<I, S> Iterator for RendererIter<I, S>
2291where
2292    I: Iterator<Item = OutputToken<S>>,
2293    S: ScopeData,
2294{
2295    type Item = RenderedToken<S>;
2296
2297    fn next(&mut self) -> Option<Self::Item> {
2298        let token = self.upstream.next()?;
2299        Some(self.renderer.push_token(token))
2300    }
2301}
2302
2303fn render_annotation<S>(
2304    annotation: &Annotation,
2305    options: &RenderOptions,
2306    allows_inline_markup: bool,
2307) -> RenderedToken<S> {
2308    if annotation.skip_annotation {
2309        let primary = match options.mode {
2310            RenderMode::HangulOnly | RenderMode::HangulHanjaParens => annotation.reading.clone(),
2311            RenderMode::HanjaHangulParens | RenderMode::Original => annotation.hanja.clone(),
2312            RenderMode::Ruby(RubyBase::OnHangul) => annotation.reading.clone(),
2313            RenderMode::Ruby(RubyBase::OnHanja) => annotation.hanja.clone(),
2314        };
2315        return RenderedToken::Text(primary);
2316    }
2317
2318    match options.mode {
2319        RenderMode::HangulOnly if annotation.require_hanja || annotation.homophone => {
2320            RenderedToken::Text(parens(&annotation.reading, &annotation.hanja))
2321        }
2322        RenderMode::HangulOnly => RenderedToken::Text(annotation.reading.clone()),
2323        RenderMode::HangulHanjaParens => {
2324            RenderedToken::Text(parens(&annotation.reading, &annotation.hanja))
2325        }
2326        RenderMode::HanjaHangulParens => {
2327            RenderedToken::Text(parens(&annotation.hanja, &annotation.reading))
2328        }
2329        RenderMode::Ruby(base) => render_ruby(annotation, base, allows_inline_markup),
2330        RenderMode::Original if annotation.require_hangul => match options.original_gloss {
2331            OriginalGloss::Parens => {
2332                RenderedToken::Text(parens(&annotation.hanja, &annotation.reading))
2333            }
2334            // `Original` keeps hanja as the primary text, so its ruby form
2335            // always uses hanja as the base regardless of any other setting.
2336            OriginalGloss::Ruby => render_ruby(annotation, RubyBase::OnHanja, allows_inline_markup),
2337        },
2338        RenderMode::Original => RenderedToken::Text(annotation.hanja.clone()),
2339    }
2340}
2341
2342fn render_ruby<S>(
2343    annotation: &Annotation,
2344    base: RubyBase,
2345    allows_inline_markup: bool,
2346) -> RenderedToken<S> {
2347    let (base_text, rt_text) = match base {
2348        RubyBase::OnHangul => (&annotation.reading, &annotation.hanja),
2349        RubyBase::OnHanja => (&annotation.hanja, &annotation.reading),
2350    };
2351    if !allows_inline_markup {
2352        return RenderedToken::Text(parens(base_text, rt_text));
2353    }
2354    RenderedToken::Ruby {
2355        base: base_text.clone(),
2356        rt: rt_text.clone(),
2357    }
2358}
2359
2360fn parens(reading: &str, hanja: &str) -> String {
2361    let mut output = String::new();
2362    output.push_str(reading);
2363    output.push('(');
2364    output.push_str(hanja);
2365    output.push(')');
2366    output
2367}
2368
2369/// Converts plain text through reader, engine, renderer, and writer stages.
2370///
2371/// This is a convenience for the plain-text MVP path. More capable format
2372/// adapters should call the individual stages so they can preserve their own
2373/// structural tokens. The `render` argument accepts either a [`RenderMode`]
2374/// (converted via `From<RenderMode>` for [`RenderOptions`]) or a full
2375/// [`RenderOptions`] value.
2376pub fn convert_plain_text<D, R>(input: &str, dictionary: &D, render: R) -> String
2377where
2378    D: HanjaDictionary + ?Sized,
2379    R: Into<RenderOptions>,
2380{
2381    convert_plain_text_with_options(input, dictionary, render, EngineOptions::default())
2382}
2383
2384/// Converts plain text with explicit hanja conversion engine options.
2385///
2386/// This is the option-aware variant of [`convert_plain_text`].
2387pub fn convert_plain_text_with_options<D, R>(
2388    input: &str,
2389    dictionary: &D,
2390    render: R,
2391    options: EngineOptions,
2392) -> String
2393where
2394    D: HanjaDictionary + ?Sized,
2395    R: Into<RenderOptions>,
2396{
2397    let input_tokens = read_plain_text(input);
2398    let output_tokens = process_tokens_with_options(input_tokens, dictionary, options);
2399    let output_tokens = mark_homophones(output_tokens, dictionary, ContextWindow::PerBlock);
2400    let rendered_tokens = render_tokens(output_tokens, render);
2401    write_plain_text(rendered_tokens)
2402}
gukhanmun_core/lib.rs

gukhanmun_core/
lib.rs