Skip to main content

gukhanmun_core/
lib.rs

1// Gukhanmun: Core IR, engine, dictionary traits, and fallback logic for Gukhanmun.
2// Copyright (C) 2026  Hong Minhee
3//
4// This program is free software: you can redistribute it and/or modify
5// it under the terms of the GNU General Public License as published by
6// the Free Software Foundation, either version 3 of the License, or
7// (at your option) any later version.
8//
9// This program is distributed in the hope that it will be useful,
10// but WITHOUT ANY WARRANTY; without even the implied warranty of
11// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12// GNU General Public License for more details.
13//
14// You should have received a copy of the GNU General Public License
15// along with this program.  If not, see <https://www.gnu.org/licenses/>.
16
17//! Core types and algorithms for Gukhanmun.
18//!
19//! This crate is the home for the format-neutral intermediate representation,
20//! conversion engine, dictionary traits, lattice segmentation, and fallback
21//! hanja reading logic. Format adapters, command-line I/O, and language
22//! bindings live in separate crates.
23
24#![no_std]
25#![forbid(unsafe_code)]
26#![deny(missing_docs)]
27
28extern crate alloc;
29
30mod fallback;
31mod generated;
32mod segment;
33
34use alloc::boxed::Box;
35use alloc::collections::{BTreeMap, BTreeSet};
36use alloc::string::{String, ToString};
37use alloc::vec::Vec;
38use core::marker::PhantomData;
39
40use fallback::{
41    FallbackPart, FallbackState, apply_initial_sound_law_to_first_syllable,
42    fallback_reading_for_run, is_hanja_numeral, khangul_all_readings,
43    phoneticize_fallback_run_with_state, phoneticize_hanja_char,
44    reading_matches_with_initial_sound_law, should_apply_yeol_yul,
45};
46use generated::unihan_readings::KHANGUL_READINGS;
47use segment::{Segment, segment_text};
48
49/// Error returned by fallible core pipeline entry points.
50///
51/// The core engine is mostly infallible today because dictionary lookup is a
52/// synchronous trait contract. This type is still the common structured error
53/// surface for reader/engine/writer boundaries and for future engine
54/// invariants that callers may need to inspect.
55#[derive(Debug, thiserror::Error)]
56#[non_exhaustive]
57pub enum Error {
58    /// Loading or preparing a dictionary failed before conversion could run.
59    #[error("dictionary load failed: {0}")]
60    DictionaryLoad(String),
61
62    /// Lattice segmentation failed for a specific source string.
63    #[error("segmentation failed for {hanja:?}: {reason}")]
64    Segmentation {
65        /// The hanja source span that could not be segmented.
66        hanja: String,
67
68        /// Human-readable reason for the segmentation failure.
69        reason: String,
70    },
71
72    /// A dictionary or fallback path produced a reading that is not accepted.
73    #[error("invalid hangul reading {reading:?} for hanja {hanja:?}")]
74    InvalidReading {
75        /// The hanja source string associated with the reading.
76        hanja: String,
77
78        /// The rejected hangul reading.
79        reading: String,
80    },
81
82    /// An internal invariant was violated.
83    #[error("internal invariant violated: {0}")]
84    Internal(&'static str),
85
86    /// A boxed error from an extension point that has no more specific core
87    /// variant yet.
88    #[error(transparent)]
89    Other(#[from] Box<dyn core::error::Error + Send + Sync + 'static>),
90}
91
92/// Stream-level error recovery policy.
93///
94/// `Strict` is the default and returns the first recoverable reader error.
95/// `Lenient` logs the error and emits the original unrecognized region as a
96/// verbatim token so downstream tokens can continue flowing.
97#[derive(Clone, Copy, Debug, Default, Eq, PartialEq)]
98pub enum Recovery {
99    /// Return the first reader, engine, or writer error and stop processing.
100    #[default]
101    Strict,
102
103    /// Preserve recoverable bad input regions and continue processing.
104    Lenient,
105}
106
107/// A recoverable reader error plus the original source region.
108///
109/// Readers use this value when they can identify a malformed region and know
110/// how to preserve its source bytes or text in lenient mode. Strict mode
111/// returns the stored error directly.
112#[derive(Debug)]
113pub struct RecoverableInputError {
114    original: String,
115    error: Error,
116}
117
118impl RecoverableInputError {
119    /// Creates a recoverable input error from original source and cause.
120    pub fn new(original: String, error: Error) -> Self {
121        Self { original, error }
122    }
123
124    /// Returns the original source region that can be preserved in lenient
125    /// mode.
126    pub fn original(&self) -> &str {
127        &self.original
128    }
129
130    /// Returns the structured error describing why the region was rejected.
131    pub fn error(&self) -> &Error {
132        &self.error
133    }
134
135    /// Consumes the error and returns the original source plus cause.
136    pub fn into_parts(self) -> (String, Error) {
137        (self.original, self.error)
138    }
139}
140
141/// Adapter-owned data attached to an intermediate-representation scope.
142///
143/// The engine treats this trait as an opaque policy boundary. Format adapters
144/// can encode HTML elements, Markdown events, or plain-text scopes in the
145/// concrete type, while the engine only asks whether text should be preserved
146/// and whether later stages may insert inline markup.
147pub trait ScopeData: Clone + 'static {
148    /// Returns whether text inside this scope must pass through untouched.
149    fn is_preserve(&self) -> bool;
150
151    /// Returns whether inline markup may be inserted inside this scope.
152    ///
153    /// This flag is about *structural* permission for markup at the current
154    /// position, not about whether the engine actually converts text here.
155    /// A scope may legitimately set [`Self::is_preserve`] to `true` (so no
156    /// annotation is produced) while still reporting `true` for this method,
157    /// because preserve does not by itself restrict what a deeper non-preserved
158    /// child may emit. Adapters should return `false` only when an HTML5
159    /// text-only content model (such as `<title>` or `<option>`) or an
160    /// analogous host rule actually forbids markup at this position.
161    ///
162    /// Scope-aware renderers treat inline markup as allowed only when *every*
163    /// open ancestor reports `true`; a nested allow-markup scope cannot
164    /// re-enable markup that an ancestor has forbidden.
165    fn allows_inline_markup(&self) -> bool {
166        true
167    }
168
169    /// Returns whether this scope resets block-oriented stateful stages.
170    fn is_block_boundary(&self) -> bool {
171        false
172    }
173
174    /// Returns whether this scope resets section-oriented stateful stages.
175    fn is_section_boundary(&self) -> bool {
176        false
177    }
178}
179
180/// A structural scope in the format-neutral token stream.
181///
182/// `Scope` carries only adapter-owned data. The engine may clone and stack
183/// scopes, but it does not inspect the concrete data beyond the `ScopeData`
184/// methods.
185#[derive(Clone, Debug, Eq, PartialEq)]
186pub struct Scope<S> {
187    data: S,
188}
189
190impl<S> Scope<S> {
191    /// Creates a scope from adapter-specific data.
192    pub fn new(data: S) -> Self {
193        Self { data }
194    }
195
196    /// Returns a shared reference to the adapter-specific scope data.
197    pub fn data(&self) -> &S {
198        &self.data
199    }
200
201    /// Consumes the scope and returns its adapter-specific data.
202    pub fn into_data(self) -> S {
203        self.data
204    }
205}
206
207/// A token emitted by a reader before hanja conversion has run.
208///
209/// This type intentionally has no annotation variant: annotations are produced
210/// by the engine and consumed by renderers, so input adapters cannot inject
211/// already-converted positions into the stream.
212#[derive(Clone, Debug, Eq, PartialEq)]
213pub enum InputToken<S> {
214    /// Enters a structural scope.
215    Open(Scope<S>),
216
217    /// Leaves the most recent structural scope.
218    Close,
219
220    /// Text that the engine may convert unless a preserving scope is active.
221    Text(String),
222
223    /// Text that must pass through untouched.
224    Verbatim(String),
225}
226
227/// A token emitted by the engine after hanja conversion.
228///
229/// Most tokens pass through from `InputToken`, but converted dictionary matches
230/// become `Annotated` so middlewares and renderers can choose their final
231/// surface form.
232#[derive(Clone, Debug, Eq, PartialEq)]
233pub enum OutputToken<S> {
234    /// Enters a structural scope.
235    Open(Scope<S>),
236
237    /// Leaves the most recent structural scope.
238    Close,
239
240    /// Text that needs no annotation-aware rendering.
241    Text(String),
242
243    /// Text that must pass through untouched.
244    Verbatim(String),
245
246    /// A converted hanja word plus metadata for later stages.
247    Annotated(Annotation),
248}
249
250/// A token emitted by a renderer after all annotations have been expanded.
251///
252/// Writers consume this stream because it cannot contain unrendered
253/// annotations. That makes the renderer-to-writer boundary explicit in the type
254/// system.
255#[derive(Clone, Debug, Eq, PartialEq)]
256pub enum RenderedToken<S> {
257    /// Enters a structural scope.
258    Open(Scope<S>),
259
260    /// Leaves the most recent structural scope.
261    Close,
262
263    /// Text ready for serialization.
264    Text(String),
265
266    /// Verbatim text ready for serialization.
267    Verbatim(String),
268
269    /// A structural ruby annotation pairing a base text with an `rt` gloss.
270    ///
271    /// Writers serialize this in a format-appropriate way: HTML emits a
272    /// `<ruby>` element, Markdown emits inline HTML, and plain text falls back
273    /// to parenthesized text. Because the variant carries the base and gloss
274    /// as separate strings rather than pre-built markup, each writer is
275    /// responsible for escaping the contents according to its own rules—the
276    /// renderer never injects raw HTML produced by string concatenation.
277    ///
278    /// Renderers only emit this variant when the active scope reports
279    /// [`ScopeData::allows_inline_markup`] as `true`; scopes that disallow
280    /// inline markup receive a plain `Text` fallback instead.
281    Ruby {
282        /// Base text shown as the primary side of the ruby annotation.
283        base: String,
284
285        /// Gloss text shown in the `rt` position.
286        rt: String,
287    },
288}
289
290/// Metadata for a dictionary-backed hanja conversion.
291///
292/// The engine fills this value when it turns source hanja into a hangul
293/// reading. The flags describe known constraints; middlewares may adjust them
294/// before a renderer chooses the concrete output form.
295///
296/// This struct is `#[non_exhaustive]`, so additional flags can be added without
297/// a breaking change. Construct it from [`Annotation::default`] and set the
298/// fields you need; the public fields stay readable and writable.
299#[derive(Clone, Debug, Default, Eq, PartialEq)]
300#[non_exhaustive]
301pub struct Annotation {
302    /// The original hanja text from the input.
303    pub hanja: String,
304
305    /// The hangul reading selected for the hanja text.
306    pub reading: String,
307
308    /// Whether another hanja form in the active context shares this reading.
309    pub homophone: bool,
310
311    /// Whether rendered output must keep the original hanja visible.
312    pub require_hanja: bool,
313
314    /// Whether rendered output must include a hangul gloss when hanja remains
315    /// primary.
316    pub require_hangul: bool,
317
318    /// Whether this is the first occurrence in the active context window.
319    pub first_in_context: bool,
320
321    /// Whether renderers should collapse this annotation to its primary plain
322    /// text form instead of adding annotation markup or parentheses.
323    pub skip_annotation: bool,
324
325    /// Whether this annotation came from a dictionary match.
326    pub from_dictionary: bool,
327
328    /// Whether the presentation requirements
329    /// ([`require_hanja`](Self::require_hanja) /
330    /// [`require_hangul`](Self::require_hangul)) were requested by an explicit
331    /// parenthetical gloss in the source, rather than by the dictionary.
332    ///
333    /// [`RedundantParenCollapser`] sets this when it collapses an author-written
334    /// gloss.  [`FirstOccurrenceFilter`] preserves the requirements on such
335    /// annotations instead of clearing them on repeats, so a word the author
336    /// glossed every time stays fully annotated every time.
337    pub from_source_gloss: bool,
338}
339
340/// Dictionary-provided rendering constraints for a match.
341#[derive(Clone, Copy, Debug, Default, Eq, PartialEq)]
342pub struct MatchMark {
343    /// Whether this dictionary entry should always show its hanja form.
344    pub require_hanja: bool,
345
346    /// Whether this dictionary entry should always show its hangul reading.
347    pub require_hangul: bool,
348}
349
350/// A complete dictionary entry exposed for batch policy analysis.
351///
352/// Conversion only needs prefix lookup through [`HanjaDictionary::matches_at`],
353/// but middlewares such as homophone marking need to reason about the effective
354/// entry set without repeatedly probing the dictionary. Backends that can
355/// enumerate entries should return these records from
356/// [`HanjaDictionary::entries`].
357#[derive(Clone, Debug, Eq, PartialEq)]
358pub struct DictionaryRecord {
359    /// The hanja spelling stored as a dictionary key.
360    pub hanja: String,
361
362    /// The hangul reading selected for this hanja spelling.
363    pub reading: String,
364
365    /// Dictionary-provided rendering constraints for this entry.
366    pub mark: MatchMark,
367}
368
369/// A dictionary match that starts at the queried cursor position.
370#[derive(Clone, Debug, Eq, PartialEq)]
371pub struct Match {
372    /// The matched prefix length in UTF-8 bytes.
373    pub byte_len: usize,
374
375    /// The hangul reading for the matched hanja prefix.
376    ///
377    /// This is the word-initial reading, which already reflects South Korean
378    /// initial sound law where it applies (for example `年` reads `연`).
379    pub reading: String,
380
381    /// The reading to use when this match is *not* word-initial, when it
382    /// differs from [`Match::reading`] by initial sound law.
383    ///
384    /// Dictionaries set this for multi-syllable entries whose leading morpheme
385    /// keeps its original sound outside word-initial position, as the Standard
386    /// Korean Language Dictionary records through its suffix and bound-noun
387    /// head words (for example `年代` reads `연대` word-initially but `년대`
388    /// after a number). Single-hanja initial sound law is handled by the engine
389    /// from the bundled unihan readings and does not need this field. `None`
390    /// means the reading is position independent.
391    pub suffix_reading: Option<String>,
392
393    /// Dictionary-provided rendering constraints for this match.
394    pub mark: MatchMark,
395}
396
397/// A hanja dictionary queried by the conversion engine.
398///
399/// The key operation returns every entry that starts at the beginning of the
400/// supplied string. This shape supports lattice segmentation because the
401/// engine must consider every candidate path through a hanja run.
402pub trait HanjaDictionary {
403    /// Yields every dictionary match that starts at the beginning of `s`.
404    fn matches_at<'a>(&'a self, s: &'a str) -> Box<dyn Iterator<Item = Match> + 'a>;
405
406    /// Returns the greatest dictionary entry length in Unicode scalar values.
407    fn max_word_chars(&self) -> Option<usize> {
408        None
409    }
410
411    /// Enumerates complete dictionary entries when the backend supports it.
412    ///
413    /// The default returns `None`, which keeps custom lookup-only dictionaries
414    /// valid. Homophone-aware middlewares use this as an optional batch path so
415    /// built-in backends can avoid per-token full-dictionary scans.
416    fn entries<'a>(&'a self) -> Option<Box<dyn Iterator<Item = DictionaryRecord> + 'a>> {
417        None
418    }
419
420    /// Returns whether another hanja spelling has the same hangul reading.
421    fn has_homophone(&self, hanja: &str, reading: &str) -> bool {
422        self.entries().is_some_and(|mut entries| {
423            entries.any(|record| record.hanja != hanja && record.reading == reading)
424        })
425    }
426}
427
428impl<D> HanjaDictionary for &D
429where
430    D: HanjaDictionary + ?Sized,
431{
432    fn matches_at<'a>(&'a self, s: &'a str) -> Box<dyn Iterator<Item = Match> + 'a> {
433        (**self).matches_at(s)
434    }
435
436    fn max_word_chars(&self) -> Option<usize> {
437        (**self).max_word_chars()
438    }
439
440    fn entries<'a>(&'a self) -> Option<Box<dyn Iterator<Item = DictionaryRecord> + 'a>> {
441        (**self).entries()
442    }
443
444    fn has_homophone(&self, hanja: &str, reading: &str) -> bool {
445        (**self).has_homophone(hanja, reading)
446    }
447}
448
449impl<D> HanjaDictionary for Box<D>
450where
451    D: HanjaDictionary + ?Sized,
452{
453    fn matches_at<'a>(&'a self, s: &'a str) -> Box<dyn Iterator<Item = Match> + 'a> {
454        (**self).matches_at(s)
455    }
456
457    fn max_word_chars(&self) -> Option<usize> {
458        (**self).max_word_chars()
459    }
460
461    fn entries<'a>(&'a self) -> Option<Box<dyn Iterator<Item = DictionaryRecord> + 'a>> {
462        (**self).entries()
463    }
464
465    fn has_homophone(&self, hanja: &str, reading: &str) -> bool {
466        (**self).has_homophone(hanja, reading)
467    }
468}
469
470/// Per-character Unihan fallback readings exposed as a dictionary.
471///
472/// This type reads the same generated `kHangul` table used by the engine's
473/// fallback phoneticizer, but it deliberately returns canonical pre-initial
474/// sound law readings. Stateful orthographic rules such as the initial sound
475/// law, `列`/`律`, and numeral grouping remain engine fallback behavior rather
476/// than dictionary behavior.
477#[derive(Clone, Copy, Debug, Default, Eq, PartialEq)]
478pub struct UnihanCharDict;
479
480impl HanjaDictionary for UnihanCharDict {
481    fn matches_at<'a>(&'a self, s: &'a str) -> Box<dyn Iterator<Item = Match> + 'a> {
482        let matched = s.chars().next().and_then(|ch| {
483            khangul_reading(ch).map(|reading| Match {
484                byte_len: ch.len_utf8(),
485                reading: reading.to_string(),
486                suffix_reading: None,
487                mark: MatchMark::default(),
488            })
489        });
490        Box::new(matched.into_iter())
491    }
492
493    fn max_word_chars(&self) -> Option<usize> {
494        Some(1)
495    }
496
497    fn entries<'a>(&'a self) -> Option<Box<dyn Iterator<Item = DictionaryRecord> + 'a>> {
498        Some(Box::new(KHANGUL_READINGS.iter().map(|(hanja, reading)| {
499            DictionaryRecord {
500                hanja: hanja.to_string(),
501                reading: reading.to_string(),
502                mark: MatchMark::default(),
503            }
504        })))
505    }
506
507    fn has_homophone(&self, hanja: &str, reading: &str) -> bool {
508        let mut chars = hanja.chars();
509        let Some(hanja) = chars.next() else {
510            return false;
511        };
512        if chars.next().is_some() {
513            return false;
514        }
515        KHANGUL_READINGS
516            .iter()
517            .any(|&(other_hanja, other_reading)| other_hanja != hanja && other_reading == reading)
518    }
519}
520
521/// A dictionary composition that preserves caller-supplied priority order.
522///
523/// Dictionaries are stored from highest to lowest priority. During lookup,
524/// matches of different byte lengths are all returned so the lattice segmenter
525/// can still compare shorter high-priority entries with longer low-priority
526/// entries. When two dictionaries produce a match with the same byte length,
527/// only the first one is kept.
528#[derive(Clone, Debug, Default, Eq, PartialEq)]
529pub struct ChainDictionary<D> {
530    dictionaries: Vec<D>,
531}
532
533impl<D> ChainDictionary<D> {
534    /// Creates an empty chain.
535    pub fn new() -> Self {
536        Self {
537            dictionaries: Vec::new(),
538        }
539    }
540
541    /// Appends a dictionary with lower priority than the existing entries.
542    pub fn push(&mut self, dictionary: D) {
543        self.dictionaries.push(dictionary);
544    }
545
546    /// Returns the number of dictionaries in the chain.
547    pub fn len(&self) -> usize {
548        self.dictionaries.len()
549    }
550
551    /// Returns whether the chain contains no dictionaries.
552    pub fn is_empty(&self) -> bool {
553        self.dictionaries.is_empty()
554    }
555
556    /// Returns the chained dictionaries in priority order.
557    pub fn dictionaries(&self) -> &[D] {
558        &self.dictionaries
559    }
560
561    /// Consumes the chain and returns its dictionaries in priority order.
562    pub fn into_dictionaries(self) -> Vec<D> {
563        self.dictionaries
564    }
565}
566
567impl<D> FromIterator<D> for ChainDictionary<D> {
568    fn from_iter<T: IntoIterator<Item = D>>(iter: T) -> Self {
569        Self {
570            dictionaries: Vec::from_iter(iter),
571        }
572    }
573}
574
575impl<D> HanjaDictionary for ChainDictionary<D>
576where
577    D: HanjaDictionary,
578{
579    fn matches_at<'a>(&'a self, s: &'a str) -> Box<dyn Iterator<Item = Match> + 'a> {
580        let mut seen_lengths = BTreeSet::new();
581        let mut matches = Vec::new();
582
583        for dictionary in &self.dictionaries {
584            for matched in dictionary.matches_at(s) {
585                if seen_lengths.insert(matched.byte_len) {
586                    matches.push(matched);
587                }
588            }
589        }
590
591        matches.sort_by_key(|matched| matched.byte_len);
592        Box::new(matches.into_iter())
593    }
594
595    fn max_word_chars(&self) -> Option<usize> {
596        let mut max = None;
597        for dictionary in &self.dictionaries {
598            let word_chars = dictionary.max_word_chars()?;
599            max = Some(max.map_or(word_chars, |current: usize| current.max(word_chars)));
600        }
601        max
602    }
603
604    fn entries<'a>(&'a self) -> Option<Box<dyn Iterator<Item = DictionaryRecord> + 'a>> {
605        let mut records = BTreeMap::<String, DictionaryRecord>::new();
606
607        for dictionary in &self.dictionaries {
608            for record in dictionary.entries()? {
609                records.entry(record.hanja.clone()).or_insert(record);
610            }
611        }
612
613        Some(Box::new(records.into_values()))
614    }
615
616    fn has_homophone(&self, hanja: &str, reading: &str) -> bool {
617        if let Some(mut records) = self.entries() {
618            return records.any(|record| record.hanja != hanja && record.reading == reading);
619        }
620
621        self.dictionaries
622            .iter()
623            .any(|dictionary| dictionary.has_homophone(hanja, reading))
624    }
625}
626
627fn khangul_reading(ch: char) -> Option<&'static str> {
628    KHANGUL_READINGS
629        .binary_search_by_key(&ch, |(hanja, _)| *hanja)
630        .ok()
631        .map(|index| KHANGUL_READINGS[index].1)
632}
633
634/// Engine-level options that affect hanja conversion before rendering.
635///
636/// These options apply to fallback text that is not covered by the supplied
637/// dictionary. Dictionary matches are assumed to already contain the desired
638/// reading and are not rewritten by fallback orthography rules.
639#[derive(Clone, Copy, Debug, Eq, PartialEq)]
640pub struct EngineOptions {
641    /// How hanja-containing spans are split into dictionary and fallback
642    /// segments.
643    pub segmentation: SegmentationStrategy,
644
645    /// Whether fallback readings should apply South Korean initial sound law.
646    pub initial_sound_law: bool,
647
648    /// How fallback hanja numerals are rendered.
649    pub numeral_strategy: NumeralStrategy,
650}
651
652impl Default for EngineOptions {
653    fn default() -> Self {
654        Self {
655            segmentation: SegmentationStrategy::Lattice,
656            initial_sound_law: true,
657            numeral_strategy: NumeralStrategy::HangulPhonetic,
658        }
659    }
660}
661
662/// Strategy used to segment hanja-containing spans.
663///
664/// `Lattice` considers every dictionary path and chooses the best coverage,
665/// while `Eager` greedily takes the longest match at each cursor.  The eager
666/// strategy can reduce work for callers that prefer speed over segmentation
667/// accuracy.
668#[non_exhaustive]
669#[derive(Clone, Copy, Debug, Default, Eq, PartialEq)]
670pub enum SegmentationStrategy {
671    /// Use dynamic programming to maximize dictionary coverage.
672    #[default]
673    Lattice,
674
675    /// Use left-to-right eager longest-match segmentation.
676    Eager,
677}
678
679/// Strategy for rendering hanja numerals.
680///
681/// The hangul phonetic strategy is fallback-only, so dictionary matches keep
682/// their lexicalized readings. Arabic strategies also participate in
683/// segmentation as plain-text numeral edges, allowing numeric normalization to
684/// take precedence over dictionary calendar entries such as `六月`.
685#[non_exhaustive]
686#[derive(Clone, Copy, Debug, Eq, PartialEq)]
687pub enum NumeralStrategy {
688    /// Render hanja numerals as their hangul phonetic readings.
689    ///
690    /// This strategy emits fallback annotations so renderers can still expose
691    /// the original hanja in annotation-oriented render modes.
692    HangulPhonetic,
693
694    /// Normalize positional digit-only hanja numerals to Arabic digits.
695    ///
696    /// Arabic normalization emits plain text rather than annotations. Renderers
697    /// and user directives therefore cannot later recover the original numeral
698    /// hanja for the normalized span.
699    PositionalArabic,
700
701    /// Normalize additive hanja numerals with place markers to Arabic digits.
702    ///
703    /// This parser handles small units such as `十`, `百`, and `千` and large
704    /// units through `澗`. Malformed or overflowing numerals fall back to
705    /// [`NumeralStrategy::HangulPhonetic`] for that run.
706    AdditiveArabic,
707
708    /// Choose Arabic normalization for common numeric contexts and otherwise
709    /// keep hangul phonetic fallback behavior.
710    ///
711    /// Additive numerals are normalized to Arabic when they begin with a digit.
712    /// Runs that begin with a small place marker such as `十`, `百`, or `千`
713    /// are normalized only when the next character is not an ambiguous
714    /// non-unit hanja word character. Pure positional digit runs are normalized
715    /// when they contain at least four digits (matching common year notation)
716    /// or when a unit hanja (`年月日時分秒號世紀` and so on) immediately follows.
717    /// Standalone large place markers such as `萬` or `京`, and other
718    /// ambiguous numerals, remain hangul annotations.
719    Smart,
720}
721
722#[derive(Clone, Debug, Eq, PartialEq)]
723struct DictionaryEntry {
724    reading: String,
725    suffix_reading: Option<String>,
726    mark: MatchMark,
727}
728
729/// A small in-memory dictionary backed by an ordered map.
730///
731/// This implementation is intended for tests, user-supplied custom entries,
732/// and early pipeline validation. It returns all prefix matches at a cursor so
733/// the engine can score every candidate path through a hanja run.
734#[derive(Clone, Debug, Default, Eq, PartialEq)]
735pub struct MapDictionary {
736    entries: BTreeMap<String, DictionaryEntry>,
737    max_word_chars: Option<usize>,
738}
739
740impl MapDictionary {
741    /// Creates an empty map dictionary.
742    pub fn new() -> Self {
743        Self::default()
744    }
745
746    /// Inserts an entry with no special rendering constraints.
747    pub fn insert(&mut self, hanja: impl Into<String>, reading: impl Into<String>) {
748        self.insert_marked(hanja, reading, MatchMark::default());
749    }
750
751    /// Inserts an entry with dictionary-provided rendering constraints.
752    pub fn insert_marked(
753        &mut self,
754        hanja: impl Into<String>,
755        reading: impl Into<String>,
756        mark: MatchMark,
757    ) {
758        self.insert_entry(hanja, reading, None, mark);
759    }
760
761    /// Inserts an entry that carries a distinct non-word-initial reading.
762    ///
763    /// `suffix` is the reading used when the match is not word-initial (see
764    /// [`Match::suffix_reading`]); `reading` is the word-initial reading.
765    pub fn insert_with_suffix(
766        &mut self,
767        hanja: impl Into<String>,
768        reading: impl Into<String>,
769        suffix: impl Into<String>,
770    ) {
771        self.insert_entry(hanja, reading, Some(suffix.into()), MatchMark::default());
772    }
773
774    fn insert_entry(
775        &mut self,
776        hanja: impl Into<String>,
777        reading: impl Into<String>,
778        suffix_reading: Option<String>,
779        mark: MatchMark,
780    ) {
781        let hanja = hanja.into();
782        let word_chars = hanja.chars().count();
783        self.max_word_chars = Some(self.max_word_chars.map_or(word_chars, |max| {
784            if word_chars > max { word_chars } else { max }
785        }));
786        self.entries.insert(
787            hanja,
788            DictionaryEntry {
789                reading: reading.into(),
790                suffix_reading,
791                mark,
792            },
793        );
794    }
795
796    /// Returns whether the dictionary has no entries.
797    pub fn is_empty(&self) -> bool {
798        self.entries.is_empty()
799    }
800
801    /// Returns the number of dictionary entries.
802    pub fn len(&self) -> usize {
803        self.entries.len()
804    }
805}
806
807impl HanjaDictionary for MapDictionary {
808    fn matches_at<'a>(&'a self, s: &'a str) -> Box<dyn Iterator<Item = Match> + 'a> {
809        Box::new(
810            self.entries
811                .iter()
812                .filter(move |(hanja, _)| s.starts_with(hanja.as_str()))
813                .map(|(hanja, entry)| Match {
814                    byte_len: hanja.len(),
815                    reading: entry.reading.clone(),
816                    suffix_reading: entry.suffix_reading.clone(),
817                    mark: entry.mark,
818                }),
819        )
820    }
821
822    fn max_word_chars(&self) -> Option<usize> {
823        self.max_word_chars
824    }
825
826    fn entries<'a>(&'a self) -> Option<Box<dyn Iterator<Item = DictionaryRecord> + 'a>> {
827        Some(Box::new(self.entries.iter().map(|(hanja, entry)| {
828            DictionaryRecord {
829                hanja: hanja.clone(),
830                reading: entry.reading.clone(),
831                mark: entry.mark,
832            }
833        })))
834    }
835
836    fn has_homophone(&self, hanja: &str, reading: &str) -> bool {
837        self.entries
838            .iter()
839            .any(|(other_hanja, entry)| other_hanja != hanja && entry.reading == reading)
840    }
841}
842
843/// Scope data used by the plain-text adapter.
844///
845/// Plain text has no preserved regions or block boundaries, and inline markup
846/// such as `<ruby>` is not meaningful in a plain-text stream. Reporting
847/// [`ScopeData::allows_inline_markup`] as `false` lets scope-aware renderers
848/// fall back to parenthesized text before any [`RenderedToken::Ruby`] reaches
849/// the plain-text writer.
850#[derive(Clone, Copy, Debug, Default, Eq, PartialEq)]
851pub struct PlainScopeData;
852
853impl ScopeData for PlainScopeData {
854    fn is_preserve(&self) -> bool {
855        false
856    }
857
858    fn allows_inline_markup(&self) -> bool {
859        false
860    }
861}
862
863/// Reads a plain-text string into the core input-token stream.
864///
865/// The adapter wraps the input in a plain scope and emits the entire input as a
866/// single `Text` token.
867pub fn read_plain_text(input: &str) -> Vec<InputToken<PlainScopeData>> {
868    Vec::from([
869        InputToken::Open(Scope::new(PlainScopeData)),
870        InputToken::Text(input.to_string()),
871        InputToken::Close,
872    ])
873}
874
875/// Writes rendered plain-text tokens back to a string.
876///
877/// Structural tokens are ignored because plain text has no serialized scope
878/// markers. `Text` and `Verbatim` tokens are concatenated in stream order.
879/// `Ruby` tokens are not expected because [`PlainScopeData`] disallows inline
880/// markup, but they are defensively serialized as `base(rt)` rather than
881/// dropped silently if one ever reaches the writer.
882pub fn write_plain_text<S>(tokens: impl IntoIterator<Item = RenderedToken<S>>) -> String {
883    let mut output = String::new();
884    for token in tokens {
885        match token {
886            RenderedToken::Open(_) | RenderedToken::Close => {}
887            RenderedToken::Text(text) | RenderedToken::Verbatim(text) => output.push_str(&text),
888            RenderedToken::Ruby { base, rt } => {
889                output.push_str(&parens(&base, &rt));
890            }
891        }
892    }
893    output
894}
895
896/// Processes input tokens with the default hanja conversion engine options.
897///
898/// The engine preserves structural and verbatim tokens, skips text when the
899/// current scope is preserving, and uses lattice segmentation to annotate
900/// dictionary and fallback matches inside text tokens.
901pub fn process_tokens<S, D>(
902    tokens: impl IntoIterator<Item = InputToken<S>>,
903    dictionary: &D,
904) -> Vec<OutputToken<S>>
905where
906    S: ScopeData,
907    D: HanjaDictionary + ?Sized,
908{
909    process_tokens_iter(tokens, dictionary).collect()
910}
911
912/// Processes input tokens through the default engine options and returns an
913/// iterator over the collected output.
914///
915/// This is an iterator-shaped compatibility adapter, not the low-level
916/// streaming surface: it consumes the supplied input before returning. For
917/// true incremental processing, use [`Engine`] directly and call
918/// [`Engine::push_token`] as chunks arrive.
919pub fn process_tokens_iter<S, D>(
920    tokens: impl IntoIterator<Item = InputToken<S>>,
921    dictionary: &D,
922) -> alloc::vec::IntoIter<OutputToken<S>>
923where
924    S: ScopeData,
925    D: HanjaDictionary + ?Sized,
926{
927    process_tokens_with_options(tokens, dictionary, EngineOptions::default()).into_iter()
928}
929
930/// Processes input tokens with explicit hanja conversion engine options.
931///
932/// This is the lower-level entry point for callers that need to disable
933/// fallback initial sound law or choose a non-default numeral strategy.
934pub fn process_tokens_with_options<S, D>(
935    tokens: impl IntoIterator<Item = InputToken<S>>,
936    dictionary: &D,
937    options: EngineOptions,
938) -> Vec<OutputToken<S>>
939where
940    S: ScopeData,
941    D: HanjaDictionary + ?Sized,
942{
943    let mut engine = Engine::collecting(dictionary, options);
944    let mut output = Vec::new();
945
946    for token in tokens {
947        output.extend(engine.push_token(token));
948    }
949
950    output.extend(engine.finish());
951    output
952}
953
954/// Processes input tokens through explicit engine options and returns an
955/// iterator over the collected output.
956///
957/// This convenience adapter preserves the existing collect-into-`Vec` behavior
958/// while exposing an iterator-shaped API for callers that compose pipeline
959/// stages. Use [`Engine`] for chunk-by-chunk output.
960pub fn process_tokens_iter_with_options<S, D>(
961    tokens: impl IntoIterator<Item = InputToken<S>>,
962    dictionary: &D,
963    options: EngineOptions,
964) -> alloc::vec::IntoIter<OutputToken<S>>
965where
966    S: ScopeData,
967    D: HanjaDictionary + ?Sized,
968{
969    process_tokens_with_options(tokens, dictionary, options).into_iter()
970}
971
972/// Resolves a fallible reader token stream into recovered input tokens.
973///
974/// This is the single place where the stream-level [`Recovery`] policy is
975/// applied to a reader's output. Format adapters (such as the HTML scanner)
976/// emit `Ok(InputToken)` for well-formed regions and
977/// `Err(RecoverableInputError)` for malformed regions they can describe and
978/// preserve; this function turns that stream into the plain
979/// [`InputToken`] sequence the rest of the pipeline consumes:
980///
981///  -  In [`Recovery::Strict`] mode the first error stops processing and its
982///     cause is returned, so the caller never sees a partial token stream.
983///  -  In [`Recovery::Lenient`] mode each error is logged at `warn` level once
984///     and replaced by an [`InputToken::Verbatim`] holding the original source
985///     region, so the malformed bytes pass through untouched while surrounding
986///     tokens continue to flow.
987///
988/// It sits one stage before the [`Engine`]: feed its output into
989/// [`process_tokens_with_options`] or a streaming [`Engine`]. The recovery-aware
990/// engine entry points ([`process_fallible_tokens`] and
991/// [`process_fallible_tokens_with_options`]) are thin wrappers that call this
992/// and then run the engine.
993pub fn recover_input_tokens<S>(
994    tokens: impl IntoIterator<Item = Result<InputToken<S>, RecoverableInputError>>,
995    recovery: Recovery,
996) -> Result<Vec<InputToken<S>>, Error>
997where
998    S: ScopeData,
999{
1000    let mut recovered = Vec::new();
1001    for token in tokens {
1002        recovered.push(recover_input_token(token, recovery)?);
1003    }
1004    Ok(recovered)
1005}
1006
1007/// Resolves one fallible reader item according to a [`Recovery`] policy.
1008///
1009/// This is the per-token form of [`recover_input_tokens`] for streaming
1010/// pipelines. In strict mode an error is returned immediately. In lenient mode
1011/// the error is logged once and replaced with an [`InputToken::Verbatim`]
1012/// carrying the original malformed region.
1013pub fn recover_input_token<S>(
1014    token: Result<InputToken<S>, RecoverableInputError>,
1015    recovery: Recovery,
1016) -> Result<InputToken<S>, Error>
1017where
1018    S: ScopeData,
1019{
1020    match token {
1021        Ok(token) => Ok(token),
1022        Err(error) => match recovery {
1023            Recovery::Strict => Err(error.into_parts().1),
1024            Recovery::Lenient => {
1025                let (original, error) = error.into_parts();
1026                tracing::warn!(error = %error, "recovering from input reader error");
1027                Ok(InputToken::Verbatim(original))
1028            }
1029        },
1030    }
1031}
1032
1033/// Processes fallible input tokens with default engine options.
1034///
1035/// Reader errors are handled according to `recovery`. In strict mode the first
1036/// error is returned. In lenient mode each recoverable region is logged and
1037/// emitted as `OutputToken::Verbatim`, after which later tokens continue
1038/// through the normal engine path.
1039pub fn process_fallible_tokens<S, D>(
1040    tokens: impl IntoIterator<Item = Result<InputToken<S>, RecoverableInputError>>,
1041    dictionary: &D,
1042    recovery: Recovery,
1043) -> Result<Vec<OutputToken<S>>, Error>
1044where
1045    S: ScopeData,
1046    D: HanjaDictionary + ?Sized,
1047{
1048    process_fallible_tokens_with_options(tokens, dictionary, EngineOptions::default(), recovery)
1049}
1050
1051/// Processes fallible input tokens with explicit engine options.
1052///
1053/// This is the recovery-aware counterpart to
1054/// [`process_tokens_with_options`]. It does not make the dictionary trait
1055/// fallible; it only handles reader errors that carry enough original source
1056/// text for lenient preservation.
1057pub fn process_fallible_tokens_with_options<S, D>(
1058    tokens: impl IntoIterator<Item = Result<InputToken<S>, RecoverableInputError>>,
1059    dictionary: &D,
1060    options: EngineOptions,
1061    recovery: Recovery,
1062) -> Result<Vec<OutputToken<S>>, Error>
1063where
1064    S: ScopeData,
1065    D: HanjaDictionary + ?Sized,
1066{
1067    let recovered = recover_input_tokens(tokens, recovery)?;
1068    Ok(process_tokens_with_options(recovered, dictionary, options))
1069}
1070
1071/// Stateful hanja conversion engine for chunked token streams.
1072///
1073/// `Engine` is the low-level streaming surface. Call [`Engine::push_token`] for
1074/// each incoming token and then [`Engine::finish`] once the upstream reader is
1075/// exhausted. When the dictionary reports a maximum word length, text chunks are
1076/// buffered only at the tail so dictionary matches can cross chunk boundaries
1077/// without requiring the whole document in memory. A trailing fallback hanja run
1078/// is also kept buffered until a non-convertible boundary or EOF so render modes
1079/// that expose annotation spans match one-shot conversion. Dictionaries with an
1080/// unknown maximum keep hanja-containing text until a non-convertible boundary
1081/// or EOF so long custom entries remain observable.
1082pub struct Engine<'a, S, D>
1083where
1084    S: ScopeData,
1085    D: HanjaDictionary + ?Sized,
1086{
1087    dictionary: &'a D,
1088    options: EngineOptions,
1089    scopes: Vec<Scope<S>>,
1090    pending_text: String,
1091    pending_unflushable_fallback_run_bytes: Option<usize>,
1092    fallback_state: FallbackState,
1093    incremental_flush: bool,
1094}
1095
1096impl<'a, S, D> Engine<'a, S, D>
1097where
1098    S: ScopeData,
1099    D: HanjaDictionary + ?Sized,
1100{
1101    /// Creates a streaming engine with default options.
1102    pub fn new(dictionary: &'a D) -> Self {
1103        Self::with_options(dictionary, EngineOptions::default())
1104    }
1105
1106    /// Creates a streaming engine with explicit conversion options.
1107    pub fn with_options(dictionary: &'a D, options: EngineOptions) -> Self {
1108        Self::with_incremental_flush(dictionary, options, true)
1109    }
1110
1111    fn collecting(dictionary: &'a D, options: EngineOptions) -> Self {
1112        Self::with_incremental_flush(dictionary, options, false)
1113    }
1114
1115    fn with_incremental_flush(
1116        dictionary: &'a D,
1117        options: EngineOptions,
1118        incremental_flush: bool,
1119    ) -> Self {
1120        tracing::debug!(
1121            strategy = ?options.segmentation,
1122            "engine created with segmentation strategy"
1123        );
1124        Self {
1125            dictionary,
1126            options,
1127            scopes: Vec::new(),
1128            pending_text: String::new(),
1129            pending_unflushable_fallback_run_bytes: None,
1130            fallback_state: FallbackState::default(),
1131            incremental_flush,
1132        }
1133    }
1134
1135    /// Pushes one input token and returns output tokens that are now safe to
1136    /// emit.
1137    pub fn push_token(&mut self, token: InputToken<S>) -> Vec<OutputToken<S>> {
1138        let mut output = Vec::new();
1139        match token {
1140            InputToken::Open(scope) => {
1141                self.flush_into(&mut output);
1142                if scope.data().is_block_boundary() {
1143                    self.reset_fallback_context();
1144                }
1145                self.scopes.push(scope.clone());
1146                output.push(OutputToken::Open(scope));
1147            }
1148            InputToken::Close => {
1149                self.flush_into(&mut output);
1150                let closes_block_boundary = self
1151                    .scopes
1152                    .pop()
1153                    .is_some_and(|scope| scope.data().is_block_boundary());
1154                output.push(OutputToken::Close);
1155                if closes_block_boundary {
1156                    self.reset_fallback_context();
1157                }
1158            }
1159            InputToken::Text(text) => {
1160                if self
1161                    .scopes
1162                    .last()
1163                    .is_some_and(|scope| scope.data().is_preserve())
1164                {
1165                    self.flush_into(&mut output);
1166                    self.reset_fallback_context();
1167                    output.push(OutputToken::Text(text));
1168                } else {
1169                    let previous_pending_bytes = self.pending_text.len();
1170                    self.pending_text.push_str(&text);
1171                    if self
1172                        .pending_unflushable_fallback_run_bytes
1173                        .is_some_and(|bytes| bytes == previous_pending_bytes)
1174                    {
1175                        self.pending_unflushable_fallback_run_bytes = Some(previous_pending_bytes);
1176                    } else {
1177                        self.pending_unflushable_fallback_run_bytes = None;
1178                    }
1179                    if self.incremental_flush {
1180                        self.flush_safe_into(&mut output);
1181                    }
1182                }
1183            }
1184            InputToken::Verbatim(text) => {
1185                self.flush_into(&mut output);
1186                self.reset_fallback_context();
1187                output.push(OutputToken::Verbatim(text));
1188            }
1189        }
1190        output
1191    }
1192
1193    /// Flushes all pending text without ending the engine.
1194    pub fn flush(&mut self) -> Vec<OutputToken<S>> {
1195        let mut output = Vec::new();
1196        self.flush_into(&mut output);
1197        output
1198    }
1199
1200    /// Finishes the stream and returns every remaining output token.
1201    pub fn finish(mut self) -> Vec<OutputToken<S>> {
1202        self.flush()
1203    }
1204
1205    /// Returns the number of Unicode scalar values currently buffered.
1206    pub fn buffered_chars(&self) -> usize {
1207        self.pending_text.chars().count()
1208    }
1209
1210    fn tail_bound(&self) -> Option<usize> {
1211        self.dictionary.max_word_chars().filter(|bound| *bound > 0)
1212    }
1213
1214    fn flush_safe_into(&mut self, output: &mut Vec<OutputToken<S>>) {
1215        if self.pending_text.is_empty() {
1216            return;
1217        }
1218        if !self.pending_text.chars().any(is_hanja) {
1219            self.flush_non_hanja_safe_into(output);
1220            return;
1221        }
1222
1223        let Some(bound) = self.tail_bound() else {
1224            let Some(flush_end) = safe_unknown_bound_flush_end(&self.pending_text) else {
1225                return;
1226            };
1227            self.flush_prefix_into(flush_end, output);
1228            if !self.pending_text.chars().any(is_hanja) {
1229                self.flush_non_hanja_safe_into(output);
1230            }
1231            return;
1232        };
1233        if let Some(flush_end) = safe_unknown_bound_flush_end(&self.pending_text) {
1234            self.flush_prefix_into(flush_end, output);
1235            if !self.pending_text.chars().any(is_hanja) {
1236                self.flush_non_hanja_safe_into(output);
1237            }
1238            return;
1239        }
1240        let buffered_chars = self.buffered_chars();
1241        if buffered_chars > bound.saturating_mul(10) {
1242            tracing::debug!(
1243                buffered_chars,
1244                dict_max_word_chars = bound,
1245                "streaming tail buffer is unusually large"
1246            );
1247        }
1248        if buffered_chars <= bound {
1249            return;
1250        }
1251
1252        if self.extends_unflushable_fallback_run(bound) {
1253            self.pending_unflushable_fallback_run_bytes = Some(self.pending_text.len());
1254            return;
1255        }
1256
1257        let safe_chars = buffered_chars.saturating_sub(bound).saturating_add(1);
1258        let segments = segment_text(&self.pending_text, self.dictionary, self.options);
1259        let mut flush_end = 0;
1260        let mut flush_segments = Vec::new();
1261        for segment in &segments {
1262            let (byte_start, byte_end) = segment_bounds(segment);
1263            let start_chars = self.pending_text[..byte_start].chars().count();
1264            let end_chars = self.pending_text[..byte_end].chars().count();
1265            if byte_start > flush_end || (start_chars > safe_chars && flush_end > 0) {
1266                break;
1267            }
1268            if end_chars > safe_chars {
1269                break;
1270            }
1271            flush_end = byte_end;
1272            flush_segments.push(segment.clone());
1273        }
1274
1275        // Fallback runs render as one annotation in non-default render modes.
1276        // Keep a trailing fallback run buffered because the next chunk may
1277        // extend it, even when the dictionary lookahead bound is only one char.
1278        if let Some(fallback_start) = trailing_fallback_run_start(&segments, flush_end) {
1279            flush_end = fallback_start;
1280            while flush_segments
1281                .last()
1282                .is_some_and(|segment| segment_bounds(segment).1 > flush_end)
1283            {
1284                flush_segments.pop();
1285            }
1286        }
1287
1288        if flush_end > 0 {
1289            self.pending_unflushable_fallback_run_bytes = None;
1290            self.flush_segments_prefix_into(flush_end, &flush_segments, output);
1291            if !self.pending_text.chars().any(is_hanja) {
1292                self.flush_non_hanja_safe_into(output);
1293            }
1294        } else if trailing_fallback_run_start(&segments, self.pending_text.len()) == Some(0) {
1295            self.pending_unflushable_fallback_run_bytes = Some(self.pending_text.len());
1296        }
1297    }
1298
1299    fn extends_unflushable_fallback_run(&self, bound: usize) -> bool {
1300        let Some(previous_bytes) = self.pending_unflushable_fallback_run_bytes else {
1301            return false;
1302        };
1303        if previous_bytes == 0
1304            || previous_bytes > self.pending_text.len()
1305            || !self.pending_text.is_char_boundary(previous_bytes)
1306        {
1307            return false;
1308        }
1309
1310        let appended = &self.pending_text[previous_bytes..];
1311        if appended.is_empty() {
1312            return true;
1313        }
1314        if appended.chars().any(|ch| !is_hanja(ch)) {
1315            return false;
1316        }
1317
1318        // The existing prefix was already segmented as one fallback run.  Only
1319        // the old suffix that can participate in a cross-chunk dictionary match
1320        // and the newly appended text need to be inspected here.
1321        let probe_start = suffix_start_for_char_count(
1322            &self.pending_text[..previous_bytes],
1323            bound.saturating_sub(1),
1324        );
1325        let probe = &self.pending_text[probe_start..];
1326        segment_text(probe, self.dictionary, self.options)
1327            .iter()
1328            .all(|segment| {
1329                matches!(
1330                    segment,
1331                    Segment::Fallback { .. } | Segment::TrivialDictionary { .. }
1332                )
1333            })
1334    }
1335
1336    fn flush_non_hanja_safe_into(&mut self, output: &mut Vec<OutputToken<S>>) {
1337        let flush_end = match self.tail_bound() {
1338            Some(bound) => safe_non_hanja_flush_end(&self.pending_text, bound),
1339            None => safe_unknown_bound_flush_end(&self.pending_text),
1340        };
1341        if let Some(flush_end) = flush_end {
1342            self.flush_prefix_into(flush_end, output);
1343        }
1344    }
1345
1346    fn flush_prefix_into(&mut self, flush_end: usize, output: &mut Vec<OutputToken<S>>) {
1347        if flush_end == self.pending_text.len() {
1348            self.flush_into(output);
1349            return;
1350        }
1351        self.pending_unflushable_fallback_run_bytes = None;
1352        let prefix = self.pending_text[..flush_end].to_string();
1353        let segments = segment_text(&prefix, self.dictionary, self.options);
1354        self.flush_segments_prefix_into(flush_end, &segments, output);
1355    }
1356
1357    fn flush_segments_prefix_into(
1358        &mut self,
1359        flush_end: usize,
1360        segments: &[Segment],
1361        output: &mut Vec<OutputToken<S>>,
1362    ) {
1363        let prefix = self.pending_text[..flush_end].to_string();
1364        process_segments_with_state(
1365            &prefix,
1366            segments,
1367            self.dictionary,
1368            self.options,
1369            &mut self.fallback_state,
1370            output,
1371        );
1372        self.pending_text.replace_range(..flush_end, "");
1373    }
1374
1375    fn flush_into(&mut self, output: &mut Vec<OutputToken<S>>) {
1376        if self.pending_text.is_empty() {
1377            return;
1378        }
1379        self.pending_unflushable_fallback_run_bytes = None;
1380        let text = core::mem::take(&mut self.pending_text);
1381        process_text_with_state(
1382            &text,
1383            self.dictionary,
1384            self.options,
1385            &mut self.fallback_state,
1386            output,
1387        );
1388    }
1389
1390    fn reset_fallback_context(&mut self) {
1391        self.fallback_state = FallbackState::default();
1392    }
1393}
1394
1395fn safe_non_hanja_flush_end(text: &str, bound: usize) -> Option<usize> {
1396    if text.is_empty() {
1397        return None;
1398    }
1399
1400    let keep_chars = bound.saturating_sub(1);
1401    let span_start = text
1402        .char_indices()
1403        .rfind(|(_, ch)| ch.is_whitespace())
1404        .map_or(0, |(index, ch)| index + ch.len_utf8());
1405    let suffix = &text[span_start..];
1406    let suffix_chars = suffix.chars().count();
1407    if suffix_chars <= keep_chars {
1408        return (span_start > 0).then_some(span_start);
1409    }
1410
1411    let flush_suffix_chars = suffix_chars - keep_chars;
1412    let flush_end = suffix
1413        .char_indices()
1414        .nth(flush_suffix_chars)
1415        .map_or(text.len(), |(index, _)| span_start + index);
1416    (flush_end > 0).then_some(flush_end)
1417}
1418
1419fn safe_unknown_bound_flush_end(text: &str) -> Option<usize> {
1420    text.char_indices()
1421        .rfind(|(_, ch)| ch.is_whitespace())
1422        .map(|(index, ch)| index + ch.len_utf8())
1423}
1424
1425fn suffix_start_for_char_count(text: &str, count: usize) -> usize {
1426    if count == 0 {
1427        return text.len();
1428    }
1429
1430    text.char_indices()
1431        .rev()
1432        .nth(count.saturating_sub(1))
1433        .map_or(0, |(index, _)| index)
1434}
1435
1436fn trailing_fallback_run_start(segments: &[Segment], split_byte: usize) -> Option<usize> {
1437    if split_byte == 0 {
1438        return None;
1439    }
1440
1441    for (index, segment) in segments.iter().enumerate() {
1442        let (byte_start, byte_end) = segment_bounds(segment);
1443        if byte_end != split_byte {
1444            continue;
1445        }
1446        if !matches!(
1447            segment,
1448            Segment::Fallback { .. } | Segment::TrivialDictionary { .. }
1449        ) {
1450            return None;
1451        }
1452        if let Some(next) = segments.get(index + 1)
1453            && !matches!(
1454                next,
1455                Segment::Fallback { .. } | Segment::TrivialDictionary { .. }
1456            )
1457        {
1458            return None;
1459        }
1460
1461        let mut run_start = byte_start;
1462        for previous in segments[..index].iter().rev() {
1463            let (previous_start, previous_end) = segment_bounds(previous);
1464            if previous_end != run_start
1465                || !matches!(
1466                    previous,
1467                    Segment::Fallback { .. } | Segment::TrivialDictionary { .. }
1468                )
1469            {
1470                break;
1471            }
1472            run_start = previous_start;
1473        }
1474        return (run_start < split_byte).then_some(run_start);
1475    }
1476
1477    None
1478}
1479
1480fn process_text_with_state<S, D>(
1481    text: &str,
1482    dictionary: &D,
1483    options: EngineOptions,
1484    fallback_state: &mut FallbackState,
1485    output: &mut Vec<OutputToken<S>>,
1486) where
1487    D: HanjaDictionary + ?Sized,
1488{
1489    let segments = segment_text(text, dictionary, options);
1490    process_segments_with_state(text, &segments, dictionary, options, fallback_state, output);
1491}
1492
1493fn process_trivial_fallback_run<S>(
1494    run_segments: &[Segment],
1495    text: &str,
1496    options: EngineOptions,
1497    state: &mut FallbackState,
1498    output: &mut Vec<OutputToken<S>>,
1499) {
1500    let run_start = segment_bounds(&run_segments[0]).0;
1501    let run_end = segment_bounds(&run_segments[run_segments.len() - 1]).1;
1502    let capacity = run_end.saturating_sub(run_start);
1503    let mut hanja = String::with_capacity(capacity);
1504    let mut reading = String::with_capacity(capacity);
1505    let mut has_dictionary = false;
1506    let mut last_trivial_source: Option<char> = None;
1507    let mut last_trivial_reading: Option<String> = None;
1508
1509    let mut seg_index = 0;
1510    while seg_index < run_segments.len() {
1511        match &run_segments[seg_index] {
1512            Segment::TrivialDictionary {
1513                byte_start,
1514                byte_end,
1515                reading: dict_reading,
1516                suffix_reading,
1517                ..
1518            } => {
1519                let source = &text[*byte_start..*byte_end];
1520                let effective = dictionary_effective_reading(
1521                    source,
1522                    dict_reading,
1523                    suffix_reading.as_deref(),
1524                    options,
1525                    state.starts_word,
1526                    state.previous_reading,
1527                );
1528                if !hanja.is_empty()
1529                    && last_trivial_reading.as_deref() == Some(&effective)
1530                    && last_trivial_source != source.chars().next()
1531                {
1532                    output.push(OutputToken::Annotated(Annotation {
1533                        hanja: core::mem::take(&mut hanja),
1534                        reading: core::mem::take(&mut reading),
1535                        homophone: false,
1536                        require_hanja: false,
1537                        require_hangul: false,
1538                        first_in_context: true,
1539                        skip_annotation: false,
1540                        from_dictionary: has_dictionary,
1541                        from_source_gloss: false,
1542                    }));
1543                }
1544                hanja.push_str(source);
1545                reading.push_str(&effective);
1546                update_fallback_state_for_reading(&effective, state);
1547                has_dictionary = true;
1548                last_trivial_source = source.chars().next();
1549                last_trivial_reading = Some(effective);
1550                seg_index += 1;
1551            }
1552            Segment::Fallback { byte_start: _, .. } => {
1553                last_trivial_source = None;
1554                last_trivial_reading = None;
1555                let fb_start = seg_index;
1556                while seg_index < run_segments.len()
1557                    && matches!(&run_segments[seg_index], Segment::Fallback { .. })
1558                {
1559                    seg_index += 1;
1560                }
1561                let fb_text = &text[segment_bounds(&run_segments[fb_start]).0
1562                    ..segment_bounds(&run_segments[seg_index - 1]).1];
1563                for part in phoneticize_fallback_run_with_state(fb_text, options, state) {
1564                    match part {
1565                        FallbackPart::Annotation {
1566                            hanja: part_hanja,
1567                            reading: part_reading,
1568                        } => {
1569                            if part_hanja.chars().any(is_hanja_numeral) {
1570                                if !hanja.is_empty() {
1571                                    output.push(OutputToken::Annotated(Annotation {
1572                                        hanja: core::mem::take(&mut hanja),
1573                                        reading: core::mem::take(&mut reading),
1574                                        homophone: false,
1575                                        require_hanja: false,
1576                                        require_hangul: false,
1577                                        first_in_context: true,
1578                                        skip_annotation: false,
1579                                        from_dictionary: has_dictionary,
1580                                        from_source_gloss: false,
1581                                    }));
1582                                    has_dictionary = false;
1583                                }
1584                                output.push(OutputToken::Annotated(Annotation {
1585                                    hanja: part_hanja,
1586                                    reading: part_reading,
1587                                    homophone: false,
1588                                    require_hanja: false,
1589                                    require_hangul: false,
1590                                    first_in_context: true,
1591                                    skip_annotation: false,
1592                                    from_dictionary: false,
1593                                    from_source_gloss: false,
1594                                }));
1595                            } else {
1596                                hanja.push_str(&part_hanja);
1597                                reading.push_str(&part_reading);
1598                            }
1599                        }
1600                        FallbackPart::ReadingText(t) | FallbackPart::Text(t) => {
1601                            if !hanja.is_empty() {
1602                                output.push(OutputToken::Annotated(Annotation {
1603                                    hanja: core::mem::take(&mut hanja),
1604                                    reading: core::mem::take(&mut reading),
1605                                    homophone: false,
1606                                    require_hanja: false,
1607                                    require_hangul: false,
1608                                    first_in_context: true,
1609                                    skip_annotation: false,
1610                                    from_dictionary: has_dictionary,
1611                                    from_source_gloss: false,
1612                                }));
1613                                has_dictionary = false;
1614                            }
1615                            push_text(output, &t);
1616                        }
1617                    }
1618                }
1619            }
1620            _ => unreachable!("run must contain only TrivialDictionary | Fallback"),
1621        }
1622    }
1623
1624    if !hanja.is_empty() {
1625        output.push(OutputToken::Annotated(Annotation {
1626            hanja,
1627            reading,
1628            homophone: false,
1629            require_hanja: false,
1630            require_hangul: false,
1631            first_in_context: true,
1632            skip_annotation: false,
1633            from_dictionary: has_dictionary,
1634            from_source_gloss: false,
1635        }));
1636    }
1637}
1638
1639fn process_segments_with_state<S, D>(
1640    text: &str,
1641    segments: &[Segment],
1642    _dictionary: &D,
1643    options: EngineOptions,
1644    fallback_state: &mut FallbackState,
1645    output: &mut Vec<OutputToken<S>>,
1646) where
1647    D: HanjaDictionary + ?Sized,
1648{
1649    let mut index = 0;
1650
1651    while index < segments.len() {
1652        match &segments[index] {
1653            Segment::Dictionary {
1654                byte_start,
1655                byte_end,
1656                reading,
1657                suffix_reading,
1658                mark,
1659            } => {
1660                let source = &text[*byte_start..*byte_end];
1661                let effective = dictionary_effective_reading(
1662                    source,
1663                    reading,
1664                    suffix_reading.as_deref(),
1665                    options,
1666                    fallback_state.starts_word,
1667                    fallback_state.previous_reading,
1668                );
1669                output.push(OutputToken::Annotated(Annotation {
1670                    hanja: source.to_string(),
1671                    homophone: false,
1672                    reading: effective.clone(),
1673                    require_hanja: mark.require_hanja,
1674                    require_hangul: mark.require_hangul,
1675                    first_in_context: true,
1676                    skip_annotation: false,
1677                    from_dictionary: true,
1678                    from_source_gloss: false,
1679                }));
1680                if should_preserve_dictionary_context(source, &effective, options) {
1681                    update_fallback_state_for_reading(&effective, fallback_state);
1682                } else {
1683                    *fallback_state = FallbackState::default();
1684                }
1685                index += 1;
1686            }
1687            Segment::TrivialDictionary {
1688                byte_start,
1689                byte_end,
1690                ..
1691            }
1692            | Segment::Fallback {
1693                byte_start,
1694                byte_end,
1695            } => {
1696                let run_start = index;
1697                let mut merged_end = *byte_end;
1698                while let Some(
1699                    Segment::TrivialDictionary {
1700                        byte_end: next_end, ..
1701                    }
1702                    | Segment::Fallback {
1703                        byte_end: next_end, ..
1704                    },
1705                ) = segments.get(index + 1)
1706                {
1707                    merged_end = *next_end;
1708                    index += 1;
1709                }
1710                let has_dictionary = segments[run_start..=index]
1711                    .iter()
1712                    .any(|s| matches!(s, Segment::TrivialDictionary { .. }));
1713                if has_dictionary {
1714                    process_trivial_fallback_run(
1715                        &segments[run_start..=index],
1716                        text,
1717                        options,
1718                        fallback_state,
1719                        output,
1720                    );
1721                } else {
1722                    process_fallback_text(
1723                        &text[*byte_start..merged_end],
1724                        options,
1725                        fallback_state,
1726                        output,
1727                    );
1728                }
1729                index += 1;
1730            }
1731            Segment::NumeralText { text, .. } => {
1732                push_text(output, text);
1733                update_fallback_state_for_text(text, fallback_state);
1734                index += 1;
1735            }
1736            Segment::Text {
1737                byte_start,
1738                byte_end,
1739            } => {
1740                let text_segment = &text[*byte_start..*byte_end];
1741                push_text(output, text_segment);
1742                update_fallback_state_for_text(text_segment, fallback_state);
1743                index += 1;
1744            }
1745        }
1746    }
1747}
1748
1749fn segment_bounds(segment: &Segment) -> (usize, usize) {
1750    match segment {
1751        Segment::Dictionary {
1752            byte_start,
1753            byte_end,
1754            ..
1755        }
1756        | Segment::TrivialDictionary {
1757            byte_start,
1758            byte_end,
1759            ..
1760        }
1761        | Segment::Fallback {
1762            byte_start,
1763            byte_end,
1764        }
1765        | Segment::NumeralText {
1766            byte_start,
1767            byte_end,
1768            ..
1769        }
1770        | Segment::Text {
1771            byte_start,
1772            byte_end,
1773        } => (*byte_start, *byte_end),
1774    }
1775}
1776
1777fn process_fallback_text<S>(
1778    text: &str,
1779    options: EngineOptions,
1780    state: &mut FallbackState,
1781    output: &mut Vec<OutputToken<S>>,
1782) {
1783    for part in phoneticize_fallback_run_with_state(text, options, state) {
1784        match part {
1785            FallbackPart::Annotation { hanja, reading } => {
1786                output.push(OutputToken::Annotated(Annotation {
1787                    hanja,
1788                    reading,
1789                    homophone: false,
1790                    require_hanja: false,
1791                    require_hangul: false,
1792                    first_in_context: true,
1793                    skip_annotation: false,
1794                    from_dictionary: false,
1795                    from_source_gloss: false,
1796                }));
1797            }
1798            FallbackPart::ReadingText(text) => push_text(output, &text),
1799            FallbackPart::Text(text) => push_text(output, &text),
1800        }
1801    }
1802}
1803
1804fn update_fallback_state_for_text(text: &str, state: &mut FallbackState) {
1805    if text.is_empty() {
1806        return;
1807    }
1808
1809    if text
1810        .chars()
1811        .last()
1812        .is_some_and(|character| character.is_whitespace())
1813    {
1814        *state = FallbackState::default();
1815        return;
1816    }
1817
1818    let Some(last) = text.chars().rev().find(|ch| !ch.is_whitespace()) else {
1819        return;
1820    };
1821
1822    if last.is_alphanumeric() {
1823        state.starts_word = false;
1824        state.previous_reading = Some(last);
1825    } else {
1826        *state = FallbackState::default();
1827    }
1828}
1829
1830/// Chooses the reading a dictionary match should emit at its position.
1831///
1832/// South Korean initial sound law (頭音法則) makes some morphemes read
1833/// differently word-initially than elsewhere. The bundled dictionary stores the
1834/// word-initial form, so a bare match would render `1998年` as `1998연` instead
1835/// of `1998년`. This applies the position-correct reading:
1836///
1837///  -  When the match carries an explicit [`Match::suffix_reading`] (a
1838///     multi-syllable entry the Standard Korean Language Dictionary records with
1839///     a distinct suffix or bound-noun form, such as `年代`), that suffix
1840///     reading is used outside word-initial position.
1841///  -  Otherwise, for a single hanja whose bundled unihan reading undergoes
1842///     initial sound law, the original (non-word-initial) reading is recovered
1843///     from the unihan table. This covers every such hanja without per-entry
1844///     data. The match's reading must already be one of the two law variants so
1845///     unrelated readings (and non-law hanja) are left untouched. The
1846///     `렬`/`률` → `열`/`율` rule after a vowel or `ㄴ` coda is honored through
1847///     [`should_apply_yeol_yul`], matching fallback behavior.
1848///
1849/// With initial sound law disabled (for example the North Korean preset) the
1850/// original reading is used everywhere.
1851fn dictionary_effective_reading(
1852    source: &str,
1853    reading: &str,
1854    suffix_reading: Option<&str>,
1855    options: EngineOptions,
1856    starts_word: bool,
1857    previous_reading: Option<char>,
1858) -> String {
1859    if let Some(suffix) = suffix_reading {
1860        return if starts_word && options.initial_sound_law {
1861            reading.to_string()
1862        } else {
1863            suffix.to_string()
1864        };
1865    }
1866
1867    let mut chars = source.chars();
1868    if let (Some(ch), None) = (chars.next(), chars.next())
1869        && let Some(base) = phoneticize_hanja_char(ch)
1870    {
1871        let initial = apply_initial_sound_law_to_first_syllable(base);
1872        if initial != base && (reading == base || reading == initial) {
1873            let apply_law = options.initial_sound_law
1874                && (starts_word || should_apply_yeol_yul(previous_reading, base));
1875            return if apply_law { initial } else { base.to_string() };
1876        }
1877    }
1878
1879    reading.to_string()
1880}
1881
1882fn should_preserve_dictionary_context(source: &str, reading: &str, options: EngineOptions) -> bool {
1883    if reading.chars().all(char::is_whitespace) {
1884        return false;
1885    }
1886
1887    if source.chars().all(is_hanja) {
1888        match fallback_reading_for_run(source, options) {
1889            Some(fallback_reading) => {
1890                fallback_reading == reading || has_one_hangul_syllable_per_hanja(source, reading)
1891            }
1892            None => has_one_hangul_syllable_per_hanja(source, reading),
1893        }
1894    } else {
1895        true
1896    }
1897}
1898
1899fn has_one_hangul_syllable_per_hanja(source: &str, reading: &str) -> bool {
1900    let source_len = source.chars().count();
1901    let mut reading_len = 0;
1902
1903    for ch in reading.chars() {
1904        if !is_hangul_syllable(ch) {
1905            return false;
1906        }
1907        reading_len += 1;
1908    }
1909
1910    reading_len == source_len
1911}
1912
1913fn is_hangul_syllable(ch: char) -> bool {
1914    ('\u{ac00}'..='\u{d7a3}').contains(&ch)
1915}
1916
1917fn update_fallback_state_for_reading(reading: &str, state: &mut FallbackState) {
1918    let Some(last) = reading.chars().rev().find(|ch| !ch.is_whitespace()) else {
1919        *state = FallbackState::default();
1920        return;
1921    };
1922
1923    if last.is_alphanumeric() {
1924        state.starts_word = false;
1925        state.previous_reading = Some(last);
1926    } else {
1927        *state = FallbackState::default();
1928    }
1929}
1930
1931fn push_text<S>(output: &mut Vec<OutputToken<S>>, text: &str) {
1932    if text.is_empty() {
1933        return;
1934    }
1935
1936    match output.last_mut() {
1937        Some(OutputToken::Text(existing)) => existing.push_str(text),
1938        _ => output.push(OutputToken::Text(text.to_string())),
1939    }
1940}
1941
1942/// Returns whether `ch` is in a known CJK ideograph range.
1943pub fn is_hanja(ch: char) -> bool {
1944    matches!(
1945        ch,
1946        '\u{2F00}'..='\u{2FFF}'
1947            | '\u{3007}'
1948            | '\u{3400}'..='\u{4DBF}'
1949            | '\u{4E00}'..='\u{9FFF}'
1950            | '\u{F900}'..='\u{FAFF}'
1951            | '\u{20000}'..='\u{2A6DF}'
1952            | '\u{2A700}'..='\u{2B73F}'
1953            | '\u{2B740}'..='\u{2B81F}'
1954            | '\u{2B820}'..='\u{2CEAF}'
1955            | '\u{2CEB0}'..='\u{2EBEF}'
1956            | '\u{2EBF0}'..='\u{2EE5F}'
1957            | '\u{2F800}'..='\u{2FA1F}'
1958            | '\u{30000}'..='\u{3134F}'
1959            | '\u{31350}'..='\u{323AF}'
1960            | '\u{323B0}'..='\u{3347F}'
1961    )
1962}
1963
1964/// The concrete rendering mode for annotated hanja words.
1965#[derive(Clone, Copy, Debug, Eq, PartialEq)]
1966pub enum RenderMode {
1967    /// Emits only hangul unless annotation flags require hanja disambiguation.
1968    HangulOnly,
1969
1970    /// Always emits hangul followed by the original hanja in parentheses.
1971    HangulHanjaParens,
1972
1973    /// Always emits original hanja followed by the hangul reading in
1974    /// parentheses.
1975    HanjaHangulParens,
1976
1977    /// Emits a `<ruby>` element pairing hangul reading and source hanja.
1978    ///
1979    /// The [`RubyBase`] sub-mode chooses which side becomes the base text.
1980    /// When the active scope reports
1981    /// [`ScopeData::allows_inline_markup`] as `false`, the renderer falls back
1982    /// to parenthesized text so that adapters which cannot embed markup still
1983    /// receive a sensible surface form.
1984    Ruby(RubyBase),
1985
1986    /// Emits original hanja, adding a hangul gloss only when requested.
1987    Original,
1988}
1989
1990/// Selects which side of a `<ruby>` element is the base text.
1991#[derive(Clone, Copy, Debug, Eq, PartialEq)]
1992pub enum RubyBase {
1993    /// `<ruby>hangul<rp>(</rp><rt>hanja</rt><rp>)</rp></ruby>`; hangul is the
1994    /// base, hanja is the gloss. The `<rp>` elements provide parenthesized
1995    /// fallback text for browsers without `<ruby>` support.
1996    OnHangul,
1997
1998    /// `<ruby>hanja<rp>(</rp><rt>hangul</rt><rp>)</rp></ruby>`; hanja is the
1999    /// base, hangul is the gloss. The `<rp>` elements provide parenthesized
2000    /// fallback text for browsers without `<ruby>` support.
2001    OnHanja,
2002}
2003
2004/// Form for the gloss attached to annotations in [`RenderMode::Original`].
2005///
2006/// `Original` keeps the source hanja as primary text and only attaches a
2007/// hangul gloss when the annotation flags or a user directive demand one.
2008/// This option controls how that gloss appears. Because `Original` always
2009/// treats hanja as primary, the ruby form uses hanja as the base and hangul
2010/// as the `rt` gloss; there is no sub-mode to flip the sides.
2011#[derive(Clone, Copy, Debug, Default, Eq, PartialEq)]
2012pub enum OriginalGloss {
2013    /// `hanja(hangul)`; matches the legacy behavior.
2014    #[default]
2015    Parens,
2016
2017    /// A `<ruby>` element with hanja as the base and hangul as the `rt`
2018    /// gloss, falling back to parens when the active scope disallows inline
2019    /// markup.
2020    Ruby,
2021}
2022
2023/// Rendering options that combine a [`RenderMode`] with per-mode sub-options.
2024///
2025/// Most pipelines configure rendering by mode alone, so `RenderOptions`
2026/// implements `From<RenderMode>` and `Default` to keep existing call sites
2027/// terse. Pipelines that need finer control (such as a ruby gloss in
2028/// [`RenderMode::Original`]) construct a `RenderOptions` value directly.
2029#[derive(Clone, Copy, Debug, Eq, PartialEq)]
2030pub struct RenderOptions {
2031    /// Top-level rendering mode applied to every annotation.
2032    pub mode: RenderMode,
2033
2034    /// Gloss form used by [`RenderMode::Original`]. Ignored by other modes.
2035    pub original_gloss: OriginalGloss,
2036}
2037
2038impl Default for RenderOptions {
2039    fn default() -> Self {
2040        Self {
2041            mode: RenderMode::HangulOnly,
2042            original_gloss: OriginalGloss::Parens,
2043        }
2044    }
2045}
2046
2047impl From<RenderMode> for RenderOptions {
2048    fn from(mode: RenderMode) -> Self {
2049        Self {
2050            mode,
2051            original_gloss: OriginalGloss::default(),
2052        }
2053    }
2054}
2055
2056/// The context boundary used by stateful annotation middlewares.
2057///
2058/// `PerBlock` resets when a scope reports [`ScopeData::is_block_boundary`].
2059/// `PerSection` resets when a later scope reports
2060/// [`ScopeData::is_section_boundary`].  Plain-text streams have no block or
2061/// section scopes, so those windows behave like one document context.  This is
2062/// required for exact homophone rendering because a later plain-text line can
2063/// make an earlier annotation ambiguous after it would otherwise have been
2064/// written.
2065#[derive(Clone, Copy, Debug, Eq, PartialEq)]
2066pub enum ContextWindow {
2067    /// Disable the middleware and leave tokens unchanged.
2068    Off,
2069
2070    /// Reset state at format-adapter block boundaries.
2071    PerBlock,
2072
2073    /// Reset state at format-adapter section boundaries.
2074    PerSection,
2075
2076    /// Use the entire token stream as one context.
2077    PerDocument,
2078}
2079
2080/// How homophone disambiguation decides that an annotation needs its hanja
2081/// shown in [`RenderMode::HangulOnly`].
2082///
2083/// The two strategies differ in what counts as a homophone collision:
2084///
2085/// `ContextLocal` (the default) marks an annotation only when another reading
2086/// with a *different* hanja form actually appears within the same context
2087/// window.  This keeps hangul-only output clean: a Sino-Korean word is glossed
2088/// only when the surrounding text genuinely makes it ambiguous.
2089///
2090/// `DictionaryWide` additionally marks an annotation whenever its reading is
2091/// shared by any other hanja form anywhere in the dictionary, regardless of
2092/// whether those alternatives occur in the text.  With a large reference
2093/// dictionary such as the Standard Korean Dictionary almost every common
2094/// reading has some homophone, so this strategy glosses most Sino-Korean
2095/// words.  It is preserved as an opt-in for callers that want maximal
2096/// disambiguation; words that should always be glossed regardless of context
2097/// are better expressed through [`MatchMark::require_hanja`].
2098#[derive(Clone, Copy, Debug, Default, Eq, PartialEq)]
2099pub enum HomophoneDetection {
2100    /// Mark only readings that collide within the active context window.
2101    #[default]
2102    ContextLocal,
2103
2104    /// Also mark readings shared by other hanja forms anywhere in the
2105    /// dictionary.
2106    DictionaryWide,
2107}
2108
2109/// Action applied when a user directive predicate matches an annotation.
2110#[derive(Clone, Copy, Debug, Eq, PartialEq)]
2111pub enum DirectiveAction {
2112    /// Require rendered output to keep the original hanja visible.
2113    RequireHanja,
2114
2115    /// Require rendered output to include a hangul gloss.
2116    RequireHangul,
2117
2118    /// Collapse the annotation to plain primary text for the active renderer.
2119    SkipAnnotation,
2120}
2121
2122/// User rules that adjust annotation presentation policy.
2123///
2124/// Literal helpers cover common hanja-form rules.  Callers that need richer
2125/// matching can add closure predicates over the whole [`Annotation`], which
2126/// keeps the core API independent of CLI-only pattern syntaxes.
2127#[derive(Default)]
2128pub struct UserDirectives<'a> {
2129    rules: Vec<UserDirectiveRule<'a>>,
2130}
2131
2132impl<'a> UserDirectives<'a> {
2133    /// Creates an empty directive set.
2134    pub fn new() -> Self {
2135        Self::default()
2136    }
2137
2138    /// Marks a literal hanja form as requiring visible hanja in output.
2139    pub fn require_hanja(&mut self, hanja: impl Into<String>) {
2140        self.add_literal(hanja, DirectiveAction::RequireHanja);
2141    }
2142
2143    /// Marks a literal hanja form as requiring a visible hangul gloss.
2144    pub fn require_hangul(&mut self, hanja: impl Into<String>) {
2145        self.add_literal(hanja, DirectiveAction::RequireHangul);
2146    }
2147
2148    /// Marks a literal hanja form as not receiving annotation rendering.
2149    pub fn skip_annotation(&mut self, hanja: impl Into<String>) {
2150        self.add_literal(hanja, DirectiveAction::SkipAnnotation);
2151    }
2152
2153    /// Adds a literal hanja-form directive.
2154    pub fn add_literal(&mut self, hanja: impl Into<String>, action: DirectiveAction) {
2155        self.rules.push(UserDirectiveRule {
2156            predicate: UserDirectivePredicate::Literal(hanja.into()),
2157            action,
2158        });
2159    }
2160
2161    /// Adds a predicate directive over the complete annotation metadata.
2162    pub fn add_predicate(
2163        &mut self,
2164        predicate: impl Fn(&Annotation) -> bool + 'a,
2165        action: DirectiveAction,
2166    ) {
2167        self.rules.push(UserDirectiveRule {
2168            predicate: UserDirectivePredicate::Predicate(Box::new(predicate)),
2169            action,
2170        });
2171    }
2172
2173    /// Returns whether no directive rules are configured.
2174    pub fn is_empty(&self) -> bool {
2175        self.rules.is_empty()
2176    }
2177
2178    /// Applies every configured directive to a single output token.
2179    ///
2180    /// Non-[`OutputToken::Annotated`] tokens pass through unchanged. For an
2181    /// annotation, each matching rule sets the corresponding flag in priority
2182    /// of declaration order. This method is the per-token primitive used by
2183    /// streaming pipelines that want to apply directives without buffering.
2184    pub fn apply<S>(&self, token: OutputToken<S>) -> OutputToken<S> {
2185        match token {
2186            OutputToken::Annotated(mut annotation) => {
2187                for rule in &self.rules {
2188                    if !rule.predicate.matches(&annotation) {
2189                        continue;
2190                    }
2191                    match rule.action {
2192                        DirectiveAction::RequireHanja => annotation.require_hanja = true,
2193                        DirectiveAction::RequireHangul => annotation.require_hangul = true,
2194                        DirectiveAction::SkipAnnotation => annotation.skip_annotation = true,
2195                    }
2196                }
2197                OutputToken::Annotated(annotation)
2198            }
2199            token => token,
2200        }
2201    }
2202}
2203
2204struct UserDirectiveRule<'a> {
2205    predicate: UserDirectivePredicate<'a>,
2206    action: DirectiveAction,
2207}
2208
2209enum UserDirectivePredicate<'a> {
2210    Literal(String),
2211    Predicate(Box<dyn Fn(&Annotation) -> bool + 'a>),
2212}
2213
2214impl UserDirectivePredicate<'_> {
2215    fn matches(&self, annotation: &Annotation) -> bool {
2216        match self {
2217            Self::Literal(hanja) => annotation.hanja == *hanja,
2218            Self::Predicate(predicate) => predicate(annotation),
2219        }
2220    }
2221}
2222
2223/// Sets `homophone` on dictionary annotations sharing a reading.
2224///
2225/// Uses [`HomophoneDetection::ContextLocal`], marking only readings that
2226/// collide within the active context window.  Use
2227/// [`mark_homophones_with_detection`] to opt into dictionary-wide marking.
2228pub fn mark_homophones<S, D>(
2229    tokens: impl IntoIterator<Item = OutputToken<S>>,
2230    dictionary: &D,
2231    window: ContextWindow,
2232) -> Vec<OutputToken<S>>
2233where
2234    S: ScopeData,
2235    D: HanjaDictionary + ?Sized,
2236{
2237    mark_homophones_with_detection(tokens, dictionary, window, HomophoneDetection::ContextLocal)
2238}
2239
2240/// Sets `homophone` on dictionary annotations sharing a reading, choosing the
2241/// detection strategy explicitly.
2242///
2243/// With [`HomophoneDetection::ContextLocal`] an annotation is marked only when
2244/// another hanja form with the same reading occurs within the context window,
2245/// so no dictionary index is built.  With
2246/// [`HomophoneDetection::DictionaryWide`] the marker also builds one homophone
2247/// index from the supplied dictionary and falls back to
2248/// [`HanjaDictionary::has_homophone`] for lookup-only dictionaries.  Fallback
2249/// (non-dictionary) annotations are ignored either way because they are
2250/// phonetic fragments rather than known lexical homophones.
2251pub fn mark_homophones_with_detection<S, D>(
2252    tokens: impl IntoIterator<Item = OutputToken<S>>,
2253    dictionary: &D,
2254    window: ContextWindow,
2255    detection: HomophoneDetection,
2256) -> Vec<OutputToken<S>>
2257where
2258    S: ScopeData,
2259    D: HanjaDictionary + ?Sized,
2260{
2261    if window == ContextWindow::Off {
2262        return tokens.into_iter().collect();
2263    }
2264
2265    let index = match detection {
2266        HomophoneDetection::ContextLocal => None,
2267        HomophoneDetection::DictionaryWide => HomophoneIndex::from_dictionary(dictionary),
2268    };
2269    let lookup_fallback = match detection {
2270        HomophoneDetection::ContextLocal => None,
2271        HomophoneDetection::DictionaryWide => index.is_none().then_some(dictionary),
2272    };
2273    ContextMiddleware::new(window, |tokens| {
2274        mark_homophones_in_context(tokens, index.as_ref(), lookup_fallback);
2275    })
2276    .process(tokens)
2277}
2278
2279/// Clears repeat gloss requirements after the first occurrence of each hanja.
2280///
2281/// The first occurrence key is the original hanja form. Later annotations for
2282/// the same form have `first_in_context` set to false and no longer require
2283/// either side to be shown.
2284pub fn filter_first_occurrences<S>(
2285    tokens: impl IntoIterator<Item = OutputToken<S>>,
2286    window: ContextWindow,
2287) -> Vec<OutputToken<S>>
2288where
2289    S: ScopeData,
2290{
2291    ContextMiddleware::new(window, filter_first_occurrences_in_context).process(tokens)
2292}
2293
2294type ContextApply<S> = fn(&mut [OutputToken<S>]);
2295type HomophoneApply<'a, S> = Box<dyn FnMut(&mut [OutputToken<S>]) + 'a>;
2296
2297/// Streaming homophone marker middleware.
2298///
2299/// Context windows that require lookahead buffer only until their configured
2300/// boundary. `PerDocument`, and scoped windows on streams that never emit the
2301/// corresponding boundary, buffer until [`HomophoneMarker::finish`].  For
2302/// example, exact plain-text homophone marking with `PerBlock` is document-wide
2303/// because plain text has no block scopes.
2304pub struct HomophoneMarker<'a, S>
2305where
2306    S: ScopeData,
2307{
2308    inner: ContextMiddleware<S, HomophoneApply<'a, S>>,
2309}
2310
2311impl<'a, S> HomophoneMarker<'a, S>
2312where
2313    S: ScopeData,
2314{
2315    /// Creates a homophone marker for the selected context window using
2316    /// [`HomophoneDetection::ContextLocal`].
2317    ///
2318    /// Use [`HomophoneMarker::with_detection`] to opt into dictionary-wide
2319    /// marking.
2320    pub fn new<D>(dictionary: &'a D, window: ContextWindow) -> Self
2321    where
2322        D: HanjaDictionary + ?Sized,
2323    {
2324        Self::with_detection(dictionary, window, HomophoneDetection::ContextLocal)
2325    }
2326
2327    /// Creates a homophone marker for the selected context window and detection
2328    /// strategy.
2329    ///
2330    /// With [`HomophoneDetection::ContextLocal`] no dictionary index is built;
2331    /// only readings that collide within the context window are marked.  With
2332    /// [`HomophoneDetection::DictionaryWide`] a homophone index (or
2333    /// [`HanjaDictionary::has_homophone`] fallback) is consulted as well.
2334    pub fn with_detection<D>(
2335        dictionary: &'a D,
2336        window: ContextWindow,
2337        detection: HomophoneDetection,
2338    ) -> Self
2339    where
2340        D: HanjaDictionary + ?Sized,
2341    {
2342        let index = match detection {
2343            _ if window == ContextWindow::Off => None,
2344            HomophoneDetection::ContextLocal => None,
2345            HomophoneDetection::DictionaryWide => HomophoneIndex::from_dictionary(dictionary),
2346        };
2347        let lookup_fallback = match detection {
2348            HomophoneDetection::ContextLocal => None,
2349            HomophoneDetection::DictionaryWide => index.is_none().then_some(dictionary),
2350        };
2351        Self {
2352            inner: ContextMiddleware::new(
2353                window,
2354                Box::new(move |tokens| {
2355                    mark_homophones_in_context(tokens, index.as_ref(), lookup_fallback);
2356                }),
2357            ),
2358        }
2359    }
2360
2361    /// Pushes one output token and returns tokens ready for downstream stages.
2362    pub fn push_token(&mut self, token: OutputToken<S>) -> Vec<OutputToken<S>> {
2363        self.inner.push_token(token)
2364    }
2365
2366    /// Finishes the middleware and returns buffered tokens.
2367    pub fn finish(self) -> Vec<OutputToken<S>> {
2368        self.inner.finish()
2369    }
2370}
2371
2372/// Streaming first-occurrence middleware.
2373///
2374/// Repeated annotations inside a context have `first_in_context` cleared and
2375/// presentation requirements removed once the context is flushed.
2376pub struct FirstOccurrenceFilter<S>
2377where
2378    S: ScopeData,
2379{
2380    inner: ContextMiddleware<S, ContextApply<S>>,
2381}
2382
2383impl<S> FirstOccurrenceFilter<S>
2384where
2385    S: ScopeData,
2386{
2387    /// Creates a first-occurrence filter for the selected context window.
2388    pub fn new(window: ContextWindow) -> Self {
2389        Self {
2390            inner: ContextMiddleware::new(window, filter_first_occurrences_in_context::<S>),
2391        }
2392    }
2393
2394    /// Pushes one output token and returns tokens ready for downstream stages.
2395    pub fn push_token(&mut self, token: OutputToken<S>) -> Vec<OutputToken<S>> {
2396        self.inner.push_token(token)
2397    }
2398
2399    /// Finishes the middleware and returns buffered tokens.
2400    pub fn finish(self) -> Vec<OutputToken<S>> {
2401        self.inner.finish()
2402    }
2403}
2404
2405/// Streaming middleware that collapses an explicit parenthetical reading
2406/// annotation into the converted hanja word it duplicates.
2407///
2408/// Mixed-script input sometimes spells a word together with a parenthetical
2409/// gloss, either hanja-first (`庫間(곳간)`) or hangul-first (`곳간(庫間)`).  Left
2410/// alone, the converter would render the hanja *and* keep the parenthetical,
2411/// producing a redundant `곳간(곳간)`.  An author who wrote such a gloss meant
2412/// "annotate this word fully", so this middleware detects the two patterns,
2413/// removes the now-redundant parenthetical text, and sets both
2414/// [`Annotation::require_hanja`] and [`Annotation::require_hangul`] on the
2415/// surviving annotation.  Setting both flags reproduces the author's intent in
2416/// every render mode: [`RenderMode::HangulOnly`] honours `require_hanja`
2417/// (`곳간(庫間)`) while [`RenderMode::Original`] honours `require_hangul`
2418/// (`庫間(곳간)`).
2419///
2420/// A parenthetical may also *pin an alternative reading*.  `數字` is normally
2421/// read `숫자`, but in the sense "a few characters" it reads `수자`; writing
2422/// `數字(수자)` fixes the reading for that occurrence.  Such a reading
2423/// annotation is told apart from a definition gloss like
2424/// `庫間(물건을 간직하여 두는 곳)` with a two-tier test against the candidate
2425/// hangul `R`:
2426///
2427/// 1. **Exact match** — `R` equals the annotation's reading.  Collapse and keep
2428///    the reading.
2429/// 2. **Valid alternative reading** — `R` has exactly one hangul syllable per
2430///    hanja character and every syllable is a recorded Unihan reading of its
2431///    character (or the initial-sound-law variant of one).  Collapse and
2432///    override the reading with `R`.
2433///
2434/// Anything else (definition glosses, foreign transliterations such as
2435/// `蔣介石(장제스)`, or a syllable-count mismatch) is left untouched.
2436///
2437/// The middleware runs immediately after the engine, before
2438/// [`HomophoneMarker`] and [`FirstOccurrenceFilter`], so later stages observe
2439/// the corrected reading and flags.  It coalesces adjacent
2440/// [`OutputToken::Text`] tokens (the streaming engine flushes non-hanja text at
2441/// safe points, so `(곳간)` can arrive split as `(곳간` then `)`) and buffers
2442/// only a bounded amount: a held annotation, the trailing matchable suffix of
2443/// the preceding text, and the following parenthetical until it can be
2444/// classified.  This keeps the streaming result identical to a one-shot
2445/// conversion while staying responsive on long hanja-free runs.
2446/// [`OutputToken::Open`], [`OutputToken::Close`], and [`OutputToken::Verbatim`]
2447/// flush the buffer and pass through, so a match never crosses a scope
2448/// boundary.  When `enabled` is `false` the middleware is an exact
2449/// pass-through.
2450///
2451/// # Limitation
2452///
2453/// The collapser runs after the engine and never re-derives readings, so a
2454/// hanja-first gloss immediately followed (with no space) by an initial-sound-law
2455/// (頭音法則) character keeps the reading the engine chose with the parenthetical
2456/// acting as a word boundary.  For example `學(학)率` collapses to `학(學)율`
2457/// rather than `학률`: the engine read `率` as word-initial `율` because `)`
2458/// separated it from `學`, and removing the gloss cannot recover the
2459/// non-word-initial `률`.  This is narrow in practice; an intended compound is
2460/// normally written `學率(학률)`.  Insert a space (`學(학) 率`) or gloss the whole
2461/// compound to control the reading.
2462pub struct RedundantParenCollapser<S>
2463where
2464    S: ScopeData,
2465{
2466    enabled: bool,
2467    /// Coalesced trailing text held while no annotation is pending: a bounded
2468    /// suffix (`[hangul]*` optionally ending in `(`) that could still become a
2469    /// hangul-first match's preceding text once the next annotation arrives.
2470    /// Everything before that suffix is emitted eagerly so streaming stays
2471    /// responsive even for long hanja-free runs.
2472    held_tail: String,
2473    /// A held annotation whose following text is still being accumulated.
2474    pending_annotation: Option<Annotation>,
2475    /// The text immediately preceding [`Self::pending_annotation`].
2476    preceding: String,
2477    /// Coalesced text following [`Self::pending_annotation`], accumulated until
2478    /// the parenthetical can be classified.
2479    following: String,
2480    _scope: PhantomData<fn(S)>,
2481}
2482
2483impl<S> RedundantParenCollapser<S>
2484where
2485    S: ScopeData,
2486{
2487    /// Creates a collapser.  When `enabled` is `false` every token passes
2488    /// through unchanged.
2489    pub fn new(enabled: bool) -> Self {
2490        Self {
2491            enabled,
2492            held_tail: String::new(),
2493            pending_annotation: None,
2494            preceding: String::new(),
2495            following: String::new(),
2496            _scope: PhantomData,
2497        }
2498    }
2499
2500    /// Pushes one output token and returns tokens ready for downstream stages.
2501    pub fn push_token(&mut self, token: OutputToken<S>) -> Vec<OutputToken<S>> {
2502        if !self.enabled {
2503            return Vec::from([token]);
2504        }
2505        let mut output = Vec::new();
2506        match token {
2507            OutputToken::Annotated(annotation) => {
2508                // End any in-progress following text run by forcing a decision,
2509                // then the held tail becomes this annotation's preceding text.
2510                self.finalize_pending(&mut output);
2511                self.preceding = core::mem::take(&mut self.held_tail);
2512                self.pending_annotation = Some(annotation);
2513            }
2514            OutputToken::Text(text) => {
2515                if self.pending_annotation.is_some() {
2516                    self.following.push_str(&text);
2517                    self.resolve_following(&mut output);
2518                } else {
2519                    self.held_tail.push_str(&text);
2520                    self.emit_held_prefix(&mut output);
2521                }
2522            }
2523            boundary => {
2524                // Open / Close / Verbatim: a match may not cross this boundary,
2525                // so finalize everything before passing the boundary through.
2526                self.finalize_pending(&mut output);
2527                if !self.held_tail.is_empty() {
2528                    output.push(OutputToken::Text(core::mem::take(&mut self.held_tail)));
2529                }
2530                output.push(boundary);
2531            }
2532        }
2533        output
2534    }
2535
2536    /// Flushes buffered tokens and returns them.
2537    pub fn finish(mut self) -> Vec<OutputToken<S>> {
2538        if !self.enabled {
2539            return Vec::new();
2540        }
2541        let mut output = Vec::new();
2542        self.finalize_pending(&mut output);
2543        if !self.held_tail.is_empty() {
2544            output.push(OutputToken::Text(core::mem::take(&mut self.held_tail)));
2545        }
2546        output
2547    }
2548
2549    /// Emits the part of [`Self::held_tail`] that can no longer participate in a
2550    /// hangul-first match, keeping the bounded matchable suffix.
2551    fn emit_held_prefix(&mut self, output: &mut Vec<OutputToken<S>>) {
2552        let split = hangul_first_tail_start(&self.held_tail);
2553        if split > 0 {
2554            // Keep the (possibly long) prefix in the existing buffer and split
2555            // off only the bounded suffix, avoiding a large copy and shift.
2556            let suffix = self.held_tail.split_off(split);
2557            let prefix = core::mem::replace(&mut self.held_tail, suffix);
2558            output.push(OutputToken::Text(prefix));
2559        }
2560    }
2561
2562    /// Forces a pending annotation to resolve as if no further following text
2563    /// will arrive (called at a boundary, a new annotation, or EOF).
2564    fn finalize_pending(&mut self, output: &mut Vec<OutputToken<S>>) {
2565        if self.pending_annotation.is_some() {
2566            self.decide_following(true, output);
2567        }
2568    }
2569
2570    /// Resolves a pending annotation against the accumulated following text,
2571    /// buffering more text when the parenthetical is still incomplete.
2572    fn resolve_following(&mut self, output: &mut Vec<OutputToken<S>>) {
2573        self.decide_following(false, output);
2574    }
2575
2576    /// Classifies the pending annotation against `preceding` / `following`.
2577    ///
2578    /// With `flush` set, an otherwise-undecidable case is treated as a
2579    /// non-match instead of requesting more text.
2580    fn decide_following(&mut self, flush: bool, output: &mut Vec<OutputToken<S>>) {
2581        let annotation = self
2582            .pending_annotation
2583            .as_ref()
2584            .expect("decide_following called with a pending annotation");
2585        match classify_following(&self.preceding, annotation, &self.following, flush) {
2586            FollowingMatch::NeedMore => return,
2587            FollowingMatch::NoMatch => {
2588                if !self.preceding.is_empty() {
2589                    output.push(OutputToken::Text(core::mem::take(&mut self.preceding)));
2590                }
2591                output.push(OutputToken::Annotated(
2592                    self.pending_annotation.take().expect("pending annotation"),
2593                ));
2594                // The following text run continues as ordinary trailing text.
2595                self.held_tail = core::mem::take(&mut self.following);
2596            }
2597            FollowingMatch::HanjaFirst {
2598                collapsed,
2599                leftover,
2600            } => {
2601                // The preceding text is unrelated; emit it verbatim.
2602                if !self.preceding.is_empty() {
2603                    output.push(OutputToken::Text(core::mem::take(&mut self.preceding)));
2604                }
2605                output.push(OutputToken::Annotated(collapsed));
2606                self.pending_annotation = None;
2607                self.held_tail = leftover;
2608                self.following.clear();
2609            }
2610            FollowingMatch::HangulFirst {
2611                remaining_preceding,
2612                collapsed,
2613                leftover,
2614            } => {
2615                if !remaining_preceding.is_empty() {
2616                    output.push(OutputToken::Text(remaining_preceding));
2617                }
2618                output.push(OutputToken::Annotated(collapsed));
2619                self.pending_annotation = None;
2620                self.preceding.clear();
2621                self.held_tail = leftover;
2622                self.following.clear();
2623            }
2624        }
2625        self.emit_held_prefix(output);
2626    }
2627}
2628
2629/// Upper bound on how many trailing hangul syllables are held as a hangul-first
2630/// reading candidate.  A Sino-Korean reading written before `(` is at most a
2631/// handful of syllables; this generous cap keeps `held_tail` bounded even for a
2632/// pathological space-free hangul run (the only cost of exceeding it is that an
2633/// implausibly long reading is not collapsed).
2634const MAX_PRECEDING_READING_CHARS: usize = 64;
2635
2636/// Byte index where the matchable suffix of a held text run begins: up to
2637/// [`MAX_PRECEDING_READING_CHARS`] trailing hangul syllables plus an optional
2638/// final `(`.  Everything before this index can be emitted because it can no
2639/// longer be the preceding text of a hangul-first match.
2640fn hangul_first_tail_start(text: &str) -> usize {
2641    let mut start = text.len();
2642    let mut chars = text.char_indices().rev().peekable();
2643    if let Some(&(index, '(')) = chars.peek() {
2644        start = index;
2645        chars.next();
2646    }
2647    let mut held = 0;
2648    while held < MAX_PRECEDING_READING_CHARS {
2649        match chars.peek() {
2650            Some(&(index, ch)) if is_hangul_syllable(ch) => {
2651                start = index;
2652                held += 1;
2653                chars.next();
2654            }
2655            _ => break,
2656        }
2657    }
2658    start
2659}
2660
2661/// Buffered counterpart to [`RedundantParenCollapser`] for non-streaming
2662/// callers, mirroring [`mark_homophones_with_detection`] and
2663/// [`filter_first_occurrences`].
2664pub fn collapse_redundant_parens<S>(
2665    tokens: impl IntoIterator<Item = OutputToken<S>>,
2666    enabled: bool,
2667) -> Vec<OutputToken<S>>
2668where
2669    S: ScopeData,
2670{
2671    if !enabled {
2672        return tokens.into_iter().collect();
2673    }
2674    let mut collapser = RedundantParenCollapser::new(true);
2675    let mut output = Vec::new();
2676    for token in tokens {
2677        output.extend(collapser.push_token(token));
2678    }
2679    output.extend(collapser.finish());
2680    output
2681}
2682
2683/// Classification of a parenthetical hangul string against an annotation.
2684enum ReadingMatch {
2685    /// The parenthetical equals the annotation's reading; keep the reading.
2686    Keep,
2687    /// The parenthetical is a valid alternative reading; override with it.
2688    Override(String),
2689}
2690
2691/// Classifies the parenthetical hangul `candidate` against an annotation's
2692/// `hanja`/`reading`, returning `None` when it is neither the reading nor a
2693/// valid alternative reading (so the tokens are left untouched).
2694fn classify_reading(hanja: &str, reading: &str, candidate: &str) -> Option<ReadingMatch> {
2695    if candidate == reading {
2696        Some(ReadingMatch::Keep)
2697    } else if is_valid_alternative_reading(hanja, candidate) {
2698        Some(ReadingMatch::Override(candidate.to_string()))
2699    } else {
2700        None
2701    }
2702}
2703
2704/// Returns whether `candidate` is a valid Sino-Korean reading of `hanja`: one
2705/// hangul syllable per hanja character, each a recorded Unihan reading of its
2706/// character or the initial-sound-law variant of one.
2707fn is_valid_alternative_reading(hanja: &str, candidate: &str) -> bool {
2708    let mut hanja_chars = hanja.chars();
2709    let mut candidate_chars = candidate.chars();
2710    let mut matched_any = false;
2711    loop {
2712        match (hanja_chars.next(), candidate_chars.next()) {
2713            (Some(hanja_char), Some(syllable)) => {
2714                if !is_valid_char_reading(hanja_char, syllable) {
2715                    return false;
2716                }
2717                matched_any = true;
2718            }
2719            (None, None) => return matched_any,
2720            // Differing lengths: not a one-syllable-per-character reading.
2721            _ => return false,
2722        }
2723    }
2724}
2725
2726/// Returns whether `syllable` is a valid reading of the source character
2727/// `source`: a recorded Unihan reading (or its initial-sound-law 頭音法則
2728/// variant) when `source` is a hanja character, or the same syllable verbatim
2729/// when `source` is itself hangul (as in a mixed-script entry such as `色깔論`).
2730fn is_valid_char_reading(source: char, syllable: char) -> bool {
2731    if !is_hangul_syllable(syllable) {
2732        return false;
2733    }
2734    let readings = khangul_all_readings(source);
2735    if readings.is_empty() {
2736        // No recorded Sino-Korean reading: the source is the hangul portion of
2737        // a mixed-script entry (or otherwise non-hanja), so it must appear
2738        // verbatim in the candidate reading.
2739        return source == syllable;
2740    }
2741    readings.iter().any(|reading| {
2742        reading_is_syllable(reading, syllable)
2743            || reading_matches_with_initial_sound_law(reading, syllable)
2744    })
2745}
2746
2747/// Returns whether the single-syllable `reading` is exactly `syllable`.
2748fn reading_is_syllable(reading: &str, syllable: char) -> bool {
2749    let mut chars = reading.chars();
2750    chars.next() == Some(syllable) && chars.next().is_none()
2751}
2752
2753/// Builds the collapsed annotation: both presentation flags set, with the
2754/// reading overridden when the parenthetical pinned an alternative one.
2755fn collapse_annotation(mut annotation: Annotation, reading_match: ReadingMatch) -> Annotation {
2756    if let ReadingMatch::Override(reading) = reading_match {
2757        annotation.reading = reading;
2758    }
2759    annotation.require_hanja = true;
2760    annotation.require_hangul = true;
2761    annotation.from_source_gloss = true;
2762    annotation
2763}
2764
2765/// Outcome of classifying a pending annotation against the text that follows
2766/// it (and, for the hangul-first pattern, the text that precedes it).
2767enum FollowingMatch {
2768    /// The following text is an incomplete parenthetical; buffer more.
2769    NeedMore,
2770    /// Neither pattern applies; emit the tokens unchanged.
2771    NoMatch,
2772    /// Hanja-first `Annotated` + `(R)`: collapse, keeping the text after `)`.
2773    HanjaFirst {
2774        collapsed: Annotation,
2775        leftover: String,
2776    },
2777    /// Hangul-first `R(` + `Annotated` + `)`: collapse, keeping the preceding
2778    /// text before `R(` and the following text after `)`.
2779    HangulFirst {
2780        remaining_preceding: String,
2781        collapsed: Annotation,
2782        leftover: String,
2783    },
2784}
2785
2786/// Classifies a pending annotation against the accumulated `preceding` and
2787/// `following` text.
2788///
2789/// `following` is coalesced across adjacent text tokens; the hanja-first arm
2790/// buffers (returns [`FollowingMatch::NeedMore`]) until it sees the closing `)`
2791/// or can rule a match out, which keeps the buffer bounded by the longest
2792/// possible reading.  With `flush` set (a boundary or EOF ended the run) an
2793/// otherwise-undecidable parenthetical is treated as a non-match.
2794fn classify_following(
2795    preceding: &str,
2796    annotation: &Annotation,
2797    following: &str,
2798    flush: bool,
2799) -> FollowingMatch {
2800    let Some(first) = following.chars().next() else {
2801        return if flush {
2802            FollowingMatch::NoMatch
2803        } else {
2804            FollowingMatch::NeedMore
2805        };
2806    };
2807    match first {
2808        ')' => match match_hangul_first(preceding, annotation, following) {
2809            Some((remaining_preceding, collapsed)) => FollowingMatch::HangulFirst {
2810                remaining_preceding,
2811                collapsed,
2812                leftover: following[')'.len_utf8()..].to_string(),
2813            },
2814            None => FollowingMatch::NoMatch,
2815        },
2816        '(' => {
2817            let content = &following['('.len_utf8()..];
2818            match content.find(')') {
2819                Some(close) => {
2820                    let candidate = &content[..close];
2821                    match classify_reading(&annotation.hanja, &annotation.reading, candidate) {
2822                        Some(reading_match) => FollowingMatch::HanjaFirst {
2823                            collapsed: collapse_annotation(annotation.clone(), reading_match),
2824                            leftover: content[close + ')'.len_utf8()..].to_string(),
2825                        },
2826                        None => FollowingMatch::NoMatch,
2827                    }
2828                }
2829                None => {
2830                    // A reading is at most max(reading, hanja) syllables long, so
2831                    // once the unclosed content exceeds that it cannot match.
2832                    let max_reading = annotation
2833                        .reading
2834                        .chars()
2835                        .count()
2836                        .max(annotation.hanja.chars().count());
2837                    if flush || content.chars().count() > max_reading {
2838                        FollowingMatch::NoMatch
2839                    } else {
2840                        FollowingMatch::NeedMore
2841                    }
2842                }
2843            }
2844        }
2845        _ => FollowingMatch::NoMatch,
2846    }
2847}
2848
2849/// Matches the hangul-first pattern preceding `Text("…R(")` + `Annotated` +
2850/// following `Text(")…")`.  On success returns the preceding text remaining
2851/// after stripping `R(` and the collapsed annotation.
2852fn match_hangul_first(
2853    preceding: &str,
2854    annotation: &Annotation,
2855    following: &str,
2856) -> Option<(String, Annotation)> {
2857    if !following.starts_with(')') {
2858        return None;
2859    }
2860    let before = preceding.strip_suffix('(')?;
2861
2862    // Tier 1: the text just before `(` ends with the annotation's reading.
2863    if !annotation.reading.is_empty()
2864        && let Some(remaining) = before.strip_suffix(&annotation.reading)
2865    {
2866        let collapsed = collapse_annotation(annotation.clone(), ReadingMatch::Keep);
2867        return Some((remaining.to_string(), collapsed));
2868    }
2869
2870    // Tier 2: the trailing hanja-character count of hangul syllables form a
2871    // valid alternative reading.  Slice `before` directly at the byte boundary
2872    // of those trailing syllables rather than collecting it into a `Vec<char>`.
2873    let syllable_count = annotation.hanja.chars().count();
2874    if syllable_count == 0 {
2875        return None;
2876    }
2877    let (split, _) = before.char_indices().rev().nth(syllable_count - 1)?;
2878    let candidate = &before[split..];
2879    let reading_match = classify_reading(&annotation.hanja, &annotation.reading, candidate)?;
2880    Some((
2881        before[..split].to_string(),
2882        collapse_annotation(annotation.clone(), reading_match),
2883    ))
2884}
2885
2886/// Applies literal user directives to annotation policy flags.
2887///
2888/// Rules only set flags; they do not render, remove, or reorder tokens.
2889pub fn apply_user_directives<S>(
2890    tokens: impl IntoIterator<Item = OutputToken<S>>,
2891    directives: &UserDirectives<'_>,
2892) -> Vec<OutputToken<S>> {
2893    apply_user_directives_iter(tokens, directives).collect()
2894}
2895
2896/// Lazily applies literal user directives to an output token stream.
2897///
2898/// Returns an iterator that walks the input tokens without intermediate
2899/// buffering. Use this variant in streaming pipelines that need to chain
2900/// directive application with other lazy stages such as [`render_tokens_iter`].
2901pub fn apply_user_directives_iter<'a, S>(
2902    tokens: impl IntoIterator<Item = OutputToken<S>> + 'a,
2903    directives: &'a UserDirectives<'_>,
2904) -> impl Iterator<Item = OutputToken<S>> + 'a {
2905    tokens.into_iter().map(|token| directives.apply(token))
2906}
2907
2908struct ContextMiddleware<S, F>
2909where
2910    S: ScopeData,
2911    F: FnMut(&mut [OutputToken<S>]),
2912{
2913    window: ContextWindow,
2914    apply: F,
2915    context: Vec<OutputToken<S>>,
2916    scope_boundaries: Vec<bool>,
2917}
2918
2919impl<S, F> ContextMiddleware<S, F>
2920where
2921    S: ScopeData,
2922    F: FnMut(&mut [OutputToken<S>]),
2923{
2924    fn new(window: ContextWindow, apply: F) -> Self {
2925        Self {
2926            window,
2927            apply,
2928            context: Vec::new(),
2929            scope_boundaries: Vec::new(),
2930        }
2931    }
2932
2933    fn process(mut self, tokens: impl IntoIterator<Item = OutputToken<S>>) -> Vec<OutputToken<S>> {
2934        let mut output = Vec::new();
2935        for token in tokens {
2936            output.extend(self.push_token(token));
2937        }
2938        output.extend(self.finish());
2939        output
2940    }
2941
2942    fn push_token(&mut self, token: OutputToken<S>) -> Vec<OutputToken<S>> {
2943        let mut output = Vec::new();
2944        match self.window {
2945            ContextWindow::Off => output.push(token),
2946            ContextWindow::PerDocument => self.context.push(token),
2947            ContextWindow::PerBlock | ContextWindow::PerSection => match &token {
2948                OutputToken::Open(scope) => {
2949                    let is_boundary = match self.window {
2950                        ContextWindow::PerBlock => scope.data().is_block_boundary(),
2951                        ContextWindow::PerSection => scope.data().is_section_boundary(),
2952                        ContextWindow::Off | ContextWindow::PerDocument => false,
2953                    };
2954                    if is_boundary {
2955                        self.flush_context(&mut output);
2956                    }
2957                    self.scope_boundaries.push(is_boundary);
2958                    self.context.push(token);
2959                }
2960                OutputToken::Close => {
2961                    let closes_boundary = self.scope_boundaries.pop().unwrap_or(false);
2962                    self.context.push(token);
2963                    if closes_boundary && self.window == ContextWindow::PerBlock {
2964                        self.flush_context(&mut output);
2965                    }
2966                }
2967                _ => self.context.push(token),
2968            },
2969        }
2970        output
2971    }
2972
2973    fn finish(mut self) -> Vec<OutputToken<S>> {
2974        let mut output = Vec::new();
2975        self.flush_context(&mut output);
2976        output
2977    }
2978
2979    fn flush_context(&mut self, output: &mut Vec<OutputToken<S>>) {
2980        if self.context.is_empty() {
2981            return;
2982        }
2983
2984        (self.apply)(&mut self.context);
2985        output.append(&mut self.context);
2986    }
2987}
2988
2989#[derive(Clone, Debug, Default, Eq, PartialEq)]
2990struct HomophoneIndex {
2991    forms_by_reading: BTreeMap<String, BTreeSet<String>>,
2992}
2993
2994impl HomophoneIndex {
2995    fn from_dictionary<D>(dictionary: &D) -> Option<Self>
2996    where
2997        D: HanjaDictionary + ?Sized,
2998    {
2999        let mut forms_by_reading = BTreeMap::<String, BTreeSet<String>>::new();
3000        for record in dictionary.entries()? {
3001            forms_by_reading
3002                .entry(record.reading)
3003                .or_default()
3004                .insert(record.hanja);
3005        }
3006        Some(Self { forms_by_reading })
3007    }
3008
3009    fn has_homophone(&self, hanja: &str, reading: &str) -> bool {
3010        self.forms_by_reading
3011            .get(reading)
3012            .is_some_and(|forms| forms.iter().any(|form| form != hanja))
3013    }
3014}
3015
3016fn mark_homophones_in_context<S, D>(
3017    tokens: &mut [OutputToken<S>],
3018    index: Option<&HomophoneIndex>,
3019    lookup_fallback: Option<&D>,
3020) where
3021    D: HanjaDictionary + ?Sized,
3022{
3023    let mut forms_by_reading = BTreeMap::<String, BTreeSet<String>>::new();
3024
3025    for token in tokens.iter() {
3026        if let OutputToken::Annotated(annotation) = token
3027            && annotation.from_dictionary
3028        {
3029            forms_by_reading
3030                .entry(annotation.reading.clone())
3031                .or_default()
3032                .insert(annotation.hanja.clone());
3033        }
3034    }
3035
3036    for token in tokens.iter_mut() {
3037        if let OutputToken::Annotated(annotation) = token {
3038            annotation.homophone = annotation.from_dictionary
3039                && (index.is_some_and(|index| {
3040                    index.has_homophone(&annotation.hanja, &annotation.reading)
3041                }) || lookup_fallback.is_some_and(|dictionary| {
3042                    dictionary.has_homophone(&annotation.hanja, &annotation.reading)
3043                }) || forms_by_reading
3044                    .get(&annotation.reading)
3045                    .is_some_and(|forms| forms.len() > 1));
3046        }
3047    }
3048}
3049
3050fn filter_first_occurrences_in_context<S>(tokens: &mut [OutputToken<S>]) {
3051    let mut seen = BTreeSet::new();
3052
3053    for token in tokens.iter_mut() {
3054        if let OutputToken::Annotated(annotation) = token {
3055            if seen.insert(annotation.hanja.clone()) {
3056                annotation.first_in_context = true;
3057            } else {
3058                annotation.first_in_context = false;
3059                // An explicit parenthetical gloss is the author asking for the
3060                // annotation at every occurrence, so its requirements survive
3061                // first-occurrence clearing; dictionary requirements do not.
3062                if !annotation.from_source_gloss {
3063                    annotation.require_hanja = false;
3064                    annotation.require_hangul = false;
3065                }
3066            }
3067        }
3068    }
3069}
3070
3071/// Renders engine output tokens into annotation-free tokens.
3072///
3073/// Structural and text tokens pass through. Each annotation is expanded into a
3074/// concrete rendered token according to the supplied options, the current
3075/// scope, and the annotation's flags. `options` accepts either a bare
3076/// [`RenderMode`] (via the `From<RenderMode>` impl on [`RenderOptions`]) or a
3077/// full [`RenderOptions`] value.
3078pub fn render_tokens<S, O>(
3079    tokens: impl IntoIterator<Item = OutputToken<S>>,
3080    options: O,
3081) -> Vec<RenderedToken<S>>
3082where
3083    S: ScopeData,
3084    O: Into<RenderOptions>,
3085{
3086    render_tokens_iter(tokens, options).collect()
3087}
3088
3089/// Renders engine output tokens into annotation-free tokens as an iterator.
3090///
3091/// The renderer maintains a small scope stack so that annotation expansion can
3092/// consult the active scope's [`ScopeData::allows_inline_markup`] when
3093/// choosing between an inline-markup form and a parenthesized fallback. Every
3094/// other token maps one-to-one to its rendered counterpart.
3095pub fn render_tokens_iter<S, O>(
3096    tokens: impl IntoIterator<Item = OutputToken<S>>,
3097    options: O,
3098) -> impl Iterator<Item = RenderedToken<S>>
3099where
3100    S: ScopeData,
3101    O: Into<RenderOptions>,
3102{
3103    RendererIter {
3104        upstream: tokens.into_iter(),
3105        renderer: Renderer::new(options),
3106    }
3107}
3108
3109/// Stateful renderer for chunked [`OutputToken`] streams.
3110///
3111/// `Renderer` is the push-based counterpart to [`render_tokens_iter`]. It
3112/// preserves the active scope stack across calls so format writers can consume
3113/// rendered tokens as soon as upstream engine and middleware stages release
3114/// them, without losing inline-markup restrictions from earlier chunks.
3115pub struct Renderer<S>
3116where
3117    S: ScopeData,
3118{
3119    options: RenderOptions,
3120    /// Cached `allows_inline_markup` value for each open scope. Storing the
3121    /// boolean instead of the whole scope keeps the renderer free of an extra
3122    /// `S: Clone` bound at this layer (it already requires it via `ScopeData`)
3123    /// and avoids the cost of cloning adapter-owned data.
3124    markup_stack: Vec<bool>,
3125    /// Number of currently open scopes whose `allows_inline_markup` is
3126    /// `false`. Inline markup is safe at the current cursor only when this
3127    /// counter is zero; otherwise some ancestor forbids markup and a nested
3128    /// allow-markup scope cannot override that restriction.
3129    disallowing_ancestors: usize,
3130    _scope: PhantomData<fn(S)>,
3131}
3132
3133impl<S> Renderer<S>
3134where
3135    S: ScopeData,
3136{
3137    /// Creates a renderer with the supplied rendering options.
3138    pub fn new<O>(options: O) -> Self
3139    where
3140        O: Into<RenderOptions>,
3141    {
3142        Self {
3143            options: options.into(),
3144            markup_stack: Vec::new(),
3145            disallowing_ancestors: 0,
3146            _scope: PhantomData,
3147        }
3148    }
3149
3150    /// Pushes one output token and returns its rendered counterpart.
3151    pub fn push_token(&mut self, token: OutputToken<S>) -> RenderedToken<S> {
3152        match token {
3153            OutputToken::Open(scope) => {
3154                let allows = scope.data().allows_inline_markup();
3155                if !allows {
3156                    self.disallowing_ancestors += 1;
3157                }
3158                self.markup_stack.push(allows);
3159                RenderedToken::Open(scope)
3160            }
3161            OutputToken::Close => {
3162                if let Some(false) = self.markup_stack.pop() {
3163                    // Saturating guard for malformed streams that emit more
3164                    // Close than Open tokens; the renderer should never
3165                    // panic on broken input.
3166                    self.disallowing_ancestors = self.disallowing_ancestors.saturating_sub(1);
3167                }
3168                RenderedToken::Close
3169            }
3170            OutputToken::Text(text) => RenderedToken::Text(text),
3171            OutputToken::Verbatim(text) => RenderedToken::Verbatim(text),
3172            OutputToken::Annotated(annotation) => {
3173                // Inline markup is allowed only when no open ancestor scope
3174                // forbids it. The plain-text reader wraps its input in a
3175                // scope whose `allows_inline_markup` is false, so plain text
3176                // still falls back to parens; HTML and Markdown root
3177                // contexts emit no enclosing scope and therefore start with
3178                // an empty stack, leaving annotations free to use markup.
3179                let allows_inline_markup = self.disallowing_ancestors == 0;
3180                render_annotation(&annotation, &self.options, allows_inline_markup)
3181            }
3182        }
3183    }
3184}
3185
3186struct RendererIter<I, S>
3187where
3188    S: ScopeData,
3189{
3190    upstream: I,
3191    renderer: Renderer<S>,
3192}
3193
3194impl<I, S> Iterator for RendererIter<I, S>
3195where
3196    I: Iterator<Item = OutputToken<S>>,
3197    S: ScopeData,
3198{
3199    type Item = RenderedToken<S>;
3200
3201    fn next(&mut self) -> Option<Self::Item> {
3202        let token = self.upstream.next()?;
3203        Some(self.renderer.push_token(token))
3204    }
3205}
3206
3207fn render_annotation<S>(
3208    annotation: &Annotation,
3209    options: &RenderOptions,
3210    allows_inline_markup: bool,
3211) -> RenderedToken<S> {
3212    if annotation.skip_annotation {
3213        let primary = match options.mode {
3214            RenderMode::HangulOnly | RenderMode::HangulHanjaParens => annotation.reading.clone(),
3215            RenderMode::HanjaHangulParens | RenderMode::Original => annotation.hanja.clone(),
3216            RenderMode::Ruby(RubyBase::OnHangul) => annotation.reading.clone(),
3217            RenderMode::Ruby(RubyBase::OnHanja) => annotation.hanja.clone(),
3218        };
3219        return RenderedToken::Text(primary);
3220    }
3221
3222    match options.mode {
3223        RenderMode::HangulOnly if annotation.require_hanja || annotation.homophone => {
3224            RenderedToken::Text(parens(&annotation.reading, &annotation.hanja))
3225        }
3226        RenderMode::HangulOnly => RenderedToken::Text(annotation.reading.clone()),
3227        RenderMode::HangulHanjaParens => {
3228            RenderedToken::Text(parens(&annotation.reading, &annotation.hanja))
3229        }
3230        RenderMode::HanjaHangulParens => {
3231            RenderedToken::Text(parens(&annotation.hanja, &annotation.reading))
3232        }
3233        RenderMode::Ruby(base) => render_ruby(annotation, base, allows_inline_markup),
3234        RenderMode::Original if annotation.require_hangul => match options.original_gloss {
3235            OriginalGloss::Parens => {
3236                RenderedToken::Text(parens(&annotation.hanja, &annotation.reading))
3237            }
3238            // `Original` keeps hanja as the primary text, so its ruby form
3239            // always uses hanja as the base regardless of any other setting.
3240            OriginalGloss::Ruby => render_ruby(annotation, RubyBase::OnHanja, allows_inline_markup),
3241        },
3242        RenderMode::Original => RenderedToken::Text(annotation.hanja.clone()),
3243    }
3244}
3245
3246fn render_ruby<S>(
3247    annotation: &Annotation,
3248    base: RubyBase,
3249    allows_inline_markup: bool,
3250) -> RenderedToken<S> {
3251    let (base_text, rt_text) = match base {
3252        RubyBase::OnHangul => (&annotation.reading, &annotation.hanja),
3253        RubyBase::OnHanja => (&annotation.hanja, &annotation.reading),
3254    };
3255    if !allows_inline_markup {
3256        return RenderedToken::Text(parens(base_text, rt_text));
3257    }
3258    RenderedToken::Ruby {
3259        base: base_text.clone(),
3260        rt: rt_text.clone(),
3261    }
3262}
3263
3264fn parens(reading: &str, hanja: &str) -> String {
3265    let mut output = String::new();
3266    output.push_str(reading);
3267    output.push('(');
3268    output.push_str(hanja);
3269    output.push(')');
3270    output
3271}
3272
3273/// Converts plain text through reader, engine, renderer, and writer stages.
3274///
3275/// This is a convenience for the plain-text MVP path. More capable format
3276/// adapters should call the individual stages so they can preserve their own
3277/// structural tokens. The `render` argument accepts either a [`RenderMode`]
3278/// (converted via `From<RenderMode>` for [`RenderOptions`]) or a full
3279/// [`RenderOptions`] value.
3280///
3281/// Like the high-level umbrella default, this collapses redundant parenthetical
3282/// reading annotations ([`RedundantParenCollapser`]); callers that need finer
3283/// control (including disabling that step) should drive the individual stages
3284/// instead.
3285pub fn convert_plain_text<D, R>(input: &str, dictionary: &D, render: R) -> String
3286where
3287    D: HanjaDictionary + ?Sized,
3288    R: Into<RenderOptions>,
3289{
3290    convert_plain_text_with_options(input, dictionary, render, EngineOptions::default())
3291}
3292
3293/// Converts plain text with explicit hanja conversion engine options.
3294///
3295/// This is the option-aware variant of [`convert_plain_text`].
3296pub fn convert_plain_text_with_options<D, R>(
3297    input: &str,
3298    dictionary: &D,
3299    render: R,
3300    options: EngineOptions,
3301) -> String
3302where
3303    D: HanjaDictionary + ?Sized,
3304    R: Into<RenderOptions>,
3305{
3306    let input_tokens = read_plain_text(input);
3307    let output_tokens = process_tokens_with_options(input_tokens, dictionary, options);
3308    let output_tokens = collapse_redundant_parens(output_tokens, true);
3309    let output_tokens = mark_homophones(output_tokens, dictionary, ContextWindow::PerBlock);
3310    let rendered_tokens = render_tokens(output_tokens, render);
3311    write_plain_text(rendered_tokens)
3312}