1#![no_std]
25#![forbid(unsafe_code)]
26#![deny(missing_docs)]
27
28extern crate alloc;
29
30mod fallback;
31mod generated;
32mod segment;
33
34use alloc::boxed::Box;
35use alloc::collections::{BTreeMap, BTreeSet};
36use alloc::string::{String, ToString};
37use alloc::vec::Vec;
38use core::marker::PhantomData;
39
40use fallback::{
41 FallbackPart, FallbackState, fallback_reading_for_run, phoneticize_fallback_run_with_state,
42};
43use generated::unihan_readings::KHANGUL_READINGS;
44use segment::{Segment, segment_text};
45
46#[derive(Debug, thiserror::Error)]
53#[non_exhaustive]
54pub enum Error {
55 #[error("dictionary load failed: {0}")]
57 DictionaryLoad(String),
58
59 #[error("segmentation failed for {hanja:?}: {reason}")]
61 Segmentation {
62 hanja: String,
64
65 reason: String,
67 },
68
69 #[error("invalid hangul reading {reading:?} for hanja {hanja:?}")]
71 InvalidReading {
72 hanja: String,
74
75 reading: String,
77 },
78
79 #[error("internal invariant violated: {0}")]
81 Internal(&'static str),
82
83 #[error(transparent)]
86 Other(#[from] Box<dyn core::error::Error + Send + Sync + 'static>),
87}
88
89#[derive(Clone, Copy, Debug, Default, Eq, PartialEq)]
95pub enum Recovery {
96 #[default]
98 Strict,
99
100 Lenient,
102}
103
104#[derive(Debug)]
110pub struct RecoverableInputError {
111 original: String,
112 error: Error,
113}
114
115impl RecoverableInputError {
116 pub fn new(original: String, error: Error) -> Self {
118 Self { original, error }
119 }
120
121 pub fn original(&self) -> &str {
124 &self.original
125 }
126
127 pub fn error(&self) -> &Error {
129 &self.error
130 }
131
132 pub fn into_parts(self) -> (String, Error) {
134 (self.original, self.error)
135 }
136}
137
138pub trait ScopeData: Clone + 'static {
145 fn is_preserve(&self) -> bool;
147
148 fn allows_inline_markup(&self) -> bool {
163 true
164 }
165
166 fn is_block_boundary(&self) -> bool {
168 false
169 }
170
171 fn is_section_boundary(&self) -> bool {
173 false
174 }
175}
176
177#[derive(Clone, Debug, Eq, PartialEq)]
183pub struct Scope<S> {
184 data: S,
185}
186
187impl<S> Scope<S> {
188 pub fn new(data: S) -> Self {
190 Self { data }
191 }
192
193 pub fn data(&self) -> &S {
195 &self.data
196 }
197
198 pub fn into_data(self) -> S {
200 self.data
201 }
202}
203
204#[derive(Clone, Debug, Eq, PartialEq)]
210pub enum InputToken<S> {
211 Open(Scope<S>),
213
214 Close,
216
217 Text(String),
219
220 Verbatim(String),
222}
223
224#[derive(Clone, Debug, Eq, PartialEq)]
230pub enum OutputToken<S> {
231 Open(Scope<S>),
233
234 Close,
236
237 Text(String),
239
240 Verbatim(String),
242
243 Annotated(Annotation),
245}
246
247#[derive(Clone, Debug, Eq, PartialEq)]
253pub enum RenderedToken<S> {
254 Open(Scope<S>),
256
257 Close,
259
260 Text(String),
262
263 Verbatim(String),
265
266 Ruby {
279 base: String,
281
282 rt: String,
284 },
285}
286
287#[derive(Clone, Debug, Eq, PartialEq)]
293pub struct Annotation {
294 pub hanja: String,
296
297 pub reading: String,
299
300 pub homophone: bool,
302
303 pub require_hanja: bool,
305
306 pub require_hangul: bool,
309
310 pub first_in_context: bool,
312
313 pub skip_annotation: bool,
316
317 pub from_dictionary: bool,
319}
320
321#[derive(Clone, Copy, Debug, Default, Eq, PartialEq)]
323pub struct MatchMark {
324 pub require_hanja: bool,
326
327 pub require_hangul: bool,
329}
330
331#[derive(Clone, Debug, Eq, PartialEq)]
339pub struct DictionaryRecord {
340 pub hanja: String,
342
343 pub reading: String,
345
346 pub mark: MatchMark,
348}
349
350#[derive(Clone, Debug, Eq, PartialEq)]
352pub struct Match {
353 pub byte_len: usize,
355
356 pub reading: String,
358
359 pub mark: MatchMark,
361}
362
363pub trait HanjaDictionary {
369 fn matches_at<'a>(&'a self, s: &'a str) -> Box<dyn Iterator<Item = Match> + 'a>;
371
372 fn max_word_chars(&self) -> Option<usize> {
374 None
375 }
376
377 fn entries<'a>(&'a self) -> Option<Box<dyn Iterator<Item = DictionaryRecord> + 'a>> {
383 None
384 }
385
386 fn has_homophone(&self, hanja: &str, reading: &str) -> bool {
388 self.entries().is_some_and(|mut entries| {
389 entries.any(|record| record.hanja != hanja && record.reading == reading)
390 })
391 }
392}
393
394impl<D> HanjaDictionary for &D
395where
396 D: HanjaDictionary + ?Sized,
397{
398 fn matches_at<'a>(&'a self, s: &'a str) -> Box<dyn Iterator<Item = Match> + 'a> {
399 (**self).matches_at(s)
400 }
401
402 fn max_word_chars(&self) -> Option<usize> {
403 (**self).max_word_chars()
404 }
405
406 fn entries<'a>(&'a self) -> Option<Box<dyn Iterator<Item = DictionaryRecord> + 'a>> {
407 (**self).entries()
408 }
409
410 fn has_homophone(&self, hanja: &str, reading: &str) -> bool {
411 (**self).has_homophone(hanja, reading)
412 }
413}
414
415impl<D> HanjaDictionary for Box<D>
416where
417 D: HanjaDictionary + ?Sized,
418{
419 fn matches_at<'a>(&'a self, s: &'a str) -> Box<dyn Iterator<Item = Match> + 'a> {
420 (**self).matches_at(s)
421 }
422
423 fn max_word_chars(&self) -> Option<usize> {
424 (**self).max_word_chars()
425 }
426
427 fn entries<'a>(&'a self) -> Option<Box<dyn Iterator<Item = DictionaryRecord> + 'a>> {
428 (**self).entries()
429 }
430
431 fn has_homophone(&self, hanja: &str, reading: &str) -> bool {
432 (**self).has_homophone(hanja, reading)
433 }
434}
435
436#[derive(Clone, Copy, Debug, Default, Eq, PartialEq)]
444pub struct UnihanCharDict;
445
446impl HanjaDictionary for UnihanCharDict {
447 fn matches_at<'a>(&'a self, s: &'a str) -> Box<dyn Iterator<Item = Match> + 'a> {
448 let matched = s.chars().next().and_then(|ch| {
449 khangul_reading(ch).map(|reading| Match {
450 byte_len: ch.len_utf8(),
451 reading: reading.to_string(),
452 mark: MatchMark::default(),
453 })
454 });
455 Box::new(matched.into_iter())
456 }
457
458 fn max_word_chars(&self) -> Option<usize> {
459 Some(1)
460 }
461
462 fn entries<'a>(&'a self) -> Option<Box<dyn Iterator<Item = DictionaryRecord> + 'a>> {
463 Some(Box::new(KHANGUL_READINGS.iter().map(|(hanja, reading)| {
464 DictionaryRecord {
465 hanja: hanja.to_string(),
466 reading: reading.to_string(),
467 mark: MatchMark::default(),
468 }
469 })))
470 }
471
472 fn has_homophone(&self, hanja: &str, reading: &str) -> bool {
473 let mut chars = hanja.chars();
474 let Some(hanja) = chars.next() else {
475 return false;
476 };
477 if chars.next().is_some() {
478 return false;
479 }
480 KHANGUL_READINGS
481 .iter()
482 .any(|&(other_hanja, other_reading)| other_hanja != hanja && other_reading == reading)
483 }
484}
485
486#[derive(Clone, Debug, Default, Eq, PartialEq)]
494pub struct ChainDictionary<D> {
495 dictionaries: Vec<D>,
496}
497
498impl<D> ChainDictionary<D> {
499 pub fn new() -> Self {
501 Self {
502 dictionaries: Vec::new(),
503 }
504 }
505
506 pub fn push(&mut self, dictionary: D) {
508 self.dictionaries.push(dictionary);
509 }
510
511 pub fn len(&self) -> usize {
513 self.dictionaries.len()
514 }
515
516 pub fn is_empty(&self) -> bool {
518 self.dictionaries.is_empty()
519 }
520
521 pub fn dictionaries(&self) -> &[D] {
523 &self.dictionaries
524 }
525
526 pub fn into_dictionaries(self) -> Vec<D> {
528 self.dictionaries
529 }
530}
531
532impl<D> FromIterator<D> for ChainDictionary<D> {
533 fn from_iter<T: IntoIterator<Item = D>>(iter: T) -> Self {
534 Self {
535 dictionaries: Vec::from_iter(iter),
536 }
537 }
538}
539
540impl<D> HanjaDictionary for ChainDictionary<D>
541where
542 D: HanjaDictionary,
543{
544 fn matches_at<'a>(&'a self, s: &'a str) -> Box<dyn Iterator<Item = Match> + 'a> {
545 let mut seen_lengths = BTreeSet::new();
546 let mut matches = Vec::new();
547
548 for dictionary in &self.dictionaries {
549 for matched in dictionary.matches_at(s) {
550 if seen_lengths.insert(matched.byte_len) {
551 matches.push(matched);
552 }
553 }
554 }
555
556 matches.sort_by_key(|matched| matched.byte_len);
557 Box::new(matches.into_iter())
558 }
559
560 fn max_word_chars(&self) -> Option<usize> {
561 let mut max = None;
562 for dictionary in &self.dictionaries {
563 let word_chars = dictionary.max_word_chars()?;
564 max = Some(max.map_or(word_chars, |current: usize| current.max(word_chars)));
565 }
566 max
567 }
568
569 fn entries<'a>(&'a self) -> Option<Box<dyn Iterator<Item = DictionaryRecord> + 'a>> {
570 let mut records = BTreeMap::<String, DictionaryRecord>::new();
571
572 for dictionary in &self.dictionaries {
573 for record in dictionary.entries()? {
574 records.entry(record.hanja.clone()).or_insert(record);
575 }
576 }
577
578 Some(Box::new(records.into_values()))
579 }
580
581 fn has_homophone(&self, hanja: &str, reading: &str) -> bool {
582 if let Some(mut records) = self.entries() {
583 return records.any(|record| record.hanja != hanja && record.reading == reading);
584 }
585
586 self.dictionaries
587 .iter()
588 .any(|dictionary| dictionary.has_homophone(hanja, reading))
589 }
590}
591
592fn khangul_reading(ch: char) -> Option<&'static str> {
593 KHANGUL_READINGS
594 .binary_search_by_key(&ch, |(hanja, _)| *hanja)
595 .ok()
596 .map(|index| KHANGUL_READINGS[index].1)
597}
598
599#[derive(Clone, Copy, Debug, Eq, PartialEq)]
605pub struct EngineOptions {
606 pub segmentation: SegmentationStrategy,
609
610 pub initial_sound_law: bool,
612
613 pub numeral_strategy: NumeralStrategy,
615}
616
617impl Default for EngineOptions {
618 fn default() -> Self {
619 Self {
620 segmentation: SegmentationStrategy::Lattice,
621 initial_sound_law: true,
622 numeral_strategy: NumeralStrategy::HangulPhonetic,
623 }
624 }
625}
626
627#[non_exhaustive]
634#[derive(Clone, Copy, Debug, Default, Eq, PartialEq)]
635pub enum SegmentationStrategy {
636 #[default]
638 Lattice,
639
640 Eager,
642}
643
644#[non_exhaustive]
646#[derive(Clone, Copy, Debug, Eq, PartialEq)]
647pub enum NumeralStrategy {
648 HangulPhonetic,
653
654 PositionalArabic,
660
661 AdditiveArabic,
667
668 Smart,
677}
678
679#[derive(Clone, Debug, Eq, PartialEq)]
680struct DictionaryEntry {
681 reading: String,
682 mark: MatchMark,
683}
684
685#[derive(Clone, Debug, Default, Eq, PartialEq)]
691pub struct MapDictionary {
692 entries: BTreeMap<String, DictionaryEntry>,
693 max_word_chars: Option<usize>,
694}
695
696impl MapDictionary {
697 pub fn new() -> Self {
699 Self::default()
700 }
701
702 pub fn insert(&mut self, hanja: impl Into<String>, reading: impl Into<String>) {
704 self.insert_marked(hanja, reading, MatchMark::default());
705 }
706
707 pub fn insert_marked(
709 &mut self,
710 hanja: impl Into<String>,
711 reading: impl Into<String>,
712 mark: MatchMark,
713 ) {
714 let hanja = hanja.into();
715 let word_chars = hanja.chars().count();
716 self.max_word_chars = Some(self.max_word_chars.map_or(word_chars, |max| {
717 if word_chars > max { word_chars } else { max }
718 }));
719 self.entries.insert(
720 hanja,
721 DictionaryEntry {
722 reading: reading.into(),
723 mark,
724 },
725 );
726 }
727
728 pub fn is_empty(&self) -> bool {
730 self.entries.is_empty()
731 }
732
733 pub fn len(&self) -> usize {
735 self.entries.len()
736 }
737}
738
739impl HanjaDictionary for MapDictionary {
740 fn matches_at<'a>(&'a self, s: &'a str) -> Box<dyn Iterator<Item = Match> + 'a> {
741 Box::new(
742 self.entries
743 .iter()
744 .filter(move |(hanja, _)| s.starts_with(hanja.as_str()))
745 .map(|(hanja, entry)| Match {
746 byte_len: hanja.len(),
747 reading: entry.reading.clone(),
748 mark: entry.mark,
749 }),
750 )
751 }
752
753 fn max_word_chars(&self) -> Option<usize> {
754 self.max_word_chars
755 }
756
757 fn entries<'a>(&'a self) -> Option<Box<dyn Iterator<Item = DictionaryRecord> + 'a>> {
758 Some(Box::new(self.entries.iter().map(|(hanja, entry)| {
759 DictionaryRecord {
760 hanja: hanja.clone(),
761 reading: entry.reading.clone(),
762 mark: entry.mark,
763 }
764 })))
765 }
766
767 fn has_homophone(&self, hanja: &str, reading: &str) -> bool {
768 self.entries
769 .iter()
770 .any(|(other_hanja, entry)| other_hanja != hanja && entry.reading == reading)
771 }
772}
773
774#[derive(Clone, Copy, Debug, Default, Eq, PartialEq)]
782pub struct PlainScopeData;
783
784impl ScopeData for PlainScopeData {
785 fn is_preserve(&self) -> bool {
786 false
787 }
788
789 fn allows_inline_markup(&self) -> bool {
790 false
791 }
792}
793
794pub fn read_plain_text(input: &str) -> Vec<InputToken<PlainScopeData>> {
799 Vec::from([
800 InputToken::Open(Scope::new(PlainScopeData)),
801 InputToken::Text(input.to_string()),
802 InputToken::Close,
803 ])
804}
805
806pub fn write_plain_text<S>(tokens: impl IntoIterator<Item = RenderedToken<S>>) -> String {
814 let mut output = String::new();
815 for token in tokens {
816 match token {
817 RenderedToken::Open(_) | RenderedToken::Close => {}
818 RenderedToken::Text(text) | RenderedToken::Verbatim(text) => output.push_str(&text),
819 RenderedToken::Ruby { base, rt } => {
820 output.push_str(&parens(&base, &rt));
821 }
822 }
823 }
824 output
825}
826
827pub fn process_tokens<S, D>(
833 tokens: impl IntoIterator<Item = InputToken<S>>,
834 dictionary: &D,
835) -> Vec<OutputToken<S>>
836where
837 S: ScopeData,
838 D: HanjaDictionary + ?Sized,
839{
840 process_tokens_iter(tokens, dictionary).collect()
841}
842
843pub fn process_tokens_iter<S, D>(
851 tokens: impl IntoIterator<Item = InputToken<S>>,
852 dictionary: &D,
853) -> alloc::vec::IntoIter<OutputToken<S>>
854where
855 S: ScopeData,
856 D: HanjaDictionary + ?Sized,
857{
858 process_tokens_with_options(tokens, dictionary, EngineOptions::default()).into_iter()
859}
860
861pub fn process_tokens_with_options<S, D>(
866 tokens: impl IntoIterator<Item = InputToken<S>>,
867 dictionary: &D,
868 options: EngineOptions,
869) -> Vec<OutputToken<S>>
870where
871 S: ScopeData,
872 D: HanjaDictionary + ?Sized,
873{
874 let mut engine = Engine::collecting(dictionary, options);
875 let mut output = Vec::new();
876
877 for token in tokens {
878 output.extend(engine.push_token(token));
879 }
880
881 output.extend(engine.finish());
882 output
883}
884
885pub fn process_tokens_iter_with_options<S, D>(
892 tokens: impl IntoIterator<Item = InputToken<S>>,
893 dictionary: &D,
894 options: EngineOptions,
895) -> alloc::vec::IntoIter<OutputToken<S>>
896where
897 S: ScopeData,
898 D: HanjaDictionary + ?Sized,
899{
900 process_tokens_with_options(tokens, dictionary, options).into_iter()
901}
902
903pub fn recover_input_tokens<S>(
925 tokens: impl IntoIterator<Item = Result<InputToken<S>, RecoverableInputError>>,
926 recovery: Recovery,
927) -> Result<Vec<InputToken<S>>, Error>
928where
929 S: ScopeData,
930{
931 let mut recovered = Vec::new();
932 for token in tokens {
933 recovered.push(recover_input_token(token, recovery)?);
934 }
935 Ok(recovered)
936}
937
938pub fn recover_input_token<S>(
945 token: Result<InputToken<S>, RecoverableInputError>,
946 recovery: Recovery,
947) -> Result<InputToken<S>, Error>
948where
949 S: ScopeData,
950{
951 match token {
952 Ok(token) => Ok(token),
953 Err(error) => match recovery {
954 Recovery::Strict => Err(error.into_parts().1),
955 Recovery::Lenient => {
956 let (original, error) = error.into_parts();
957 tracing::warn!(error = %error, "recovering from input reader error");
958 Ok(InputToken::Verbatim(original))
959 }
960 },
961 }
962}
963
964pub fn process_fallible_tokens<S, D>(
971 tokens: impl IntoIterator<Item = Result<InputToken<S>, RecoverableInputError>>,
972 dictionary: &D,
973 recovery: Recovery,
974) -> Result<Vec<OutputToken<S>>, Error>
975where
976 S: ScopeData,
977 D: HanjaDictionary + ?Sized,
978{
979 process_fallible_tokens_with_options(tokens, dictionary, EngineOptions::default(), recovery)
980}
981
982pub fn process_fallible_tokens_with_options<S, D>(
989 tokens: impl IntoIterator<Item = Result<InputToken<S>, RecoverableInputError>>,
990 dictionary: &D,
991 options: EngineOptions,
992 recovery: Recovery,
993) -> Result<Vec<OutputToken<S>>, Error>
994where
995 S: ScopeData,
996 D: HanjaDictionary + ?Sized,
997{
998 let recovered = recover_input_tokens(tokens, recovery)?;
999 Ok(process_tokens_with_options(recovered, dictionary, options))
1000}
1001
1002pub struct Engine<'a, S, D>
1014where
1015 S: ScopeData,
1016 D: HanjaDictionary + ?Sized,
1017{
1018 dictionary: &'a D,
1019 options: EngineOptions,
1020 scopes: Vec<Scope<S>>,
1021 pending_text: String,
1022 pending_unflushable_fallback_run_bytes: Option<usize>,
1023 fallback_state: FallbackState,
1024 incremental_flush: bool,
1025}
1026
1027impl<'a, S, D> Engine<'a, S, D>
1028where
1029 S: ScopeData,
1030 D: HanjaDictionary + ?Sized,
1031{
1032 pub fn new(dictionary: &'a D) -> Self {
1034 Self::with_options(dictionary, EngineOptions::default())
1035 }
1036
1037 pub fn with_options(dictionary: &'a D, options: EngineOptions) -> Self {
1039 Self::with_incremental_flush(dictionary, options, true)
1040 }
1041
1042 fn collecting(dictionary: &'a D, options: EngineOptions) -> Self {
1043 Self::with_incremental_flush(dictionary, options, false)
1044 }
1045
1046 fn with_incremental_flush(
1047 dictionary: &'a D,
1048 options: EngineOptions,
1049 incremental_flush: bool,
1050 ) -> Self {
1051 tracing::debug!(
1052 strategy = ?options.segmentation,
1053 "engine created with segmentation strategy"
1054 );
1055 Self {
1056 dictionary,
1057 options,
1058 scopes: Vec::new(),
1059 pending_text: String::new(),
1060 pending_unflushable_fallback_run_bytes: None,
1061 fallback_state: FallbackState::default(),
1062 incremental_flush,
1063 }
1064 }
1065
1066 pub fn push_token(&mut self, token: InputToken<S>) -> Vec<OutputToken<S>> {
1069 let mut output = Vec::new();
1070 match token {
1071 InputToken::Open(scope) => {
1072 self.flush_into(&mut output);
1073 if scope.data().is_block_boundary() {
1074 self.reset_fallback_context();
1075 }
1076 self.scopes.push(scope.clone());
1077 output.push(OutputToken::Open(scope));
1078 }
1079 InputToken::Close => {
1080 self.flush_into(&mut output);
1081 let closes_block_boundary = self
1082 .scopes
1083 .pop()
1084 .is_some_and(|scope| scope.data().is_block_boundary());
1085 output.push(OutputToken::Close);
1086 if closes_block_boundary {
1087 self.reset_fallback_context();
1088 }
1089 }
1090 InputToken::Text(text) => {
1091 if self
1092 .scopes
1093 .last()
1094 .is_some_and(|scope| scope.data().is_preserve())
1095 {
1096 self.flush_into(&mut output);
1097 self.reset_fallback_context();
1098 output.push(OutputToken::Text(text));
1099 } else {
1100 let previous_pending_bytes = self.pending_text.len();
1101 self.pending_text.push_str(&text);
1102 if self
1103 .pending_unflushable_fallback_run_bytes
1104 .is_some_and(|bytes| bytes == previous_pending_bytes)
1105 {
1106 self.pending_unflushable_fallback_run_bytes = Some(previous_pending_bytes);
1107 } else {
1108 self.pending_unflushable_fallback_run_bytes = None;
1109 }
1110 if self.incremental_flush {
1111 self.flush_safe_into(&mut output);
1112 }
1113 }
1114 }
1115 InputToken::Verbatim(text) => {
1116 self.flush_into(&mut output);
1117 self.reset_fallback_context();
1118 output.push(OutputToken::Verbatim(text));
1119 }
1120 }
1121 output
1122 }
1123
1124 pub fn flush(&mut self) -> Vec<OutputToken<S>> {
1126 let mut output = Vec::new();
1127 self.flush_into(&mut output);
1128 output
1129 }
1130
1131 pub fn finish(mut self) -> Vec<OutputToken<S>> {
1133 self.flush()
1134 }
1135
1136 pub fn buffered_chars(&self) -> usize {
1138 self.pending_text.chars().count()
1139 }
1140
1141 fn tail_bound(&self) -> Option<usize> {
1142 self.dictionary.max_word_chars().filter(|bound| *bound > 0)
1143 }
1144
1145 fn flush_safe_into(&mut self, output: &mut Vec<OutputToken<S>>) {
1146 if self.pending_text.is_empty() {
1147 return;
1148 }
1149 if !self.pending_text.chars().any(is_hanja) {
1150 self.flush_non_hanja_safe_into(output);
1151 return;
1152 }
1153
1154 let Some(bound) = self.tail_bound() else {
1155 let Some(flush_end) = safe_unknown_bound_flush_end(&self.pending_text) else {
1156 return;
1157 };
1158 self.flush_prefix_into(flush_end, output);
1159 if !self.pending_text.chars().any(is_hanja) {
1160 self.flush_non_hanja_safe_into(output);
1161 }
1162 return;
1163 };
1164 if let Some(flush_end) = safe_unknown_bound_flush_end(&self.pending_text) {
1165 self.flush_prefix_into(flush_end, output);
1166 if !self.pending_text.chars().any(is_hanja) {
1167 self.flush_non_hanja_safe_into(output);
1168 }
1169 return;
1170 }
1171 let buffered_chars = self.buffered_chars();
1172 if buffered_chars > bound.saturating_mul(10) {
1173 tracing::debug!(
1174 buffered_chars,
1175 dict_max_word_chars = bound,
1176 "streaming tail buffer is unusually large"
1177 );
1178 }
1179 if buffered_chars <= bound {
1180 return;
1181 }
1182
1183 if self.extends_unflushable_fallback_run(bound) {
1184 self.pending_unflushable_fallback_run_bytes = Some(self.pending_text.len());
1185 return;
1186 }
1187
1188 let safe_chars = buffered_chars.saturating_sub(bound).saturating_add(1);
1189 let segments = segment_text(
1190 &self.pending_text,
1191 self.dictionary,
1192 self.options.segmentation,
1193 );
1194 let mut flush_end = 0;
1195 let mut flush_segments = Vec::new();
1196 for segment in &segments {
1197 let (byte_start, byte_end) = segment_bounds(segment);
1198 let start_chars = self.pending_text[..byte_start].chars().count();
1199 let end_chars = self.pending_text[..byte_end].chars().count();
1200 if byte_start > flush_end || (start_chars > safe_chars && flush_end > 0) {
1201 break;
1202 }
1203 if end_chars > safe_chars {
1204 break;
1205 }
1206 flush_end = byte_end;
1207 flush_segments.push(segment.clone());
1208 }
1209
1210 if let Some(fallback_start) = trailing_fallback_run_start(&segments, flush_end) {
1214 flush_end = fallback_start;
1215 while flush_segments
1216 .last()
1217 .is_some_and(|segment| segment_bounds(segment).1 > flush_end)
1218 {
1219 flush_segments.pop();
1220 }
1221 }
1222
1223 if flush_end > 0 {
1224 self.pending_unflushable_fallback_run_bytes = None;
1225 self.flush_segments_prefix_into(flush_end, &flush_segments, output);
1226 if !self.pending_text.chars().any(is_hanja) {
1227 self.flush_non_hanja_safe_into(output);
1228 }
1229 } else if trailing_fallback_run_start(&segments, self.pending_text.len()) == Some(0) {
1230 self.pending_unflushable_fallback_run_bytes = Some(self.pending_text.len());
1231 }
1232 }
1233
1234 fn extends_unflushable_fallback_run(&self, bound: usize) -> bool {
1235 let Some(previous_bytes) = self.pending_unflushable_fallback_run_bytes else {
1236 return false;
1237 };
1238 if previous_bytes == 0
1239 || previous_bytes > self.pending_text.len()
1240 || !self.pending_text.is_char_boundary(previous_bytes)
1241 {
1242 return false;
1243 }
1244
1245 let appended = &self.pending_text[previous_bytes..];
1246 if appended.is_empty() {
1247 return true;
1248 }
1249 if appended.chars().any(|ch| !is_hanja(ch)) {
1250 return false;
1251 }
1252
1253 let probe_start = suffix_start_for_char_count(
1257 &self.pending_text[..previous_bytes],
1258 bound.saturating_sub(1),
1259 );
1260 let probe = &self.pending_text[probe_start..];
1261 segment_text(probe, self.dictionary, self.options.segmentation)
1262 .iter()
1263 .all(|segment| matches!(segment, Segment::Fallback { .. }))
1264 }
1265
1266 fn flush_non_hanja_safe_into(&mut self, output: &mut Vec<OutputToken<S>>) {
1267 let flush_end = match self.tail_bound() {
1268 Some(bound) => safe_non_hanja_flush_end(&self.pending_text, bound),
1269 None => safe_unknown_bound_flush_end(&self.pending_text),
1270 };
1271 if let Some(flush_end) = flush_end {
1272 self.flush_prefix_into(flush_end, output);
1273 }
1274 }
1275
1276 fn flush_prefix_into(&mut self, flush_end: usize, output: &mut Vec<OutputToken<S>>) {
1277 if flush_end == self.pending_text.len() {
1278 self.flush_into(output);
1279 return;
1280 }
1281 self.pending_unflushable_fallback_run_bytes = None;
1282 let prefix = self.pending_text[..flush_end].to_string();
1283 let segments = segment_text(&prefix, self.dictionary, self.options.segmentation);
1284 self.flush_segments_prefix_into(flush_end, &segments, output);
1285 }
1286
1287 fn flush_segments_prefix_into(
1288 &mut self,
1289 flush_end: usize,
1290 segments: &[Segment],
1291 output: &mut Vec<OutputToken<S>>,
1292 ) {
1293 let prefix = self.pending_text[..flush_end].to_string();
1294 process_segments_with_state(
1295 &prefix,
1296 segments,
1297 self.dictionary,
1298 self.options,
1299 &mut self.fallback_state,
1300 output,
1301 );
1302 self.pending_text.replace_range(..flush_end, "");
1303 }
1304
1305 fn flush_into(&mut self, output: &mut Vec<OutputToken<S>>) {
1306 if self.pending_text.is_empty() {
1307 return;
1308 }
1309 self.pending_unflushable_fallback_run_bytes = None;
1310 let text = core::mem::take(&mut self.pending_text);
1311 process_text_with_state(
1312 &text,
1313 self.dictionary,
1314 self.options,
1315 &mut self.fallback_state,
1316 output,
1317 );
1318 }
1319
1320 fn reset_fallback_context(&mut self) {
1321 self.fallback_state = FallbackState::default();
1322 }
1323}
1324
1325fn safe_non_hanja_flush_end(text: &str, bound: usize) -> Option<usize> {
1326 if text.is_empty() {
1327 return None;
1328 }
1329
1330 let keep_chars = bound.saturating_sub(1);
1331 let span_start = text
1332 .char_indices()
1333 .rfind(|(_, ch)| ch.is_whitespace())
1334 .map_or(0, |(index, ch)| index + ch.len_utf8());
1335 let suffix = &text[span_start..];
1336 let suffix_chars = suffix.chars().count();
1337 if suffix_chars <= keep_chars {
1338 return (span_start > 0).then_some(span_start);
1339 }
1340
1341 let flush_suffix_chars = suffix_chars - keep_chars;
1342 let flush_end = suffix
1343 .char_indices()
1344 .nth(flush_suffix_chars)
1345 .map_or(text.len(), |(index, _)| span_start + index);
1346 (flush_end > 0).then_some(flush_end)
1347}
1348
1349fn safe_unknown_bound_flush_end(text: &str) -> Option<usize> {
1350 text.char_indices()
1351 .rfind(|(_, ch)| ch.is_whitespace())
1352 .map(|(index, ch)| index + ch.len_utf8())
1353}
1354
1355fn suffix_start_for_char_count(text: &str, count: usize) -> usize {
1356 if count == 0 {
1357 return text.len();
1358 }
1359
1360 text.char_indices()
1361 .rev()
1362 .nth(count.saturating_sub(1))
1363 .map_or(0, |(index, _)| index)
1364}
1365
1366fn trailing_fallback_run_start(segments: &[Segment], split_byte: usize) -> Option<usize> {
1367 if split_byte == 0 {
1368 return None;
1369 }
1370
1371 for (index, segment) in segments.iter().enumerate() {
1372 let (byte_start, byte_end) = segment_bounds(segment);
1373 if byte_end != split_byte {
1374 continue;
1375 }
1376 if !matches!(segment, Segment::Fallback { .. }) {
1377 return None;
1378 }
1379 if let Some(next) = segments.get(index + 1)
1380 && !matches!(next, Segment::Fallback { .. })
1381 {
1382 return None;
1383 }
1384
1385 let mut run_start = byte_start;
1386 for previous in segments[..index].iter().rev() {
1387 let (previous_start, previous_end) = segment_bounds(previous);
1388 if previous_end != run_start || !matches!(previous, Segment::Fallback { .. }) {
1389 break;
1390 }
1391 run_start = previous_start;
1392 }
1393 return (run_start < split_byte).then_some(run_start);
1394 }
1395
1396 None
1397}
1398
1399fn process_text_with_state<S, D>(
1400 text: &str,
1401 dictionary: &D,
1402 options: EngineOptions,
1403 fallback_state: &mut FallbackState,
1404 output: &mut Vec<OutputToken<S>>,
1405) where
1406 D: HanjaDictionary + ?Sized,
1407{
1408 let segments = segment_text(text, dictionary, options.segmentation);
1409 process_segments_with_state(text, &segments, dictionary, options, fallback_state, output);
1410}
1411
1412fn process_segments_with_state<S, D>(
1413 text: &str,
1414 segments: &[Segment],
1415 _dictionary: &D,
1416 options: EngineOptions,
1417 fallback_state: &mut FallbackState,
1418 output: &mut Vec<OutputToken<S>>,
1419) where
1420 D: HanjaDictionary + ?Sized,
1421{
1422 let mut index = 0;
1423
1424 while index < segments.len() {
1425 match &segments[index] {
1426 Segment::Dictionary {
1427 byte_start,
1428 byte_end,
1429 reading,
1430 mark,
1431 } => {
1432 let source = &text[*byte_start..*byte_end];
1433 output.push(OutputToken::Annotated(Annotation {
1434 hanja: source.to_string(),
1435 homophone: false,
1436 reading: reading.clone(),
1437 require_hanja: mark.require_hanja,
1438 require_hangul: mark.require_hangul,
1439 first_in_context: true,
1440 skip_annotation: false,
1441 from_dictionary: true,
1442 }));
1443 if should_preserve_dictionary_context(source, reading, options) {
1444 update_fallback_state_for_reading(reading, fallback_state);
1445 } else {
1446 *fallback_state = FallbackState::default();
1447 }
1448 index += 1;
1449 }
1450 Segment::Fallback {
1451 byte_start,
1452 byte_end,
1453 } => {
1454 let mut fallback_end = *byte_end;
1455 while let Some(Segment::Fallback { byte_end, .. }) = segments.get(index + 1) {
1456 fallback_end = *byte_end;
1457 index += 1;
1458 }
1459 process_fallback_text(
1460 &text[*byte_start..fallback_end],
1461 options,
1462 fallback_state,
1463 output,
1464 );
1465 index += 1;
1466 }
1467 Segment::Text {
1468 byte_start,
1469 byte_end,
1470 } => {
1471 let text_segment = &text[*byte_start..*byte_end];
1472 push_text(output, text_segment);
1473 update_fallback_state_for_text(text_segment, fallback_state);
1474 index += 1;
1475 }
1476 }
1477 }
1478}
1479
1480fn segment_bounds(segment: &Segment) -> (usize, usize) {
1481 match segment {
1482 Segment::Dictionary {
1483 byte_start,
1484 byte_end,
1485 ..
1486 }
1487 | Segment::Fallback {
1488 byte_start,
1489 byte_end,
1490 }
1491 | Segment::Text {
1492 byte_start,
1493 byte_end,
1494 } => (*byte_start, *byte_end),
1495 }
1496}
1497
1498fn process_fallback_text<S>(
1499 text: &str,
1500 options: EngineOptions,
1501 state: &mut FallbackState,
1502 output: &mut Vec<OutputToken<S>>,
1503) {
1504 for part in phoneticize_fallback_run_with_state(text, options, state) {
1505 match part {
1506 FallbackPart::Annotation { hanja, reading } => {
1507 output.push(OutputToken::Annotated(Annotation {
1508 hanja,
1509 reading,
1510 homophone: false,
1511 require_hanja: false,
1512 require_hangul: false,
1513 first_in_context: true,
1514 skip_annotation: false,
1515 from_dictionary: false,
1516 }));
1517 }
1518 FallbackPart::ReadingText(text) => push_text(output, &text),
1519 FallbackPart::Text(text) => push_text(output, &text),
1520 }
1521 }
1522}
1523
1524fn update_fallback_state_for_text(text: &str, state: &mut FallbackState) {
1525 if text.is_empty() {
1526 return;
1527 }
1528
1529 if text
1530 .chars()
1531 .last()
1532 .is_some_and(|character| character.is_whitespace())
1533 {
1534 *state = FallbackState::default();
1535 return;
1536 }
1537
1538 let Some(last) = text.chars().rev().find(|ch| !ch.is_whitespace()) else {
1539 return;
1540 };
1541
1542 if last.is_alphanumeric() {
1543 state.starts_word = false;
1544 state.previous_reading = Some(last);
1545 } else {
1546 *state = FallbackState::default();
1547 }
1548}
1549
1550fn should_preserve_dictionary_context(source: &str, reading: &str, options: EngineOptions) -> bool {
1551 if reading.chars().all(char::is_whitespace) {
1552 return false;
1553 }
1554
1555 if source.chars().all(is_hanja) {
1556 match fallback_reading_for_run(source, options) {
1557 Some(fallback_reading) => {
1558 fallback_reading == reading || has_one_hangul_syllable_per_hanja(source, reading)
1559 }
1560 None => has_one_hangul_syllable_per_hanja(source, reading),
1561 }
1562 } else {
1563 true
1564 }
1565}
1566
1567fn has_one_hangul_syllable_per_hanja(source: &str, reading: &str) -> bool {
1568 let source_len = source.chars().count();
1569 let mut reading_len = 0;
1570
1571 for ch in reading.chars() {
1572 if !is_hangul_syllable(ch) {
1573 return false;
1574 }
1575 reading_len += 1;
1576 }
1577
1578 reading_len == source_len
1579}
1580
1581fn is_hangul_syllable(ch: char) -> bool {
1582 ('\u{ac00}'..='\u{d7a3}').contains(&ch)
1583}
1584
1585fn update_fallback_state_for_reading(reading: &str, state: &mut FallbackState) {
1586 let Some(last) = reading.chars().rev().find(|ch| !ch.is_whitespace()) else {
1587 *state = FallbackState::default();
1588 return;
1589 };
1590
1591 if last.is_alphanumeric() {
1592 state.starts_word = false;
1593 state.previous_reading = Some(last);
1594 } else {
1595 *state = FallbackState::default();
1596 }
1597}
1598
1599fn push_text<S>(output: &mut Vec<OutputToken<S>>, text: &str) {
1600 if text.is_empty() {
1601 return;
1602 }
1603
1604 match output.last_mut() {
1605 Some(OutputToken::Text(existing)) => existing.push_str(text),
1606 _ => output.push(OutputToken::Text(text.to_string())),
1607 }
1608}
1609
1610pub fn is_hanja(ch: char) -> bool {
1612 matches!(
1613 ch,
1614 '\u{2F00}'..='\u{2FFF}'
1615 | '\u{3007}'
1616 | '\u{3400}'..='\u{4DBF}'
1617 | '\u{4E00}'..='\u{9FFF}'
1618 | '\u{F900}'..='\u{FAFF}'
1619 | '\u{20000}'..='\u{2A6DF}'
1620 | '\u{2A700}'..='\u{2B73F}'
1621 | '\u{2B740}'..='\u{2B81F}'
1622 | '\u{2B820}'..='\u{2CEAF}'
1623 | '\u{2CEB0}'..='\u{2EBEF}'
1624 | '\u{2EBF0}'..='\u{2EE5F}'
1625 | '\u{2F800}'..='\u{2FA1F}'
1626 | '\u{30000}'..='\u{3134F}'
1627 | '\u{31350}'..='\u{323AF}'
1628 | '\u{323B0}'..='\u{3347F}'
1629 )
1630}
1631
1632#[derive(Clone, Copy, Debug, Eq, PartialEq)]
1634pub enum RenderMode {
1635 HangulOnly,
1637
1638 HangulHanjaParens,
1640
1641 HanjaHangulParens,
1644
1645 Ruby(RubyBase),
1653
1654 Original,
1656}
1657
1658#[derive(Clone, Copy, Debug, Eq, PartialEq)]
1660pub enum RubyBase {
1661 OnHangul,
1663
1664 OnHanja,
1666}
1667
1668#[derive(Clone, Copy, Debug, Default, Eq, PartialEq)]
1676pub enum OriginalGloss {
1677 #[default]
1679 Parens,
1680
1681 Ruby,
1685}
1686
1687#[derive(Clone, Copy, Debug, Eq, PartialEq)]
1694pub struct RenderOptions {
1695 pub mode: RenderMode,
1697
1698 pub original_gloss: OriginalGloss,
1700}
1701
1702impl Default for RenderOptions {
1703 fn default() -> Self {
1704 Self {
1705 mode: RenderMode::HangulOnly,
1706 original_gloss: OriginalGloss::Parens,
1707 }
1708 }
1709}
1710
1711impl From<RenderMode> for RenderOptions {
1712 fn from(mode: RenderMode) -> Self {
1713 Self {
1714 mode,
1715 original_gloss: OriginalGloss::default(),
1716 }
1717 }
1718}
1719
1720#[derive(Clone, Copy, Debug, Eq, PartialEq)]
1730pub enum ContextWindow {
1731 Off,
1733
1734 PerBlock,
1736
1737 PerSection,
1739
1740 PerDocument,
1742}
1743
1744#[derive(Clone, Copy, Debug, Eq, PartialEq)]
1746pub enum DirectiveAction {
1747 RequireHanja,
1749
1750 RequireHangul,
1752
1753 SkipAnnotation,
1755}
1756
1757#[derive(Default)]
1763pub struct UserDirectives<'a> {
1764 rules: Vec<UserDirectiveRule<'a>>,
1765}
1766
1767impl<'a> UserDirectives<'a> {
1768 pub fn new() -> Self {
1770 Self::default()
1771 }
1772
1773 pub fn require_hanja(&mut self, hanja: impl Into<String>) {
1775 self.add_literal(hanja, DirectiveAction::RequireHanja);
1776 }
1777
1778 pub fn require_hangul(&mut self, hanja: impl Into<String>) {
1780 self.add_literal(hanja, DirectiveAction::RequireHangul);
1781 }
1782
1783 pub fn skip_annotation(&mut self, hanja: impl Into<String>) {
1785 self.add_literal(hanja, DirectiveAction::SkipAnnotation);
1786 }
1787
1788 pub fn add_literal(&mut self, hanja: impl Into<String>, action: DirectiveAction) {
1790 self.rules.push(UserDirectiveRule {
1791 predicate: UserDirectivePredicate::Literal(hanja.into()),
1792 action,
1793 });
1794 }
1795
1796 pub fn add_predicate(
1798 &mut self,
1799 predicate: impl Fn(&Annotation) -> bool + 'a,
1800 action: DirectiveAction,
1801 ) {
1802 self.rules.push(UserDirectiveRule {
1803 predicate: UserDirectivePredicate::Predicate(Box::new(predicate)),
1804 action,
1805 });
1806 }
1807
1808 pub fn is_empty(&self) -> bool {
1810 self.rules.is_empty()
1811 }
1812
1813 pub fn apply<S>(&self, token: OutputToken<S>) -> OutputToken<S> {
1820 match token {
1821 OutputToken::Annotated(mut annotation) => {
1822 for rule in &self.rules {
1823 if !rule.predicate.matches(&annotation) {
1824 continue;
1825 }
1826 match rule.action {
1827 DirectiveAction::RequireHanja => annotation.require_hanja = true,
1828 DirectiveAction::RequireHangul => annotation.require_hangul = true,
1829 DirectiveAction::SkipAnnotation => annotation.skip_annotation = true,
1830 }
1831 }
1832 OutputToken::Annotated(annotation)
1833 }
1834 token => token,
1835 }
1836 }
1837}
1838
1839struct UserDirectiveRule<'a> {
1840 predicate: UserDirectivePredicate<'a>,
1841 action: DirectiveAction,
1842}
1843
1844enum UserDirectivePredicate<'a> {
1845 Literal(String),
1846 Predicate(Box<dyn Fn(&Annotation) -> bool + 'a>),
1847}
1848
1849impl UserDirectivePredicate<'_> {
1850 fn matches(&self, annotation: &Annotation) -> bool {
1851 match self {
1852 Self::Literal(hanja) => annotation.hanja == *hanja,
1853 Self::Predicate(predicate) => predicate(annotation),
1854 }
1855 }
1856}
1857
1858pub fn mark_homophones<S, D>(
1866 tokens: impl IntoIterator<Item = OutputToken<S>>,
1867 dictionary: &D,
1868 window: ContextWindow,
1869) -> Vec<OutputToken<S>>
1870where
1871 S: ScopeData,
1872 D: HanjaDictionary + ?Sized,
1873{
1874 if window == ContextWindow::Off {
1875 return tokens.into_iter().collect();
1876 }
1877
1878 let index = HomophoneIndex::from_dictionary(dictionary);
1879 let lookup_fallback = index.is_none().then_some(dictionary);
1880 ContextMiddleware::new(window, |tokens| {
1881 mark_homophones_in_context(tokens, index.as_ref(), lookup_fallback);
1882 })
1883 .process(tokens)
1884}
1885
1886pub fn filter_first_occurrences<S>(
1892 tokens: impl IntoIterator<Item = OutputToken<S>>,
1893 window: ContextWindow,
1894) -> Vec<OutputToken<S>>
1895where
1896 S: ScopeData,
1897{
1898 ContextMiddleware::new(window, filter_first_occurrences_in_context).process(tokens)
1899}
1900
1901type ContextApply<S> = fn(&mut [OutputToken<S>]);
1902type HomophoneApply<'a, S> = Box<dyn FnMut(&mut [OutputToken<S>]) + 'a>;
1903
1904pub struct HomophoneMarker<'a, S>
1912where
1913 S: ScopeData,
1914{
1915 inner: ContextMiddleware<S, HomophoneApply<'a, S>>,
1916}
1917
1918impl<'a, S> HomophoneMarker<'a, S>
1919where
1920 S: ScopeData,
1921{
1922 pub fn new<D>(dictionary: &'a D, window: ContextWindow) -> Self
1924 where
1925 D: HanjaDictionary + ?Sized,
1926 {
1927 let index = if window == ContextWindow::Off {
1928 None
1929 } else {
1930 HomophoneIndex::from_dictionary(dictionary)
1931 };
1932 let lookup_fallback = index.is_none().then_some(dictionary);
1933 Self {
1934 inner: ContextMiddleware::new(
1935 window,
1936 Box::new(move |tokens| {
1937 mark_homophones_in_context(tokens, index.as_ref(), lookup_fallback);
1938 }),
1939 ),
1940 }
1941 }
1942
1943 pub fn push_token(&mut self, token: OutputToken<S>) -> Vec<OutputToken<S>> {
1945 self.inner.push_token(token)
1946 }
1947
1948 pub fn finish(self) -> Vec<OutputToken<S>> {
1950 self.inner.finish()
1951 }
1952}
1953
1954pub struct FirstOccurrenceFilter<S>
1959where
1960 S: ScopeData,
1961{
1962 inner: ContextMiddleware<S, ContextApply<S>>,
1963}
1964
1965impl<S> FirstOccurrenceFilter<S>
1966where
1967 S: ScopeData,
1968{
1969 pub fn new(window: ContextWindow) -> Self {
1971 Self {
1972 inner: ContextMiddleware::new(window, filter_first_occurrences_in_context::<S>),
1973 }
1974 }
1975
1976 pub fn push_token(&mut self, token: OutputToken<S>) -> Vec<OutputToken<S>> {
1978 self.inner.push_token(token)
1979 }
1980
1981 pub fn finish(self) -> Vec<OutputToken<S>> {
1983 self.inner.finish()
1984 }
1985}
1986
1987pub fn apply_user_directives<S>(
1991 tokens: impl IntoIterator<Item = OutputToken<S>>,
1992 directives: &UserDirectives<'_>,
1993) -> Vec<OutputToken<S>> {
1994 apply_user_directives_iter(tokens, directives).collect()
1995}
1996
1997pub fn apply_user_directives_iter<'a, S>(
2003 tokens: impl IntoIterator<Item = OutputToken<S>> + 'a,
2004 directives: &'a UserDirectives<'_>,
2005) -> impl Iterator<Item = OutputToken<S>> + 'a {
2006 tokens.into_iter().map(|token| directives.apply(token))
2007}
2008
2009struct ContextMiddleware<S, F>
2010where
2011 S: ScopeData,
2012 F: FnMut(&mut [OutputToken<S>]),
2013{
2014 window: ContextWindow,
2015 apply: F,
2016 context: Vec<OutputToken<S>>,
2017 scope_boundaries: Vec<bool>,
2018}
2019
2020impl<S, F> ContextMiddleware<S, F>
2021where
2022 S: ScopeData,
2023 F: FnMut(&mut [OutputToken<S>]),
2024{
2025 fn new(window: ContextWindow, apply: F) -> Self {
2026 Self {
2027 window,
2028 apply,
2029 context: Vec::new(),
2030 scope_boundaries: Vec::new(),
2031 }
2032 }
2033
2034 fn process(mut self, tokens: impl IntoIterator<Item = OutputToken<S>>) -> Vec<OutputToken<S>> {
2035 let mut output = Vec::new();
2036 for token in tokens {
2037 output.extend(self.push_token(token));
2038 }
2039 output.extend(self.finish());
2040 output
2041 }
2042
2043 fn push_token(&mut self, token: OutputToken<S>) -> Vec<OutputToken<S>> {
2044 let mut output = Vec::new();
2045 match self.window {
2046 ContextWindow::Off => output.push(token),
2047 ContextWindow::PerDocument => self.context.push(token),
2048 ContextWindow::PerBlock | ContextWindow::PerSection => match &token {
2049 OutputToken::Open(scope) => {
2050 let is_boundary = match self.window {
2051 ContextWindow::PerBlock => scope.data().is_block_boundary(),
2052 ContextWindow::PerSection => scope.data().is_section_boundary(),
2053 ContextWindow::Off | ContextWindow::PerDocument => false,
2054 };
2055 if is_boundary {
2056 self.flush_context(&mut output);
2057 }
2058 self.scope_boundaries.push(is_boundary);
2059 self.context.push(token);
2060 }
2061 OutputToken::Close => {
2062 let closes_boundary = self.scope_boundaries.pop().unwrap_or(false);
2063 self.context.push(token);
2064 if closes_boundary && self.window == ContextWindow::PerBlock {
2065 self.flush_context(&mut output);
2066 }
2067 }
2068 _ => self.context.push(token),
2069 },
2070 }
2071 output
2072 }
2073
2074 fn finish(mut self) -> Vec<OutputToken<S>> {
2075 let mut output = Vec::new();
2076 self.flush_context(&mut output);
2077 output
2078 }
2079
2080 fn flush_context(&mut self, output: &mut Vec<OutputToken<S>>) {
2081 if self.context.is_empty() {
2082 return;
2083 }
2084
2085 (self.apply)(&mut self.context);
2086 output.append(&mut self.context);
2087 }
2088}
2089
2090#[derive(Clone, Debug, Default, Eq, PartialEq)]
2091struct HomophoneIndex {
2092 forms_by_reading: BTreeMap<String, BTreeSet<String>>,
2093}
2094
2095impl HomophoneIndex {
2096 fn from_dictionary<D>(dictionary: &D) -> Option<Self>
2097 where
2098 D: HanjaDictionary + ?Sized,
2099 {
2100 let mut forms_by_reading = BTreeMap::<String, BTreeSet<String>>::new();
2101 for record in dictionary.entries()? {
2102 forms_by_reading
2103 .entry(record.reading)
2104 .or_default()
2105 .insert(record.hanja);
2106 }
2107 Some(Self { forms_by_reading })
2108 }
2109
2110 fn has_homophone(&self, hanja: &str, reading: &str) -> bool {
2111 self.forms_by_reading
2112 .get(reading)
2113 .is_some_and(|forms| forms.iter().any(|form| form != hanja))
2114 }
2115}
2116
2117fn mark_homophones_in_context<S, D>(
2118 tokens: &mut [OutputToken<S>],
2119 index: Option<&HomophoneIndex>,
2120 lookup_fallback: Option<&D>,
2121) where
2122 D: HanjaDictionary + ?Sized,
2123{
2124 let mut forms_by_reading = BTreeMap::<String, BTreeSet<String>>::new();
2125
2126 for token in tokens.iter() {
2127 if let OutputToken::Annotated(annotation) = token
2128 && annotation.from_dictionary
2129 {
2130 forms_by_reading
2131 .entry(annotation.reading.clone())
2132 .or_default()
2133 .insert(annotation.hanja.clone());
2134 }
2135 }
2136
2137 for token in tokens.iter_mut() {
2138 if let OutputToken::Annotated(annotation) = token {
2139 annotation.homophone = annotation.from_dictionary
2140 && (index.is_some_and(|index| {
2141 index.has_homophone(&annotation.hanja, &annotation.reading)
2142 }) || lookup_fallback.is_some_and(|dictionary| {
2143 dictionary.has_homophone(&annotation.hanja, &annotation.reading)
2144 }) || forms_by_reading
2145 .get(&annotation.reading)
2146 .is_some_and(|forms| forms.len() > 1));
2147 }
2148 }
2149}
2150
2151fn filter_first_occurrences_in_context<S>(tokens: &mut [OutputToken<S>]) {
2152 let mut seen = BTreeSet::new();
2153
2154 for token in tokens.iter_mut() {
2155 if let OutputToken::Annotated(annotation) = token {
2156 if seen.insert(annotation.hanja.clone()) {
2157 annotation.first_in_context = true;
2158 } else {
2159 annotation.first_in_context = false;
2160 annotation.require_hanja = false;
2161 annotation.require_hangul = false;
2162 }
2163 }
2164 }
2165}
2166
2167pub fn render_tokens<S, O>(
2175 tokens: impl IntoIterator<Item = OutputToken<S>>,
2176 options: O,
2177) -> Vec<RenderedToken<S>>
2178where
2179 S: ScopeData,
2180 O: Into<RenderOptions>,
2181{
2182 render_tokens_iter(tokens, options).collect()
2183}
2184
2185pub fn render_tokens_iter<S, O>(
2192 tokens: impl IntoIterator<Item = OutputToken<S>>,
2193 options: O,
2194) -> impl Iterator<Item = RenderedToken<S>>
2195where
2196 S: ScopeData,
2197 O: Into<RenderOptions>,
2198{
2199 RendererIter {
2200 upstream: tokens.into_iter(),
2201 renderer: Renderer::new(options),
2202 }
2203}
2204
2205pub struct Renderer<S>
2212where
2213 S: ScopeData,
2214{
2215 options: RenderOptions,
2216 markup_stack: Vec<bool>,
2221 disallowing_ancestors: usize,
2226 _scope: PhantomData<fn(S)>,
2227}
2228
2229impl<S> Renderer<S>
2230where
2231 S: ScopeData,
2232{
2233 pub fn new<O>(options: O) -> Self
2235 where
2236 O: Into<RenderOptions>,
2237 {
2238 Self {
2239 options: options.into(),
2240 markup_stack: Vec::new(),
2241 disallowing_ancestors: 0,
2242 _scope: PhantomData,
2243 }
2244 }
2245
2246 pub fn push_token(&mut self, token: OutputToken<S>) -> RenderedToken<S> {
2248 match token {
2249 OutputToken::Open(scope) => {
2250 let allows = scope.data().allows_inline_markup();
2251 if !allows {
2252 self.disallowing_ancestors += 1;
2253 }
2254 self.markup_stack.push(allows);
2255 RenderedToken::Open(scope)
2256 }
2257 OutputToken::Close => {
2258 if let Some(false) = self.markup_stack.pop() {
2259 self.disallowing_ancestors = self.disallowing_ancestors.saturating_sub(1);
2263 }
2264 RenderedToken::Close
2265 }
2266 OutputToken::Text(text) => RenderedToken::Text(text),
2267 OutputToken::Verbatim(text) => RenderedToken::Verbatim(text),
2268 OutputToken::Annotated(annotation) => {
2269 let allows_inline_markup = self.disallowing_ancestors == 0;
2276 render_annotation(&annotation, &self.options, allows_inline_markup)
2277 }
2278 }
2279 }
2280}
2281
2282struct RendererIter<I, S>
2283where
2284 S: ScopeData,
2285{
2286 upstream: I,
2287 renderer: Renderer<S>,
2288}
2289
2290impl<I, S> Iterator for RendererIter<I, S>
2291where
2292 I: Iterator<Item = OutputToken<S>>,
2293 S: ScopeData,
2294{
2295 type Item = RenderedToken<S>;
2296
2297 fn next(&mut self) -> Option<Self::Item> {
2298 let token = self.upstream.next()?;
2299 Some(self.renderer.push_token(token))
2300 }
2301}
2302
2303fn render_annotation<S>(
2304 annotation: &Annotation,
2305 options: &RenderOptions,
2306 allows_inline_markup: bool,
2307) -> RenderedToken<S> {
2308 if annotation.skip_annotation {
2309 let primary = match options.mode {
2310 RenderMode::HangulOnly | RenderMode::HangulHanjaParens => annotation.reading.clone(),
2311 RenderMode::HanjaHangulParens | RenderMode::Original => annotation.hanja.clone(),
2312 RenderMode::Ruby(RubyBase::OnHangul) => annotation.reading.clone(),
2313 RenderMode::Ruby(RubyBase::OnHanja) => annotation.hanja.clone(),
2314 };
2315 return RenderedToken::Text(primary);
2316 }
2317
2318 match options.mode {
2319 RenderMode::HangulOnly if annotation.require_hanja || annotation.homophone => {
2320 RenderedToken::Text(parens(&annotation.reading, &annotation.hanja))
2321 }
2322 RenderMode::HangulOnly => RenderedToken::Text(annotation.reading.clone()),
2323 RenderMode::HangulHanjaParens => {
2324 RenderedToken::Text(parens(&annotation.reading, &annotation.hanja))
2325 }
2326 RenderMode::HanjaHangulParens => {
2327 RenderedToken::Text(parens(&annotation.hanja, &annotation.reading))
2328 }
2329 RenderMode::Ruby(base) => render_ruby(annotation, base, allows_inline_markup),
2330 RenderMode::Original if annotation.require_hangul => match options.original_gloss {
2331 OriginalGloss::Parens => {
2332 RenderedToken::Text(parens(&annotation.hanja, &annotation.reading))
2333 }
2334 OriginalGloss::Ruby => render_ruby(annotation, RubyBase::OnHanja, allows_inline_markup),
2337 },
2338 RenderMode::Original => RenderedToken::Text(annotation.hanja.clone()),
2339 }
2340}
2341
2342fn render_ruby<S>(
2343 annotation: &Annotation,
2344 base: RubyBase,
2345 allows_inline_markup: bool,
2346) -> RenderedToken<S> {
2347 let (base_text, rt_text) = match base {
2348 RubyBase::OnHangul => (&annotation.reading, &annotation.hanja),
2349 RubyBase::OnHanja => (&annotation.hanja, &annotation.reading),
2350 };
2351 if !allows_inline_markup {
2352 return RenderedToken::Text(parens(base_text, rt_text));
2353 }
2354 RenderedToken::Ruby {
2355 base: base_text.clone(),
2356 rt: rt_text.clone(),
2357 }
2358}
2359
2360fn parens(reading: &str, hanja: &str) -> String {
2361 let mut output = String::new();
2362 output.push_str(reading);
2363 output.push('(');
2364 output.push_str(hanja);
2365 output.push(')');
2366 output
2367}
2368
2369pub fn convert_plain_text<D, R>(input: &str, dictionary: &D, render: R) -> String
2377where
2378 D: HanjaDictionary + ?Sized,
2379 R: Into<RenderOptions>,
2380{
2381 convert_plain_text_with_options(input, dictionary, render, EngineOptions::default())
2382}
2383
2384pub fn convert_plain_text_with_options<D, R>(
2388 input: &str,
2389 dictionary: &D,
2390 render: R,
2391 options: EngineOptions,
2392) -> String
2393where
2394 D: HanjaDictionary + ?Sized,
2395 R: Into<RenderOptions>,
2396{
2397 let input_tokens = read_plain_text(input);
2398 let output_tokens = process_tokens_with_options(input_tokens, dictionary, options);
2399 let output_tokens = mark_homophones(output_tokens, dictionary, ContextWindow::PerBlock);
2400 let rendered_tokens = render_tokens(output_tokens, render);
2401 write_plain_text(rendered_tokens)
2402}