1#![no_std]
25#![forbid(unsafe_code)]
26#![deny(missing_docs)]
27
28extern crate alloc;
29
30mod fallback;
31mod generated;
32mod segment;
33
34use alloc::boxed::Box;
35use alloc::collections::{BTreeMap, BTreeSet};
36use alloc::string::{String, ToString};
37use alloc::vec::Vec;
38use core::marker::PhantomData;
39
40use fallback::{
41 FallbackPart, FallbackState, apply_initial_sound_law_to_first_syllable,
42 fallback_reading_for_run, phoneticize_fallback_run_with_state, phoneticize_hanja_char,
43 should_apply_yeol_yul,
44};
45use generated::unihan_readings::KHANGUL_READINGS;
46use segment::{Segment, segment_text};
47
48#[derive(Debug, thiserror::Error)]
55#[non_exhaustive]
56pub enum Error {
57 #[error("dictionary load failed: {0}")]
59 DictionaryLoad(String),
60
61 #[error("segmentation failed for {hanja:?}: {reason}")]
63 Segmentation {
64 hanja: String,
66
67 reason: String,
69 },
70
71 #[error("invalid hangul reading {reading:?} for hanja {hanja:?}")]
73 InvalidReading {
74 hanja: String,
76
77 reading: String,
79 },
80
81 #[error("internal invariant violated: {0}")]
83 Internal(&'static str),
84
85 #[error(transparent)]
88 Other(#[from] Box<dyn core::error::Error + Send + Sync + 'static>),
89}
90
91#[derive(Clone, Copy, Debug, Default, Eq, PartialEq)]
97pub enum Recovery {
98 #[default]
100 Strict,
101
102 Lenient,
104}
105
106#[derive(Debug)]
112pub struct RecoverableInputError {
113 original: String,
114 error: Error,
115}
116
117impl RecoverableInputError {
118 pub fn new(original: String, error: Error) -> Self {
120 Self { original, error }
121 }
122
123 pub fn original(&self) -> &str {
126 &self.original
127 }
128
129 pub fn error(&self) -> &Error {
131 &self.error
132 }
133
134 pub fn into_parts(self) -> (String, Error) {
136 (self.original, self.error)
137 }
138}
139
140pub trait ScopeData: Clone + 'static {
147 fn is_preserve(&self) -> bool;
149
150 fn allows_inline_markup(&self) -> bool {
165 true
166 }
167
168 fn is_block_boundary(&self) -> bool {
170 false
171 }
172
173 fn is_section_boundary(&self) -> bool {
175 false
176 }
177}
178
179#[derive(Clone, Debug, Eq, PartialEq)]
185pub struct Scope<S> {
186 data: S,
187}
188
189impl<S> Scope<S> {
190 pub fn new(data: S) -> Self {
192 Self { data }
193 }
194
195 pub fn data(&self) -> &S {
197 &self.data
198 }
199
200 pub fn into_data(self) -> S {
202 self.data
203 }
204}
205
206#[derive(Clone, Debug, Eq, PartialEq)]
212pub enum InputToken<S> {
213 Open(Scope<S>),
215
216 Close,
218
219 Text(String),
221
222 Verbatim(String),
224}
225
226#[derive(Clone, Debug, Eq, PartialEq)]
232pub enum OutputToken<S> {
233 Open(Scope<S>),
235
236 Close,
238
239 Text(String),
241
242 Verbatim(String),
244
245 Annotated(Annotation),
247}
248
249#[derive(Clone, Debug, Eq, PartialEq)]
255pub enum RenderedToken<S> {
256 Open(Scope<S>),
258
259 Close,
261
262 Text(String),
264
265 Verbatim(String),
267
268 Ruby {
281 base: String,
283
284 rt: String,
286 },
287}
288
289#[derive(Clone, Debug, Eq, PartialEq)]
295pub struct Annotation {
296 pub hanja: String,
298
299 pub reading: String,
301
302 pub homophone: bool,
304
305 pub require_hanja: bool,
307
308 pub require_hangul: bool,
311
312 pub first_in_context: bool,
314
315 pub skip_annotation: bool,
318
319 pub from_dictionary: bool,
321}
322
323#[derive(Clone, Copy, Debug, Default, Eq, PartialEq)]
325pub struct MatchMark {
326 pub require_hanja: bool,
328
329 pub require_hangul: bool,
331}
332
333#[derive(Clone, Debug, Eq, PartialEq)]
341pub struct DictionaryRecord {
342 pub hanja: String,
344
345 pub reading: String,
347
348 pub mark: MatchMark,
350}
351
352#[derive(Clone, Debug, Eq, PartialEq)]
354pub struct Match {
355 pub byte_len: usize,
357
358 pub reading: String,
363
364 pub suffix_reading: Option<String>,
375
376 pub mark: MatchMark,
378}
379
380pub trait HanjaDictionary {
386 fn matches_at<'a>(&'a self, s: &'a str) -> Box<dyn Iterator<Item = Match> + 'a>;
388
389 fn max_word_chars(&self) -> Option<usize> {
391 None
392 }
393
394 fn entries<'a>(&'a self) -> Option<Box<dyn Iterator<Item = DictionaryRecord> + 'a>> {
400 None
401 }
402
403 fn has_homophone(&self, hanja: &str, reading: &str) -> bool {
405 self.entries().is_some_and(|mut entries| {
406 entries.any(|record| record.hanja != hanja && record.reading == reading)
407 })
408 }
409}
410
411impl<D> HanjaDictionary for &D
412where
413 D: HanjaDictionary + ?Sized,
414{
415 fn matches_at<'a>(&'a self, s: &'a str) -> Box<dyn Iterator<Item = Match> + 'a> {
416 (**self).matches_at(s)
417 }
418
419 fn max_word_chars(&self) -> Option<usize> {
420 (**self).max_word_chars()
421 }
422
423 fn entries<'a>(&'a self) -> Option<Box<dyn Iterator<Item = DictionaryRecord> + 'a>> {
424 (**self).entries()
425 }
426
427 fn has_homophone(&self, hanja: &str, reading: &str) -> bool {
428 (**self).has_homophone(hanja, reading)
429 }
430}
431
432impl<D> HanjaDictionary for Box<D>
433where
434 D: HanjaDictionary + ?Sized,
435{
436 fn matches_at<'a>(&'a self, s: &'a str) -> Box<dyn Iterator<Item = Match> + 'a> {
437 (**self).matches_at(s)
438 }
439
440 fn max_word_chars(&self) -> Option<usize> {
441 (**self).max_word_chars()
442 }
443
444 fn entries<'a>(&'a self) -> Option<Box<dyn Iterator<Item = DictionaryRecord> + 'a>> {
445 (**self).entries()
446 }
447
448 fn has_homophone(&self, hanja: &str, reading: &str) -> bool {
449 (**self).has_homophone(hanja, reading)
450 }
451}
452
453#[derive(Clone, Copy, Debug, Default, Eq, PartialEq)]
461pub struct UnihanCharDict;
462
463impl HanjaDictionary for UnihanCharDict {
464 fn matches_at<'a>(&'a self, s: &'a str) -> Box<dyn Iterator<Item = Match> + 'a> {
465 let matched = s.chars().next().and_then(|ch| {
466 khangul_reading(ch).map(|reading| Match {
467 byte_len: ch.len_utf8(),
468 reading: reading.to_string(),
469 suffix_reading: None,
470 mark: MatchMark::default(),
471 })
472 });
473 Box::new(matched.into_iter())
474 }
475
476 fn max_word_chars(&self) -> Option<usize> {
477 Some(1)
478 }
479
480 fn entries<'a>(&'a self) -> Option<Box<dyn Iterator<Item = DictionaryRecord> + 'a>> {
481 Some(Box::new(KHANGUL_READINGS.iter().map(|(hanja, reading)| {
482 DictionaryRecord {
483 hanja: hanja.to_string(),
484 reading: reading.to_string(),
485 mark: MatchMark::default(),
486 }
487 })))
488 }
489
490 fn has_homophone(&self, hanja: &str, reading: &str) -> bool {
491 let mut chars = hanja.chars();
492 let Some(hanja) = chars.next() else {
493 return false;
494 };
495 if chars.next().is_some() {
496 return false;
497 }
498 KHANGUL_READINGS
499 .iter()
500 .any(|&(other_hanja, other_reading)| other_hanja != hanja && other_reading == reading)
501 }
502}
503
504#[derive(Clone, Debug, Default, Eq, PartialEq)]
512pub struct ChainDictionary<D> {
513 dictionaries: Vec<D>,
514}
515
516impl<D> ChainDictionary<D> {
517 pub fn new() -> Self {
519 Self {
520 dictionaries: Vec::new(),
521 }
522 }
523
524 pub fn push(&mut self, dictionary: D) {
526 self.dictionaries.push(dictionary);
527 }
528
529 pub fn len(&self) -> usize {
531 self.dictionaries.len()
532 }
533
534 pub fn is_empty(&self) -> bool {
536 self.dictionaries.is_empty()
537 }
538
539 pub fn dictionaries(&self) -> &[D] {
541 &self.dictionaries
542 }
543
544 pub fn into_dictionaries(self) -> Vec<D> {
546 self.dictionaries
547 }
548}
549
550impl<D> FromIterator<D> for ChainDictionary<D> {
551 fn from_iter<T: IntoIterator<Item = D>>(iter: T) -> Self {
552 Self {
553 dictionaries: Vec::from_iter(iter),
554 }
555 }
556}
557
558impl<D> HanjaDictionary for ChainDictionary<D>
559where
560 D: HanjaDictionary,
561{
562 fn matches_at<'a>(&'a self, s: &'a str) -> Box<dyn Iterator<Item = Match> + 'a> {
563 let mut seen_lengths = BTreeSet::new();
564 let mut matches = Vec::new();
565
566 for dictionary in &self.dictionaries {
567 for matched in dictionary.matches_at(s) {
568 if seen_lengths.insert(matched.byte_len) {
569 matches.push(matched);
570 }
571 }
572 }
573
574 matches.sort_by_key(|matched| matched.byte_len);
575 Box::new(matches.into_iter())
576 }
577
578 fn max_word_chars(&self) -> Option<usize> {
579 let mut max = None;
580 for dictionary in &self.dictionaries {
581 let word_chars = dictionary.max_word_chars()?;
582 max = Some(max.map_or(word_chars, |current: usize| current.max(word_chars)));
583 }
584 max
585 }
586
587 fn entries<'a>(&'a self) -> Option<Box<dyn Iterator<Item = DictionaryRecord> + 'a>> {
588 let mut records = BTreeMap::<String, DictionaryRecord>::new();
589
590 for dictionary in &self.dictionaries {
591 for record in dictionary.entries()? {
592 records.entry(record.hanja.clone()).or_insert(record);
593 }
594 }
595
596 Some(Box::new(records.into_values()))
597 }
598
599 fn has_homophone(&self, hanja: &str, reading: &str) -> bool {
600 if let Some(mut records) = self.entries() {
601 return records.any(|record| record.hanja != hanja && record.reading == reading);
602 }
603
604 self.dictionaries
605 .iter()
606 .any(|dictionary| dictionary.has_homophone(hanja, reading))
607 }
608}
609
610fn khangul_reading(ch: char) -> Option<&'static str> {
611 KHANGUL_READINGS
612 .binary_search_by_key(&ch, |(hanja, _)| *hanja)
613 .ok()
614 .map(|index| KHANGUL_READINGS[index].1)
615}
616
617#[derive(Clone, Copy, Debug, Eq, PartialEq)]
623pub struct EngineOptions {
624 pub segmentation: SegmentationStrategy,
627
628 pub initial_sound_law: bool,
630
631 pub numeral_strategy: NumeralStrategy,
633}
634
635impl Default for EngineOptions {
636 fn default() -> Self {
637 Self {
638 segmentation: SegmentationStrategy::Lattice,
639 initial_sound_law: true,
640 numeral_strategy: NumeralStrategy::HangulPhonetic,
641 }
642 }
643}
644
645#[non_exhaustive]
652#[derive(Clone, Copy, Debug, Default, Eq, PartialEq)]
653pub enum SegmentationStrategy {
654 #[default]
656 Lattice,
657
658 Eager,
660}
661
662#[non_exhaustive]
664#[derive(Clone, Copy, Debug, Eq, PartialEq)]
665pub enum NumeralStrategy {
666 HangulPhonetic,
671
672 PositionalArabic,
678
679 AdditiveArabic,
685
686 Smart,
695}
696
697#[derive(Clone, Debug, Eq, PartialEq)]
698struct DictionaryEntry {
699 reading: String,
700 suffix_reading: Option<String>,
701 mark: MatchMark,
702}
703
704#[derive(Clone, Debug, Default, Eq, PartialEq)]
710pub struct MapDictionary {
711 entries: BTreeMap<String, DictionaryEntry>,
712 max_word_chars: Option<usize>,
713}
714
715impl MapDictionary {
716 pub fn new() -> Self {
718 Self::default()
719 }
720
721 pub fn insert(&mut self, hanja: impl Into<String>, reading: impl Into<String>) {
723 self.insert_marked(hanja, reading, MatchMark::default());
724 }
725
726 pub fn insert_marked(
728 &mut self,
729 hanja: impl Into<String>,
730 reading: impl Into<String>,
731 mark: MatchMark,
732 ) {
733 self.insert_entry(hanja, reading, None, mark);
734 }
735
736 pub fn insert_with_suffix(
741 &mut self,
742 hanja: impl Into<String>,
743 reading: impl Into<String>,
744 suffix: impl Into<String>,
745 ) {
746 self.insert_entry(hanja, reading, Some(suffix.into()), MatchMark::default());
747 }
748
749 fn insert_entry(
750 &mut self,
751 hanja: impl Into<String>,
752 reading: impl Into<String>,
753 suffix_reading: Option<String>,
754 mark: MatchMark,
755 ) {
756 let hanja = hanja.into();
757 let word_chars = hanja.chars().count();
758 self.max_word_chars = Some(self.max_word_chars.map_or(word_chars, |max| {
759 if word_chars > max { word_chars } else { max }
760 }));
761 self.entries.insert(
762 hanja,
763 DictionaryEntry {
764 reading: reading.into(),
765 suffix_reading,
766 mark,
767 },
768 );
769 }
770
771 pub fn is_empty(&self) -> bool {
773 self.entries.is_empty()
774 }
775
776 pub fn len(&self) -> usize {
778 self.entries.len()
779 }
780}
781
782impl HanjaDictionary for MapDictionary {
783 fn matches_at<'a>(&'a self, s: &'a str) -> Box<dyn Iterator<Item = Match> + 'a> {
784 Box::new(
785 self.entries
786 .iter()
787 .filter(move |(hanja, _)| s.starts_with(hanja.as_str()))
788 .map(|(hanja, entry)| Match {
789 byte_len: hanja.len(),
790 reading: entry.reading.clone(),
791 suffix_reading: entry.suffix_reading.clone(),
792 mark: entry.mark,
793 }),
794 )
795 }
796
797 fn max_word_chars(&self) -> Option<usize> {
798 self.max_word_chars
799 }
800
801 fn entries<'a>(&'a self) -> Option<Box<dyn Iterator<Item = DictionaryRecord> + 'a>> {
802 Some(Box::new(self.entries.iter().map(|(hanja, entry)| {
803 DictionaryRecord {
804 hanja: hanja.clone(),
805 reading: entry.reading.clone(),
806 mark: entry.mark,
807 }
808 })))
809 }
810
811 fn has_homophone(&self, hanja: &str, reading: &str) -> bool {
812 self.entries
813 .iter()
814 .any(|(other_hanja, entry)| other_hanja != hanja && entry.reading == reading)
815 }
816}
817
818#[derive(Clone, Copy, Debug, Default, Eq, PartialEq)]
826pub struct PlainScopeData;
827
828impl ScopeData for PlainScopeData {
829 fn is_preserve(&self) -> bool {
830 false
831 }
832
833 fn allows_inline_markup(&self) -> bool {
834 false
835 }
836}
837
838pub fn read_plain_text(input: &str) -> Vec<InputToken<PlainScopeData>> {
843 Vec::from([
844 InputToken::Open(Scope::new(PlainScopeData)),
845 InputToken::Text(input.to_string()),
846 InputToken::Close,
847 ])
848}
849
850pub fn write_plain_text<S>(tokens: impl IntoIterator<Item = RenderedToken<S>>) -> String {
858 let mut output = String::new();
859 for token in tokens {
860 match token {
861 RenderedToken::Open(_) | RenderedToken::Close => {}
862 RenderedToken::Text(text) | RenderedToken::Verbatim(text) => output.push_str(&text),
863 RenderedToken::Ruby { base, rt } => {
864 output.push_str(&parens(&base, &rt));
865 }
866 }
867 }
868 output
869}
870
871pub fn process_tokens<S, D>(
877 tokens: impl IntoIterator<Item = InputToken<S>>,
878 dictionary: &D,
879) -> Vec<OutputToken<S>>
880where
881 S: ScopeData,
882 D: HanjaDictionary + ?Sized,
883{
884 process_tokens_iter(tokens, dictionary).collect()
885}
886
887pub fn process_tokens_iter<S, D>(
895 tokens: impl IntoIterator<Item = InputToken<S>>,
896 dictionary: &D,
897) -> alloc::vec::IntoIter<OutputToken<S>>
898where
899 S: ScopeData,
900 D: HanjaDictionary + ?Sized,
901{
902 process_tokens_with_options(tokens, dictionary, EngineOptions::default()).into_iter()
903}
904
905pub fn process_tokens_with_options<S, D>(
910 tokens: impl IntoIterator<Item = InputToken<S>>,
911 dictionary: &D,
912 options: EngineOptions,
913) -> Vec<OutputToken<S>>
914where
915 S: ScopeData,
916 D: HanjaDictionary + ?Sized,
917{
918 let mut engine = Engine::collecting(dictionary, options);
919 let mut output = Vec::new();
920
921 for token in tokens {
922 output.extend(engine.push_token(token));
923 }
924
925 output.extend(engine.finish());
926 output
927}
928
929pub fn process_tokens_iter_with_options<S, D>(
936 tokens: impl IntoIterator<Item = InputToken<S>>,
937 dictionary: &D,
938 options: EngineOptions,
939) -> alloc::vec::IntoIter<OutputToken<S>>
940where
941 S: ScopeData,
942 D: HanjaDictionary + ?Sized,
943{
944 process_tokens_with_options(tokens, dictionary, options).into_iter()
945}
946
947pub fn recover_input_tokens<S>(
969 tokens: impl IntoIterator<Item = Result<InputToken<S>, RecoverableInputError>>,
970 recovery: Recovery,
971) -> Result<Vec<InputToken<S>>, Error>
972where
973 S: ScopeData,
974{
975 let mut recovered = Vec::new();
976 for token in tokens {
977 recovered.push(recover_input_token(token, recovery)?);
978 }
979 Ok(recovered)
980}
981
982pub fn recover_input_token<S>(
989 token: Result<InputToken<S>, RecoverableInputError>,
990 recovery: Recovery,
991) -> Result<InputToken<S>, Error>
992where
993 S: ScopeData,
994{
995 match token {
996 Ok(token) => Ok(token),
997 Err(error) => match recovery {
998 Recovery::Strict => Err(error.into_parts().1),
999 Recovery::Lenient => {
1000 let (original, error) = error.into_parts();
1001 tracing::warn!(error = %error, "recovering from input reader error");
1002 Ok(InputToken::Verbatim(original))
1003 }
1004 },
1005 }
1006}
1007
1008pub fn process_fallible_tokens<S, D>(
1015 tokens: impl IntoIterator<Item = Result<InputToken<S>, RecoverableInputError>>,
1016 dictionary: &D,
1017 recovery: Recovery,
1018) -> Result<Vec<OutputToken<S>>, Error>
1019where
1020 S: ScopeData,
1021 D: HanjaDictionary + ?Sized,
1022{
1023 process_fallible_tokens_with_options(tokens, dictionary, EngineOptions::default(), recovery)
1024}
1025
1026pub fn process_fallible_tokens_with_options<S, D>(
1033 tokens: impl IntoIterator<Item = Result<InputToken<S>, RecoverableInputError>>,
1034 dictionary: &D,
1035 options: EngineOptions,
1036 recovery: Recovery,
1037) -> Result<Vec<OutputToken<S>>, Error>
1038where
1039 S: ScopeData,
1040 D: HanjaDictionary + ?Sized,
1041{
1042 let recovered = recover_input_tokens(tokens, recovery)?;
1043 Ok(process_tokens_with_options(recovered, dictionary, options))
1044}
1045
1046pub struct Engine<'a, S, D>
1058where
1059 S: ScopeData,
1060 D: HanjaDictionary + ?Sized,
1061{
1062 dictionary: &'a D,
1063 options: EngineOptions,
1064 scopes: Vec<Scope<S>>,
1065 pending_text: String,
1066 pending_unflushable_fallback_run_bytes: Option<usize>,
1067 fallback_state: FallbackState,
1068 incremental_flush: bool,
1069}
1070
1071impl<'a, S, D> Engine<'a, S, D>
1072where
1073 S: ScopeData,
1074 D: HanjaDictionary + ?Sized,
1075{
1076 pub fn new(dictionary: &'a D) -> Self {
1078 Self::with_options(dictionary, EngineOptions::default())
1079 }
1080
1081 pub fn with_options(dictionary: &'a D, options: EngineOptions) -> Self {
1083 Self::with_incremental_flush(dictionary, options, true)
1084 }
1085
1086 fn collecting(dictionary: &'a D, options: EngineOptions) -> Self {
1087 Self::with_incremental_flush(dictionary, options, false)
1088 }
1089
1090 fn with_incremental_flush(
1091 dictionary: &'a D,
1092 options: EngineOptions,
1093 incremental_flush: bool,
1094 ) -> Self {
1095 tracing::debug!(
1096 strategy = ?options.segmentation,
1097 "engine created with segmentation strategy"
1098 );
1099 Self {
1100 dictionary,
1101 options,
1102 scopes: Vec::new(),
1103 pending_text: String::new(),
1104 pending_unflushable_fallback_run_bytes: None,
1105 fallback_state: FallbackState::default(),
1106 incremental_flush,
1107 }
1108 }
1109
1110 pub fn push_token(&mut self, token: InputToken<S>) -> Vec<OutputToken<S>> {
1113 let mut output = Vec::new();
1114 match token {
1115 InputToken::Open(scope) => {
1116 self.flush_into(&mut output);
1117 if scope.data().is_block_boundary() {
1118 self.reset_fallback_context();
1119 }
1120 self.scopes.push(scope.clone());
1121 output.push(OutputToken::Open(scope));
1122 }
1123 InputToken::Close => {
1124 self.flush_into(&mut output);
1125 let closes_block_boundary = self
1126 .scopes
1127 .pop()
1128 .is_some_and(|scope| scope.data().is_block_boundary());
1129 output.push(OutputToken::Close);
1130 if closes_block_boundary {
1131 self.reset_fallback_context();
1132 }
1133 }
1134 InputToken::Text(text) => {
1135 if self
1136 .scopes
1137 .last()
1138 .is_some_and(|scope| scope.data().is_preserve())
1139 {
1140 self.flush_into(&mut output);
1141 self.reset_fallback_context();
1142 output.push(OutputToken::Text(text));
1143 } else {
1144 let previous_pending_bytes = self.pending_text.len();
1145 self.pending_text.push_str(&text);
1146 if self
1147 .pending_unflushable_fallback_run_bytes
1148 .is_some_and(|bytes| bytes == previous_pending_bytes)
1149 {
1150 self.pending_unflushable_fallback_run_bytes = Some(previous_pending_bytes);
1151 } else {
1152 self.pending_unflushable_fallback_run_bytes = None;
1153 }
1154 if self.incremental_flush {
1155 self.flush_safe_into(&mut output);
1156 }
1157 }
1158 }
1159 InputToken::Verbatim(text) => {
1160 self.flush_into(&mut output);
1161 self.reset_fallback_context();
1162 output.push(OutputToken::Verbatim(text));
1163 }
1164 }
1165 output
1166 }
1167
1168 pub fn flush(&mut self) -> Vec<OutputToken<S>> {
1170 let mut output = Vec::new();
1171 self.flush_into(&mut output);
1172 output
1173 }
1174
1175 pub fn finish(mut self) -> Vec<OutputToken<S>> {
1177 self.flush()
1178 }
1179
1180 pub fn buffered_chars(&self) -> usize {
1182 self.pending_text.chars().count()
1183 }
1184
1185 fn tail_bound(&self) -> Option<usize> {
1186 self.dictionary.max_word_chars().filter(|bound| *bound > 0)
1187 }
1188
1189 fn flush_safe_into(&mut self, output: &mut Vec<OutputToken<S>>) {
1190 if self.pending_text.is_empty() {
1191 return;
1192 }
1193 if !self.pending_text.chars().any(is_hanja) {
1194 self.flush_non_hanja_safe_into(output);
1195 return;
1196 }
1197
1198 let Some(bound) = self.tail_bound() else {
1199 let Some(flush_end) = safe_unknown_bound_flush_end(&self.pending_text) else {
1200 return;
1201 };
1202 self.flush_prefix_into(flush_end, output);
1203 if !self.pending_text.chars().any(is_hanja) {
1204 self.flush_non_hanja_safe_into(output);
1205 }
1206 return;
1207 };
1208 if let Some(flush_end) = safe_unknown_bound_flush_end(&self.pending_text) {
1209 self.flush_prefix_into(flush_end, output);
1210 if !self.pending_text.chars().any(is_hanja) {
1211 self.flush_non_hanja_safe_into(output);
1212 }
1213 return;
1214 }
1215 let buffered_chars = self.buffered_chars();
1216 if buffered_chars > bound.saturating_mul(10) {
1217 tracing::debug!(
1218 buffered_chars,
1219 dict_max_word_chars = bound,
1220 "streaming tail buffer is unusually large"
1221 );
1222 }
1223 if buffered_chars <= bound {
1224 return;
1225 }
1226
1227 if self.extends_unflushable_fallback_run(bound) {
1228 self.pending_unflushable_fallback_run_bytes = Some(self.pending_text.len());
1229 return;
1230 }
1231
1232 let safe_chars = buffered_chars.saturating_sub(bound).saturating_add(1);
1233 let segments = segment_text(
1234 &self.pending_text,
1235 self.dictionary,
1236 self.options.segmentation,
1237 );
1238 let mut flush_end = 0;
1239 let mut flush_segments = Vec::new();
1240 for segment in &segments {
1241 let (byte_start, byte_end) = segment_bounds(segment);
1242 let start_chars = self.pending_text[..byte_start].chars().count();
1243 let end_chars = self.pending_text[..byte_end].chars().count();
1244 if byte_start > flush_end || (start_chars > safe_chars && flush_end > 0) {
1245 break;
1246 }
1247 if end_chars > safe_chars {
1248 break;
1249 }
1250 flush_end = byte_end;
1251 flush_segments.push(segment.clone());
1252 }
1253
1254 if let Some(fallback_start) = trailing_fallback_run_start(&segments, flush_end) {
1258 flush_end = fallback_start;
1259 while flush_segments
1260 .last()
1261 .is_some_and(|segment| segment_bounds(segment).1 > flush_end)
1262 {
1263 flush_segments.pop();
1264 }
1265 }
1266
1267 if flush_end > 0 {
1268 self.pending_unflushable_fallback_run_bytes = None;
1269 self.flush_segments_prefix_into(flush_end, &flush_segments, output);
1270 if !self.pending_text.chars().any(is_hanja) {
1271 self.flush_non_hanja_safe_into(output);
1272 }
1273 } else if trailing_fallback_run_start(&segments, self.pending_text.len()) == Some(0) {
1274 self.pending_unflushable_fallback_run_bytes = Some(self.pending_text.len());
1275 }
1276 }
1277
1278 fn extends_unflushable_fallback_run(&self, bound: usize) -> bool {
1279 let Some(previous_bytes) = self.pending_unflushable_fallback_run_bytes else {
1280 return false;
1281 };
1282 if previous_bytes == 0
1283 || previous_bytes > self.pending_text.len()
1284 || !self.pending_text.is_char_boundary(previous_bytes)
1285 {
1286 return false;
1287 }
1288
1289 let appended = &self.pending_text[previous_bytes..];
1290 if appended.is_empty() {
1291 return true;
1292 }
1293 if appended.chars().any(|ch| !is_hanja(ch)) {
1294 return false;
1295 }
1296
1297 let probe_start = suffix_start_for_char_count(
1301 &self.pending_text[..previous_bytes],
1302 bound.saturating_sub(1),
1303 );
1304 let probe = &self.pending_text[probe_start..];
1305 segment_text(probe, self.dictionary, self.options.segmentation)
1306 .iter()
1307 .all(|segment| matches!(segment, Segment::Fallback { .. }))
1308 }
1309
1310 fn flush_non_hanja_safe_into(&mut self, output: &mut Vec<OutputToken<S>>) {
1311 let flush_end = match self.tail_bound() {
1312 Some(bound) => safe_non_hanja_flush_end(&self.pending_text, bound),
1313 None => safe_unknown_bound_flush_end(&self.pending_text),
1314 };
1315 if let Some(flush_end) = flush_end {
1316 self.flush_prefix_into(flush_end, output);
1317 }
1318 }
1319
1320 fn flush_prefix_into(&mut self, flush_end: usize, output: &mut Vec<OutputToken<S>>) {
1321 if flush_end == self.pending_text.len() {
1322 self.flush_into(output);
1323 return;
1324 }
1325 self.pending_unflushable_fallback_run_bytes = None;
1326 let prefix = self.pending_text[..flush_end].to_string();
1327 let segments = segment_text(&prefix, self.dictionary, self.options.segmentation);
1328 self.flush_segments_prefix_into(flush_end, &segments, output);
1329 }
1330
1331 fn flush_segments_prefix_into(
1332 &mut self,
1333 flush_end: usize,
1334 segments: &[Segment],
1335 output: &mut Vec<OutputToken<S>>,
1336 ) {
1337 let prefix = self.pending_text[..flush_end].to_string();
1338 process_segments_with_state(
1339 &prefix,
1340 segments,
1341 self.dictionary,
1342 self.options,
1343 &mut self.fallback_state,
1344 output,
1345 );
1346 self.pending_text.replace_range(..flush_end, "");
1347 }
1348
1349 fn flush_into(&mut self, output: &mut Vec<OutputToken<S>>) {
1350 if self.pending_text.is_empty() {
1351 return;
1352 }
1353 self.pending_unflushable_fallback_run_bytes = None;
1354 let text = core::mem::take(&mut self.pending_text);
1355 process_text_with_state(
1356 &text,
1357 self.dictionary,
1358 self.options,
1359 &mut self.fallback_state,
1360 output,
1361 );
1362 }
1363
1364 fn reset_fallback_context(&mut self) {
1365 self.fallback_state = FallbackState::default();
1366 }
1367}
1368
1369fn safe_non_hanja_flush_end(text: &str, bound: usize) -> Option<usize> {
1370 if text.is_empty() {
1371 return None;
1372 }
1373
1374 let keep_chars = bound.saturating_sub(1);
1375 let span_start = text
1376 .char_indices()
1377 .rfind(|(_, ch)| ch.is_whitespace())
1378 .map_or(0, |(index, ch)| index + ch.len_utf8());
1379 let suffix = &text[span_start..];
1380 let suffix_chars = suffix.chars().count();
1381 if suffix_chars <= keep_chars {
1382 return (span_start > 0).then_some(span_start);
1383 }
1384
1385 let flush_suffix_chars = suffix_chars - keep_chars;
1386 let flush_end = suffix
1387 .char_indices()
1388 .nth(flush_suffix_chars)
1389 .map_or(text.len(), |(index, _)| span_start + index);
1390 (flush_end > 0).then_some(flush_end)
1391}
1392
1393fn safe_unknown_bound_flush_end(text: &str) -> Option<usize> {
1394 text.char_indices()
1395 .rfind(|(_, ch)| ch.is_whitespace())
1396 .map(|(index, ch)| index + ch.len_utf8())
1397}
1398
1399fn suffix_start_for_char_count(text: &str, count: usize) -> usize {
1400 if count == 0 {
1401 return text.len();
1402 }
1403
1404 text.char_indices()
1405 .rev()
1406 .nth(count.saturating_sub(1))
1407 .map_or(0, |(index, _)| index)
1408}
1409
1410fn trailing_fallback_run_start(segments: &[Segment], split_byte: usize) -> Option<usize> {
1411 if split_byte == 0 {
1412 return None;
1413 }
1414
1415 for (index, segment) in segments.iter().enumerate() {
1416 let (byte_start, byte_end) = segment_bounds(segment);
1417 if byte_end != split_byte {
1418 continue;
1419 }
1420 if !matches!(segment, Segment::Fallback { .. }) {
1421 return None;
1422 }
1423 if let Some(next) = segments.get(index + 1)
1424 && !matches!(next, Segment::Fallback { .. })
1425 {
1426 return None;
1427 }
1428
1429 let mut run_start = byte_start;
1430 for previous in segments[..index].iter().rev() {
1431 let (previous_start, previous_end) = segment_bounds(previous);
1432 if previous_end != run_start || !matches!(previous, Segment::Fallback { .. }) {
1433 break;
1434 }
1435 run_start = previous_start;
1436 }
1437 return (run_start < split_byte).then_some(run_start);
1438 }
1439
1440 None
1441}
1442
1443fn process_text_with_state<S, D>(
1444 text: &str,
1445 dictionary: &D,
1446 options: EngineOptions,
1447 fallback_state: &mut FallbackState,
1448 output: &mut Vec<OutputToken<S>>,
1449) where
1450 D: HanjaDictionary + ?Sized,
1451{
1452 let segments = segment_text(text, dictionary, options.segmentation);
1453 process_segments_with_state(text, &segments, dictionary, options, fallback_state, output);
1454}
1455
1456fn process_segments_with_state<S, D>(
1457 text: &str,
1458 segments: &[Segment],
1459 _dictionary: &D,
1460 options: EngineOptions,
1461 fallback_state: &mut FallbackState,
1462 output: &mut Vec<OutputToken<S>>,
1463) where
1464 D: HanjaDictionary + ?Sized,
1465{
1466 let mut index = 0;
1467
1468 while index < segments.len() {
1469 match &segments[index] {
1470 Segment::Dictionary {
1471 byte_start,
1472 byte_end,
1473 reading,
1474 suffix_reading,
1475 mark,
1476 } => {
1477 let source = &text[*byte_start..*byte_end];
1478 let effective = dictionary_effective_reading(
1479 source,
1480 reading,
1481 suffix_reading.as_deref(),
1482 options,
1483 fallback_state.starts_word,
1484 fallback_state.previous_reading,
1485 );
1486 output.push(OutputToken::Annotated(Annotation {
1487 hanja: source.to_string(),
1488 homophone: false,
1489 reading: effective.clone(),
1490 require_hanja: mark.require_hanja,
1491 require_hangul: mark.require_hangul,
1492 first_in_context: true,
1493 skip_annotation: false,
1494 from_dictionary: true,
1495 }));
1496 if should_preserve_dictionary_context(source, &effective, options) {
1497 update_fallback_state_for_reading(&effective, fallback_state);
1498 } else {
1499 *fallback_state = FallbackState::default();
1500 }
1501 index += 1;
1502 }
1503 Segment::Fallback {
1504 byte_start,
1505 byte_end,
1506 } => {
1507 let mut fallback_end = *byte_end;
1508 while let Some(Segment::Fallback { byte_end, .. }) = segments.get(index + 1) {
1509 fallback_end = *byte_end;
1510 index += 1;
1511 }
1512 process_fallback_text(
1513 &text[*byte_start..fallback_end],
1514 options,
1515 fallback_state,
1516 output,
1517 );
1518 index += 1;
1519 }
1520 Segment::Text {
1521 byte_start,
1522 byte_end,
1523 } => {
1524 let text_segment = &text[*byte_start..*byte_end];
1525 push_text(output, text_segment);
1526 update_fallback_state_for_text(text_segment, fallback_state);
1527 index += 1;
1528 }
1529 }
1530 }
1531}
1532
1533fn segment_bounds(segment: &Segment) -> (usize, usize) {
1534 match segment {
1535 Segment::Dictionary {
1536 byte_start,
1537 byte_end,
1538 ..
1539 }
1540 | Segment::Fallback {
1541 byte_start,
1542 byte_end,
1543 }
1544 | Segment::Text {
1545 byte_start,
1546 byte_end,
1547 } => (*byte_start, *byte_end),
1548 }
1549}
1550
1551fn process_fallback_text<S>(
1552 text: &str,
1553 options: EngineOptions,
1554 state: &mut FallbackState,
1555 output: &mut Vec<OutputToken<S>>,
1556) {
1557 for part in phoneticize_fallback_run_with_state(text, options, state) {
1558 match part {
1559 FallbackPart::Annotation { hanja, reading } => {
1560 output.push(OutputToken::Annotated(Annotation {
1561 hanja,
1562 reading,
1563 homophone: false,
1564 require_hanja: false,
1565 require_hangul: false,
1566 first_in_context: true,
1567 skip_annotation: false,
1568 from_dictionary: false,
1569 }));
1570 }
1571 FallbackPart::ReadingText(text) => push_text(output, &text),
1572 FallbackPart::Text(text) => push_text(output, &text),
1573 }
1574 }
1575}
1576
1577fn update_fallback_state_for_text(text: &str, state: &mut FallbackState) {
1578 if text.is_empty() {
1579 return;
1580 }
1581
1582 if text
1583 .chars()
1584 .last()
1585 .is_some_and(|character| character.is_whitespace())
1586 {
1587 *state = FallbackState::default();
1588 return;
1589 }
1590
1591 let Some(last) = text.chars().rev().find(|ch| !ch.is_whitespace()) else {
1592 return;
1593 };
1594
1595 if last.is_alphanumeric() {
1596 state.starts_word = false;
1597 state.previous_reading = Some(last);
1598 } else {
1599 *state = FallbackState::default();
1600 }
1601}
1602
1603fn dictionary_effective_reading(
1625 source: &str,
1626 reading: &str,
1627 suffix_reading: Option<&str>,
1628 options: EngineOptions,
1629 starts_word: bool,
1630 previous_reading: Option<char>,
1631) -> String {
1632 if let Some(suffix) = suffix_reading {
1633 return if starts_word && options.initial_sound_law {
1634 reading.to_string()
1635 } else {
1636 suffix.to_string()
1637 };
1638 }
1639
1640 let mut chars = source.chars();
1641 if let (Some(ch), None) = (chars.next(), chars.next())
1642 && let Some(base) = phoneticize_hanja_char(ch)
1643 {
1644 let initial = apply_initial_sound_law_to_first_syllable(base);
1645 if initial != base && (reading == base || reading == initial) {
1646 let apply_law = options.initial_sound_law
1647 && (starts_word || should_apply_yeol_yul(previous_reading, base));
1648 return if apply_law { initial } else { base.to_string() };
1649 }
1650 }
1651
1652 reading.to_string()
1653}
1654
1655fn should_preserve_dictionary_context(source: &str, reading: &str, options: EngineOptions) -> bool {
1656 if reading.chars().all(char::is_whitespace) {
1657 return false;
1658 }
1659
1660 if source.chars().all(is_hanja) {
1661 match fallback_reading_for_run(source, options) {
1662 Some(fallback_reading) => {
1663 fallback_reading == reading || has_one_hangul_syllable_per_hanja(source, reading)
1664 }
1665 None => has_one_hangul_syllable_per_hanja(source, reading),
1666 }
1667 } else {
1668 true
1669 }
1670}
1671
1672fn has_one_hangul_syllable_per_hanja(source: &str, reading: &str) -> bool {
1673 let source_len = source.chars().count();
1674 let mut reading_len = 0;
1675
1676 for ch in reading.chars() {
1677 if !is_hangul_syllable(ch) {
1678 return false;
1679 }
1680 reading_len += 1;
1681 }
1682
1683 reading_len == source_len
1684}
1685
1686fn is_hangul_syllable(ch: char) -> bool {
1687 ('\u{ac00}'..='\u{d7a3}').contains(&ch)
1688}
1689
1690fn update_fallback_state_for_reading(reading: &str, state: &mut FallbackState) {
1691 let Some(last) = reading.chars().rev().find(|ch| !ch.is_whitespace()) else {
1692 *state = FallbackState::default();
1693 return;
1694 };
1695
1696 if last.is_alphanumeric() {
1697 state.starts_word = false;
1698 state.previous_reading = Some(last);
1699 } else {
1700 *state = FallbackState::default();
1701 }
1702}
1703
1704fn push_text<S>(output: &mut Vec<OutputToken<S>>, text: &str) {
1705 if text.is_empty() {
1706 return;
1707 }
1708
1709 match output.last_mut() {
1710 Some(OutputToken::Text(existing)) => existing.push_str(text),
1711 _ => output.push(OutputToken::Text(text.to_string())),
1712 }
1713}
1714
1715pub fn is_hanja(ch: char) -> bool {
1717 matches!(
1718 ch,
1719 '\u{2F00}'..='\u{2FFF}'
1720 | '\u{3007}'
1721 | '\u{3400}'..='\u{4DBF}'
1722 | '\u{4E00}'..='\u{9FFF}'
1723 | '\u{F900}'..='\u{FAFF}'
1724 | '\u{20000}'..='\u{2A6DF}'
1725 | '\u{2A700}'..='\u{2B73F}'
1726 | '\u{2B740}'..='\u{2B81F}'
1727 | '\u{2B820}'..='\u{2CEAF}'
1728 | '\u{2CEB0}'..='\u{2EBEF}'
1729 | '\u{2EBF0}'..='\u{2EE5F}'
1730 | '\u{2F800}'..='\u{2FA1F}'
1731 | '\u{30000}'..='\u{3134F}'
1732 | '\u{31350}'..='\u{323AF}'
1733 | '\u{323B0}'..='\u{3347F}'
1734 )
1735}
1736
1737#[derive(Clone, Copy, Debug, Eq, PartialEq)]
1739pub enum RenderMode {
1740 HangulOnly,
1742
1743 HangulHanjaParens,
1745
1746 HanjaHangulParens,
1749
1750 Ruby(RubyBase),
1758
1759 Original,
1761}
1762
1763#[derive(Clone, Copy, Debug, Eq, PartialEq)]
1765pub enum RubyBase {
1766 OnHangul,
1770
1771 OnHanja,
1775}
1776
1777#[derive(Clone, Copy, Debug, Default, Eq, PartialEq)]
1785pub enum OriginalGloss {
1786 #[default]
1788 Parens,
1789
1790 Ruby,
1794}
1795
1796#[derive(Clone, Copy, Debug, Eq, PartialEq)]
1803pub struct RenderOptions {
1804 pub mode: RenderMode,
1806
1807 pub original_gloss: OriginalGloss,
1809}
1810
1811impl Default for RenderOptions {
1812 fn default() -> Self {
1813 Self {
1814 mode: RenderMode::HangulOnly,
1815 original_gloss: OriginalGloss::Parens,
1816 }
1817 }
1818}
1819
1820impl From<RenderMode> for RenderOptions {
1821 fn from(mode: RenderMode) -> Self {
1822 Self {
1823 mode,
1824 original_gloss: OriginalGloss::default(),
1825 }
1826 }
1827}
1828
1829#[derive(Clone, Copy, Debug, Eq, PartialEq)]
1839pub enum ContextWindow {
1840 Off,
1842
1843 PerBlock,
1845
1846 PerSection,
1848
1849 PerDocument,
1851}
1852
1853#[derive(Clone, Copy, Debug, Default, Eq, PartialEq)]
1872pub enum HomophoneDetection {
1873 #[default]
1875 ContextLocal,
1876
1877 DictionaryWide,
1880}
1881
1882#[derive(Clone, Copy, Debug, Eq, PartialEq)]
1884pub enum DirectiveAction {
1885 RequireHanja,
1887
1888 RequireHangul,
1890
1891 SkipAnnotation,
1893}
1894
1895#[derive(Default)]
1901pub struct UserDirectives<'a> {
1902 rules: Vec<UserDirectiveRule<'a>>,
1903}
1904
1905impl<'a> UserDirectives<'a> {
1906 pub fn new() -> Self {
1908 Self::default()
1909 }
1910
1911 pub fn require_hanja(&mut self, hanja: impl Into<String>) {
1913 self.add_literal(hanja, DirectiveAction::RequireHanja);
1914 }
1915
1916 pub fn require_hangul(&mut self, hanja: impl Into<String>) {
1918 self.add_literal(hanja, DirectiveAction::RequireHangul);
1919 }
1920
1921 pub fn skip_annotation(&mut self, hanja: impl Into<String>) {
1923 self.add_literal(hanja, DirectiveAction::SkipAnnotation);
1924 }
1925
1926 pub fn add_literal(&mut self, hanja: impl Into<String>, action: DirectiveAction) {
1928 self.rules.push(UserDirectiveRule {
1929 predicate: UserDirectivePredicate::Literal(hanja.into()),
1930 action,
1931 });
1932 }
1933
1934 pub fn add_predicate(
1936 &mut self,
1937 predicate: impl Fn(&Annotation) -> bool + 'a,
1938 action: DirectiveAction,
1939 ) {
1940 self.rules.push(UserDirectiveRule {
1941 predicate: UserDirectivePredicate::Predicate(Box::new(predicate)),
1942 action,
1943 });
1944 }
1945
1946 pub fn is_empty(&self) -> bool {
1948 self.rules.is_empty()
1949 }
1950
1951 pub fn apply<S>(&self, token: OutputToken<S>) -> OutputToken<S> {
1958 match token {
1959 OutputToken::Annotated(mut annotation) => {
1960 for rule in &self.rules {
1961 if !rule.predicate.matches(&annotation) {
1962 continue;
1963 }
1964 match rule.action {
1965 DirectiveAction::RequireHanja => annotation.require_hanja = true,
1966 DirectiveAction::RequireHangul => annotation.require_hangul = true,
1967 DirectiveAction::SkipAnnotation => annotation.skip_annotation = true,
1968 }
1969 }
1970 OutputToken::Annotated(annotation)
1971 }
1972 token => token,
1973 }
1974 }
1975}
1976
1977struct UserDirectiveRule<'a> {
1978 predicate: UserDirectivePredicate<'a>,
1979 action: DirectiveAction,
1980}
1981
1982enum UserDirectivePredicate<'a> {
1983 Literal(String),
1984 Predicate(Box<dyn Fn(&Annotation) -> bool + 'a>),
1985}
1986
1987impl UserDirectivePredicate<'_> {
1988 fn matches(&self, annotation: &Annotation) -> bool {
1989 match self {
1990 Self::Literal(hanja) => annotation.hanja == *hanja,
1991 Self::Predicate(predicate) => predicate(annotation),
1992 }
1993 }
1994}
1995
1996pub fn mark_homophones<S, D>(
2002 tokens: impl IntoIterator<Item = OutputToken<S>>,
2003 dictionary: &D,
2004 window: ContextWindow,
2005) -> Vec<OutputToken<S>>
2006where
2007 S: ScopeData,
2008 D: HanjaDictionary + ?Sized,
2009{
2010 mark_homophones_with_detection(tokens, dictionary, window, HomophoneDetection::ContextLocal)
2011}
2012
2013pub fn mark_homophones_with_detection<S, D>(
2025 tokens: impl IntoIterator<Item = OutputToken<S>>,
2026 dictionary: &D,
2027 window: ContextWindow,
2028 detection: HomophoneDetection,
2029) -> Vec<OutputToken<S>>
2030where
2031 S: ScopeData,
2032 D: HanjaDictionary + ?Sized,
2033{
2034 if window == ContextWindow::Off {
2035 return tokens.into_iter().collect();
2036 }
2037
2038 let index = match detection {
2039 HomophoneDetection::ContextLocal => None,
2040 HomophoneDetection::DictionaryWide => HomophoneIndex::from_dictionary(dictionary),
2041 };
2042 let lookup_fallback = match detection {
2043 HomophoneDetection::ContextLocal => None,
2044 HomophoneDetection::DictionaryWide => index.is_none().then_some(dictionary),
2045 };
2046 ContextMiddleware::new(window, |tokens| {
2047 mark_homophones_in_context(tokens, index.as_ref(), lookup_fallback);
2048 })
2049 .process(tokens)
2050}
2051
2052pub fn filter_first_occurrences<S>(
2058 tokens: impl IntoIterator<Item = OutputToken<S>>,
2059 window: ContextWindow,
2060) -> Vec<OutputToken<S>>
2061where
2062 S: ScopeData,
2063{
2064 ContextMiddleware::new(window, filter_first_occurrences_in_context).process(tokens)
2065}
2066
2067type ContextApply<S> = fn(&mut [OutputToken<S>]);
2068type HomophoneApply<'a, S> = Box<dyn FnMut(&mut [OutputToken<S>]) + 'a>;
2069
2070pub struct HomophoneMarker<'a, S>
2078where
2079 S: ScopeData,
2080{
2081 inner: ContextMiddleware<S, HomophoneApply<'a, S>>,
2082}
2083
2084impl<'a, S> HomophoneMarker<'a, S>
2085where
2086 S: ScopeData,
2087{
2088 pub fn new<D>(dictionary: &'a D, window: ContextWindow) -> Self
2094 where
2095 D: HanjaDictionary + ?Sized,
2096 {
2097 Self::with_detection(dictionary, window, HomophoneDetection::ContextLocal)
2098 }
2099
2100 pub fn with_detection<D>(
2108 dictionary: &'a D,
2109 window: ContextWindow,
2110 detection: HomophoneDetection,
2111 ) -> Self
2112 where
2113 D: HanjaDictionary + ?Sized,
2114 {
2115 let index = match detection {
2116 _ if window == ContextWindow::Off => None,
2117 HomophoneDetection::ContextLocal => None,
2118 HomophoneDetection::DictionaryWide => HomophoneIndex::from_dictionary(dictionary),
2119 };
2120 let lookup_fallback = match detection {
2121 HomophoneDetection::ContextLocal => None,
2122 HomophoneDetection::DictionaryWide => index.is_none().then_some(dictionary),
2123 };
2124 Self {
2125 inner: ContextMiddleware::new(
2126 window,
2127 Box::new(move |tokens| {
2128 mark_homophones_in_context(tokens, index.as_ref(), lookup_fallback);
2129 }),
2130 ),
2131 }
2132 }
2133
2134 pub fn push_token(&mut self, token: OutputToken<S>) -> Vec<OutputToken<S>> {
2136 self.inner.push_token(token)
2137 }
2138
2139 pub fn finish(self) -> Vec<OutputToken<S>> {
2141 self.inner.finish()
2142 }
2143}
2144
2145pub struct FirstOccurrenceFilter<S>
2150where
2151 S: ScopeData,
2152{
2153 inner: ContextMiddleware<S, ContextApply<S>>,
2154}
2155
2156impl<S> FirstOccurrenceFilter<S>
2157where
2158 S: ScopeData,
2159{
2160 pub fn new(window: ContextWindow) -> Self {
2162 Self {
2163 inner: ContextMiddleware::new(window, filter_first_occurrences_in_context::<S>),
2164 }
2165 }
2166
2167 pub fn push_token(&mut self, token: OutputToken<S>) -> Vec<OutputToken<S>> {
2169 self.inner.push_token(token)
2170 }
2171
2172 pub fn finish(self) -> Vec<OutputToken<S>> {
2174 self.inner.finish()
2175 }
2176}
2177
2178pub fn apply_user_directives<S>(
2182 tokens: impl IntoIterator<Item = OutputToken<S>>,
2183 directives: &UserDirectives<'_>,
2184) -> Vec<OutputToken<S>> {
2185 apply_user_directives_iter(tokens, directives).collect()
2186}
2187
2188pub fn apply_user_directives_iter<'a, S>(
2194 tokens: impl IntoIterator<Item = OutputToken<S>> + 'a,
2195 directives: &'a UserDirectives<'_>,
2196) -> impl Iterator<Item = OutputToken<S>> + 'a {
2197 tokens.into_iter().map(|token| directives.apply(token))
2198}
2199
2200struct ContextMiddleware<S, F>
2201where
2202 S: ScopeData,
2203 F: FnMut(&mut [OutputToken<S>]),
2204{
2205 window: ContextWindow,
2206 apply: F,
2207 context: Vec<OutputToken<S>>,
2208 scope_boundaries: Vec<bool>,
2209}
2210
2211impl<S, F> ContextMiddleware<S, F>
2212where
2213 S: ScopeData,
2214 F: FnMut(&mut [OutputToken<S>]),
2215{
2216 fn new(window: ContextWindow, apply: F) -> Self {
2217 Self {
2218 window,
2219 apply,
2220 context: Vec::new(),
2221 scope_boundaries: Vec::new(),
2222 }
2223 }
2224
2225 fn process(mut self, tokens: impl IntoIterator<Item = OutputToken<S>>) -> Vec<OutputToken<S>> {
2226 let mut output = Vec::new();
2227 for token in tokens {
2228 output.extend(self.push_token(token));
2229 }
2230 output.extend(self.finish());
2231 output
2232 }
2233
2234 fn push_token(&mut self, token: OutputToken<S>) -> Vec<OutputToken<S>> {
2235 let mut output = Vec::new();
2236 match self.window {
2237 ContextWindow::Off => output.push(token),
2238 ContextWindow::PerDocument => self.context.push(token),
2239 ContextWindow::PerBlock | ContextWindow::PerSection => match &token {
2240 OutputToken::Open(scope) => {
2241 let is_boundary = match self.window {
2242 ContextWindow::PerBlock => scope.data().is_block_boundary(),
2243 ContextWindow::PerSection => scope.data().is_section_boundary(),
2244 ContextWindow::Off | ContextWindow::PerDocument => false,
2245 };
2246 if is_boundary {
2247 self.flush_context(&mut output);
2248 }
2249 self.scope_boundaries.push(is_boundary);
2250 self.context.push(token);
2251 }
2252 OutputToken::Close => {
2253 let closes_boundary = self.scope_boundaries.pop().unwrap_or(false);
2254 self.context.push(token);
2255 if closes_boundary && self.window == ContextWindow::PerBlock {
2256 self.flush_context(&mut output);
2257 }
2258 }
2259 _ => self.context.push(token),
2260 },
2261 }
2262 output
2263 }
2264
2265 fn finish(mut self) -> Vec<OutputToken<S>> {
2266 let mut output = Vec::new();
2267 self.flush_context(&mut output);
2268 output
2269 }
2270
2271 fn flush_context(&mut self, output: &mut Vec<OutputToken<S>>) {
2272 if self.context.is_empty() {
2273 return;
2274 }
2275
2276 (self.apply)(&mut self.context);
2277 output.append(&mut self.context);
2278 }
2279}
2280
2281#[derive(Clone, Debug, Default, Eq, PartialEq)]
2282struct HomophoneIndex {
2283 forms_by_reading: BTreeMap<String, BTreeSet<String>>,
2284}
2285
2286impl HomophoneIndex {
2287 fn from_dictionary<D>(dictionary: &D) -> Option<Self>
2288 where
2289 D: HanjaDictionary + ?Sized,
2290 {
2291 let mut forms_by_reading = BTreeMap::<String, BTreeSet<String>>::new();
2292 for record in dictionary.entries()? {
2293 forms_by_reading
2294 .entry(record.reading)
2295 .or_default()
2296 .insert(record.hanja);
2297 }
2298 Some(Self { forms_by_reading })
2299 }
2300
2301 fn has_homophone(&self, hanja: &str, reading: &str) -> bool {
2302 self.forms_by_reading
2303 .get(reading)
2304 .is_some_and(|forms| forms.iter().any(|form| form != hanja))
2305 }
2306}
2307
2308fn mark_homophones_in_context<S, D>(
2309 tokens: &mut [OutputToken<S>],
2310 index: Option<&HomophoneIndex>,
2311 lookup_fallback: Option<&D>,
2312) where
2313 D: HanjaDictionary + ?Sized,
2314{
2315 let mut forms_by_reading = BTreeMap::<String, BTreeSet<String>>::new();
2316
2317 for token in tokens.iter() {
2318 if let OutputToken::Annotated(annotation) = token
2319 && annotation.from_dictionary
2320 {
2321 forms_by_reading
2322 .entry(annotation.reading.clone())
2323 .or_default()
2324 .insert(annotation.hanja.clone());
2325 }
2326 }
2327
2328 for token in tokens.iter_mut() {
2329 if let OutputToken::Annotated(annotation) = token {
2330 annotation.homophone = annotation.from_dictionary
2331 && (index.is_some_and(|index| {
2332 index.has_homophone(&annotation.hanja, &annotation.reading)
2333 }) || lookup_fallback.is_some_and(|dictionary| {
2334 dictionary.has_homophone(&annotation.hanja, &annotation.reading)
2335 }) || forms_by_reading
2336 .get(&annotation.reading)
2337 .is_some_and(|forms| forms.len() > 1));
2338 }
2339 }
2340}
2341
2342fn filter_first_occurrences_in_context<S>(tokens: &mut [OutputToken<S>]) {
2343 let mut seen = BTreeSet::new();
2344
2345 for token in tokens.iter_mut() {
2346 if let OutputToken::Annotated(annotation) = token {
2347 if seen.insert(annotation.hanja.clone()) {
2348 annotation.first_in_context = true;
2349 } else {
2350 annotation.first_in_context = false;
2351 annotation.require_hanja = false;
2352 annotation.require_hangul = false;
2353 }
2354 }
2355 }
2356}
2357
2358pub fn render_tokens<S, O>(
2366 tokens: impl IntoIterator<Item = OutputToken<S>>,
2367 options: O,
2368) -> Vec<RenderedToken<S>>
2369where
2370 S: ScopeData,
2371 O: Into<RenderOptions>,
2372{
2373 render_tokens_iter(tokens, options).collect()
2374}
2375
2376pub fn render_tokens_iter<S, O>(
2383 tokens: impl IntoIterator<Item = OutputToken<S>>,
2384 options: O,
2385) -> impl Iterator<Item = RenderedToken<S>>
2386where
2387 S: ScopeData,
2388 O: Into<RenderOptions>,
2389{
2390 RendererIter {
2391 upstream: tokens.into_iter(),
2392 renderer: Renderer::new(options),
2393 }
2394}
2395
2396pub struct Renderer<S>
2403where
2404 S: ScopeData,
2405{
2406 options: RenderOptions,
2407 markup_stack: Vec<bool>,
2412 disallowing_ancestors: usize,
2417 _scope: PhantomData<fn(S)>,
2418}
2419
2420impl<S> Renderer<S>
2421where
2422 S: ScopeData,
2423{
2424 pub fn new<O>(options: O) -> Self
2426 where
2427 O: Into<RenderOptions>,
2428 {
2429 Self {
2430 options: options.into(),
2431 markup_stack: Vec::new(),
2432 disallowing_ancestors: 0,
2433 _scope: PhantomData,
2434 }
2435 }
2436
2437 pub fn push_token(&mut self, token: OutputToken<S>) -> RenderedToken<S> {
2439 match token {
2440 OutputToken::Open(scope) => {
2441 let allows = scope.data().allows_inline_markup();
2442 if !allows {
2443 self.disallowing_ancestors += 1;
2444 }
2445 self.markup_stack.push(allows);
2446 RenderedToken::Open(scope)
2447 }
2448 OutputToken::Close => {
2449 if let Some(false) = self.markup_stack.pop() {
2450 self.disallowing_ancestors = self.disallowing_ancestors.saturating_sub(1);
2454 }
2455 RenderedToken::Close
2456 }
2457 OutputToken::Text(text) => RenderedToken::Text(text),
2458 OutputToken::Verbatim(text) => RenderedToken::Verbatim(text),
2459 OutputToken::Annotated(annotation) => {
2460 let allows_inline_markup = self.disallowing_ancestors == 0;
2467 render_annotation(&annotation, &self.options, allows_inline_markup)
2468 }
2469 }
2470 }
2471}
2472
2473struct RendererIter<I, S>
2474where
2475 S: ScopeData,
2476{
2477 upstream: I,
2478 renderer: Renderer<S>,
2479}
2480
2481impl<I, S> Iterator for RendererIter<I, S>
2482where
2483 I: Iterator<Item = OutputToken<S>>,
2484 S: ScopeData,
2485{
2486 type Item = RenderedToken<S>;
2487
2488 fn next(&mut self) -> Option<Self::Item> {
2489 let token = self.upstream.next()?;
2490 Some(self.renderer.push_token(token))
2491 }
2492}
2493
2494fn render_annotation<S>(
2495 annotation: &Annotation,
2496 options: &RenderOptions,
2497 allows_inline_markup: bool,
2498) -> RenderedToken<S> {
2499 if annotation.skip_annotation {
2500 let primary = match options.mode {
2501 RenderMode::HangulOnly | RenderMode::HangulHanjaParens => annotation.reading.clone(),
2502 RenderMode::HanjaHangulParens | RenderMode::Original => annotation.hanja.clone(),
2503 RenderMode::Ruby(RubyBase::OnHangul) => annotation.reading.clone(),
2504 RenderMode::Ruby(RubyBase::OnHanja) => annotation.hanja.clone(),
2505 };
2506 return RenderedToken::Text(primary);
2507 }
2508
2509 match options.mode {
2510 RenderMode::HangulOnly if annotation.require_hanja || annotation.homophone => {
2511 RenderedToken::Text(parens(&annotation.reading, &annotation.hanja))
2512 }
2513 RenderMode::HangulOnly => RenderedToken::Text(annotation.reading.clone()),
2514 RenderMode::HangulHanjaParens => {
2515 RenderedToken::Text(parens(&annotation.reading, &annotation.hanja))
2516 }
2517 RenderMode::HanjaHangulParens => {
2518 RenderedToken::Text(parens(&annotation.hanja, &annotation.reading))
2519 }
2520 RenderMode::Ruby(base) => render_ruby(annotation, base, allows_inline_markup),
2521 RenderMode::Original if annotation.require_hangul => match options.original_gloss {
2522 OriginalGloss::Parens => {
2523 RenderedToken::Text(parens(&annotation.hanja, &annotation.reading))
2524 }
2525 OriginalGloss::Ruby => render_ruby(annotation, RubyBase::OnHanja, allows_inline_markup),
2528 },
2529 RenderMode::Original => RenderedToken::Text(annotation.hanja.clone()),
2530 }
2531}
2532
2533fn render_ruby<S>(
2534 annotation: &Annotation,
2535 base: RubyBase,
2536 allows_inline_markup: bool,
2537) -> RenderedToken<S> {
2538 let (base_text, rt_text) = match base {
2539 RubyBase::OnHangul => (&annotation.reading, &annotation.hanja),
2540 RubyBase::OnHanja => (&annotation.hanja, &annotation.reading),
2541 };
2542 if !allows_inline_markup {
2543 return RenderedToken::Text(parens(base_text, rt_text));
2544 }
2545 RenderedToken::Ruby {
2546 base: base_text.clone(),
2547 rt: rt_text.clone(),
2548 }
2549}
2550
2551fn parens(reading: &str, hanja: &str) -> String {
2552 let mut output = String::new();
2553 output.push_str(reading);
2554 output.push('(');
2555 output.push_str(hanja);
2556 output.push(')');
2557 output
2558}
2559
2560pub fn convert_plain_text<D, R>(input: &str, dictionary: &D, render: R) -> String
2568where
2569 D: HanjaDictionary + ?Sized,
2570 R: Into<RenderOptions>,
2571{
2572 convert_plain_text_with_options(input, dictionary, render, EngineOptions::default())
2573}
2574
2575pub fn convert_plain_text_with_options<D, R>(
2579 input: &str,
2580 dictionary: &D,
2581 render: R,
2582 options: EngineOptions,
2583) -> String
2584where
2585 D: HanjaDictionary + ?Sized,
2586 R: Into<RenderOptions>,
2587{
2588 let input_tokens = read_plain_text(input);
2589 let output_tokens = process_tokens_with_options(input_tokens, dictionary, options);
2590 let output_tokens = mark_homophones(output_tokens, dictionary, ContextWindow::PerBlock);
2591 let rendered_tokens = render_tokens(output_tokens, render);
2592 write_plain_text(rendered_tokens)
2593}