1#![no_std]
25#![forbid(unsafe_code)]
26#![deny(missing_docs)]
27
28extern crate alloc;
29
30mod fallback;
31mod generated;
32mod segment;
33
34use alloc::boxed::Box;
35use alloc::collections::{BTreeMap, BTreeSet};
36use alloc::string::{String, ToString};
37use alloc::vec::Vec;
38use core::marker::PhantomData;
39
40use fallback::{
41 FallbackPart, FallbackState, apply_initial_sound_law_to_first_syllable,
42 fallback_reading_for_run, is_hanja_numeral, khangul_all_readings,
43 phoneticize_fallback_run_with_state, phoneticize_hanja_char,
44 reading_matches_with_initial_sound_law, should_apply_yeol_yul,
45};
46use generated::unihan_readings::KHANGUL_READINGS;
47use segment::{Segment, segment_text};
48
49#[derive(Debug, thiserror::Error)]
56#[non_exhaustive]
57pub enum Error {
58 #[error("dictionary load failed: {0}")]
60 DictionaryLoad(String),
61
62 #[error("segmentation failed for {hanja:?}: {reason}")]
64 Segmentation {
65 hanja: String,
67
68 reason: String,
70 },
71
72 #[error("invalid hangul reading {reading:?} for hanja {hanja:?}")]
74 InvalidReading {
75 hanja: String,
77
78 reading: String,
80 },
81
82 #[error("internal invariant violated: {0}")]
84 Internal(&'static str),
85
86 #[error(transparent)]
89 Other(#[from] Box<dyn core::error::Error + Send + Sync + 'static>),
90}
91
92#[derive(Clone, Copy, Debug, Default, Eq, PartialEq)]
98pub enum Recovery {
99 #[default]
101 Strict,
102
103 Lenient,
105}
106
107#[derive(Debug)]
113pub struct RecoverableInputError {
114 original: String,
115 error: Error,
116}
117
118impl RecoverableInputError {
119 pub fn new(original: String, error: Error) -> Self {
121 Self { original, error }
122 }
123
124 pub fn original(&self) -> &str {
127 &self.original
128 }
129
130 pub fn error(&self) -> &Error {
132 &self.error
133 }
134
135 pub fn into_parts(self) -> (String, Error) {
137 (self.original, self.error)
138 }
139}
140
141pub trait ScopeData: Clone + 'static {
148 fn is_preserve(&self) -> bool;
150
151 fn allows_inline_markup(&self) -> bool {
166 true
167 }
168
169 fn is_block_boundary(&self) -> bool {
171 false
172 }
173
174 fn is_section_boundary(&self) -> bool {
176 false
177 }
178}
179
180#[derive(Clone, Debug, Eq, PartialEq)]
186pub struct Scope<S> {
187 data: S,
188}
189
190impl<S> Scope<S> {
191 pub fn new(data: S) -> Self {
193 Self { data }
194 }
195
196 pub fn data(&self) -> &S {
198 &self.data
199 }
200
201 pub fn into_data(self) -> S {
203 self.data
204 }
205}
206
207#[derive(Clone, Debug, Eq, PartialEq)]
213pub enum InputToken<S> {
214 Open(Scope<S>),
216
217 Close,
219
220 Text(String),
222
223 Verbatim(String),
225}
226
227#[derive(Clone, Debug, Eq, PartialEq)]
233pub enum OutputToken<S> {
234 Open(Scope<S>),
236
237 Close,
239
240 Text(String),
242
243 Verbatim(String),
245
246 Annotated(Annotation),
248}
249
250#[derive(Clone, Debug, Eq, PartialEq)]
256pub enum RenderedToken<S> {
257 Open(Scope<S>),
259
260 Close,
262
263 Text(String),
265
266 Verbatim(String),
268
269 Ruby {
282 base: String,
284
285 rt: String,
287 },
288}
289
290#[derive(Clone, Debug, Default, Eq, PartialEq)]
300#[non_exhaustive]
301pub struct Annotation {
302 pub hanja: String,
304
305 pub reading: String,
307
308 pub homophone: bool,
310
311 pub require_hanja: bool,
313
314 pub require_hangul: bool,
317
318 pub first_in_context: bool,
320
321 pub skip_annotation: bool,
324
325 pub from_dictionary: bool,
327
328 pub from_source_gloss: bool,
338}
339
340#[derive(Clone, Copy, Debug, Default, Eq, PartialEq)]
342pub struct MatchMark {
343 pub require_hanja: bool,
345
346 pub require_hangul: bool,
348}
349
350#[derive(Clone, Debug, Eq, PartialEq)]
358pub struct DictionaryRecord {
359 pub hanja: String,
361
362 pub reading: String,
364
365 pub mark: MatchMark,
367}
368
369#[derive(Clone, Debug, Eq, PartialEq)]
371pub struct Match {
372 pub byte_len: usize,
374
375 pub reading: String,
380
381 pub suffix_reading: Option<String>,
392
393 pub mark: MatchMark,
395}
396
397pub trait HanjaDictionary {
403 fn matches_at<'a>(&'a self, s: &'a str) -> Box<dyn Iterator<Item = Match> + 'a>;
405
406 fn max_word_chars(&self) -> Option<usize> {
408 None
409 }
410
411 fn entries<'a>(&'a self) -> Option<Box<dyn Iterator<Item = DictionaryRecord> + 'a>> {
417 None
418 }
419
420 fn has_homophone(&self, hanja: &str, reading: &str) -> bool {
422 self.entries().is_some_and(|mut entries| {
423 entries.any(|record| record.hanja != hanja && record.reading == reading)
424 })
425 }
426}
427
428impl<D> HanjaDictionary for &D
429where
430 D: HanjaDictionary + ?Sized,
431{
432 fn matches_at<'a>(&'a self, s: &'a str) -> Box<dyn Iterator<Item = Match> + 'a> {
433 (**self).matches_at(s)
434 }
435
436 fn max_word_chars(&self) -> Option<usize> {
437 (**self).max_word_chars()
438 }
439
440 fn entries<'a>(&'a self) -> Option<Box<dyn Iterator<Item = DictionaryRecord> + 'a>> {
441 (**self).entries()
442 }
443
444 fn has_homophone(&self, hanja: &str, reading: &str) -> bool {
445 (**self).has_homophone(hanja, reading)
446 }
447}
448
449impl<D> HanjaDictionary for Box<D>
450where
451 D: HanjaDictionary + ?Sized,
452{
453 fn matches_at<'a>(&'a self, s: &'a str) -> Box<dyn Iterator<Item = Match> + 'a> {
454 (**self).matches_at(s)
455 }
456
457 fn max_word_chars(&self) -> Option<usize> {
458 (**self).max_word_chars()
459 }
460
461 fn entries<'a>(&'a self) -> Option<Box<dyn Iterator<Item = DictionaryRecord> + 'a>> {
462 (**self).entries()
463 }
464
465 fn has_homophone(&self, hanja: &str, reading: &str) -> bool {
466 (**self).has_homophone(hanja, reading)
467 }
468}
469
470#[derive(Clone, Copy, Debug, Default, Eq, PartialEq)]
478pub struct UnihanCharDict;
479
480impl HanjaDictionary for UnihanCharDict {
481 fn matches_at<'a>(&'a self, s: &'a str) -> Box<dyn Iterator<Item = Match> + 'a> {
482 let matched = s.chars().next().and_then(|ch| {
483 khangul_reading(ch).map(|reading| Match {
484 byte_len: ch.len_utf8(),
485 reading: reading.to_string(),
486 suffix_reading: None,
487 mark: MatchMark::default(),
488 })
489 });
490 Box::new(matched.into_iter())
491 }
492
493 fn max_word_chars(&self) -> Option<usize> {
494 Some(1)
495 }
496
497 fn entries<'a>(&'a self) -> Option<Box<dyn Iterator<Item = DictionaryRecord> + 'a>> {
498 Some(Box::new(KHANGUL_READINGS.iter().map(|(hanja, reading)| {
499 DictionaryRecord {
500 hanja: hanja.to_string(),
501 reading: reading.to_string(),
502 mark: MatchMark::default(),
503 }
504 })))
505 }
506
507 fn has_homophone(&self, hanja: &str, reading: &str) -> bool {
508 let mut chars = hanja.chars();
509 let Some(hanja) = chars.next() else {
510 return false;
511 };
512 if chars.next().is_some() {
513 return false;
514 }
515 KHANGUL_READINGS
516 .iter()
517 .any(|&(other_hanja, other_reading)| other_hanja != hanja && other_reading == reading)
518 }
519}
520
521#[derive(Clone, Debug, Default, Eq, PartialEq)]
529pub struct ChainDictionary<D> {
530 dictionaries: Vec<D>,
531}
532
533impl<D> ChainDictionary<D> {
534 pub fn new() -> Self {
536 Self {
537 dictionaries: Vec::new(),
538 }
539 }
540
541 pub fn push(&mut self, dictionary: D) {
543 self.dictionaries.push(dictionary);
544 }
545
546 pub fn len(&self) -> usize {
548 self.dictionaries.len()
549 }
550
551 pub fn is_empty(&self) -> bool {
553 self.dictionaries.is_empty()
554 }
555
556 pub fn dictionaries(&self) -> &[D] {
558 &self.dictionaries
559 }
560
561 pub fn into_dictionaries(self) -> Vec<D> {
563 self.dictionaries
564 }
565}
566
567impl<D> FromIterator<D> for ChainDictionary<D> {
568 fn from_iter<T: IntoIterator<Item = D>>(iter: T) -> Self {
569 Self {
570 dictionaries: Vec::from_iter(iter),
571 }
572 }
573}
574
575impl<D> HanjaDictionary for ChainDictionary<D>
576where
577 D: HanjaDictionary,
578{
579 fn matches_at<'a>(&'a self, s: &'a str) -> Box<dyn Iterator<Item = Match> + 'a> {
580 let mut seen_lengths = BTreeSet::new();
581 let mut matches = Vec::new();
582
583 for dictionary in &self.dictionaries {
584 for matched in dictionary.matches_at(s) {
585 if seen_lengths.insert(matched.byte_len) {
586 matches.push(matched);
587 }
588 }
589 }
590
591 matches.sort_by_key(|matched| matched.byte_len);
592 Box::new(matches.into_iter())
593 }
594
595 fn max_word_chars(&self) -> Option<usize> {
596 let mut max = None;
597 for dictionary in &self.dictionaries {
598 let word_chars = dictionary.max_word_chars()?;
599 max = Some(max.map_or(word_chars, |current: usize| current.max(word_chars)));
600 }
601 max
602 }
603
604 fn entries<'a>(&'a self) -> Option<Box<dyn Iterator<Item = DictionaryRecord> + 'a>> {
605 let mut records = BTreeMap::<String, DictionaryRecord>::new();
606
607 for dictionary in &self.dictionaries {
608 for record in dictionary.entries()? {
609 records.entry(record.hanja.clone()).or_insert(record);
610 }
611 }
612
613 Some(Box::new(records.into_values()))
614 }
615
616 fn has_homophone(&self, hanja: &str, reading: &str) -> bool {
617 if let Some(mut records) = self.entries() {
618 return records.any(|record| record.hanja != hanja && record.reading == reading);
619 }
620
621 self.dictionaries
622 .iter()
623 .any(|dictionary| dictionary.has_homophone(hanja, reading))
624 }
625}
626
627fn khangul_reading(ch: char) -> Option<&'static str> {
628 KHANGUL_READINGS
629 .binary_search_by_key(&ch, |(hanja, _)| *hanja)
630 .ok()
631 .map(|index| KHANGUL_READINGS[index].1)
632}
633
634#[derive(Clone, Copy, Debug, Eq, PartialEq)]
640pub struct EngineOptions {
641 pub segmentation: SegmentationStrategy,
644
645 pub initial_sound_law: bool,
647
648 pub numeral_strategy: NumeralStrategy,
650}
651
652impl Default for EngineOptions {
653 fn default() -> Self {
654 Self {
655 segmentation: SegmentationStrategy::Lattice,
656 initial_sound_law: true,
657 numeral_strategy: NumeralStrategy::HangulPhonetic,
658 }
659 }
660}
661
662#[non_exhaustive]
669#[derive(Clone, Copy, Debug, Default, Eq, PartialEq)]
670pub enum SegmentationStrategy {
671 #[default]
673 Lattice,
674
675 Eager,
677}
678
679#[non_exhaustive]
686#[derive(Clone, Copy, Debug, Eq, PartialEq)]
687pub enum NumeralStrategy {
688 HangulPhonetic,
693
694 PositionalArabic,
700
701 AdditiveArabic,
707
708 Smart,
720}
721
722#[derive(Clone, Debug, Eq, PartialEq)]
723struct DictionaryEntry {
724 reading: String,
725 suffix_reading: Option<String>,
726 mark: MatchMark,
727}
728
729#[derive(Clone, Debug, Default, Eq, PartialEq)]
735pub struct MapDictionary {
736 entries: BTreeMap<String, DictionaryEntry>,
737 max_word_chars: Option<usize>,
738}
739
740impl MapDictionary {
741 pub fn new() -> Self {
743 Self::default()
744 }
745
746 pub fn insert(&mut self, hanja: impl Into<String>, reading: impl Into<String>) {
748 self.insert_marked(hanja, reading, MatchMark::default());
749 }
750
751 pub fn insert_marked(
753 &mut self,
754 hanja: impl Into<String>,
755 reading: impl Into<String>,
756 mark: MatchMark,
757 ) {
758 self.insert_entry(hanja, reading, None, mark);
759 }
760
761 pub fn insert_with_suffix(
766 &mut self,
767 hanja: impl Into<String>,
768 reading: impl Into<String>,
769 suffix: impl Into<String>,
770 ) {
771 self.insert_entry(hanja, reading, Some(suffix.into()), MatchMark::default());
772 }
773
774 fn insert_entry(
775 &mut self,
776 hanja: impl Into<String>,
777 reading: impl Into<String>,
778 suffix_reading: Option<String>,
779 mark: MatchMark,
780 ) {
781 let hanja = hanja.into();
782 let word_chars = hanja.chars().count();
783 self.max_word_chars = Some(self.max_word_chars.map_or(word_chars, |max| {
784 if word_chars > max { word_chars } else { max }
785 }));
786 self.entries.insert(
787 hanja,
788 DictionaryEntry {
789 reading: reading.into(),
790 suffix_reading,
791 mark,
792 },
793 );
794 }
795
796 pub fn is_empty(&self) -> bool {
798 self.entries.is_empty()
799 }
800
801 pub fn len(&self) -> usize {
803 self.entries.len()
804 }
805}
806
807impl HanjaDictionary for MapDictionary {
808 fn matches_at<'a>(&'a self, s: &'a str) -> Box<dyn Iterator<Item = Match> + 'a> {
809 Box::new(
810 self.entries
811 .iter()
812 .filter(move |(hanja, _)| s.starts_with(hanja.as_str()))
813 .map(|(hanja, entry)| Match {
814 byte_len: hanja.len(),
815 reading: entry.reading.clone(),
816 suffix_reading: entry.suffix_reading.clone(),
817 mark: entry.mark,
818 }),
819 )
820 }
821
822 fn max_word_chars(&self) -> Option<usize> {
823 self.max_word_chars
824 }
825
826 fn entries<'a>(&'a self) -> Option<Box<dyn Iterator<Item = DictionaryRecord> + 'a>> {
827 Some(Box::new(self.entries.iter().map(|(hanja, entry)| {
828 DictionaryRecord {
829 hanja: hanja.clone(),
830 reading: entry.reading.clone(),
831 mark: entry.mark,
832 }
833 })))
834 }
835
836 fn has_homophone(&self, hanja: &str, reading: &str) -> bool {
837 self.entries
838 .iter()
839 .any(|(other_hanja, entry)| other_hanja != hanja && entry.reading == reading)
840 }
841}
842
843#[derive(Clone, Copy, Debug, Default, Eq, PartialEq)]
851pub struct PlainScopeData;
852
853impl ScopeData for PlainScopeData {
854 fn is_preserve(&self) -> bool {
855 false
856 }
857
858 fn allows_inline_markup(&self) -> bool {
859 false
860 }
861}
862
863pub fn read_plain_text(input: &str) -> Vec<InputToken<PlainScopeData>> {
868 Vec::from([
869 InputToken::Open(Scope::new(PlainScopeData)),
870 InputToken::Text(input.to_string()),
871 InputToken::Close,
872 ])
873}
874
875pub fn write_plain_text<S>(tokens: impl IntoIterator<Item = RenderedToken<S>>) -> String {
883 let mut output = String::new();
884 for token in tokens {
885 match token {
886 RenderedToken::Open(_) | RenderedToken::Close => {}
887 RenderedToken::Text(text) | RenderedToken::Verbatim(text) => output.push_str(&text),
888 RenderedToken::Ruby { base, rt } => {
889 output.push_str(&parens(&base, &rt));
890 }
891 }
892 }
893 output
894}
895
896pub fn process_tokens<S, D>(
902 tokens: impl IntoIterator<Item = InputToken<S>>,
903 dictionary: &D,
904) -> Vec<OutputToken<S>>
905where
906 S: ScopeData,
907 D: HanjaDictionary + ?Sized,
908{
909 process_tokens_iter(tokens, dictionary).collect()
910}
911
912pub fn process_tokens_iter<S, D>(
920 tokens: impl IntoIterator<Item = InputToken<S>>,
921 dictionary: &D,
922) -> alloc::vec::IntoIter<OutputToken<S>>
923where
924 S: ScopeData,
925 D: HanjaDictionary + ?Sized,
926{
927 process_tokens_with_options(tokens, dictionary, EngineOptions::default()).into_iter()
928}
929
930pub fn process_tokens_with_options<S, D>(
935 tokens: impl IntoIterator<Item = InputToken<S>>,
936 dictionary: &D,
937 options: EngineOptions,
938) -> Vec<OutputToken<S>>
939where
940 S: ScopeData,
941 D: HanjaDictionary + ?Sized,
942{
943 let mut engine = Engine::collecting(dictionary, options);
944 let mut output = Vec::new();
945
946 for token in tokens {
947 output.extend(engine.push_token(token));
948 }
949
950 output.extend(engine.finish());
951 output
952}
953
954pub fn process_tokens_iter_with_options<S, D>(
961 tokens: impl IntoIterator<Item = InputToken<S>>,
962 dictionary: &D,
963 options: EngineOptions,
964) -> alloc::vec::IntoIter<OutputToken<S>>
965where
966 S: ScopeData,
967 D: HanjaDictionary + ?Sized,
968{
969 process_tokens_with_options(tokens, dictionary, options).into_iter()
970}
971
972pub fn recover_input_tokens<S>(
994 tokens: impl IntoIterator<Item = Result<InputToken<S>, RecoverableInputError>>,
995 recovery: Recovery,
996) -> Result<Vec<InputToken<S>>, Error>
997where
998 S: ScopeData,
999{
1000 let mut recovered = Vec::new();
1001 for token in tokens {
1002 recovered.push(recover_input_token(token, recovery)?);
1003 }
1004 Ok(recovered)
1005}
1006
1007pub fn recover_input_token<S>(
1014 token: Result<InputToken<S>, RecoverableInputError>,
1015 recovery: Recovery,
1016) -> Result<InputToken<S>, Error>
1017where
1018 S: ScopeData,
1019{
1020 match token {
1021 Ok(token) => Ok(token),
1022 Err(error) => match recovery {
1023 Recovery::Strict => Err(error.into_parts().1),
1024 Recovery::Lenient => {
1025 let (original, error) = error.into_parts();
1026 tracing::warn!(error = %error, "recovering from input reader error");
1027 Ok(InputToken::Verbatim(original))
1028 }
1029 },
1030 }
1031}
1032
1033pub fn process_fallible_tokens<S, D>(
1040 tokens: impl IntoIterator<Item = Result<InputToken<S>, RecoverableInputError>>,
1041 dictionary: &D,
1042 recovery: Recovery,
1043) -> Result<Vec<OutputToken<S>>, Error>
1044where
1045 S: ScopeData,
1046 D: HanjaDictionary + ?Sized,
1047{
1048 process_fallible_tokens_with_options(tokens, dictionary, EngineOptions::default(), recovery)
1049}
1050
1051pub fn process_fallible_tokens_with_options<S, D>(
1058 tokens: impl IntoIterator<Item = Result<InputToken<S>, RecoverableInputError>>,
1059 dictionary: &D,
1060 options: EngineOptions,
1061 recovery: Recovery,
1062) -> Result<Vec<OutputToken<S>>, Error>
1063where
1064 S: ScopeData,
1065 D: HanjaDictionary + ?Sized,
1066{
1067 let recovered = recover_input_tokens(tokens, recovery)?;
1068 Ok(process_tokens_with_options(recovered, dictionary, options))
1069}
1070
1071pub struct Engine<'a, S, D>
1083where
1084 S: ScopeData,
1085 D: HanjaDictionary + ?Sized,
1086{
1087 dictionary: &'a D,
1088 options: EngineOptions,
1089 scopes: Vec<Scope<S>>,
1090 pending_text: String,
1091 pending_unflushable_fallback_run_bytes: Option<usize>,
1092 fallback_state: FallbackState,
1093 incremental_flush: bool,
1094}
1095
1096impl<'a, S, D> Engine<'a, S, D>
1097where
1098 S: ScopeData,
1099 D: HanjaDictionary + ?Sized,
1100{
1101 pub fn new(dictionary: &'a D) -> Self {
1103 Self::with_options(dictionary, EngineOptions::default())
1104 }
1105
1106 pub fn with_options(dictionary: &'a D, options: EngineOptions) -> Self {
1108 Self::with_incremental_flush(dictionary, options, true)
1109 }
1110
1111 fn collecting(dictionary: &'a D, options: EngineOptions) -> Self {
1112 Self::with_incremental_flush(dictionary, options, false)
1113 }
1114
1115 fn with_incremental_flush(
1116 dictionary: &'a D,
1117 options: EngineOptions,
1118 incremental_flush: bool,
1119 ) -> Self {
1120 tracing::debug!(
1121 strategy = ?options.segmentation,
1122 "engine created with segmentation strategy"
1123 );
1124 Self {
1125 dictionary,
1126 options,
1127 scopes: Vec::new(),
1128 pending_text: String::new(),
1129 pending_unflushable_fallback_run_bytes: None,
1130 fallback_state: FallbackState::default(),
1131 incremental_flush,
1132 }
1133 }
1134
1135 pub fn push_token(&mut self, token: InputToken<S>) -> Vec<OutputToken<S>> {
1138 let mut output = Vec::new();
1139 match token {
1140 InputToken::Open(scope) => {
1141 self.flush_into(&mut output);
1142 if scope.data().is_block_boundary() {
1143 self.reset_fallback_context();
1144 }
1145 self.scopes.push(scope.clone());
1146 output.push(OutputToken::Open(scope));
1147 }
1148 InputToken::Close => {
1149 self.flush_into(&mut output);
1150 let closes_block_boundary = self
1151 .scopes
1152 .pop()
1153 .is_some_and(|scope| scope.data().is_block_boundary());
1154 output.push(OutputToken::Close);
1155 if closes_block_boundary {
1156 self.reset_fallback_context();
1157 }
1158 }
1159 InputToken::Text(text) => {
1160 if self
1161 .scopes
1162 .last()
1163 .is_some_and(|scope| scope.data().is_preserve())
1164 {
1165 self.flush_into(&mut output);
1166 self.reset_fallback_context();
1167 output.push(OutputToken::Text(text));
1168 } else {
1169 let previous_pending_bytes = self.pending_text.len();
1170 self.pending_text.push_str(&text);
1171 if self
1172 .pending_unflushable_fallback_run_bytes
1173 .is_some_and(|bytes| bytes == previous_pending_bytes)
1174 {
1175 self.pending_unflushable_fallback_run_bytes = Some(previous_pending_bytes);
1176 } else {
1177 self.pending_unflushable_fallback_run_bytes = None;
1178 }
1179 if self.incremental_flush {
1180 self.flush_safe_into(&mut output);
1181 }
1182 }
1183 }
1184 InputToken::Verbatim(text) => {
1185 self.flush_into(&mut output);
1186 self.reset_fallback_context();
1187 output.push(OutputToken::Verbatim(text));
1188 }
1189 }
1190 output
1191 }
1192
1193 pub fn flush(&mut self) -> Vec<OutputToken<S>> {
1195 let mut output = Vec::new();
1196 self.flush_into(&mut output);
1197 output
1198 }
1199
1200 pub fn finish(mut self) -> Vec<OutputToken<S>> {
1202 self.flush()
1203 }
1204
1205 pub fn buffered_chars(&self) -> usize {
1207 self.pending_text.chars().count()
1208 }
1209
1210 fn tail_bound(&self) -> Option<usize> {
1211 self.dictionary.max_word_chars().filter(|bound| *bound > 0)
1212 }
1213
1214 fn flush_safe_into(&mut self, output: &mut Vec<OutputToken<S>>) {
1215 if self.pending_text.is_empty() {
1216 return;
1217 }
1218 if !self.pending_text.chars().any(is_hanja) {
1219 self.flush_non_hanja_safe_into(output);
1220 return;
1221 }
1222
1223 let Some(bound) = self.tail_bound() else {
1224 let Some(flush_end) = safe_unknown_bound_flush_end(&self.pending_text) else {
1225 return;
1226 };
1227 self.flush_prefix_into(flush_end, output);
1228 if !self.pending_text.chars().any(is_hanja) {
1229 self.flush_non_hanja_safe_into(output);
1230 }
1231 return;
1232 };
1233 if let Some(flush_end) = safe_unknown_bound_flush_end(&self.pending_text) {
1234 self.flush_prefix_into(flush_end, output);
1235 if !self.pending_text.chars().any(is_hanja) {
1236 self.flush_non_hanja_safe_into(output);
1237 }
1238 return;
1239 }
1240 let buffered_chars = self.buffered_chars();
1241 if buffered_chars > bound.saturating_mul(10) {
1242 tracing::debug!(
1243 buffered_chars,
1244 dict_max_word_chars = bound,
1245 "streaming tail buffer is unusually large"
1246 );
1247 }
1248 if buffered_chars <= bound {
1249 return;
1250 }
1251
1252 if self.extends_unflushable_fallback_run(bound) {
1253 self.pending_unflushable_fallback_run_bytes = Some(self.pending_text.len());
1254 return;
1255 }
1256
1257 let safe_chars = buffered_chars.saturating_sub(bound).saturating_add(1);
1258 let segments = segment_text(&self.pending_text, self.dictionary, self.options);
1259 let mut flush_end = 0;
1260 let mut flush_segments = Vec::new();
1261 for segment in &segments {
1262 let (byte_start, byte_end) = segment_bounds(segment);
1263 let start_chars = self.pending_text[..byte_start].chars().count();
1264 let end_chars = self.pending_text[..byte_end].chars().count();
1265 if byte_start > flush_end || (start_chars > safe_chars && flush_end > 0) {
1266 break;
1267 }
1268 if end_chars > safe_chars {
1269 break;
1270 }
1271 flush_end = byte_end;
1272 flush_segments.push(segment.clone());
1273 }
1274
1275 if let Some(fallback_start) = trailing_fallback_run_start(&segments, flush_end) {
1279 flush_end = fallback_start;
1280 while flush_segments
1281 .last()
1282 .is_some_and(|segment| segment_bounds(segment).1 > flush_end)
1283 {
1284 flush_segments.pop();
1285 }
1286 }
1287
1288 if flush_end > 0 {
1289 self.pending_unflushable_fallback_run_bytes = None;
1290 self.flush_segments_prefix_into(flush_end, &flush_segments, output);
1291 if !self.pending_text.chars().any(is_hanja) {
1292 self.flush_non_hanja_safe_into(output);
1293 }
1294 } else if trailing_fallback_run_start(&segments, self.pending_text.len()) == Some(0) {
1295 self.pending_unflushable_fallback_run_bytes = Some(self.pending_text.len());
1296 }
1297 }
1298
1299 fn extends_unflushable_fallback_run(&self, bound: usize) -> bool {
1300 let Some(previous_bytes) = self.pending_unflushable_fallback_run_bytes else {
1301 return false;
1302 };
1303 if previous_bytes == 0
1304 || previous_bytes > self.pending_text.len()
1305 || !self.pending_text.is_char_boundary(previous_bytes)
1306 {
1307 return false;
1308 }
1309
1310 let appended = &self.pending_text[previous_bytes..];
1311 if appended.is_empty() {
1312 return true;
1313 }
1314 if appended.chars().any(|ch| !is_hanja(ch)) {
1315 return false;
1316 }
1317
1318 let probe_start = suffix_start_for_char_count(
1322 &self.pending_text[..previous_bytes],
1323 bound.saturating_sub(1),
1324 );
1325 let probe = &self.pending_text[probe_start..];
1326 segment_text(probe, self.dictionary, self.options)
1327 .iter()
1328 .all(|segment| {
1329 matches!(
1330 segment,
1331 Segment::Fallback { .. } | Segment::TrivialDictionary { .. }
1332 )
1333 })
1334 }
1335
1336 fn flush_non_hanja_safe_into(&mut self, output: &mut Vec<OutputToken<S>>) {
1337 let flush_end = match self.tail_bound() {
1338 Some(bound) => safe_non_hanja_flush_end(&self.pending_text, bound),
1339 None => safe_unknown_bound_flush_end(&self.pending_text),
1340 };
1341 if let Some(flush_end) = flush_end {
1342 self.flush_prefix_into(flush_end, output);
1343 }
1344 }
1345
1346 fn flush_prefix_into(&mut self, flush_end: usize, output: &mut Vec<OutputToken<S>>) {
1347 if flush_end == self.pending_text.len() {
1348 self.flush_into(output);
1349 return;
1350 }
1351 self.pending_unflushable_fallback_run_bytes = None;
1352 let prefix = self.pending_text[..flush_end].to_string();
1353 let segments = segment_text(&prefix, self.dictionary, self.options);
1354 self.flush_segments_prefix_into(flush_end, &segments, output);
1355 }
1356
1357 fn flush_segments_prefix_into(
1358 &mut self,
1359 flush_end: usize,
1360 segments: &[Segment],
1361 output: &mut Vec<OutputToken<S>>,
1362 ) {
1363 let prefix = self.pending_text[..flush_end].to_string();
1364 process_segments_with_state(
1365 &prefix,
1366 segments,
1367 self.dictionary,
1368 self.options,
1369 &mut self.fallback_state,
1370 output,
1371 );
1372 self.pending_text.replace_range(..flush_end, "");
1373 }
1374
1375 fn flush_into(&mut self, output: &mut Vec<OutputToken<S>>) {
1376 if self.pending_text.is_empty() {
1377 return;
1378 }
1379 self.pending_unflushable_fallback_run_bytes = None;
1380 let text = core::mem::take(&mut self.pending_text);
1381 process_text_with_state(
1382 &text,
1383 self.dictionary,
1384 self.options,
1385 &mut self.fallback_state,
1386 output,
1387 );
1388 }
1389
1390 fn reset_fallback_context(&mut self) {
1391 self.fallback_state = FallbackState::default();
1392 }
1393}
1394
1395fn safe_non_hanja_flush_end(text: &str, bound: usize) -> Option<usize> {
1396 if text.is_empty() {
1397 return None;
1398 }
1399
1400 let keep_chars = bound.saturating_sub(1);
1401 let span_start = text
1402 .char_indices()
1403 .rfind(|(_, ch)| ch.is_whitespace())
1404 .map_or(0, |(index, ch)| index + ch.len_utf8());
1405 let suffix = &text[span_start..];
1406 let suffix_chars = suffix.chars().count();
1407 if suffix_chars <= keep_chars {
1408 return (span_start > 0).then_some(span_start);
1409 }
1410
1411 let flush_suffix_chars = suffix_chars - keep_chars;
1412 let flush_end = suffix
1413 .char_indices()
1414 .nth(flush_suffix_chars)
1415 .map_or(text.len(), |(index, _)| span_start + index);
1416 (flush_end > 0).then_some(flush_end)
1417}
1418
1419fn safe_unknown_bound_flush_end(text: &str) -> Option<usize> {
1420 text.char_indices()
1421 .rfind(|(_, ch)| ch.is_whitespace())
1422 .map(|(index, ch)| index + ch.len_utf8())
1423}
1424
1425fn suffix_start_for_char_count(text: &str, count: usize) -> usize {
1426 if count == 0 {
1427 return text.len();
1428 }
1429
1430 text.char_indices()
1431 .rev()
1432 .nth(count.saturating_sub(1))
1433 .map_or(0, |(index, _)| index)
1434}
1435
1436fn trailing_fallback_run_start(segments: &[Segment], split_byte: usize) -> Option<usize> {
1437 if split_byte == 0 {
1438 return None;
1439 }
1440
1441 for (index, segment) in segments.iter().enumerate() {
1442 let (byte_start, byte_end) = segment_bounds(segment);
1443 if byte_end != split_byte {
1444 continue;
1445 }
1446 if !matches!(
1447 segment,
1448 Segment::Fallback { .. } | Segment::TrivialDictionary { .. }
1449 ) {
1450 return None;
1451 }
1452 if let Some(next) = segments.get(index + 1)
1453 && !matches!(
1454 next,
1455 Segment::Fallback { .. } | Segment::TrivialDictionary { .. }
1456 )
1457 {
1458 return None;
1459 }
1460
1461 let mut run_start = byte_start;
1462 for previous in segments[..index].iter().rev() {
1463 let (previous_start, previous_end) = segment_bounds(previous);
1464 if previous_end != run_start
1465 || !matches!(
1466 previous,
1467 Segment::Fallback { .. } | Segment::TrivialDictionary { .. }
1468 )
1469 {
1470 break;
1471 }
1472 run_start = previous_start;
1473 }
1474 return (run_start < split_byte).then_some(run_start);
1475 }
1476
1477 None
1478}
1479
1480fn process_text_with_state<S, D>(
1481 text: &str,
1482 dictionary: &D,
1483 options: EngineOptions,
1484 fallback_state: &mut FallbackState,
1485 output: &mut Vec<OutputToken<S>>,
1486) where
1487 D: HanjaDictionary + ?Sized,
1488{
1489 let segments = segment_text(text, dictionary, options);
1490 process_segments_with_state(text, &segments, dictionary, options, fallback_state, output);
1491}
1492
1493fn process_trivial_fallback_run<S>(
1494 run_segments: &[Segment],
1495 text: &str,
1496 options: EngineOptions,
1497 state: &mut FallbackState,
1498 output: &mut Vec<OutputToken<S>>,
1499) {
1500 let run_start = segment_bounds(&run_segments[0]).0;
1501 let run_end = segment_bounds(&run_segments[run_segments.len() - 1]).1;
1502 let capacity = run_end.saturating_sub(run_start);
1503 let mut hanja = String::with_capacity(capacity);
1504 let mut reading = String::with_capacity(capacity);
1505 let mut has_dictionary = false;
1506 let mut last_trivial_source: Option<char> = None;
1507 let mut last_trivial_reading: Option<String> = None;
1508
1509 let mut seg_index = 0;
1510 while seg_index < run_segments.len() {
1511 match &run_segments[seg_index] {
1512 Segment::TrivialDictionary {
1513 byte_start,
1514 byte_end,
1515 reading: dict_reading,
1516 suffix_reading,
1517 ..
1518 } => {
1519 let source = &text[*byte_start..*byte_end];
1520 let effective = dictionary_effective_reading(
1521 source,
1522 dict_reading,
1523 suffix_reading.as_deref(),
1524 options,
1525 state.starts_word,
1526 state.previous_reading,
1527 );
1528 if !hanja.is_empty()
1529 && last_trivial_reading.as_deref() == Some(&effective)
1530 && last_trivial_source != source.chars().next()
1531 {
1532 output.push(OutputToken::Annotated(Annotation {
1533 hanja: core::mem::take(&mut hanja),
1534 reading: core::mem::take(&mut reading),
1535 homophone: false,
1536 require_hanja: false,
1537 require_hangul: false,
1538 first_in_context: true,
1539 skip_annotation: false,
1540 from_dictionary: has_dictionary,
1541 from_source_gloss: false,
1542 }));
1543 }
1544 hanja.push_str(source);
1545 reading.push_str(&effective);
1546 update_fallback_state_for_reading(&effective, state);
1547 has_dictionary = true;
1548 last_trivial_source = source.chars().next();
1549 last_trivial_reading = Some(effective);
1550 seg_index += 1;
1551 }
1552 Segment::Fallback { byte_start: _, .. } => {
1553 last_trivial_source = None;
1554 last_trivial_reading = None;
1555 let fb_start = seg_index;
1556 while seg_index < run_segments.len()
1557 && matches!(&run_segments[seg_index], Segment::Fallback { .. })
1558 {
1559 seg_index += 1;
1560 }
1561 let fb_text = &text[segment_bounds(&run_segments[fb_start]).0
1562 ..segment_bounds(&run_segments[seg_index - 1]).1];
1563 for part in phoneticize_fallback_run_with_state(fb_text, options, state) {
1564 match part {
1565 FallbackPart::Annotation {
1566 hanja: part_hanja,
1567 reading: part_reading,
1568 } => {
1569 if part_hanja.chars().any(is_hanja_numeral) {
1570 if !hanja.is_empty() {
1571 output.push(OutputToken::Annotated(Annotation {
1572 hanja: core::mem::take(&mut hanja),
1573 reading: core::mem::take(&mut reading),
1574 homophone: false,
1575 require_hanja: false,
1576 require_hangul: false,
1577 first_in_context: true,
1578 skip_annotation: false,
1579 from_dictionary: has_dictionary,
1580 from_source_gloss: false,
1581 }));
1582 has_dictionary = false;
1583 }
1584 output.push(OutputToken::Annotated(Annotation {
1585 hanja: part_hanja,
1586 reading: part_reading,
1587 homophone: false,
1588 require_hanja: false,
1589 require_hangul: false,
1590 first_in_context: true,
1591 skip_annotation: false,
1592 from_dictionary: false,
1593 from_source_gloss: false,
1594 }));
1595 } else {
1596 hanja.push_str(&part_hanja);
1597 reading.push_str(&part_reading);
1598 }
1599 }
1600 FallbackPart::ReadingText(t) | FallbackPart::Text(t) => {
1601 if !hanja.is_empty() {
1602 output.push(OutputToken::Annotated(Annotation {
1603 hanja: core::mem::take(&mut hanja),
1604 reading: core::mem::take(&mut reading),
1605 homophone: false,
1606 require_hanja: false,
1607 require_hangul: false,
1608 first_in_context: true,
1609 skip_annotation: false,
1610 from_dictionary: has_dictionary,
1611 from_source_gloss: false,
1612 }));
1613 has_dictionary = false;
1614 }
1615 push_text(output, &t);
1616 }
1617 }
1618 }
1619 }
1620 _ => unreachable!("run must contain only TrivialDictionary | Fallback"),
1621 }
1622 }
1623
1624 if !hanja.is_empty() {
1625 output.push(OutputToken::Annotated(Annotation {
1626 hanja,
1627 reading,
1628 homophone: false,
1629 require_hanja: false,
1630 require_hangul: false,
1631 first_in_context: true,
1632 skip_annotation: false,
1633 from_dictionary: has_dictionary,
1634 from_source_gloss: false,
1635 }));
1636 }
1637}
1638
1639fn process_segments_with_state<S, D>(
1640 text: &str,
1641 segments: &[Segment],
1642 _dictionary: &D,
1643 options: EngineOptions,
1644 fallback_state: &mut FallbackState,
1645 output: &mut Vec<OutputToken<S>>,
1646) where
1647 D: HanjaDictionary + ?Sized,
1648{
1649 let mut index = 0;
1650
1651 while index < segments.len() {
1652 match &segments[index] {
1653 Segment::Dictionary {
1654 byte_start,
1655 byte_end,
1656 reading,
1657 suffix_reading,
1658 mark,
1659 } => {
1660 let source = &text[*byte_start..*byte_end];
1661 let effective = dictionary_effective_reading(
1662 source,
1663 reading,
1664 suffix_reading.as_deref(),
1665 options,
1666 fallback_state.starts_word,
1667 fallback_state.previous_reading,
1668 );
1669 output.push(OutputToken::Annotated(Annotation {
1670 hanja: source.to_string(),
1671 homophone: false,
1672 reading: effective.clone(),
1673 require_hanja: mark.require_hanja,
1674 require_hangul: mark.require_hangul,
1675 first_in_context: true,
1676 skip_annotation: false,
1677 from_dictionary: true,
1678 from_source_gloss: false,
1679 }));
1680 if should_preserve_dictionary_context(source, &effective, options) {
1681 update_fallback_state_for_reading(&effective, fallback_state);
1682 } else {
1683 *fallback_state = FallbackState::default();
1684 }
1685 index += 1;
1686 }
1687 Segment::TrivialDictionary {
1688 byte_start,
1689 byte_end,
1690 ..
1691 }
1692 | Segment::Fallback {
1693 byte_start,
1694 byte_end,
1695 } => {
1696 let run_start = index;
1697 let mut merged_end = *byte_end;
1698 while let Some(
1699 Segment::TrivialDictionary {
1700 byte_end: next_end, ..
1701 }
1702 | Segment::Fallback {
1703 byte_end: next_end, ..
1704 },
1705 ) = segments.get(index + 1)
1706 {
1707 merged_end = *next_end;
1708 index += 1;
1709 }
1710 let has_dictionary = segments[run_start..=index]
1711 .iter()
1712 .any(|s| matches!(s, Segment::TrivialDictionary { .. }));
1713 if has_dictionary {
1714 process_trivial_fallback_run(
1715 &segments[run_start..=index],
1716 text,
1717 options,
1718 fallback_state,
1719 output,
1720 );
1721 } else {
1722 process_fallback_text(
1723 &text[*byte_start..merged_end],
1724 options,
1725 fallback_state,
1726 output,
1727 );
1728 }
1729 index += 1;
1730 }
1731 Segment::NumeralText { text, .. } => {
1732 push_text(output, text);
1733 update_fallback_state_for_text(text, fallback_state);
1734 index += 1;
1735 }
1736 Segment::Text {
1737 byte_start,
1738 byte_end,
1739 } => {
1740 let text_segment = &text[*byte_start..*byte_end];
1741 push_text(output, text_segment);
1742 update_fallback_state_for_text(text_segment, fallback_state);
1743 index += 1;
1744 }
1745 }
1746 }
1747}
1748
1749fn segment_bounds(segment: &Segment) -> (usize, usize) {
1750 match segment {
1751 Segment::Dictionary {
1752 byte_start,
1753 byte_end,
1754 ..
1755 }
1756 | Segment::TrivialDictionary {
1757 byte_start,
1758 byte_end,
1759 ..
1760 }
1761 | Segment::Fallback {
1762 byte_start,
1763 byte_end,
1764 }
1765 | Segment::NumeralText {
1766 byte_start,
1767 byte_end,
1768 ..
1769 }
1770 | Segment::Text {
1771 byte_start,
1772 byte_end,
1773 } => (*byte_start, *byte_end),
1774 }
1775}
1776
1777fn process_fallback_text<S>(
1778 text: &str,
1779 options: EngineOptions,
1780 state: &mut FallbackState,
1781 output: &mut Vec<OutputToken<S>>,
1782) {
1783 for part in phoneticize_fallback_run_with_state(text, options, state) {
1784 match part {
1785 FallbackPart::Annotation { hanja, reading } => {
1786 output.push(OutputToken::Annotated(Annotation {
1787 hanja,
1788 reading,
1789 homophone: false,
1790 require_hanja: false,
1791 require_hangul: false,
1792 first_in_context: true,
1793 skip_annotation: false,
1794 from_dictionary: false,
1795 from_source_gloss: false,
1796 }));
1797 }
1798 FallbackPart::ReadingText(text) => push_text(output, &text),
1799 FallbackPart::Text(text) => push_text(output, &text),
1800 }
1801 }
1802}
1803
1804fn update_fallback_state_for_text(text: &str, state: &mut FallbackState) {
1805 if text.is_empty() {
1806 return;
1807 }
1808
1809 if text
1810 .chars()
1811 .last()
1812 .is_some_and(|character| character.is_whitespace())
1813 {
1814 *state = FallbackState::default();
1815 return;
1816 }
1817
1818 let Some(last) = text.chars().rev().find(|ch| !ch.is_whitespace()) else {
1819 return;
1820 };
1821
1822 if last.is_alphanumeric() {
1823 state.starts_word = false;
1824 state.previous_reading = Some(last);
1825 } else {
1826 *state = FallbackState::default();
1827 }
1828}
1829
1830fn dictionary_effective_reading(
1852 source: &str,
1853 reading: &str,
1854 suffix_reading: Option<&str>,
1855 options: EngineOptions,
1856 starts_word: bool,
1857 previous_reading: Option<char>,
1858) -> String {
1859 if let Some(suffix) = suffix_reading {
1860 return if starts_word && options.initial_sound_law {
1861 reading.to_string()
1862 } else {
1863 suffix.to_string()
1864 };
1865 }
1866
1867 let mut chars = source.chars();
1868 if let (Some(ch), None) = (chars.next(), chars.next())
1869 && let Some(base) = phoneticize_hanja_char(ch)
1870 {
1871 let initial = apply_initial_sound_law_to_first_syllable(base);
1872 if initial != base && (reading == base || reading == initial) {
1873 let apply_law = options.initial_sound_law
1874 && (starts_word || should_apply_yeol_yul(previous_reading, base));
1875 return if apply_law { initial } else { base.to_string() };
1876 }
1877 }
1878
1879 reading.to_string()
1880}
1881
1882fn should_preserve_dictionary_context(source: &str, reading: &str, options: EngineOptions) -> bool {
1883 if reading.chars().all(char::is_whitespace) {
1884 return false;
1885 }
1886
1887 if source.chars().all(is_hanja) {
1888 match fallback_reading_for_run(source, options) {
1889 Some(fallback_reading) => {
1890 fallback_reading == reading || has_one_hangul_syllable_per_hanja(source, reading)
1891 }
1892 None => has_one_hangul_syllable_per_hanja(source, reading),
1893 }
1894 } else {
1895 true
1896 }
1897}
1898
1899fn has_one_hangul_syllable_per_hanja(source: &str, reading: &str) -> bool {
1900 let source_len = source.chars().count();
1901 let mut reading_len = 0;
1902
1903 for ch in reading.chars() {
1904 if !is_hangul_syllable(ch) {
1905 return false;
1906 }
1907 reading_len += 1;
1908 }
1909
1910 reading_len == source_len
1911}
1912
1913fn is_hangul_syllable(ch: char) -> bool {
1914 ('\u{ac00}'..='\u{d7a3}').contains(&ch)
1915}
1916
1917fn update_fallback_state_for_reading(reading: &str, state: &mut FallbackState) {
1918 let Some(last) = reading.chars().rev().find(|ch| !ch.is_whitespace()) else {
1919 *state = FallbackState::default();
1920 return;
1921 };
1922
1923 if last.is_alphanumeric() {
1924 state.starts_word = false;
1925 state.previous_reading = Some(last);
1926 } else {
1927 *state = FallbackState::default();
1928 }
1929}
1930
1931fn push_text<S>(output: &mut Vec<OutputToken<S>>, text: &str) {
1932 if text.is_empty() {
1933 return;
1934 }
1935
1936 match output.last_mut() {
1937 Some(OutputToken::Text(existing)) => existing.push_str(text),
1938 _ => output.push(OutputToken::Text(text.to_string())),
1939 }
1940}
1941
1942pub fn is_hanja(ch: char) -> bool {
1944 matches!(
1945 ch,
1946 '\u{2F00}'..='\u{2FFF}'
1947 | '\u{3007}'
1948 | '\u{3400}'..='\u{4DBF}'
1949 | '\u{4E00}'..='\u{9FFF}'
1950 | '\u{F900}'..='\u{FAFF}'
1951 | '\u{20000}'..='\u{2A6DF}'
1952 | '\u{2A700}'..='\u{2B73F}'
1953 | '\u{2B740}'..='\u{2B81F}'
1954 | '\u{2B820}'..='\u{2CEAF}'
1955 | '\u{2CEB0}'..='\u{2EBEF}'
1956 | '\u{2EBF0}'..='\u{2EE5F}'
1957 | '\u{2F800}'..='\u{2FA1F}'
1958 | '\u{30000}'..='\u{3134F}'
1959 | '\u{31350}'..='\u{323AF}'
1960 | '\u{323B0}'..='\u{3347F}'
1961 )
1962}
1963
1964#[derive(Clone, Copy, Debug, Eq, PartialEq)]
1966pub enum RenderMode {
1967 HangulOnly,
1969
1970 HangulHanjaParens,
1972
1973 HanjaHangulParens,
1976
1977 Ruby(RubyBase),
1985
1986 Original,
1988}
1989
1990#[derive(Clone, Copy, Debug, Eq, PartialEq)]
1992pub enum RubyBase {
1993 OnHangul,
1997
1998 OnHanja,
2002}
2003
2004#[derive(Clone, Copy, Debug, Default, Eq, PartialEq)]
2012pub enum OriginalGloss {
2013 #[default]
2015 Parens,
2016
2017 Ruby,
2021}
2022
2023#[derive(Clone, Copy, Debug, Eq, PartialEq)]
2030pub struct RenderOptions {
2031 pub mode: RenderMode,
2033
2034 pub original_gloss: OriginalGloss,
2036}
2037
2038impl Default for RenderOptions {
2039 fn default() -> Self {
2040 Self {
2041 mode: RenderMode::HangulOnly,
2042 original_gloss: OriginalGloss::Parens,
2043 }
2044 }
2045}
2046
2047impl From<RenderMode> for RenderOptions {
2048 fn from(mode: RenderMode) -> Self {
2049 Self {
2050 mode,
2051 original_gloss: OriginalGloss::default(),
2052 }
2053 }
2054}
2055
2056#[derive(Clone, Copy, Debug, Eq, PartialEq)]
2066pub enum ContextWindow {
2067 Off,
2069
2070 PerBlock,
2072
2073 PerSection,
2075
2076 PerDocument,
2078}
2079
2080#[derive(Clone, Copy, Debug, Default, Eq, PartialEq)]
2099pub enum HomophoneDetection {
2100 #[default]
2102 ContextLocal,
2103
2104 DictionaryWide,
2107}
2108
2109#[derive(Clone, Copy, Debug, Eq, PartialEq)]
2111pub enum DirectiveAction {
2112 RequireHanja,
2114
2115 RequireHangul,
2117
2118 SkipAnnotation,
2120}
2121
2122#[derive(Default)]
2128pub struct UserDirectives<'a> {
2129 rules: Vec<UserDirectiveRule<'a>>,
2130}
2131
2132impl<'a> UserDirectives<'a> {
2133 pub fn new() -> Self {
2135 Self::default()
2136 }
2137
2138 pub fn require_hanja(&mut self, hanja: impl Into<String>) {
2140 self.add_literal(hanja, DirectiveAction::RequireHanja);
2141 }
2142
2143 pub fn require_hangul(&mut self, hanja: impl Into<String>) {
2145 self.add_literal(hanja, DirectiveAction::RequireHangul);
2146 }
2147
2148 pub fn skip_annotation(&mut self, hanja: impl Into<String>) {
2150 self.add_literal(hanja, DirectiveAction::SkipAnnotation);
2151 }
2152
2153 pub fn add_literal(&mut self, hanja: impl Into<String>, action: DirectiveAction) {
2155 self.rules.push(UserDirectiveRule {
2156 predicate: UserDirectivePredicate::Literal(hanja.into()),
2157 action,
2158 });
2159 }
2160
2161 pub fn add_predicate(
2163 &mut self,
2164 predicate: impl Fn(&Annotation) -> bool + 'a,
2165 action: DirectiveAction,
2166 ) {
2167 self.rules.push(UserDirectiveRule {
2168 predicate: UserDirectivePredicate::Predicate(Box::new(predicate)),
2169 action,
2170 });
2171 }
2172
2173 pub fn is_empty(&self) -> bool {
2175 self.rules.is_empty()
2176 }
2177
2178 pub fn apply<S>(&self, token: OutputToken<S>) -> OutputToken<S> {
2185 match token {
2186 OutputToken::Annotated(mut annotation) => {
2187 for rule in &self.rules {
2188 if !rule.predicate.matches(&annotation) {
2189 continue;
2190 }
2191 match rule.action {
2192 DirectiveAction::RequireHanja => annotation.require_hanja = true,
2193 DirectiveAction::RequireHangul => annotation.require_hangul = true,
2194 DirectiveAction::SkipAnnotation => annotation.skip_annotation = true,
2195 }
2196 }
2197 OutputToken::Annotated(annotation)
2198 }
2199 token => token,
2200 }
2201 }
2202}
2203
2204struct UserDirectiveRule<'a> {
2205 predicate: UserDirectivePredicate<'a>,
2206 action: DirectiveAction,
2207}
2208
2209enum UserDirectivePredicate<'a> {
2210 Literal(String),
2211 Predicate(Box<dyn Fn(&Annotation) -> bool + 'a>),
2212}
2213
2214impl UserDirectivePredicate<'_> {
2215 fn matches(&self, annotation: &Annotation) -> bool {
2216 match self {
2217 Self::Literal(hanja) => annotation.hanja == *hanja,
2218 Self::Predicate(predicate) => predicate(annotation),
2219 }
2220 }
2221}
2222
2223pub fn mark_homophones<S, D>(
2229 tokens: impl IntoIterator<Item = OutputToken<S>>,
2230 dictionary: &D,
2231 window: ContextWindow,
2232) -> Vec<OutputToken<S>>
2233where
2234 S: ScopeData,
2235 D: HanjaDictionary + ?Sized,
2236{
2237 mark_homophones_with_detection(tokens, dictionary, window, HomophoneDetection::ContextLocal)
2238}
2239
2240pub fn mark_homophones_with_detection<S, D>(
2252 tokens: impl IntoIterator<Item = OutputToken<S>>,
2253 dictionary: &D,
2254 window: ContextWindow,
2255 detection: HomophoneDetection,
2256) -> Vec<OutputToken<S>>
2257where
2258 S: ScopeData,
2259 D: HanjaDictionary + ?Sized,
2260{
2261 if window == ContextWindow::Off {
2262 return tokens.into_iter().collect();
2263 }
2264
2265 let index = match detection {
2266 HomophoneDetection::ContextLocal => None,
2267 HomophoneDetection::DictionaryWide => HomophoneIndex::from_dictionary(dictionary),
2268 };
2269 let lookup_fallback = match detection {
2270 HomophoneDetection::ContextLocal => None,
2271 HomophoneDetection::DictionaryWide => index.is_none().then_some(dictionary),
2272 };
2273 ContextMiddleware::new(window, |tokens| {
2274 mark_homophones_in_context(tokens, index.as_ref(), lookup_fallback);
2275 })
2276 .process(tokens)
2277}
2278
2279pub fn filter_first_occurrences<S>(
2285 tokens: impl IntoIterator<Item = OutputToken<S>>,
2286 window: ContextWindow,
2287) -> Vec<OutputToken<S>>
2288where
2289 S: ScopeData,
2290{
2291 ContextMiddleware::new(window, filter_first_occurrences_in_context).process(tokens)
2292}
2293
2294type ContextApply<S> = fn(&mut [OutputToken<S>]);
2295type HomophoneApply<'a, S> = Box<dyn FnMut(&mut [OutputToken<S>]) + 'a>;
2296
2297pub struct HomophoneMarker<'a, S>
2305where
2306 S: ScopeData,
2307{
2308 inner: ContextMiddleware<S, HomophoneApply<'a, S>>,
2309}
2310
2311impl<'a, S> HomophoneMarker<'a, S>
2312where
2313 S: ScopeData,
2314{
2315 pub fn new<D>(dictionary: &'a D, window: ContextWindow) -> Self
2321 where
2322 D: HanjaDictionary + ?Sized,
2323 {
2324 Self::with_detection(dictionary, window, HomophoneDetection::ContextLocal)
2325 }
2326
2327 pub fn with_detection<D>(
2335 dictionary: &'a D,
2336 window: ContextWindow,
2337 detection: HomophoneDetection,
2338 ) -> Self
2339 where
2340 D: HanjaDictionary + ?Sized,
2341 {
2342 let index = match detection {
2343 _ if window == ContextWindow::Off => None,
2344 HomophoneDetection::ContextLocal => None,
2345 HomophoneDetection::DictionaryWide => HomophoneIndex::from_dictionary(dictionary),
2346 };
2347 let lookup_fallback = match detection {
2348 HomophoneDetection::ContextLocal => None,
2349 HomophoneDetection::DictionaryWide => index.is_none().then_some(dictionary),
2350 };
2351 Self {
2352 inner: ContextMiddleware::new(
2353 window,
2354 Box::new(move |tokens| {
2355 mark_homophones_in_context(tokens, index.as_ref(), lookup_fallback);
2356 }),
2357 ),
2358 }
2359 }
2360
2361 pub fn push_token(&mut self, token: OutputToken<S>) -> Vec<OutputToken<S>> {
2363 self.inner.push_token(token)
2364 }
2365
2366 pub fn finish(self) -> Vec<OutputToken<S>> {
2368 self.inner.finish()
2369 }
2370}
2371
2372pub struct FirstOccurrenceFilter<S>
2377where
2378 S: ScopeData,
2379{
2380 inner: ContextMiddleware<S, ContextApply<S>>,
2381}
2382
2383impl<S> FirstOccurrenceFilter<S>
2384where
2385 S: ScopeData,
2386{
2387 pub fn new(window: ContextWindow) -> Self {
2389 Self {
2390 inner: ContextMiddleware::new(window, filter_first_occurrences_in_context::<S>),
2391 }
2392 }
2393
2394 pub fn push_token(&mut self, token: OutputToken<S>) -> Vec<OutputToken<S>> {
2396 self.inner.push_token(token)
2397 }
2398
2399 pub fn finish(self) -> Vec<OutputToken<S>> {
2401 self.inner.finish()
2402 }
2403}
2404
2405pub struct RedundantParenCollapser<S>
2463where
2464 S: ScopeData,
2465{
2466 enabled: bool,
2467 held_tail: String,
2473 pending_annotation: Option<Annotation>,
2475 preceding: String,
2477 following: String,
2480 _scope: PhantomData<fn(S)>,
2481}
2482
2483impl<S> RedundantParenCollapser<S>
2484where
2485 S: ScopeData,
2486{
2487 pub fn new(enabled: bool) -> Self {
2490 Self {
2491 enabled,
2492 held_tail: String::new(),
2493 pending_annotation: None,
2494 preceding: String::new(),
2495 following: String::new(),
2496 _scope: PhantomData,
2497 }
2498 }
2499
2500 pub fn push_token(&mut self, token: OutputToken<S>) -> Vec<OutputToken<S>> {
2502 if !self.enabled {
2503 return Vec::from([token]);
2504 }
2505 let mut output = Vec::new();
2506 match token {
2507 OutputToken::Annotated(annotation) => {
2508 self.finalize_pending(&mut output);
2511 self.preceding = core::mem::take(&mut self.held_tail);
2512 self.pending_annotation = Some(annotation);
2513 }
2514 OutputToken::Text(text) => {
2515 if self.pending_annotation.is_some() {
2516 self.following.push_str(&text);
2517 self.resolve_following(&mut output);
2518 } else {
2519 self.held_tail.push_str(&text);
2520 self.emit_held_prefix(&mut output);
2521 }
2522 }
2523 boundary => {
2524 self.finalize_pending(&mut output);
2527 if !self.held_tail.is_empty() {
2528 output.push(OutputToken::Text(core::mem::take(&mut self.held_tail)));
2529 }
2530 output.push(boundary);
2531 }
2532 }
2533 output
2534 }
2535
2536 pub fn finish(mut self) -> Vec<OutputToken<S>> {
2538 if !self.enabled {
2539 return Vec::new();
2540 }
2541 let mut output = Vec::new();
2542 self.finalize_pending(&mut output);
2543 if !self.held_tail.is_empty() {
2544 output.push(OutputToken::Text(core::mem::take(&mut self.held_tail)));
2545 }
2546 output
2547 }
2548
2549 fn emit_held_prefix(&mut self, output: &mut Vec<OutputToken<S>>) {
2552 let split = hangul_first_tail_start(&self.held_tail);
2553 if split > 0 {
2554 let suffix = self.held_tail.split_off(split);
2557 let prefix = core::mem::replace(&mut self.held_tail, suffix);
2558 output.push(OutputToken::Text(prefix));
2559 }
2560 }
2561
2562 fn finalize_pending(&mut self, output: &mut Vec<OutputToken<S>>) {
2565 if self.pending_annotation.is_some() {
2566 self.decide_following(true, output);
2567 }
2568 }
2569
2570 fn resolve_following(&mut self, output: &mut Vec<OutputToken<S>>) {
2573 self.decide_following(false, output);
2574 }
2575
2576 fn decide_following(&mut self, flush: bool, output: &mut Vec<OutputToken<S>>) {
2581 let annotation = self
2582 .pending_annotation
2583 .as_ref()
2584 .expect("decide_following called with a pending annotation");
2585 match classify_following(&self.preceding, annotation, &self.following, flush) {
2586 FollowingMatch::NeedMore => return,
2587 FollowingMatch::NoMatch => {
2588 if !self.preceding.is_empty() {
2589 output.push(OutputToken::Text(core::mem::take(&mut self.preceding)));
2590 }
2591 output.push(OutputToken::Annotated(
2592 self.pending_annotation.take().expect("pending annotation"),
2593 ));
2594 self.held_tail = core::mem::take(&mut self.following);
2596 }
2597 FollowingMatch::HanjaFirst {
2598 collapsed,
2599 leftover,
2600 } => {
2601 if !self.preceding.is_empty() {
2603 output.push(OutputToken::Text(core::mem::take(&mut self.preceding)));
2604 }
2605 output.push(OutputToken::Annotated(collapsed));
2606 self.pending_annotation = None;
2607 self.held_tail = leftover;
2608 self.following.clear();
2609 }
2610 FollowingMatch::HangulFirst {
2611 remaining_preceding,
2612 collapsed,
2613 leftover,
2614 } => {
2615 if !remaining_preceding.is_empty() {
2616 output.push(OutputToken::Text(remaining_preceding));
2617 }
2618 output.push(OutputToken::Annotated(collapsed));
2619 self.pending_annotation = None;
2620 self.preceding.clear();
2621 self.held_tail = leftover;
2622 self.following.clear();
2623 }
2624 }
2625 self.emit_held_prefix(output);
2626 }
2627}
2628
2629const MAX_PRECEDING_READING_CHARS: usize = 64;
2635
2636fn hangul_first_tail_start(text: &str) -> usize {
2641 let mut start = text.len();
2642 let mut chars = text.char_indices().rev().peekable();
2643 if let Some(&(index, '(')) = chars.peek() {
2644 start = index;
2645 chars.next();
2646 }
2647 let mut held = 0;
2648 while held < MAX_PRECEDING_READING_CHARS {
2649 match chars.peek() {
2650 Some(&(index, ch)) if is_hangul_syllable(ch) => {
2651 start = index;
2652 held += 1;
2653 chars.next();
2654 }
2655 _ => break,
2656 }
2657 }
2658 start
2659}
2660
2661pub fn collapse_redundant_parens<S>(
2665 tokens: impl IntoIterator<Item = OutputToken<S>>,
2666 enabled: bool,
2667) -> Vec<OutputToken<S>>
2668where
2669 S: ScopeData,
2670{
2671 if !enabled {
2672 return tokens.into_iter().collect();
2673 }
2674 let mut collapser = RedundantParenCollapser::new(true);
2675 let mut output = Vec::new();
2676 for token in tokens {
2677 output.extend(collapser.push_token(token));
2678 }
2679 output.extend(collapser.finish());
2680 output
2681}
2682
2683enum ReadingMatch {
2685 Keep,
2687 Override(String),
2689}
2690
2691fn classify_reading(hanja: &str, reading: &str, candidate: &str) -> Option<ReadingMatch> {
2695 if candidate == reading {
2696 Some(ReadingMatch::Keep)
2697 } else if is_valid_alternative_reading(hanja, candidate) {
2698 Some(ReadingMatch::Override(candidate.to_string()))
2699 } else {
2700 None
2701 }
2702}
2703
2704fn is_valid_alternative_reading(hanja: &str, candidate: &str) -> bool {
2708 let mut hanja_chars = hanja.chars();
2709 let mut candidate_chars = candidate.chars();
2710 let mut matched_any = false;
2711 loop {
2712 match (hanja_chars.next(), candidate_chars.next()) {
2713 (Some(hanja_char), Some(syllable)) => {
2714 if !is_valid_char_reading(hanja_char, syllable) {
2715 return false;
2716 }
2717 matched_any = true;
2718 }
2719 (None, None) => return matched_any,
2720 _ => return false,
2722 }
2723 }
2724}
2725
2726fn is_valid_char_reading(source: char, syllable: char) -> bool {
2731 if !is_hangul_syllable(syllable) {
2732 return false;
2733 }
2734 let readings = khangul_all_readings(source);
2735 if readings.is_empty() {
2736 return source == syllable;
2740 }
2741 readings.iter().any(|reading| {
2742 reading_is_syllable(reading, syllable)
2743 || reading_matches_with_initial_sound_law(reading, syllable)
2744 })
2745}
2746
2747fn reading_is_syllable(reading: &str, syllable: char) -> bool {
2749 let mut chars = reading.chars();
2750 chars.next() == Some(syllable) && chars.next().is_none()
2751}
2752
2753fn collapse_annotation(mut annotation: Annotation, reading_match: ReadingMatch) -> Annotation {
2756 if let ReadingMatch::Override(reading) = reading_match {
2757 annotation.reading = reading;
2758 }
2759 annotation.require_hanja = true;
2760 annotation.require_hangul = true;
2761 annotation.from_source_gloss = true;
2762 annotation
2763}
2764
2765enum FollowingMatch {
2768 NeedMore,
2770 NoMatch,
2772 HanjaFirst {
2774 collapsed: Annotation,
2775 leftover: String,
2776 },
2777 HangulFirst {
2780 remaining_preceding: String,
2781 collapsed: Annotation,
2782 leftover: String,
2783 },
2784}
2785
2786fn classify_following(
2795 preceding: &str,
2796 annotation: &Annotation,
2797 following: &str,
2798 flush: bool,
2799) -> FollowingMatch {
2800 let Some(first) = following.chars().next() else {
2801 return if flush {
2802 FollowingMatch::NoMatch
2803 } else {
2804 FollowingMatch::NeedMore
2805 };
2806 };
2807 match first {
2808 ')' => match match_hangul_first(preceding, annotation, following) {
2809 Some((remaining_preceding, collapsed)) => FollowingMatch::HangulFirst {
2810 remaining_preceding,
2811 collapsed,
2812 leftover: following[')'.len_utf8()..].to_string(),
2813 },
2814 None => FollowingMatch::NoMatch,
2815 },
2816 '(' => {
2817 let content = &following['('.len_utf8()..];
2818 match content.find(')') {
2819 Some(close) => {
2820 let candidate = &content[..close];
2821 match classify_reading(&annotation.hanja, &annotation.reading, candidate) {
2822 Some(reading_match) => FollowingMatch::HanjaFirst {
2823 collapsed: collapse_annotation(annotation.clone(), reading_match),
2824 leftover: content[close + ')'.len_utf8()..].to_string(),
2825 },
2826 None => FollowingMatch::NoMatch,
2827 }
2828 }
2829 None => {
2830 let max_reading = annotation
2833 .reading
2834 .chars()
2835 .count()
2836 .max(annotation.hanja.chars().count());
2837 if flush || content.chars().count() > max_reading {
2838 FollowingMatch::NoMatch
2839 } else {
2840 FollowingMatch::NeedMore
2841 }
2842 }
2843 }
2844 }
2845 _ => FollowingMatch::NoMatch,
2846 }
2847}
2848
2849fn match_hangul_first(
2853 preceding: &str,
2854 annotation: &Annotation,
2855 following: &str,
2856) -> Option<(String, Annotation)> {
2857 if !following.starts_with(')') {
2858 return None;
2859 }
2860 let before = preceding.strip_suffix('(')?;
2861
2862 if !annotation.reading.is_empty()
2864 && let Some(remaining) = before.strip_suffix(&annotation.reading)
2865 {
2866 let collapsed = collapse_annotation(annotation.clone(), ReadingMatch::Keep);
2867 return Some((remaining.to_string(), collapsed));
2868 }
2869
2870 let syllable_count = annotation.hanja.chars().count();
2874 if syllable_count == 0 {
2875 return None;
2876 }
2877 let (split, _) = before.char_indices().rev().nth(syllable_count - 1)?;
2878 let candidate = &before[split..];
2879 let reading_match = classify_reading(&annotation.hanja, &annotation.reading, candidate)?;
2880 Some((
2881 before[..split].to_string(),
2882 collapse_annotation(annotation.clone(), reading_match),
2883 ))
2884}
2885
2886pub fn apply_user_directives<S>(
2890 tokens: impl IntoIterator<Item = OutputToken<S>>,
2891 directives: &UserDirectives<'_>,
2892) -> Vec<OutputToken<S>> {
2893 apply_user_directives_iter(tokens, directives).collect()
2894}
2895
2896pub fn apply_user_directives_iter<'a, S>(
2902 tokens: impl IntoIterator<Item = OutputToken<S>> + 'a,
2903 directives: &'a UserDirectives<'_>,
2904) -> impl Iterator<Item = OutputToken<S>> + 'a {
2905 tokens.into_iter().map(|token| directives.apply(token))
2906}
2907
2908struct ContextMiddleware<S, F>
2909where
2910 S: ScopeData,
2911 F: FnMut(&mut [OutputToken<S>]),
2912{
2913 window: ContextWindow,
2914 apply: F,
2915 context: Vec<OutputToken<S>>,
2916 scope_boundaries: Vec<bool>,
2917}
2918
2919impl<S, F> ContextMiddleware<S, F>
2920where
2921 S: ScopeData,
2922 F: FnMut(&mut [OutputToken<S>]),
2923{
2924 fn new(window: ContextWindow, apply: F) -> Self {
2925 Self {
2926 window,
2927 apply,
2928 context: Vec::new(),
2929 scope_boundaries: Vec::new(),
2930 }
2931 }
2932
2933 fn process(mut self, tokens: impl IntoIterator<Item = OutputToken<S>>) -> Vec<OutputToken<S>> {
2934 let mut output = Vec::new();
2935 for token in tokens {
2936 output.extend(self.push_token(token));
2937 }
2938 output.extend(self.finish());
2939 output
2940 }
2941
2942 fn push_token(&mut self, token: OutputToken<S>) -> Vec<OutputToken<S>> {
2943 let mut output = Vec::new();
2944 match self.window {
2945 ContextWindow::Off => output.push(token),
2946 ContextWindow::PerDocument => self.context.push(token),
2947 ContextWindow::PerBlock | ContextWindow::PerSection => match &token {
2948 OutputToken::Open(scope) => {
2949 let is_boundary = match self.window {
2950 ContextWindow::PerBlock => scope.data().is_block_boundary(),
2951 ContextWindow::PerSection => scope.data().is_section_boundary(),
2952 ContextWindow::Off | ContextWindow::PerDocument => false,
2953 };
2954 if is_boundary {
2955 self.flush_context(&mut output);
2956 }
2957 self.scope_boundaries.push(is_boundary);
2958 self.context.push(token);
2959 }
2960 OutputToken::Close => {
2961 let closes_boundary = self.scope_boundaries.pop().unwrap_or(false);
2962 self.context.push(token);
2963 if closes_boundary && self.window == ContextWindow::PerBlock {
2964 self.flush_context(&mut output);
2965 }
2966 }
2967 _ => self.context.push(token),
2968 },
2969 }
2970 output
2971 }
2972
2973 fn finish(mut self) -> Vec<OutputToken<S>> {
2974 let mut output = Vec::new();
2975 self.flush_context(&mut output);
2976 output
2977 }
2978
2979 fn flush_context(&mut self, output: &mut Vec<OutputToken<S>>) {
2980 if self.context.is_empty() {
2981 return;
2982 }
2983
2984 (self.apply)(&mut self.context);
2985 output.append(&mut self.context);
2986 }
2987}
2988
2989#[derive(Clone, Debug, Default, Eq, PartialEq)]
2990struct HomophoneIndex {
2991 forms_by_reading: BTreeMap<String, BTreeSet<String>>,
2992}
2993
2994impl HomophoneIndex {
2995 fn from_dictionary<D>(dictionary: &D) -> Option<Self>
2996 where
2997 D: HanjaDictionary + ?Sized,
2998 {
2999 let mut forms_by_reading = BTreeMap::<String, BTreeSet<String>>::new();
3000 for record in dictionary.entries()? {
3001 forms_by_reading
3002 .entry(record.reading)
3003 .or_default()
3004 .insert(record.hanja);
3005 }
3006 Some(Self { forms_by_reading })
3007 }
3008
3009 fn has_homophone(&self, hanja: &str, reading: &str) -> bool {
3010 self.forms_by_reading
3011 .get(reading)
3012 .is_some_and(|forms| forms.iter().any(|form| form != hanja))
3013 }
3014}
3015
3016fn mark_homophones_in_context<S, D>(
3017 tokens: &mut [OutputToken<S>],
3018 index: Option<&HomophoneIndex>,
3019 lookup_fallback: Option<&D>,
3020) where
3021 D: HanjaDictionary + ?Sized,
3022{
3023 let mut forms_by_reading = BTreeMap::<String, BTreeSet<String>>::new();
3024
3025 for token in tokens.iter() {
3026 if let OutputToken::Annotated(annotation) = token
3027 && annotation.from_dictionary
3028 {
3029 forms_by_reading
3030 .entry(annotation.reading.clone())
3031 .or_default()
3032 .insert(annotation.hanja.clone());
3033 }
3034 }
3035
3036 for token in tokens.iter_mut() {
3037 if let OutputToken::Annotated(annotation) = token {
3038 annotation.homophone = annotation.from_dictionary
3039 && (index.is_some_and(|index| {
3040 index.has_homophone(&annotation.hanja, &annotation.reading)
3041 }) || lookup_fallback.is_some_and(|dictionary| {
3042 dictionary.has_homophone(&annotation.hanja, &annotation.reading)
3043 }) || forms_by_reading
3044 .get(&annotation.reading)
3045 .is_some_and(|forms| forms.len() > 1));
3046 }
3047 }
3048}
3049
3050fn filter_first_occurrences_in_context<S>(tokens: &mut [OutputToken<S>]) {
3051 let mut seen = BTreeSet::new();
3052
3053 for token in tokens.iter_mut() {
3054 if let OutputToken::Annotated(annotation) = token {
3055 if seen.insert(annotation.hanja.clone()) {
3056 annotation.first_in_context = true;
3057 } else {
3058 annotation.first_in_context = false;
3059 if !annotation.from_source_gloss {
3063 annotation.require_hanja = false;
3064 annotation.require_hangul = false;
3065 }
3066 }
3067 }
3068 }
3069}
3070
3071pub fn render_tokens<S, O>(
3079 tokens: impl IntoIterator<Item = OutputToken<S>>,
3080 options: O,
3081) -> Vec<RenderedToken<S>>
3082where
3083 S: ScopeData,
3084 O: Into<RenderOptions>,
3085{
3086 render_tokens_iter(tokens, options).collect()
3087}
3088
3089pub fn render_tokens_iter<S, O>(
3096 tokens: impl IntoIterator<Item = OutputToken<S>>,
3097 options: O,
3098) -> impl Iterator<Item = RenderedToken<S>>
3099where
3100 S: ScopeData,
3101 O: Into<RenderOptions>,
3102{
3103 RendererIter {
3104 upstream: tokens.into_iter(),
3105 renderer: Renderer::new(options),
3106 }
3107}
3108
3109pub struct Renderer<S>
3116where
3117 S: ScopeData,
3118{
3119 options: RenderOptions,
3120 markup_stack: Vec<bool>,
3125 disallowing_ancestors: usize,
3130 _scope: PhantomData<fn(S)>,
3131}
3132
3133impl<S> Renderer<S>
3134where
3135 S: ScopeData,
3136{
3137 pub fn new<O>(options: O) -> Self
3139 where
3140 O: Into<RenderOptions>,
3141 {
3142 Self {
3143 options: options.into(),
3144 markup_stack: Vec::new(),
3145 disallowing_ancestors: 0,
3146 _scope: PhantomData,
3147 }
3148 }
3149
3150 pub fn push_token(&mut self, token: OutputToken<S>) -> RenderedToken<S> {
3152 match token {
3153 OutputToken::Open(scope) => {
3154 let allows = scope.data().allows_inline_markup();
3155 if !allows {
3156 self.disallowing_ancestors += 1;
3157 }
3158 self.markup_stack.push(allows);
3159 RenderedToken::Open(scope)
3160 }
3161 OutputToken::Close => {
3162 if let Some(false) = self.markup_stack.pop() {
3163 self.disallowing_ancestors = self.disallowing_ancestors.saturating_sub(1);
3167 }
3168 RenderedToken::Close
3169 }
3170 OutputToken::Text(text) => RenderedToken::Text(text),
3171 OutputToken::Verbatim(text) => RenderedToken::Verbatim(text),
3172 OutputToken::Annotated(annotation) => {
3173 let allows_inline_markup = self.disallowing_ancestors == 0;
3180 render_annotation(&annotation, &self.options, allows_inline_markup)
3181 }
3182 }
3183 }
3184}
3185
3186struct RendererIter<I, S>
3187where
3188 S: ScopeData,
3189{
3190 upstream: I,
3191 renderer: Renderer<S>,
3192}
3193
3194impl<I, S> Iterator for RendererIter<I, S>
3195where
3196 I: Iterator<Item = OutputToken<S>>,
3197 S: ScopeData,
3198{
3199 type Item = RenderedToken<S>;
3200
3201 fn next(&mut self) -> Option<Self::Item> {
3202 let token = self.upstream.next()?;
3203 Some(self.renderer.push_token(token))
3204 }
3205}
3206
3207fn render_annotation<S>(
3208 annotation: &Annotation,
3209 options: &RenderOptions,
3210 allows_inline_markup: bool,
3211) -> RenderedToken<S> {
3212 if annotation.skip_annotation {
3213 let primary = match options.mode {
3214 RenderMode::HangulOnly | RenderMode::HangulHanjaParens => annotation.reading.clone(),
3215 RenderMode::HanjaHangulParens | RenderMode::Original => annotation.hanja.clone(),
3216 RenderMode::Ruby(RubyBase::OnHangul) => annotation.reading.clone(),
3217 RenderMode::Ruby(RubyBase::OnHanja) => annotation.hanja.clone(),
3218 };
3219 return RenderedToken::Text(primary);
3220 }
3221
3222 match options.mode {
3223 RenderMode::HangulOnly if annotation.require_hanja || annotation.homophone => {
3224 RenderedToken::Text(parens(&annotation.reading, &annotation.hanja))
3225 }
3226 RenderMode::HangulOnly => RenderedToken::Text(annotation.reading.clone()),
3227 RenderMode::HangulHanjaParens => {
3228 RenderedToken::Text(parens(&annotation.reading, &annotation.hanja))
3229 }
3230 RenderMode::HanjaHangulParens => {
3231 RenderedToken::Text(parens(&annotation.hanja, &annotation.reading))
3232 }
3233 RenderMode::Ruby(base) => render_ruby(annotation, base, allows_inline_markup),
3234 RenderMode::Original if annotation.require_hangul => match options.original_gloss {
3235 OriginalGloss::Parens => {
3236 RenderedToken::Text(parens(&annotation.hanja, &annotation.reading))
3237 }
3238 OriginalGloss::Ruby => render_ruby(annotation, RubyBase::OnHanja, allows_inline_markup),
3241 },
3242 RenderMode::Original => RenderedToken::Text(annotation.hanja.clone()),
3243 }
3244}
3245
3246fn render_ruby<S>(
3247 annotation: &Annotation,
3248 base: RubyBase,
3249 allows_inline_markup: bool,
3250) -> RenderedToken<S> {
3251 let (base_text, rt_text) = match base {
3252 RubyBase::OnHangul => (&annotation.reading, &annotation.hanja),
3253 RubyBase::OnHanja => (&annotation.hanja, &annotation.reading),
3254 };
3255 if !allows_inline_markup {
3256 return RenderedToken::Text(parens(base_text, rt_text));
3257 }
3258 RenderedToken::Ruby {
3259 base: base_text.clone(),
3260 rt: rt_text.clone(),
3261 }
3262}
3263
3264fn parens(reading: &str, hanja: &str) -> String {
3265 let mut output = String::new();
3266 output.push_str(reading);
3267 output.push('(');
3268 output.push_str(hanja);
3269 output.push(')');
3270 output
3271}
3272
3273pub fn convert_plain_text<D, R>(input: &str, dictionary: &D, render: R) -> String
3286where
3287 D: HanjaDictionary + ?Sized,
3288 R: Into<RenderOptions>,
3289{
3290 convert_plain_text_with_options(input, dictionary, render, EngineOptions::default())
3291}
3292
3293pub fn convert_plain_text_with_options<D, R>(
3297 input: &str,
3298 dictionary: &D,
3299 render: R,
3300 options: EngineOptions,
3301) -> String
3302where
3303 D: HanjaDictionary + ?Sized,
3304 R: Into<RenderOptions>,
3305{
3306 let input_tokens = read_plain_text(input);
3307 let output_tokens = process_tokens_with_options(input_tokens, dictionary, options);
3308 let output_tokens = collapse_redundant_parens(output_tokens, true);
3309 let output_tokens = mark_homophones(output_tokens, dictionary, ContextWindow::PerBlock);
3310 let rendered_tokens = render_tokens(output_tokens, render);
3311 write_plain_text(rendered_tokens)
3312}