#![no_std]
#![forbid(unsafe_code)]
#![deny(missing_docs)]
extern crate alloc;
mod fallback;
mod generated;
mod segment;
use alloc::boxed::Box;
use alloc::collections::{BTreeMap, BTreeSet};
use alloc::string::{String, ToString};
use alloc::vec::Vec;
use core::marker::PhantomData;
use fallback::{
FallbackPart, FallbackState, apply_initial_sound_law_to_first_syllable,
fallback_reading_for_run, is_hanja_numeral, khangul_all_readings,
phoneticize_fallback_run_with_state, phoneticize_hanja_char,
reading_matches_with_initial_sound_law, should_apply_yeol_yul,
};
use generated::unihan_readings::KHANGUL_READINGS;
use segment::{Segment, segment_text};
#[derive(Debug, thiserror::Error)]
#[non_exhaustive]
pub enum Error {
#[error("dictionary load failed: {0}")]
DictionaryLoad(String),
#[error("segmentation failed for {hanja:?}: {reason}")]
Segmentation {
hanja: String,
reason: String,
},
#[error("invalid hangul reading {reading:?} for hanja {hanja:?}")]
InvalidReading {
hanja: String,
reading: String,
},
#[error("internal invariant violated: {0}")]
Internal(&'static str),
#[error(transparent)]
Other(#[from] Box<dyn core::error::Error + Send + Sync + 'static>),
}
#[derive(Clone, Copy, Debug, Default, Eq, PartialEq)]
pub enum Recovery {
#[default]
Strict,
Lenient,
}
#[derive(Debug)]
pub struct RecoverableInputError {
original: String,
error: Error,
}
impl RecoverableInputError {
pub fn new(original: String, error: Error) -> Self {
Self { original, error }
}
pub fn original(&self) -> &str {
&self.original
}
pub fn error(&self) -> &Error {
&self.error
}
pub fn into_parts(self) -> (String, Error) {
(self.original, self.error)
}
}
pub trait ScopeData: Clone + 'static {
fn is_preserve(&self) -> bool;
fn allows_inline_markup(&self) -> bool {
true
}
fn is_block_boundary(&self) -> bool {
false
}
fn is_section_boundary(&self) -> bool {
false
}
}
#[derive(Clone, Debug, Eq, PartialEq)]
pub struct Scope<S> {
data: S,
}
impl<S> Scope<S> {
pub fn new(data: S) -> Self {
Self { data }
}
pub fn data(&self) -> &S {
&self.data
}
pub fn into_data(self) -> S {
self.data
}
}
#[derive(Clone, Debug, Eq, PartialEq)]
pub enum InputToken<S> {
Open(Scope<S>),
Close,
Text(String),
Verbatim(String),
}
#[derive(Clone, Debug, Eq, PartialEq)]
pub enum OutputToken<S> {
Open(Scope<S>),
Close,
Text(String),
Verbatim(String),
Annotated(Annotation),
}
#[derive(Clone, Debug, Eq, PartialEq)]
pub enum RenderedToken<S> {
Open(Scope<S>),
Close,
Text(String),
Verbatim(String),
Ruby {
base: String,
rt: String,
},
}
#[derive(Clone, Debug, Default, Eq, PartialEq)]
#[non_exhaustive]
pub struct Annotation {
pub hanja: String,
pub reading: String,
pub homophone: bool,
pub require_hanja: bool,
pub require_hangul: bool,
pub first_in_context: bool,
pub skip_annotation: bool,
pub from_dictionary: bool,
pub from_source_gloss: bool,
}
#[derive(Clone, Copy, Debug, Default, Eq, PartialEq)]
pub struct MatchMark {
pub require_hanja: bool,
pub require_hangul: bool,
}
#[derive(Clone, Debug, Eq, PartialEq)]
pub struct DictionaryRecord {
pub hanja: String,
pub reading: String,
pub mark: MatchMark,
}
#[derive(Clone, Debug, Eq, PartialEq)]
pub struct Match {
pub byte_len: usize,
pub reading: String,
pub suffix_reading: Option<String>,
pub mark: MatchMark,
}
pub trait HanjaDictionary {
fn matches_at<'a>(&'a self, s: &'a str) -> Box<dyn Iterator<Item = Match> + 'a>;
fn max_word_chars(&self) -> Option<usize> {
None
}
fn entries<'a>(&'a self) -> Option<Box<dyn Iterator<Item = DictionaryRecord> + 'a>> {
None
}
fn has_homophone(&self, hanja: &str, reading: &str) -> bool {
self.entries().is_some_and(|mut entries| {
entries.any(|record| record.hanja != hanja && record.reading == reading)
})
}
}
impl<D> HanjaDictionary for &D
where
D: HanjaDictionary + ?Sized,
{
fn matches_at<'a>(&'a self, s: &'a str) -> Box<dyn Iterator<Item = Match> + 'a> {
(**self).matches_at(s)
}
fn max_word_chars(&self) -> Option<usize> {
(**self).max_word_chars()
}
fn entries<'a>(&'a self) -> Option<Box<dyn Iterator<Item = DictionaryRecord> + 'a>> {
(**self).entries()
}
fn has_homophone(&self, hanja: &str, reading: &str) -> bool {
(**self).has_homophone(hanja, reading)
}
}
impl<D> HanjaDictionary for Box<D>
where
D: HanjaDictionary + ?Sized,
{
fn matches_at<'a>(&'a self, s: &'a str) -> Box<dyn Iterator<Item = Match> + 'a> {
(**self).matches_at(s)
}
fn max_word_chars(&self) -> Option<usize> {
(**self).max_word_chars()
}
fn entries<'a>(&'a self) -> Option<Box<dyn Iterator<Item = DictionaryRecord> + 'a>> {
(**self).entries()
}
fn has_homophone(&self, hanja: &str, reading: &str) -> bool {
(**self).has_homophone(hanja, reading)
}
}
#[derive(Clone, Copy, Debug, Default, Eq, PartialEq)]
pub struct UnihanCharDict;
impl HanjaDictionary for UnihanCharDict {
fn matches_at<'a>(&'a self, s: &'a str) -> Box<dyn Iterator<Item = Match> + 'a> {
let matched = s.chars().next().and_then(|ch| {
khangul_reading(ch).map(|reading| Match {
byte_len: ch.len_utf8(),
reading: reading.to_string(),
suffix_reading: None,
mark: MatchMark::default(),
})
});
Box::new(matched.into_iter())
}
fn max_word_chars(&self) -> Option<usize> {
Some(1)
}
fn entries<'a>(&'a self) -> Option<Box<dyn Iterator<Item = DictionaryRecord> + 'a>> {
Some(Box::new(KHANGUL_READINGS.iter().map(|(hanja, reading)| {
DictionaryRecord {
hanja: hanja.to_string(),
reading: reading.to_string(),
mark: MatchMark::default(),
}
})))
}
fn has_homophone(&self, hanja: &str, reading: &str) -> bool {
let mut chars = hanja.chars();
let Some(hanja) = chars.next() else {
return false;
};
if chars.next().is_some() {
return false;
}
KHANGUL_READINGS
.iter()
.any(|&(other_hanja, other_reading)| other_hanja != hanja && other_reading == reading)
}
}
#[derive(Clone, Debug, Default, Eq, PartialEq)]
pub struct ChainDictionary<D> {
dictionaries: Vec<D>,
}
impl<D> ChainDictionary<D> {
pub fn new() -> Self {
Self {
dictionaries: Vec::new(),
}
}
pub fn push(&mut self, dictionary: D) {
self.dictionaries.push(dictionary);
}
pub fn len(&self) -> usize {
self.dictionaries.len()
}
pub fn is_empty(&self) -> bool {
self.dictionaries.is_empty()
}
pub fn dictionaries(&self) -> &[D] {
&self.dictionaries
}
pub fn into_dictionaries(self) -> Vec<D> {
self.dictionaries
}
}
impl<D> FromIterator<D> for ChainDictionary<D> {
fn from_iter<T: IntoIterator<Item = D>>(iter: T) -> Self {
Self {
dictionaries: Vec::from_iter(iter),
}
}
}
impl<D> HanjaDictionary for ChainDictionary<D>
where
D: HanjaDictionary,
{
fn matches_at<'a>(&'a self, s: &'a str) -> Box<dyn Iterator<Item = Match> + 'a> {
let mut seen_lengths = BTreeSet::new();
let mut matches = Vec::new();
for dictionary in &self.dictionaries {
for matched in dictionary.matches_at(s) {
if seen_lengths.insert(matched.byte_len) {
matches.push(matched);
}
}
}
matches.sort_by_key(|matched| matched.byte_len);
Box::new(matches.into_iter())
}
fn max_word_chars(&self) -> Option<usize> {
let mut max = None;
for dictionary in &self.dictionaries {
let word_chars = dictionary.max_word_chars()?;
max = Some(max.map_or(word_chars, |current: usize| current.max(word_chars)));
}
max
}
fn entries<'a>(&'a self) -> Option<Box<dyn Iterator<Item = DictionaryRecord> + 'a>> {
let mut records = BTreeMap::<String, DictionaryRecord>::new();
for dictionary in &self.dictionaries {
for record in dictionary.entries()? {
records.entry(record.hanja.clone()).or_insert(record);
}
}
Some(Box::new(records.into_values()))
}
fn has_homophone(&self, hanja: &str, reading: &str) -> bool {
if let Some(mut records) = self.entries() {
return records.any(|record| record.hanja != hanja && record.reading == reading);
}
self.dictionaries
.iter()
.any(|dictionary| dictionary.has_homophone(hanja, reading))
}
}
fn khangul_reading(ch: char) -> Option<&'static str> {
KHANGUL_READINGS
.binary_search_by_key(&ch, |(hanja, _)| *hanja)
.ok()
.map(|index| KHANGUL_READINGS[index].1)
}
#[derive(Clone, Copy, Debug, Eq, PartialEq)]
pub struct EngineOptions {
pub segmentation: SegmentationStrategy,
pub initial_sound_law: bool,
pub numeral_strategy: NumeralStrategy,
}
impl Default for EngineOptions {
fn default() -> Self {
Self {
segmentation: SegmentationStrategy::Lattice,
initial_sound_law: true,
numeral_strategy: NumeralStrategy::HangulPhonetic,
}
}
}
#[non_exhaustive]
#[derive(Clone, Copy, Debug, Default, Eq, PartialEq)]
pub enum SegmentationStrategy {
#[default]
Lattice,
Eager,
}
#[non_exhaustive]
#[derive(Clone, Copy, Debug, Eq, PartialEq)]
pub enum NumeralStrategy {
HangulPhonetic,
PositionalArabic,
AdditiveArabic,
Smart,
}
#[derive(Clone, Debug, Eq, PartialEq)]
struct DictionaryEntry {
reading: String,
suffix_reading: Option<String>,
mark: MatchMark,
}
#[derive(Clone, Debug, Default, Eq, PartialEq)]
pub struct MapDictionary {
entries: BTreeMap<String, DictionaryEntry>,
max_word_chars: Option<usize>,
}
impl MapDictionary {
pub fn new() -> Self {
Self::default()
}
pub fn insert(&mut self, hanja: impl Into<String>, reading: impl Into<String>) {
self.insert_marked(hanja, reading, MatchMark::default());
}
pub fn insert_marked(
&mut self,
hanja: impl Into<String>,
reading: impl Into<String>,
mark: MatchMark,
) {
self.insert_entry(hanja, reading, None, mark);
}
pub fn insert_with_suffix(
&mut self,
hanja: impl Into<String>,
reading: impl Into<String>,
suffix: impl Into<String>,
) {
self.insert_entry(hanja, reading, Some(suffix.into()), MatchMark::default());
}
fn insert_entry(
&mut self,
hanja: impl Into<String>,
reading: impl Into<String>,
suffix_reading: Option<String>,
mark: MatchMark,
) {
let hanja = hanja.into();
let word_chars = hanja.chars().count();
self.max_word_chars = Some(self.max_word_chars.map_or(word_chars, |max| {
if word_chars > max { word_chars } else { max }
}));
self.entries.insert(
hanja,
DictionaryEntry {
reading: reading.into(),
suffix_reading,
mark,
},
);
}
pub fn is_empty(&self) -> bool {
self.entries.is_empty()
}
pub fn len(&self) -> usize {
self.entries.len()
}
}
impl HanjaDictionary for MapDictionary {
fn matches_at<'a>(&'a self, s: &'a str) -> Box<dyn Iterator<Item = Match> + 'a> {
Box::new(
self.entries
.iter()
.filter(move |(hanja, _)| s.starts_with(hanja.as_str()))
.map(|(hanja, entry)| Match {
byte_len: hanja.len(),
reading: entry.reading.clone(),
suffix_reading: entry.suffix_reading.clone(),
mark: entry.mark,
}),
)
}
fn max_word_chars(&self) -> Option<usize> {
self.max_word_chars
}
fn entries<'a>(&'a self) -> Option<Box<dyn Iterator<Item = DictionaryRecord> + 'a>> {
Some(Box::new(self.entries.iter().map(|(hanja, entry)| {
DictionaryRecord {
hanja: hanja.clone(),
reading: entry.reading.clone(),
mark: entry.mark,
}
})))
}
fn has_homophone(&self, hanja: &str, reading: &str) -> bool {
self.entries
.iter()
.any(|(other_hanja, entry)| other_hanja != hanja && entry.reading == reading)
}
}
#[derive(Clone, Copy, Debug, Default, Eq, PartialEq)]
pub struct PlainScopeData;
impl ScopeData for PlainScopeData {
fn is_preserve(&self) -> bool {
false
}
fn allows_inline_markup(&self) -> bool {
false
}
}
pub fn read_plain_text(input: &str) -> Vec<InputToken<PlainScopeData>> {
Vec::from([
InputToken::Open(Scope::new(PlainScopeData)),
InputToken::Text(input.to_string()),
InputToken::Close,
])
}
pub fn write_plain_text<S>(tokens: impl IntoIterator<Item = RenderedToken<S>>) -> String {
let mut output = String::new();
for token in tokens {
match token {
RenderedToken::Open(_) | RenderedToken::Close => {}
RenderedToken::Text(text) | RenderedToken::Verbatim(text) => output.push_str(&text),
RenderedToken::Ruby { base, rt } => {
output.push_str(&parens(&base, &rt));
}
}
}
output
}
pub fn process_tokens<S, D>(
tokens: impl IntoIterator<Item = InputToken<S>>,
dictionary: &D,
) -> Vec<OutputToken<S>>
where
S: ScopeData,
D: HanjaDictionary + ?Sized,
{
process_tokens_iter(tokens, dictionary).collect()
}
pub fn process_tokens_iter<S, D>(
tokens: impl IntoIterator<Item = InputToken<S>>,
dictionary: &D,
) -> alloc::vec::IntoIter<OutputToken<S>>
where
S: ScopeData,
D: HanjaDictionary + ?Sized,
{
process_tokens_with_options(tokens, dictionary, EngineOptions::default()).into_iter()
}
pub fn process_tokens_with_options<S, D>(
tokens: impl IntoIterator<Item = InputToken<S>>,
dictionary: &D,
options: EngineOptions,
) -> Vec<OutputToken<S>>
where
S: ScopeData,
D: HanjaDictionary + ?Sized,
{
let mut engine = Engine::collecting(dictionary, options);
let mut output = Vec::new();
for token in tokens {
output.extend(engine.push_token(token));
}
output.extend(engine.finish());
output
}
pub fn process_tokens_iter_with_options<S, D>(
tokens: impl IntoIterator<Item = InputToken<S>>,
dictionary: &D,
options: EngineOptions,
) -> alloc::vec::IntoIter<OutputToken<S>>
where
S: ScopeData,
D: HanjaDictionary + ?Sized,
{
process_tokens_with_options(tokens, dictionary, options).into_iter()
}
pub fn recover_input_tokens<S>(
tokens: impl IntoIterator<Item = Result<InputToken<S>, RecoverableInputError>>,
recovery: Recovery,
) -> Result<Vec<InputToken<S>>, Error>
where
S: ScopeData,
{
let mut recovered = Vec::new();
for token in tokens {
recovered.push(recover_input_token(token, recovery)?);
}
Ok(recovered)
}
pub fn recover_input_token<S>(
token: Result<InputToken<S>, RecoverableInputError>,
recovery: Recovery,
) -> Result<InputToken<S>, Error>
where
S: ScopeData,
{
match token {
Ok(token) => Ok(token),
Err(error) => match recovery {
Recovery::Strict => Err(error.into_parts().1),
Recovery::Lenient => {
let (original, error) = error.into_parts();
tracing::warn!(error = %error, "recovering from input reader error");
Ok(InputToken::Verbatim(original))
}
},
}
}
pub fn process_fallible_tokens<S, D>(
tokens: impl IntoIterator<Item = Result<InputToken<S>, RecoverableInputError>>,
dictionary: &D,
recovery: Recovery,
) -> Result<Vec<OutputToken<S>>, Error>
where
S: ScopeData,
D: HanjaDictionary + ?Sized,
{
process_fallible_tokens_with_options(tokens, dictionary, EngineOptions::default(), recovery)
}
pub fn process_fallible_tokens_with_options<S, D>(
tokens: impl IntoIterator<Item = Result<InputToken<S>, RecoverableInputError>>,
dictionary: &D,
options: EngineOptions,
recovery: Recovery,
) -> Result<Vec<OutputToken<S>>, Error>
where
S: ScopeData,
D: HanjaDictionary + ?Sized,
{
let recovered = recover_input_tokens(tokens, recovery)?;
Ok(process_tokens_with_options(recovered, dictionary, options))
}
pub struct Engine<'a, S, D>
where
S: ScopeData,
D: HanjaDictionary + ?Sized,
{
dictionary: &'a D,
options: EngineOptions,
scopes: Vec<Scope<S>>,
pending_text: String,
pending_unflushable_fallback_run_bytes: Option<usize>,
fallback_state: FallbackState,
incremental_flush: bool,
}
impl<'a, S, D> Engine<'a, S, D>
where
S: ScopeData,
D: HanjaDictionary + ?Sized,
{
pub fn new(dictionary: &'a D) -> Self {
Self::with_options(dictionary, EngineOptions::default())
}
pub fn with_options(dictionary: &'a D, options: EngineOptions) -> Self {
Self::with_incremental_flush(dictionary, options, true)
}
fn collecting(dictionary: &'a D, options: EngineOptions) -> Self {
Self::with_incremental_flush(dictionary, options, false)
}
fn with_incremental_flush(
dictionary: &'a D,
options: EngineOptions,
incremental_flush: bool,
) -> Self {
tracing::debug!(
strategy = ?options.segmentation,
"engine created with segmentation strategy"
);
Self {
dictionary,
options,
scopes: Vec::new(),
pending_text: String::new(),
pending_unflushable_fallback_run_bytes: None,
fallback_state: FallbackState::default(),
incremental_flush,
}
}
pub fn push_token(&mut self, token: InputToken<S>) -> Vec<OutputToken<S>> {
let mut output = Vec::new();
match token {
InputToken::Open(scope) => {
self.flush_into(&mut output);
if scope.data().is_block_boundary() {
self.reset_fallback_context();
}
self.scopes.push(scope.clone());
output.push(OutputToken::Open(scope));
}
InputToken::Close => {
self.flush_into(&mut output);
let closes_block_boundary = self
.scopes
.pop()
.is_some_and(|scope| scope.data().is_block_boundary());
output.push(OutputToken::Close);
if closes_block_boundary {
self.reset_fallback_context();
}
}
InputToken::Text(text) => {
if self
.scopes
.last()
.is_some_and(|scope| scope.data().is_preserve())
{
self.flush_into(&mut output);
self.reset_fallback_context();
output.push(OutputToken::Text(text));
} else {
let previous_pending_bytes = self.pending_text.len();
self.pending_text.push_str(&text);
if self
.pending_unflushable_fallback_run_bytes
.is_some_and(|bytes| bytes == previous_pending_bytes)
{
self.pending_unflushable_fallback_run_bytes = Some(previous_pending_bytes);
} else {
self.pending_unflushable_fallback_run_bytes = None;
}
if self.incremental_flush {
self.flush_safe_into(&mut output);
}
}
}
InputToken::Verbatim(text) => {
self.flush_into(&mut output);
self.reset_fallback_context();
output.push(OutputToken::Verbatim(text));
}
}
output
}
pub fn flush(&mut self) -> Vec<OutputToken<S>> {
let mut output = Vec::new();
self.flush_into(&mut output);
output
}
pub fn finish(mut self) -> Vec<OutputToken<S>> {
self.flush()
}
pub fn buffered_chars(&self) -> usize {
self.pending_text.chars().count()
}
fn tail_bound(&self) -> Option<usize> {
self.dictionary.max_word_chars().filter(|bound| *bound > 0)
}
fn flush_safe_into(&mut self, output: &mut Vec<OutputToken<S>>) {
if self.pending_text.is_empty() {
return;
}
if !self.pending_text.chars().any(is_hanja) {
self.flush_non_hanja_safe_into(output);
return;
}
let Some(bound) = self.tail_bound() else {
let Some(flush_end) = safe_unknown_bound_flush_end(&self.pending_text) else {
return;
};
self.flush_prefix_into(flush_end, output);
if !self.pending_text.chars().any(is_hanja) {
self.flush_non_hanja_safe_into(output);
}
return;
};
if let Some(flush_end) = safe_unknown_bound_flush_end(&self.pending_text) {
self.flush_prefix_into(flush_end, output);
if !self.pending_text.chars().any(is_hanja) {
self.flush_non_hanja_safe_into(output);
}
return;
}
let buffered_chars = self.buffered_chars();
if buffered_chars > bound.saturating_mul(10) {
tracing::debug!(
buffered_chars,
dict_max_word_chars = bound,
"streaming tail buffer is unusually large"
);
}
if buffered_chars <= bound {
return;
}
if self.extends_unflushable_fallback_run(bound) {
self.pending_unflushable_fallback_run_bytes = Some(self.pending_text.len());
return;
}
let safe_chars = buffered_chars.saturating_sub(bound).saturating_add(1);
let segments = segment_text(&self.pending_text, self.dictionary, self.options);
let mut flush_end = 0;
let mut flush_segments = Vec::new();
for segment in &segments {
let (byte_start, byte_end) = segment_bounds(segment);
let start_chars = self.pending_text[..byte_start].chars().count();
let end_chars = self.pending_text[..byte_end].chars().count();
if byte_start > flush_end || (start_chars > safe_chars && flush_end > 0) {
break;
}
if end_chars > safe_chars {
break;
}
flush_end = byte_end;
flush_segments.push(segment.clone());
}
if let Some(fallback_start) = trailing_fallback_run_start(&segments, flush_end) {
flush_end = fallback_start;
while flush_segments
.last()
.is_some_and(|segment| segment_bounds(segment).1 > flush_end)
{
flush_segments.pop();
}
}
if flush_end > 0 {
self.pending_unflushable_fallback_run_bytes = None;
self.flush_segments_prefix_into(flush_end, &flush_segments, output);
if !self.pending_text.chars().any(is_hanja) {
self.flush_non_hanja_safe_into(output);
}
} else if trailing_fallback_run_start(&segments, self.pending_text.len()) == Some(0) {
self.pending_unflushable_fallback_run_bytes = Some(self.pending_text.len());
}
}
fn extends_unflushable_fallback_run(&self, bound: usize) -> bool {
let Some(previous_bytes) = self.pending_unflushable_fallback_run_bytes else {
return false;
};
if previous_bytes == 0
|| previous_bytes > self.pending_text.len()
|| !self.pending_text.is_char_boundary(previous_bytes)
{
return false;
}
let appended = &self.pending_text[previous_bytes..];
if appended.is_empty() {
return true;
}
if appended.chars().any(|ch| !is_hanja(ch)) {
return false;
}
let probe_start = suffix_start_for_char_count(
&self.pending_text[..previous_bytes],
bound.saturating_sub(1),
);
let probe = &self.pending_text[probe_start..];
segment_text(probe, self.dictionary, self.options)
.iter()
.all(|segment| {
matches!(
segment,
Segment::Fallback { .. } | Segment::TrivialDictionary { .. }
)
})
}
fn flush_non_hanja_safe_into(&mut self, output: &mut Vec<OutputToken<S>>) {
let flush_end = match self.tail_bound() {
Some(bound) => safe_non_hanja_flush_end(&self.pending_text, bound),
None => safe_unknown_bound_flush_end(&self.pending_text),
};
if let Some(flush_end) = flush_end {
self.flush_prefix_into(flush_end, output);
}
}
fn flush_prefix_into(&mut self, flush_end: usize, output: &mut Vec<OutputToken<S>>) {
if flush_end == self.pending_text.len() {
self.flush_into(output);
return;
}
self.pending_unflushable_fallback_run_bytes = None;
let prefix = self.pending_text[..flush_end].to_string();
let segments = segment_text(&prefix, self.dictionary, self.options);
self.flush_segments_prefix_into(flush_end, &segments, output);
}
fn flush_segments_prefix_into(
&mut self,
flush_end: usize,
segments: &[Segment],
output: &mut Vec<OutputToken<S>>,
) {
let prefix = self.pending_text[..flush_end].to_string();
process_segments_with_state(
&prefix,
segments,
self.dictionary,
self.options,
&mut self.fallback_state,
output,
);
self.pending_text.replace_range(..flush_end, "");
}
fn flush_into(&mut self, output: &mut Vec<OutputToken<S>>) {
if self.pending_text.is_empty() {
return;
}
self.pending_unflushable_fallback_run_bytes = None;
let text = core::mem::take(&mut self.pending_text);
process_text_with_state(
&text,
self.dictionary,
self.options,
&mut self.fallback_state,
output,
);
}
fn reset_fallback_context(&mut self) {
self.fallback_state = FallbackState::default();
}
}
fn safe_non_hanja_flush_end(text: &str, bound: usize) -> Option<usize> {
if text.is_empty() {
return None;
}
let keep_chars = bound.saturating_sub(1);
let span_start = text
.char_indices()
.rfind(|(_, ch)| ch.is_whitespace())
.map_or(0, |(index, ch)| index + ch.len_utf8());
let suffix = &text[span_start..];
let suffix_chars = suffix.chars().count();
if suffix_chars <= keep_chars {
return (span_start > 0).then_some(span_start);
}
let flush_suffix_chars = suffix_chars - keep_chars;
let flush_end = suffix
.char_indices()
.nth(flush_suffix_chars)
.map_or(text.len(), |(index, _)| span_start + index);
(flush_end > 0).then_some(flush_end)
}
fn safe_unknown_bound_flush_end(text: &str) -> Option<usize> {
text.char_indices()
.rfind(|(_, ch)| ch.is_whitespace())
.map(|(index, ch)| index + ch.len_utf8())
}
fn suffix_start_for_char_count(text: &str, count: usize) -> usize {
if count == 0 {
return text.len();
}
text.char_indices()
.rev()
.nth(count.saturating_sub(1))
.map_or(0, |(index, _)| index)
}
fn trailing_fallback_run_start(segments: &[Segment], split_byte: usize) -> Option<usize> {
if split_byte == 0 {
return None;
}
for (index, segment) in segments.iter().enumerate() {
let (byte_start, byte_end) = segment_bounds(segment);
if byte_end != split_byte {
continue;
}
if !matches!(
segment,
Segment::Fallback { .. } | Segment::TrivialDictionary { .. }
) {
return None;
}
if let Some(next) = segments.get(index + 1)
&& !matches!(
next,
Segment::Fallback { .. } | Segment::TrivialDictionary { .. }
)
{
return None;
}
let mut run_start = byte_start;
for previous in segments[..index].iter().rev() {
let (previous_start, previous_end) = segment_bounds(previous);
if previous_end != run_start
|| !matches!(
previous,
Segment::Fallback { .. } | Segment::TrivialDictionary { .. }
)
{
break;
}
run_start = previous_start;
}
return (run_start < split_byte).then_some(run_start);
}
None
}
fn process_text_with_state<S, D>(
text: &str,
dictionary: &D,
options: EngineOptions,
fallback_state: &mut FallbackState,
output: &mut Vec<OutputToken<S>>,
) where
D: HanjaDictionary + ?Sized,
{
let segments = segment_text(text, dictionary, options);
process_segments_with_state(text, &segments, dictionary, options, fallback_state, output);
}
fn process_trivial_fallback_run<S>(
run_segments: &[Segment],
text: &str,
options: EngineOptions,
state: &mut FallbackState,
output: &mut Vec<OutputToken<S>>,
) {
let run_start = segment_bounds(&run_segments[0]).0;
let run_end = segment_bounds(&run_segments[run_segments.len() - 1]).1;
let capacity = run_end.saturating_sub(run_start);
let mut hanja = String::with_capacity(capacity);
let mut reading = String::with_capacity(capacity);
let mut has_dictionary = false;
let mut last_trivial_source: Option<char> = None;
let mut last_trivial_reading: Option<String> = None;
let mut seg_index = 0;
while seg_index < run_segments.len() {
match &run_segments[seg_index] {
Segment::TrivialDictionary {
byte_start,
byte_end,
reading: dict_reading,
suffix_reading,
..
} => {
let source = &text[*byte_start..*byte_end];
let effective = dictionary_effective_reading(
source,
dict_reading,
suffix_reading.as_deref(),
options,
state.starts_word,
state.previous_reading,
);
if !hanja.is_empty()
&& last_trivial_reading.as_deref() == Some(&effective)
&& last_trivial_source != source.chars().next()
{
output.push(OutputToken::Annotated(Annotation {
hanja: core::mem::take(&mut hanja),
reading: core::mem::take(&mut reading),
homophone: false,
require_hanja: false,
require_hangul: false,
first_in_context: true,
skip_annotation: false,
from_dictionary: has_dictionary,
from_source_gloss: false,
}));
}
hanja.push_str(source);
reading.push_str(&effective);
update_fallback_state_for_reading(&effective, state);
has_dictionary = true;
last_trivial_source = source.chars().next();
last_trivial_reading = Some(effective);
seg_index += 1;
}
Segment::Fallback { byte_start: _, .. } => {
last_trivial_source = None;
last_trivial_reading = None;
let fb_start = seg_index;
while seg_index < run_segments.len()
&& matches!(&run_segments[seg_index], Segment::Fallback { .. })
{
seg_index += 1;
}
let fb_text = &text[segment_bounds(&run_segments[fb_start]).0
..segment_bounds(&run_segments[seg_index - 1]).1];
for part in phoneticize_fallback_run_with_state(fb_text, options, state) {
match part {
FallbackPart::Annotation {
hanja: part_hanja,
reading: part_reading,
} => {
if part_hanja.chars().any(is_hanja_numeral) {
if !hanja.is_empty() {
output.push(OutputToken::Annotated(Annotation {
hanja: core::mem::take(&mut hanja),
reading: core::mem::take(&mut reading),
homophone: false,
require_hanja: false,
require_hangul: false,
first_in_context: true,
skip_annotation: false,
from_dictionary: has_dictionary,
from_source_gloss: false,
}));
has_dictionary = false;
}
output.push(OutputToken::Annotated(Annotation {
hanja: part_hanja,
reading: part_reading,
homophone: false,
require_hanja: false,
require_hangul: false,
first_in_context: true,
skip_annotation: false,
from_dictionary: false,
from_source_gloss: false,
}));
} else {
hanja.push_str(&part_hanja);
reading.push_str(&part_reading);
}
}
FallbackPart::ReadingText(t) | FallbackPart::Text(t) => {
if !hanja.is_empty() {
output.push(OutputToken::Annotated(Annotation {
hanja: core::mem::take(&mut hanja),
reading: core::mem::take(&mut reading),
homophone: false,
require_hanja: false,
require_hangul: false,
first_in_context: true,
skip_annotation: false,
from_dictionary: has_dictionary,
from_source_gloss: false,
}));
has_dictionary = false;
}
push_text(output, &t);
}
}
}
}
_ => unreachable!("run must contain only TrivialDictionary | Fallback"),
}
}
if !hanja.is_empty() {
output.push(OutputToken::Annotated(Annotation {
hanja,
reading,
homophone: false,
require_hanja: false,
require_hangul: false,
first_in_context: true,
skip_annotation: false,
from_dictionary: has_dictionary,
from_source_gloss: false,
}));
}
}
fn process_segments_with_state<S, D>(
text: &str,
segments: &[Segment],
_dictionary: &D,
options: EngineOptions,
fallback_state: &mut FallbackState,
output: &mut Vec<OutputToken<S>>,
) where
D: HanjaDictionary + ?Sized,
{
let mut index = 0;
while index < segments.len() {
match &segments[index] {
Segment::Dictionary {
byte_start,
byte_end,
reading,
suffix_reading,
mark,
} => {
let source = &text[*byte_start..*byte_end];
let effective = dictionary_effective_reading(
source,
reading,
suffix_reading.as_deref(),
options,
fallback_state.starts_word,
fallback_state.previous_reading,
);
output.push(OutputToken::Annotated(Annotation {
hanja: source.to_string(),
homophone: false,
reading: effective.clone(),
require_hanja: mark.require_hanja,
require_hangul: mark.require_hangul,
first_in_context: true,
skip_annotation: false,
from_dictionary: true,
from_source_gloss: false,
}));
if should_preserve_dictionary_context(source, &effective, options) {
update_fallback_state_for_reading(&effective, fallback_state);
} else {
*fallback_state = FallbackState::default();
}
index += 1;
}
Segment::TrivialDictionary {
byte_start,
byte_end,
..
}
| Segment::Fallback {
byte_start,
byte_end,
} => {
let run_start = index;
let mut merged_end = *byte_end;
while let Some(
Segment::TrivialDictionary {
byte_end: next_end, ..
}
| Segment::Fallback {
byte_end: next_end, ..
},
) = segments.get(index + 1)
{
merged_end = *next_end;
index += 1;
}
let has_dictionary = segments[run_start..=index]
.iter()
.any(|s| matches!(s, Segment::TrivialDictionary { .. }));
if has_dictionary {
process_trivial_fallback_run(
&segments[run_start..=index],
text,
options,
fallback_state,
output,
);
} else {
process_fallback_text(
&text[*byte_start..merged_end],
options,
fallback_state,
output,
);
}
index += 1;
}
Segment::NumeralText { text, .. } => {
push_text(output, text);
update_fallback_state_for_text(text, fallback_state);
index += 1;
}
Segment::Text {
byte_start,
byte_end,
} => {
let text_segment = &text[*byte_start..*byte_end];
push_text(output, text_segment);
update_fallback_state_for_text(text_segment, fallback_state);
index += 1;
}
}
}
}
fn segment_bounds(segment: &Segment) -> (usize, usize) {
match segment {
Segment::Dictionary {
byte_start,
byte_end,
..
}
| Segment::TrivialDictionary {
byte_start,
byte_end,
..
}
| Segment::Fallback {
byte_start,
byte_end,
}
| Segment::NumeralText {
byte_start,
byte_end,
..
}
| Segment::Text {
byte_start,
byte_end,
} => (*byte_start, *byte_end),
}
}
fn process_fallback_text<S>(
text: &str,
options: EngineOptions,
state: &mut FallbackState,
output: &mut Vec<OutputToken<S>>,
) {
for part in phoneticize_fallback_run_with_state(text, options, state) {
match part {
FallbackPart::Annotation { hanja, reading } => {
output.push(OutputToken::Annotated(Annotation {
hanja,
reading,
homophone: false,
require_hanja: false,
require_hangul: false,
first_in_context: true,
skip_annotation: false,
from_dictionary: false,
from_source_gloss: false,
}));
}
FallbackPart::ReadingText(text) => push_text(output, &text),
FallbackPart::Text(text) => push_text(output, &text),
}
}
}
fn update_fallback_state_for_text(text: &str, state: &mut FallbackState) {
if text.is_empty() {
return;
}
if text
.chars()
.last()
.is_some_and(|character| character.is_whitespace())
{
*state = FallbackState::default();
return;
}
let Some(last) = text.chars().rev().find(|ch| !ch.is_whitespace()) else {
return;
};
if last.is_alphanumeric() {
state.starts_word = false;
state.previous_reading = Some(last);
} else {
*state = FallbackState::default();
}
}
fn dictionary_effective_reading(
source: &str,
reading: &str,
suffix_reading: Option<&str>,
options: EngineOptions,
starts_word: bool,
previous_reading: Option<char>,
) -> String {
if let Some(suffix) = suffix_reading {
return if starts_word && options.initial_sound_law {
reading.to_string()
} else {
suffix.to_string()
};
}
let mut chars = source.chars();
if let (Some(ch), None) = (chars.next(), chars.next())
&& let Some(base) = phoneticize_hanja_char(ch)
{
let initial = apply_initial_sound_law_to_first_syllable(base);
if initial != base && (reading == base || reading == initial) {
let apply_law = options.initial_sound_law
&& (starts_word || should_apply_yeol_yul(previous_reading, base));
return if apply_law { initial } else { base.to_string() };
}
}
reading.to_string()
}
fn should_preserve_dictionary_context(source: &str, reading: &str, options: EngineOptions) -> bool {
if reading.chars().all(char::is_whitespace) {
return false;
}
if source.chars().all(is_hanja) {
match fallback_reading_for_run(source, options) {
Some(fallback_reading) => {
fallback_reading == reading || has_one_hangul_syllable_per_hanja(source, reading)
}
None => has_one_hangul_syllable_per_hanja(source, reading),
}
} else {
true
}
}
fn has_one_hangul_syllable_per_hanja(source: &str, reading: &str) -> bool {
let source_len = source.chars().count();
let mut reading_len = 0;
for ch in reading.chars() {
if !is_hangul_syllable(ch) {
return false;
}
reading_len += 1;
}
reading_len == source_len
}
fn is_hangul_syllable(ch: char) -> bool {
('\u{ac00}'..='\u{d7a3}').contains(&ch)
}
fn update_fallback_state_for_reading(reading: &str, state: &mut FallbackState) {
let Some(last) = reading.chars().rev().find(|ch| !ch.is_whitespace()) else {
*state = FallbackState::default();
return;
};
if last.is_alphanumeric() {
state.starts_word = false;
state.previous_reading = Some(last);
} else {
*state = FallbackState::default();
}
}
fn push_text<S>(output: &mut Vec<OutputToken<S>>, text: &str) {
if text.is_empty() {
return;
}
match output.last_mut() {
Some(OutputToken::Text(existing)) => existing.push_str(text),
_ => output.push(OutputToken::Text(text.to_string())),
}
}
pub fn is_hanja(ch: char) -> bool {
matches!(
ch,
'\u{2F00}'..='\u{2FFF}'
| '\u{3007}'
| '\u{3400}'..='\u{4DBF}'
| '\u{4E00}'..='\u{9FFF}'
| '\u{F900}'..='\u{FAFF}'
| '\u{20000}'..='\u{2A6DF}'
| '\u{2A700}'..='\u{2B73F}'
| '\u{2B740}'..='\u{2B81F}'
| '\u{2B820}'..='\u{2CEAF}'
| '\u{2CEB0}'..='\u{2EBEF}'
| '\u{2EBF0}'..='\u{2EE5F}'
| '\u{2F800}'..='\u{2FA1F}'
| '\u{30000}'..='\u{3134F}'
| '\u{31350}'..='\u{323AF}'
| '\u{323B0}'..='\u{3347F}'
)
}
#[derive(Clone, Copy, Debug, Eq, PartialEq)]
pub enum RenderMode {
HangulOnly,
HangulHanjaParens,
HanjaHangulParens,
Ruby(RubyBase),
Original,
}
#[derive(Clone, Copy, Debug, Eq, PartialEq)]
pub enum RubyBase {
OnHangul,
OnHanja,
}
#[derive(Clone, Copy, Debug, Default, Eq, PartialEq)]
pub enum OriginalGloss {
#[default]
Parens,
Ruby,
}
#[derive(Clone, Copy, Debug, Eq, PartialEq)]
pub struct RenderOptions {
pub mode: RenderMode,
pub original_gloss: OriginalGloss,
}
impl Default for RenderOptions {
fn default() -> Self {
Self {
mode: RenderMode::HangulOnly,
original_gloss: OriginalGloss::Parens,
}
}
}
impl From<RenderMode> for RenderOptions {
fn from(mode: RenderMode) -> Self {
Self {
mode,
original_gloss: OriginalGloss::default(),
}
}
}
#[derive(Clone, Copy, Debug, Eq, PartialEq)]
pub enum ContextWindow {
Off,
PerBlock,
PerSection,
PerDocument,
}
#[derive(Clone, Copy, Debug, Default, Eq, PartialEq)]
pub enum HomophoneDetection {
#[default]
ContextLocal,
DictionaryWide,
}
#[derive(Clone, Copy, Debug, Eq, PartialEq)]
pub enum DirectiveAction {
RequireHanja,
RequireHangul,
SkipAnnotation,
}
#[derive(Default)]
pub struct UserDirectives<'a> {
rules: Vec<UserDirectiveRule<'a>>,
}
impl<'a> UserDirectives<'a> {
pub fn new() -> Self {
Self::default()
}
pub fn require_hanja(&mut self, hanja: impl Into<String>) {
self.add_literal(hanja, DirectiveAction::RequireHanja);
}
pub fn require_hangul(&mut self, hanja: impl Into<String>) {
self.add_literal(hanja, DirectiveAction::RequireHangul);
}
pub fn skip_annotation(&mut self, hanja: impl Into<String>) {
self.add_literal(hanja, DirectiveAction::SkipAnnotation);
}
pub fn add_literal(&mut self, hanja: impl Into<String>, action: DirectiveAction) {
self.rules.push(UserDirectiveRule {
predicate: UserDirectivePredicate::Literal(hanja.into()),
action,
});
}
pub fn add_predicate(
&mut self,
predicate: impl Fn(&Annotation) -> bool + 'a,
action: DirectiveAction,
) {
self.rules.push(UserDirectiveRule {
predicate: UserDirectivePredicate::Predicate(Box::new(predicate)),
action,
});
}
pub fn is_empty(&self) -> bool {
self.rules.is_empty()
}
pub fn apply<S>(&self, token: OutputToken<S>) -> OutputToken<S> {
match token {
OutputToken::Annotated(mut annotation) => {
for rule in &self.rules {
if !rule.predicate.matches(&annotation) {
continue;
}
match rule.action {
DirectiveAction::RequireHanja => annotation.require_hanja = true,
DirectiveAction::RequireHangul => annotation.require_hangul = true,
DirectiveAction::SkipAnnotation => annotation.skip_annotation = true,
}
}
OutputToken::Annotated(annotation)
}
token => token,
}
}
}
struct UserDirectiveRule<'a> {
predicate: UserDirectivePredicate<'a>,
action: DirectiveAction,
}
enum UserDirectivePredicate<'a> {
Literal(String),
Predicate(Box<dyn Fn(&Annotation) -> bool + 'a>),
}
impl UserDirectivePredicate<'_> {
fn matches(&self, annotation: &Annotation) -> bool {
match self {
Self::Literal(hanja) => annotation.hanja == *hanja,
Self::Predicate(predicate) => predicate(annotation),
}
}
}
pub fn mark_homophones<S, D>(
tokens: impl IntoIterator<Item = OutputToken<S>>,
dictionary: &D,
window: ContextWindow,
) -> Vec<OutputToken<S>>
where
S: ScopeData,
D: HanjaDictionary + ?Sized,
{
mark_homophones_with_detection(tokens, dictionary, window, HomophoneDetection::ContextLocal)
}
pub fn mark_homophones_with_detection<S, D>(
tokens: impl IntoIterator<Item = OutputToken<S>>,
dictionary: &D,
window: ContextWindow,
detection: HomophoneDetection,
) -> Vec<OutputToken<S>>
where
S: ScopeData,
D: HanjaDictionary + ?Sized,
{
if window == ContextWindow::Off {
return tokens.into_iter().collect();
}
let index = match detection {
HomophoneDetection::ContextLocal => None,
HomophoneDetection::DictionaryWide => HomophoneIndex::from_dictionary(dictionary),
};
let lookup_fallback = match detection {
HomophoneDetection::ContextLocal => None,
HomophoneDetection::DictionaryWide => index.is_none().then_some(dictionary),
};
ContextMiddleware::new(window, |tokens| {
mark_homophones_in_context(tokens, index.as_ref(), lookup_fallback);
})
.process(tokens)
}
pub fn filter_first_occurrences<S>(
tokens: impl IntoIterator<Item = OutputToken<S>>,
window: ContextWindow,
) -> Vec<OutputToken<S>>
where
S: ScopeData,
{
ContextMiddleware::new(window, filter_first_occurrences_in_context).process(tokens)
}
type ContextApply<S> = fn(&mut [OutputToken<S>]);
type HomophoneApply<'a, S> = Box<dyn FnMut(&mut [OutputToken<S>]) + 'a>;
pub struct HomophoneMarker<'a, S>
where
S: ScopeData,
{
inner: ContextMiddleware<S, HomophoneApply<'a, S>>,
}
impl<'a, S> HomophoneMarker<'a, S>
where
S: ScopeData,
{
pub fn new<D>(dictionary: &'a D, window: ContextWindow) -> Self
where
D: HanjaDictionary + ?Sized,
{
Self::with_detection(dictionary, window, HomophoneDetection::ContextLocal)
}
pub fn with_detection<D>(
dictionary: &'a D,
window: ContextWindow,
detection: HomophoneDetection,
) -> Self
where
D: HanjaDictionary + ?Sized,
{
let index = match detection {
_ if window == ContextWindow::Off => None,
HomophoneDetection::ContextLocal => None,
HomophoneDetection::DictionaryWide => HomophoneIndex::from_dictionary(dictionary),
};
let lookup_fallback = match detection {
HomophoneDetection::ContextLocal => None,
HomophoneDetection::DictionaryWide => index.is_none().then_some(dictionary),
};
Self {
inner: ContextMiddleware::new(
window,
Box::new(move |tokens| {
mark_homophones_in_context(tokens, index.as_ref(), lookup_fallback);
}),
),
}
}
pub fn push_token(&mut self, token: OutputToken<S>) -> Vec<OutputToken<S>> {
self.inner.push_token(token)
}
pub fn finish(self) -> Vec<OutputToken<S>> {
self.inner.finish()
}
}
pub struct FirstOccurrenceFilter<S>
where
S: ScopeData,
{
inner: ContextMiddleware<S, ContextApply<S>>,
}
impl<S> FirstOccurrenceFilter<S>
where
S: ScopeData,
{
pub fn new(window: ContextWindow) -> Self {
Self {
inner: ContextMiddleware::new(window, filter_first_occurrences_in_context::<S>),
}
}
pub fn push_token(&mut self, token: OutputToken<S>) -> Vec<OutputToken<S>> {
self.inner.push_token(token)
}
pub fn finish(self) -> Vec<OutputToken<S>> {
self.inner.finish()
}
}
pub struct RedundantParenCollapser<S>
where
S: ScopeData,
{
enabled: bool,
held_tail: String,
pending_annotation: Option<Annotation>,
preceding: String,
following: String,
_scope: PhantomData<fn(S)>,
}
impl<S> RedundantParenCollapser<S>
where
S: ScopeData,
{
pub fn new(enabled: bool) -> Self {
Self {
enabled,
held_tail: String::new(),
pending_annotation: None,
preceding: String::new(),
following: String::new(),
_scope: PhantomData,
}
}
pub fn push_token(&mut self, token: OutputToken<S>) -> Vec<OutputToken<S>> {
if !self.enabled {
return Vec::from([token]);
}
let mut output = Vec::new();
match token {
OutputToken::Annotated(annotation) => {
self.finalize_pending(&mut output);
self.preceding = core::mem::take(&mut self.held_tail);
self.pending_annotation = Some(annotation);
}
OutputToken::Text(text) => {
if self.pending_annotation.is_some() {
self.following.push_str(&text);
self.resolve_following(&mut output);
} else {
self.held_tail.push_str(&text);
self.emit_held_prefix(&mut output);
}
}
boundary => {
self.finalize_pending(&mut output);
if !self.held_tail.is_empty() {
output.push(OutputToken::Text(core::mem::take(&mut self.held_tail)));
}
output.push(boundary);
}
}
output
}
pub fn finish(mut self) -> Vec<OutputToken<S>> {
if !self.enabled {
return Vec::new();
}
let mut output = Vec::new();
self.finalize_pending(&mut output);
if !self.held_tail.is_empty() {
output.push(OutputToken::Text(core::mem::take(&mut self.held_tail)));
}
output
}
fn emit_held_prefix(&mut self, output: &mut Vec<OutputToken<S>>) {
let split = hangul_first_tail_start(&self.held_tail);
if split > 0 {
let suffix = self.held_tail.split_off(split);
let prefix = core::mem::replace(&mut self.held_tail, suffix);
output.push(OutputToken::Text(prefix));
}
}
fn finalize_pending(&mut self, output: &mut Vec<OutputToken<S>>) {
if self.pending_annotation.is_some() {
self.decide_following(true, output);
}
}
fn resolve_following(&mut self, output: &mut Vec<OutputToken<S>>) {
self.decide_following(false, output);
}
fn decide_following(&mut self, flush: bool, output: &mut Vec<OutputToken<S>>) {
let annotation = self
.pending_annotation
.as_ref()
.expect("decide_following called with a pending annotation");
match classify_following(&self.preceding, annotation, &self.following, flush) {
FollowingMatch::NeedMore => return,
FollowingMatch::NoMatch => {
if !self.preceding.is_empty() {
output.push(OutputToken::Text(core::mem::take(&mut self.preceding)));
}
output.push(OutputToken::Annotated(
self.pending_annotation.take().expect("pending annotation"),
));
self.held_tail = core::mem::take(&mut self.following);
}
FollowingMatch::HanjaFirst {
collapsed,
leftover,
} => {
if !self.preceding.is_empty() {
output.push(OutputToken::Text(core::mem::take(&mut self.preceding)));
}
output.push(OutputToken::Annotated(collapsed));
self.pending_annotation = None;
self.held_tail = leftover;
self.following.clear();
}
FollowingMatch::HangulFirst {
remaining_preceding,
collapsed,
leftover,
} => {
if !remaining_preceding.is_empty() {
output.push(OutputToken::Text(remaining_preceding));
}
output.push(OutputToken::Annotated(collapsed));
self.pending_annotation = None;
self.preceding.clear();
self.held_tail = leftover;
self.following.clear();
}
}
self.emit_held_prefix(output);
}
}
const MAX_PRECEDING_READING_CHARS: usize = 64;
fn hangul_first_tail_start(text: &str) -> usize {
let mut start = text.len();
let mut chars = text.char_indices().rev().peekable();
if let Some(&(index, '(')) = chars.peek() {
start = index;
chars.next();
}
let mut held = 0;
while held < MAX_PRECEDING_READING_CHARS {
match chars.peek() {
Some(&(index, ch)) if is_hangul_syllable(ch) => {
start = index;
held += 1;
chars.next();
}
_ => break,
}
}
start
}
pub fn collapse_redundant_parens<S>(
tokens: impl IntoIterator<Item = OutputToken<S>>,
enabled: bool,
) -> Vec<OutputToken<S>>
where
S: ScopeData,
{
if !enabled {
return tokens.into_iter().collect();
}
let mut collapser = RedundantParenCollapser::new(true);
let mut output = Vec::new();
for token in tokens {
output.extend(collapser.push_token(token));
}
output.extend(collapser.finish());
output
}
enum ReadingMatch {
Keep,
Override(String),
}
fn classify_reading(hanja: &str, reading: &str, candidate: &str) -> Option<ReadingMatch> {
if candidate == reading {
Some(ReadingMatch::Keep)
} else if is_valid_alternative_reading(hanja, candidate) {
Some(ReadingMatch::Override(candidate.to_string()))
} else {
None
}
}
fn is_valid_alternative_reading(hanja: &str, candidate: &str) -> bool {
let mut hanja_chars = hanja.chars();
let mut candidate_chars = candidate.chars();
let mut matched_any = false;
loop {
match (hanja_chars.next(), candidate_chars.next()) {
(Some(hanja_char), Some(syllable)) => {
if !is_valid_char_reading(hanja_char, syllable) {
return false;
}
matched_any = true;
}
(None, None) => return matched_any,
_ => return false,
}
}
}
fn is_valid_char_reading(source: char, syllable: char) -> bool {
if !is_hangul_syllable(syllable) {
return false;
}
let readings = khangul_all_readings(source);
if readings.is_empty() {
return source == syllable;
}
readings.iter().any(|reading| {
reading_is_syllable(reading, syllable)
|| reading_matches_with_initial_sound_law(reading, syllable)
})
}
fn reading_is_syllable(reading: &str, syllable: char) -> bool {
let mut chars = reading.chars();
chars.next() == Some(syllable) && chars.next().is_none()
}
fn collapse_annotation(mut annotation: Annotation, reading_match: ReadingMatch) -> Annotation {
if let ReadingMatch::Override(reading) = reading_match {
annotation.reading = reading;
}
annotation.require_hanja = true;
annotation.require_hangul = true;
annotation.from_source_gloss = true;
annotation
}
enum FollowingMatch {
NeedMore,
NoMatch,
HanjaFirst {
collapsed: Annotation,
leftover: String,
},
HangulFirst {
remaining_preceding: String,
collapsed: Annotation,
leftover: String,
},
}
fn classify_following(
preceding: &str,
annotation: &Annotation,
following: &str,
flush: bool,
) -> FollowingMatch {
let Some(first) = following.chars().next() else {
return if flush {
FollowingMatch::NoMatch
} else {
FollowingMatch::NeedMore
};
};
match first {
')' => match match_hangul_first(preceding, annotation, following) {
Some((remaining_preceding, collapsed)) => FollowingMatch::HangulFirst {
remaining_preceding,
collapsed,
leftover: following[')'.len_utf8()..].to_string(),
},
None => FollowingMatch::NoMatch,
},
'(' => {
let content = &following['('.len_utf8()..];
match content.find(')') {
Some(close) => {
let candidate = &content[..close];
match classify_reading(&annotation.hanja, &annotation.reading, candidate) {
Some(reading_match) => FollowingMatch::HanjaFirst {
collapsed: collapse_annotation(annotation.clone(), reading_match),
leftover: content[close + ')'.len_utf8()..].to_string(),
},
None => FollowingMatch::NoMatch,
}
}
None => {
let max_reading = annotation
.reading
.chars()
.count()
.max(annotation.hanja.chars().count());
if flush || content.chars().count() > max_reading {
FollowingMatch::NoMatch
} else {
FollowingMatch::NeedMore
}
}
}
}
_ => FollowingMatch::NoMatch,
}
}
fn match_hangul_first(
preceding: &str,
annotation: &Annotation,
following: &str,
) -> Option<(String, Annotation)> {
if !following.starts_with(')') {
return None;
}
let before = preceding.strip_suffix('(')?;
if !annotation.reading.is_empty()
&& let Some(remaining) = before.strip_suffix(&annotation.reading)
{
let collapsed = collapse_annotation(annotation.clone(), ReadingMatch::Keep);
return Some((remaining.to_string(), collapsed));
}
let syllable_count = annotation.hanja.chars().count();
if syllable_count == 0 {
return None;
}
let (split, _) = before.char_indices().rev().nth(syllable_count - 1)?;
let candidate = &before[split..];
let reading_match = classify_reading(&annotation.hanja, &annotation.reading, candidate)?;
Some((
before[..split].to_string(),
collapse_annotation(annotation.clone(), reading_match),
))
}
pub fn apply_user_directives<S>(
tokens: impl IntoIterator<Item = OutputToken<S>>,
directives: &UserDirectives<'_>,
) -> Vec<OutputToken<S>> {
apply_user_directives_iter(tokens, directives).collect()
}
pub fn apply_user_directives_iter<'a, S>(
tokens: impl IntoIterator<Item = OutputToken<S>> + 'a,
directives: &'a UserDirectives<'_>,
) -> impl Iterator<Item = OutputToken<S>> + 'a {
tokens.into_iter().map(|token| directives.apply(token))
}
struct ContextMiddleware<S, F>
where
S: ScopeData,
F: FnMut(&mut [OutputToken<S>]),
{
window: ContextWindow,
apply: F,
context: Vec<OutputToken<S>>,
scope_boundaries: Vec<bool>,
}
impl<S, F> ContextMiddleware<S, F>
where
S: ScopeData,
F: FnMut(&mut [OutputToken<S>]),
{
fn new(window: ContextWindow, apply: F) -> Self {
Self {
window,
apply,
context: Vec::new(),
scope_boundaries: Vec::new(),
}
}
fn process(mut self, tokens: impl IntoIterator<Item = OutputToken<S>>) -> Vec<OutputToken<S>> {
let mut output = Vec::new();
for token in tokens {
output.extend(self.push_token(token));
}
output.extend(self.finish());
output
}
fn push_token(&mut self, token: OutputToken<S>) -> Vec<OutputToken<S>> {
let mut output = Vec::new();
match self.window {
ContextWindow::Off => output.push(token),
ContextWindow::PerDocument => self.context.push(token),
ContextWindow::PerBlock | ContextWindow::PerSection => match &token {
OutputToken::Open(scope) => {
let is_boundary = match self.window {
ContextWindow::PerBlock => scope.data().is_block_boundary(),
ContextWindow::PerSection => scope.data().is_section_boundary(),
ContextWindow::Off | ContextWindow::PerDocument => false,
};
if is_boundary {
self.flush_context(&mut output);
}
self.scope_boundaries.push(is_boundary);
self.context.push(token);
}
OutputToken::Close => {
let closes_boundary = self.scope_boundaries.pop().unwrap_or(false);
self.context.push(token);
if closes_boundary && self.window == ContextWindow::PerBlock {
self.flush_context(&mut output);
}
}
_ => self.context.push(token),
},
}
output
}
fn finish(mut self) -> Vec<OutputToken<S>> {
let mut output = Vec::new();
self.flush_context(&mut output);
output
}
fn flush_context(&mut self, output: &mut Vec<OutputToken<S>>) {
if self.context.is_empty() {
return;
}
(self.apply)(&mut self.context);
output.append(&mut self.context);
}
}
#[derive(Clone, Debug, Default, Eq, PartialEq)]
struct HomophoneIndex {
forms_by_reading: BTreeMap<String, BTreeSet<String>>,
}
impl HomophoneIndex {
fn from_dictionary<D>(dictionary: &D) -> Option<Self>
where
D: HanjaDictionary + ?Sized,
{
let mut forms_by_reading = BTreeMap::<String, BTreeSet<String>>::new();
for record in dictionary.entries()? {
forms_by_reading
.entry(record.reading)
.or_default()
.insert(record.hanja);
}
Some(Self { forms_by_reading })
}
fn has_homophone(&self, hanja: &str, reading: &str) -> bool {
self.forms_by_reading
.get(reading)
.is_some_and(|forms| forms.iter().any(|form| form != hanja))
}
}
fn mark_homophones_in_context<S, D>(
tokens: &mut [OutputToken<S>],
index: Option<&HomophoneIndex>,
lookup_fallback: Option<&D>,
) where
D: HanjaDictionary + ?Sized,
{
let mut forms_by_reading = BTreeMap::<String, BTreeSet<String>>::new();
for token in tokens.iter() {
if let OutputToken::Annotated(annotation) = token
&& annotation.from_dictionary
{
forms_by_reading
.entry(annotation.reading.clone())
.or_default()
.insert(annotation.hanja.clone());
}
}
for token in tokens.iter_mut() {
if let OutputToken::Annotated(annotation) = token {
annotation.homophone = annotation.from_dictionary
&& (index.is_some_and(|index| {
index.has_homophone(&annotation.hanja, &annotation.reading)
}) || lookup_fallback.is_some_and(|dictionary| {
dictionary.has_homophone(&annotation.hanja, &annotation.reading)
}) || forms_by_reading
.get(&annotation.reading)
.is_some_and(|forms| forms.len() > 1));
}
}
}
fn filter_first_occurrences_in_context<S>(tokens: &mut [OutputToken<S>]) {
let mut seen = BTreeSet::new();
for token in tokens.iter_mut() {
if let OutputToken::Annotated(annotation) = token {
if seen.insert(annotation.hanja.clone()) {
annotation.first_in_context = true;
} else {
annotation.first_in_context = false;
if !annotation.from_source_gloss {
annotation.require_hanja = false;
annotation.require_hangul = false;
}
}
}
}
}
pub fn render_tokens<S, O>(
tokens: impl IntoIterator<Item = OutputToken<S>>,
options: O,
) -> Vec<RenderedToken<S>>
where
S: ScopeData,
O: Into<RenderOptions>,
{
render_tokens_iter(tokens, options).collect()
}
pub fn render_tokens_iter<S, O>(
tokens: impl IntoIterator<Item = OutputToken<S>>,
options: O,
) -> impl Iterator<Item = RenderedToken<S>>
where
S: ScopeData,
O: Into<RenderOptions>,
{
RendererIter {
upstream: tokens.into_iter(),
renderer: Renderer::new(options),
}
}
pub struct Renderer<S>
where
S: ScopeData,
{
options: RenderOptions,
markup_stack: Vec<bool>,
disallowing_ancestors: usize,
_scope: PhantomData<fn(S)>,
}
impl<S> Renderer<S>
where
S: ScopeData,
{
pub fn new<O>(options: O) -> Self
where
O: Into<RenderOptions>,
{
Self {
options: options.into(),
markup_stack: Vec::new(),
disallowing_ancestors: 0,
_scope: PhantomData,
}
}
pub fn push_token(&mut self, token: OutputToken<S>) -> RenderedToken<S> {
match token {
OutputToken::Open(scope) => {
let allows = scope.data().allows_inline_markup();
if !allows {
self.disallowing_ancestors += 1;
}
self.markup_stack.push(allows);
RenderedToken::Open(scope)
}
OutputToken::Close => {
if let Some(false) = self.markup_stack.pop() {
self.disallowing_ancestors = self.disallowing_ancestors.saturating_sub(1);
}
RenderedToken::Close
}
OutputToken::Text(text) => RenderedToken::Text(text),
OutputToken::Verbatim(text) => RenderedToken::Verbatim(text),
OutputToken::Annotated(annotation) => {
let allows_inline_markup = self.disallowing_ancestors == 0;
render_annotation(&annotation, &self.options, allows_inline_markup)
}
}
}
}
struct RendererIter<I, S>
where
S: ScopeData,
{
upstream: I,
renderer: Renderer<S>,
}
impl<I, S> Iterator for RendererIter<I, S>
where
I: Iterator<Item = OutputToken<S>>,
S: ScopeData,
{
type Item = RenderedToken<S>;
fn next(&mut self) -> Option<Self::Item> {
let token = self.upstream.next()?;
Some(self.renderer.push_token(token))
}
}
fn render_annotation<S>(
annotation: &Annotation,
options: &RenderOptions,
allows_inline_markup: bool,
) -> RenderedToken<S> {
if annotation.skip_annotation {
let primary = match options.mode {
RenderMode::HangulOnly | RenderMode::HangulHanjaParens => annotation.reading.clone(),
RenderMode::HanjaHangulParens | RenderMode::Original => annotation.hanja.clone(),
RenderMode::Ruby(RubyBase::OnHangul) => annotation.reading.clone(),
RenderMode::Ruby(RubyBase::OnHanja) => annotation.hanja.clone(),
};
return RenderedToken::Text(primary);
}
match options.mode {
RenderMode::HangulOnly if annotation.require_hanja || annotation.homophone => {
RenderedToken::Text(parens(&annotation.reading, &annotation.hanja))
}
RenderMode::HangulOnly => RenderedToken::Text(annotation.reading.clone()),
RenderMode::HangulHanjaParens => {
RenderedToken::Text(parens(&annotation.reading, &annotation.hanja))
}
RenderMode::HanjaHangulParens => {
RenderedToken::Text(parens(&annotation.hanja, &annotation.reading))
}
RenderMode::Ruby(base) => render_ruby(annotation, base, allows_inline_markup),
RenderMode::Original if annotation.require_hangul => match options.original_gloss {
OriginalGloss::Parens => {
RenderedToken::Text(parens(&annotation.hanja, &annotation.reading))
}
OriginalGloss::Ruby => render_ruby(annotation, RubyBase::OnHanja, allows_inline_markup),
},
RenderMode::Original => RenderedToken::Text(annotation.hanja.clone()),
}
}
fn render_ruby<S>(
annotation: &Annotation,
base: RubyBase,
allows_inline_markup: bool,
) -> RenderedToken<S> {
let (base_text, rt_text) = match base {
RubyBase::OnHangul => (&annotation.reading, &annotation.hanja),
RubyBase::OnHanja => (&annotation.hanja, &annotation.reading),
};
if !allows_inline_markup {
return RenderedToken::Text(parens(base_text, rt_text));
}
RenderedToken::Ruby {
base: base_text.clone(),
rt: rt_text.clone(),
}
}
fn parens(reading: &str, hanja: &str) -> String {
let mut output = String::new();
output.push_str(reading);
output.push('(');
output.push_str(hanja);
output.push(')');
output
}
pub fn convert_plain_text<D, R>(input: &str, dictionary: &D, render: R) -> String
where
D: HanjaDictionary + ?Sized,
R: Into<RenderOptions>,
{
convert_plain_text_with_options(input, dictionary, render, EngineOptions::default())
}
pub fn convert_plain_text_with_options<D, R>(
input: &str,
dictionary: &D,
render: R,
options: EngineOptions,
) -> String
where
D: HanjaDictionary + ?Sized,
R: Into<RenderOptions>,
{
let input_tokens = read_plain_text(input);
let output_tokens = process_tokens_with_options(input_tokens, dictionary, options);
let output_tokens = collapse_redundant_parens(output_tokens, true);
let output_tokens = mark_homophones(output_tokens, dictionary, ContextWindow::PerBlock);
let rendered_tokens = render_tokens(output_tokens, render);
write_plain_text(rendered_tokens)
}