use gukhanmun_core::{
Annotation, ChainDictionary, ContextWindow, DirectiveAction, Engine, FirstOccurrenceFilter,
HanjaDictionary, HomophoneDetection, HomophoneMarker, InputToken, NumeralStrategy, OutputToken,
PlainScopeData, Recovery, RenderOptions, RenderedToken, ScopeData, SegmentationStrategy,
UserDirectives, apply_user_directives, apply_user_directives_iter, filter_first_occurrences,
mark_homophones_with_detection, process_tokens_iter_with_options, read_plain_text,
recover_input_tokens, render_tokens_iter, write_plain_text,
};
#[cfg(not(feature = "stdict"))]
use crate::error::Error;
use crate::error::Result;
use crate::options::{ConversionOptions, Preset};
#[cfg(feature = "html")]
use gukhanmun_html::{
HtmlElementInfo, HtmlReaderOptions, HtmlScopeData, try_read_html_fragment_iter_with_options,
write_html_fragment,
};
#[cfg(feature = "markdown")]
use gukhanmun_markdown::{MarkdownScopeData, MarkdownVariant, read_markdown_iter, write_markdown};
struct EngineIter<'a, S, D, I>
where
S: ScopeData,
D: HanjaDictionary + ?Sized + 'a,
I: Iterator<Item = InputToken<S>>,
{
upstream: I,
engine: Option<Engine<'a, S, D>>,
buffer: std::vec::IntoIter<OutputToken<S>>,
}
impl<'a, S, D, I> Iterator for EngineIter<'a, S, D, I>
where
S: ScopeData,
D: HanjaDictionary + ?Sized + 'a,
I: Iterator<Item = InputToken<S>>,
{
type Item = OutputToken<S>;
fn next(&mut self) -> Option<Self::Item> {
loop {
if let Some(token) = self.buffer.next() {
return Some(token);
}
let engine = self.engine.as_mut()?;
if let Some(input) = self.upstream.next() {
let produced = engine.push_token(input);
self.buffer = produced.into_iter();
continue;
}
let engine = self.engine.take().expect("engine present");
self.buffer = engine.finish().into_iter();
}
}
}
type BoxedDictionary<'a> = Box<dyn HanjaDictionary + 'a>;
pub struct Builder<'a> {
options: ConversionOptions,
bundled_stdict: bool,
dictionaries: Vec<BoxedDictionary<'a>>,
directives: UserDirectives<'a>,
#[cfg(feature = "html")]
html_reader_options: HtmlReaderOptions<'a>,
}
impl Default for Builder<'_> {
fn default() -> Self {
Self::with_preset(Preset::default())
}
}
impl<'a> Builder<'a> {
pub fn new() -> Self {
Self::default()
}
pub fn with_preset(preset: Preset) -> Self {
Self {
options: preset.options(),
bundled_stdict: preset.includes_bundled_stdict(),
dictionaries: Vec::new(),
directives: UserDirectives::new(),
#[cfg(feature = "html")]
html_reader_options: HtmlReaderOptions::new(),
}
}
pub fn rendering(mut self, rendering: impl Into<RenderOptions>) -> Self {
self.options.rendering = rendering.into();
self
}
pub fn segmentation(mut self, strategy: SegmentationStrategy) -> Self {
self.options.engine.segmentation = strategy;
self
}
pub fn numerals(mut self, strategy: NumeralStrategy) -> Self {
self.options.engine.numeral_strategy = strategy;
self
}
pub fn initial_sound_law(mut self, enabled: bool) -> Self {
self.options.engine.initial_sound_law = enabled;
self
}
pub fn homophone_window(mut self, window: ContextWindow) -> Self {
self.options.homophone_window = window;
self
}
pub fn homophone_detection(mut self, detection: HomophoneDetection) -> Self {
self.options.homophone_detection = detection;
self
}
pub fn first_occurrence_window(mut self, window: ContextWindow) -> Self {
self.options.first_occurrence_window = window;
self
}
pub fn recovery(mut self, recovery: Recovery) -> Self {
self.options.recovery = recovery;
self
}
pub fn no_bundled_stdict(mut self) -> Self {
self.bundled_stdict = false;
self
}
pub fn bundled_stdict(mut self) -> Self {
self.bundled_stdict = true;
self
}
pub fn push_dictionary<D>(mut self, dictionary: D) -> Self
where
D: HanjaDictionary + 'a,
{
self.dictionaries.push(Box::new(dictionary));
self
}
pub fn push_boxed_dictionary(mut self, dictionary: BoxedDictionary<'a>) -> Self {
self.dictionaries.push(dictionary);
self
}
pub fn directive(mut self, hanja: impl Into<String>, action: DirectiveAction) -> Self {
self.directives.add_literal(hanja, action);
self
}
pub fn directive_predicate(
mut self,
predicate: impl Fn(&Annotation) -> bool + 'a,
action: DirectiveAction,
) -> Self {
self.directives.add_predicate(predicate, action);
self
}
pub fn directives(mut self, directives: UserDirectives<'a>) -> Self {
self.directives = directives;
self
}
#[cfg(feature = "html")]
pub fn html_preserve_when<F>(mut self, predicate: F) -> Self
where
F: Fn(&HtmlElementInfo<'_>) -> bool + 'a,
{
self.html_reader_options = HtmlReaderOptions::new().preserve_when(predicate);
self
}
pub fn build(self) -> Result<Converter<'a>> {
let Self {
options,
bundled_stdict,
dictionaries,
directives,
#[cfg(feature = "html")]
html_reader_options,
} = self;
#[cfg(feature = "stdict")]
let dictionaries = {
let mut dictionaries = dictionaries;
if bundled_stdict {
dictionaries.push(Box::new(gukhanmun_stdict::ko_kr()));
}
dictionaries
};
#[cfg(not(feature = "stdict"))]
{
if bundled_stdict {
return Err(Error::Config(
"bundled Standard Korean Language Dictionary requested but the `stdict` \
feature is disabled"
.into(),
));
}
}
let chain = ChainDictionary::from_iter(dictionaries);
Ok(Converter {
options,
dictionary: chain,
directives,
#[cfg(feature = "html")]
html_reader_options,
})
}
}
pub struct Converter<'a> {
options: ConversionOptions,
dictionary: ChainDictionary<BoxedDictionary<'a>>,
directives: UserDirectives<'a>,
#[cfg(feature = "html")]
html_reader_options: HtmlReaderOptions<'a>,
}
impl<'a> Converter<'a> {
pub fn options(&self) -> ConversionOptions {
self.options
}
pub fn dictionary(&self) -> &ChainDictionary<BoxedDictionary<'a>> {
&self.dictionary
}
pub fn directives(&self) -> &UserDirectives<'a> {
&self.directives
}
#[cfg(feature = "html")]
pub fn html_reader_options(&self) -> &HtmlReaderOptions<'a> {
&self.html_reader_options
}
pub fn convert_text_to_string(&self, input: &str) -> Result<String> {
let input_tokens = read_plain_text(input);
let rendered = self.run_buffered(input_tokens);
Ok(write_plain_text(rendered))
}
#[cfg(feature = "html")]
pub fn convert_html_fragment_to_string(&self, input: &str) -> Result<String> {
let input_tokens = recover_input_tokens(
try_read_html_fragment_iter_with_options(input, &self.html_reader_options),
self.options.recovery,
)?;
let rendered = self.run_buffered(input_tokens);
Ok(write_html_fragment(rendered))
}
#[cfg(feature = "markdown")]
pub fn convert_markdown_to_string(
&self,
input: &str,
variant: MarkdownVariant,
) -> Result<String> {
let input_tokens = gukhanmun_markdown::read_markdown(input, variant);
let rendered = self.run_buffered(input_tokens);
Ok(write_markdown(rendered)?)
}
pub fn convert_text_iter<'b>(
&'b self,
input: &'b str,
) -> impl Iterator<Item = RenderedToken<PlainScopeData>> + 'b {
self.convert_tokens(read_plain_text(input))
}
#[cfg(feature = "html")]
pub fn convert_html_fragment_iter<'b>(
&'b self,
input: &'b str,
) -> Result<impl Iterator<Item = RenderedToken<HtmlScopeData>> + 'b> {
let input_tokens = recover_input_tokens(
try_read_html_fragment_iter_with_options(input, &self.html_reader_options),
self.options.recovery,
)?;
Ok(self.convert_tokens(input_tokens))
}
#[cfg(feature = "markdown")]
pub fn convert_markdown_iter<'b>(
&'b self,
input: &'b str,
variant: MarkdownVariant,
) -> impl Iterator<Item = RenderedToken<MarkdownScopeData>> + 'b {
let input_tokens = read_markdown_iter(input, variant);
self.convert_tokens(input_tokens)
}
pub fn convert_tokens<'b, S, I>(
&'b self,
input: I,
) -> impl Iterator<Item = RenderedToken<S>> + 'b
where
S: ScopeData + 'b,
I: IntoIterator<Item = InputToken<S>> + 'b,
I::IntoIter: 'b,
{
let engine_iter = EngineIter {
upstream: input.into_iter(),
engine: Some(Engine::<S, _>::with_options(
&self.dictionary,
self.options.engine,
)),
buffer: Vec::new().into_iter(),
};
let homophone_iter = MiddlewareIter::new(
engine_iter,
HomophoneMarker::with_detection(
&self.dictionary,
self.options.homophone_window,
self.options.homophone_detection,
),
HomophoneMarker::push_token,
HomophoneMarker::finish,
);
let first_occurrence_iter = MiddlewareIter::new(
homophone_iter,
FirstOccurrenceFilter::new(self.options.first_occurrence_window),
FirstOccurrenceFilter::push_token,
FirstOccurrenceFilter::finish,
);
let directives_iter = apply_user_directives_iter(first_occurrence_iter, &self.directives);
render_tokens_iter(directives_iter, self.options.rendering)
}
fn run_buffered<S>(
&self,
input_tokens: impl IntoIterator<Item = InputToken<S>>,
) -> Vec<RenderedToken<S>>
where
S: ScopeData,
{
let output_tokens =
process_tokens_iter_with_options(input_tokens, &self.dictionary, self.options.engine);
let output_tokens = mark_homophones_with_detection(
output_tokens,
&self.dictionary,
self.options.homophone_window,
self.options.homophone_detection,
);
let output_tokens =
filter_first_occurrences(output_tokens, self.options.first_occurrence_window);
let output_tokens = apply_user_directives(output_tokens, &self.directives);
render_tokens_iter(output_tokens, self.options.rendering).collect()
}
}
struct MiddlewareIter<I, M, S, P, F>
where
I: Iterator<Item = OutputToken<S>>,
P: FnMut(&mut M, OutputToken<S>) -> Vec<OutputToken<S>>,
F: FnOnce(M) -> Vec<OutputToken<S>>,
S: ScopeData,
{
upstream: I,
middleware: Option<M>,
push: P,
finish: Option<F>,
buffer: std::vec::IntoIter<OutputToken<S>>,
}
impl<I, M, S, P, F> MiddlewareIter<I, M, S, P, F>
where
I: Iterator<Item = OutputToken<S>>,
P: FnMut(&mut M, OutputToken<S>) -> Vec<OutputToken<S>>,
F: FnOnce(M) -> Vec<OutputToken<S>>,
S: ScopeData,
{
fn new(upstream: I, middleware: M, push: P, finish: F) -> Self {
Self {
upstream,
middleware: Some(middleware),
push,
finish: Some(finish),
buffer: Vec::new().into_iter(),
}
}
}
impl<I, M, S, P, F> Iterator for MiddlewareIter<I, M, S, P, F>
where
I: Iterator<Item = OutputToken<S>>,
P: FnMut(&mut M, OutputToken<S>) -> Vec<OutputToken<S>>,
F: FnOnce(M) -> Vec<OutputToken<S>>,
S: ScopeData,
{
type Item = OutputToken<S>;
fn next(&mut self) -> Option<Self::Item> {
loop {
if let Some(token) = self.buffer.next() {
return Some(token);
}
let middleware = self.middleware.as_mut()?;
if let Some(input) = self.upstream.next() {
let produced = (self.push)(middleware, input);
self.buffer = produced.into_iter();
continue;
}
let middleware = self.middleware.take().expect("middleware present");
let finish = self.finish.take().expect("finish callback present");
self.buffer = finish(middleware).into_iter();
}
}
}