pub mod japanese_base_form;
pub mod japanese_compound_word;
pub mod japanese_kana;
pub mod japanese_katakana_stem;
pub mod japanese_keep_tags;
pub mod japanese_number;
pub mod japanese_reading_form;
pub mod japanese_stop_tags;
pub mod keep_words;
pub mod korean_keep_tags;
pub mod korean_reading_form;
pub mod korean_stop_tags;
pub mod length;
pub mod lowercase;
pub mod mapping;
pub mod remove_diacritical_mark;
pub mod stop_words;
pub mod uppercase;
use serde_json::Value;
use std::ops::Deref;
use crate::parse_cli_flag;
use crate::token::Token;
use crate::token_filter::japanese_base_form::{
JAPANESE_BASE_FORM_TOKEN_FILTER_NAME, JapaneseBaseFormTokenFilter,
};
use crate::token_filter::japanese_compound_word::{
JAPANESE_COMPOUND_WORD_TOKEN_FILTER_NAME, JapaneseCompoundWordTokenFilter,
};
use crate::token_filter::japanese_kana::{
JAPANESE_KANA_TOKEN_FILTER_NAME, JapaneseKanaTokenFilter,
};
use crate::token_filter::japanese_katakana_stem::{
JAPANESE_KATAKANA_STEM_TOKEN_FILTER_NAME, JapaneseKatakanaStemTokenFilter,
};
use crate::token_filter::japanese_keep_tags::{
JAPANESE_KEEP_TAGS_TOKEN_FILTER_NAME, JapaneseKeepTagsTokenFilter,
};
use crate::token_filter::japanese_number::{
JAPANESE_NUMBER_TOKEN_FILTER_NAME, JapaneseNumberTokenFilter,
};
use crate::token_filter::japanese_reading_form::{
JAPANESE_READING_FORM_TOKEN_FILTER_NAME, JapaneseReadingFormTokenFilter,
};
use crate::token_filter::japanese_stop_tags::{
JAPANESE_STOP_TAGS_TOKEN_FILTER_NAME, JapaneseStopTagsTokenFilter,
};
use crate::token_filter::keep_words::{KEEP_WORDS_TOKEN_FILTER_NAME, KeepWordsTokenFilter};
use crate::token_filter::korean_keep_tags::{
KOREAN_KEEP_TAGS_TOKEN_FILTER_NAME, KoreanKeepTagsTokenFilter,
};
use crate::token_filter::korean_reading_form::{
KOREAN_READING_FORM_TOKEN_FILTER_NAME, KoreanReadingFormTokenFilter,
};
use crate::token_filter::korean_stop_tags::{
KOREAN_STOP_TAGS_TOKEN_FILTER_NAME, KoreanStopTagsTokenFilter,
};
use crate::token_filter::length::{LENGTH_TOKEN_FILTER_NAME, LengthTokenFilter};
use crate::token_filter::lowercase::{LOWERCASE_TOKEN_FILTER_NAME, LowercaseTokenFilter};
use crate::token_filter::mapping::{MAPPING_TOKEN_FILTER_NAME, MappingTokenFilter};
use crate::token_filter::remove_diacritical_mark::{
REMOVE_DIACRITICAL_TOKEN_FILTER_NAME, RemoveDiacriticalMarkTokenFilter,
};
use crate::token_filter::stop_words::{STOP_WORDS_TOKEN_FILTER_NAME, StopWordsTokenFilter};
use crate::token_filter::uppercase::{UPPERCASE_TOKEN_FILTER_NAME, UppercaseTokenFilter};
use crate::{LinderaErrorKind, LinderaResult};
pub trait TokenFilter: 'static + Send + Sync + TokenFilterClone {
fn name(&self) -> &'static str;
fn apply(&self, tokens: &mut Vec<Token<'_>>) -> LinderaResult<()>;
}
pub struct BoxTokenFilter(Box<dyn TokenFilter + 'static + Send + Sync>);
impl Deref for BoxTokenFilter {
type Target = dyn TokenFilter;
fn deref(&self) -> &dyn TokenFilter {
&*self.0
}
}
impl<T: TokenFilter> From<T> for BoxTokenFilter {
fn from(token_filter: T) -> BoxTokenFilter {
BoxTokenFilter(Box::new(token_filter))
}
}
pub trait TokenFilterClone {
fn box_clone(&self) -> BoxTokenFilter;
}
impl<T: TokenFilter + Clone + 'static> TokenFilterClone for T {
fn box_clone(&self) -> BoxTokenFilter {
BoxTokenFilter::from(self.clone())
}
}
pub struct TokenFilterLoader {}
impl TokenFilterLoader {
pub fn load_from_value(kind: &str, value: &Value) -> LinderaResult<BoxTokenFilter> {
let token_filter = match kind {
JAPANESE_BASE_FORM_TOKEN_FILTER_NAME => {
BoxTokenFilter::from(JapaneseBaseFormTokenFilter::from_config(value)?)
}
JAPANESE_COMPOUND_WORD_TOKEN_FILTER_NAME => {
BoxTokenFilter::from(JapaneseCompoundWordTokenFilter::from_config(value)?)
}
JAPANESE_KANA_TOKEN_FILTER_NAME => {
BoxTokenFilter::from(JapaneseKanaTokenFilter::from_config(value)?)
}
JAPANESE_KATAKANA_STEM_TOKEN_FILTER_NAME => {
BoxTokenFilter::from(JapaneseKatakanaStemTokenFilter::from_config(value)?)
}
JAPANESE_KEEP_TAGS_TOKEN_FILTER_NAME => {
BoxTokenFilter::from(JapaneseKeepTagsTokenFilter::from_config(value)?)
}
JAPANESE_NUMBER_TOKEN_FILTER_NAME => {
BoxTokenFilter::from(JapaneseNumberTokenFilter::from_config(value)?)
}
JAPANESE_READING_FORM_TOKEN_FILTER_NAME => {
BoxTokenFilter::from(JapaneseReadingFormTokenFilter::from_config(value)?)
}
JAPANESE_STOP_TAGS_TOKEN_FILTER_NAME => {
BoxTokenFilter::from(JapaneseStopTagsTokenFilter::from_config(value)?)
}
KEEP_WORDS_TOKEN_FILTER_NAME => {
BoxTokenFilter::from(KeepWordsTokenFilter::from_config(value)?)
}
KOREAN_KEEP_TAGS_TOKEN_FILTER_NAME => {
BoxTokenFilter::from(KoreanKeepTagsTokenFilter::from_config(value)?)
}
KOREAN_READING_FORM_TOKEN_FILTER_NAME => {
BoxTokenFilter::from(KoreanReadingFormTokenFilter::from_config(value)?)
}
KOREAN_STOP_TAGS_TOKEN_FILTER_NAME => {
BoxTokenFilter::from(KoreanStopTagsTokenFilter::from_config(value)?)
}
LENGTH_TOKEN_FILTER_NAME => {
BoxTokenFilter::from(LengthTokenFilter::from_config(value)?)
}
LOWERCASE_TOKEN_FILTER_NAME => {
BoxTokenFilter::from(LowercaseTokenFilter::from_config(value)?)
}
MAPPING_TOKEN_FILTER_NAME => {
BoxTokenFilter::from(MappingTokenFilter::from_config(value)?)
}
REMOVE_DIACRITICAL_TOKEN_FILTER_NAME => {
BoxTokenFilter::from(RemoveDiacriticalMarkTokenFilter::from_config(value)?)
}
STOP_WORDS_TOKEN_FILTER_NAME => {
BoxTokenFilter::from(StopWordsTokenFilter::from_config(value)?)
}
UPPERCASE_TOKEN_FILTER_NAME => {
BoxTokenFilter::from(UppercaseTokenFilter::from_config(value)?)
}
_ => {
return Err(LinderaErrorKind::Deserialize
.with_error(anyhow::anyhow!("unsupported token filter: {kind}")));
}
};
Ok(token_filter)
}
pub fn load_from_cli_flag(cli_flag: &str) -> LinderaResult<BoxTokenFilter> {
let (kind, args) = parse_cli_flag(cli_flag)?;
let character_filter = Self::load_from_value(kind, &args)?;
Ok(character_filter)
}
}