lindera-filter 0.32.3

Character and token filters for Lindera.
Documentation
pub mod japanese_base_form;
pub mod japanese_compound_word;
pub mod japanese_kana;
pub mod japanese_katakana_stem;
pub mod japanese_keep_tags;
pub mod japanese_number;
pub mod japanese_reading_form;
pub mod japanese_stop_tags;
pub mod keep_words;
pub mod korean_keep_tags;
pub mod korean_reading_form;
pub mod korean_stop_tags;
pub mod length;
pub mod lowercase;
pub mod mapping;
pub mod stop_words;
pub mod uppercase;

use serde_json::Value;
use std::ops::Deref;

use lindera_core::error::LinderaErrorKind;
use lindera_core::LinderaResult;

use crate::parse_cli_flag;
use crate::token::Token;
use crate::token_filter::japanese_base_form::{
    JapaneseBaseFormTokenFilter, JapaneseBaseFormTokenFilterConfig,
    JAPANESE_BASE_FORM_TOKEN_FILTER_NAME,
};
use crate::token_filter::japanese_compound_word::{
    JapaneseCompoundWordTokenFilter, JapaneseCompoundWordTokenFilterConfig,
    JAPANESE_COMPOUND_WORD_TOKEN_FILTER_NAME,
};
use crate::token_filter::japanese_kana::{
    JapaneseKanaTokenFilter, JapaneseKanaTokenFilterConfig, JAPANESE_KANA_TOKEN_FILTER_NAME,
};
use crate::token_filter::japanese_katakana_stem::{
    JapaneseKatakanaStemTokenFilter, JapaneseKatakanaStemTokenFilterConfig,
    JAPANESE_KATAKANA_STEM_TOKEN_FILTER_NAME,
};
use crate::token_filter::japanese_keep_tags::{
    JapaneseKeepTagsTokenFilter, JapaneseKeepTagsTokenFilterConfig,
    JAPANESE_KEEP_TAGS_TOKEN_FILTER_NAME,
};
use crate::token_filter::japanese_number::{
    JapaneseNumberTokenFilter, JapaneseNumberTokenFilterConfig, JAPANESE_NUMBER_TOKEN_FILTER_NAME,
};
use crate::token_filter::japanese_reading_form::{
    JapaneseReadingFormTokenFilter, JapaneseReadingFormTokenFilterConfig,
    JAPANESE_READING_FORM_TOKEN_FILTER_NAME,
};
use crate::token_filter::japanese_stop_tags::{
    JapaneseStopTagsTokenFilter, JapaneseStopTagsTokenFilterConfig,
    JAPANESE_STOP_TAGS_TOKEN_FILTER_NAME,
};
use crate::token_filter::keep_words::{
    KeepWordsTokenFilter, KeepWordsTokenFilterConfig, KEEP_WORDS_TOKEN_FILTER_NAME,
};
use crate::token_filter::korean_keep_tags::{
    KoreanKeepTagsTokenFilter, KoreanKeepTagsTokenFilterConfig, KOREAN_KEEP_TAGS_TOKEN_FILTER_NAME,
};
use crate::token_filter::korean_reading_form::{
    KoreanReadingFormTokenFilter, KOREAN_READING_FORM_TOKEN_FILTER_NAME,
};
use crate::token_filter::korean_stop_tags::{
    KoreanStopTagsTokenFilter, KoreanStopTagsTokenFilterConfig, KOREAN_STOP_TAGS_TOKEN_FILTER_NAME,
};
use crate::token_filter::length::{
    LengthTokenFilter, LengthTokenFilterConfig, LENGTH_TOKEN_FILTER_NAME,
};
use crate::token_filter::lowercase::{LowercaseTokenFilter, LOWERCASE_TOKEN_FILTER_NAME};
use crate::token_filter::mapping::{
    MappingTokenFilter, MappingTokenFilterConfig, MAPPING_TOKEN_FILTER_NAME,
};
use crate::token_filter::stop_words::{
    StopWordsTokenFilter, StopWordsTokenFilterConfig, STOP_WORDS_TOKEN_FILTER_NAME,
};
use crate::token_filter::uppercase::{UppercaseTokenFilter, UPPERCASE_TOKEN_FILTER_NAME};

pub trait TokenFilter: 'static + Send + Sync + TokenFilterClone {
    fn name(&self) -> &str;
    fn apply(&self, tokens: &mut Vec<Token>) -> LinderaResult<()>;
}

pub struct BoxTokenFilter(Box<dyn TokenFilter + 'static + Send + Sync>);

impl Deref for BoxTokenFilter {
    type Target = dyn TokenFilter;

    fn deref(&self) -> &dyn TokenFilter {
        &*self.0
    }
}

impl<T: TokenFilter> From<T> for BoxTokenFilter {
    fn from(token_filter: T) -> BoxTokenFilter {
        BoxTokenFilter(Box::new(token_filter))
    }
}

pub trait TokenFilterClone {
    fn box_clone(&self) -> BoxTokenFilter;
}

impl<T: TokenFilter + Clone + 'static> TokenFilterClone for T {
    fn box_clone(&self) -> BoxTokenFilter {
        BoxTokenFilter::from(self.clone())
    }
}

pub struct TokenFilterLoader {}

impl TokenFilterLoader {
    pub fn load_from_value(kind: &str, value: &Value) -> LinderaResult<BoxTokenFilter> {
        let token_filter = match kind {
            JAPANESE_BASE_FORM_TOKEN_FILTER_NAME => {
                let config = JapaneseBaseFormTokenFilterConfig::from_value(value)?;
                BoxTokenFilter::from(JapaneseBaseFormTokenFilter::new(config))
            }
            JAPANESE_COMPOUND_WORD_TOKEN_FILTER_NAME => {
                let config = JapaneseCompoundWordTokenFilterConfig::from_value(value)?;
                BoxTokenFilter::from(JapaneseCompoundWordTokenFilter::new(config))
            }
            JAPANESE_KANA_TOKEN_FILTER_NAME => {
                let config = JapaneseKanaTokenFilterConfig::from_value(value)?;
                BoxTokenFilter::from(JapaneseKanaTokenFilter::new(config))
            }
            JAPANESE_KATAKANA_STEM_TOKEN_FILTER_NAME => {
                let config = JapaneseKatakanaStemTokenFilterConfig::from_value(value)?;
                BoxTokenFilter::from(JapaneseKatakanaStemTokenFilter::new(config))
            }
            JAPANESE_KEEP_TAGS_TOKEN_FILTER_NAME => {
                let config = JapaneseKeepTagsTokenFilterConfig::from_value(value)?;
                BoxTokenFilter::from(JapaneseKeepTagsTokenFilter::new(config))
            }
            JAPANESE_NUMBER_TOKEN_FILTER_NAME => {
                let config = JapaneseNumberTokenFilterConfig::from_value(value)?;
                BoxTokenFilter::from(JapaneseNumberTokenFilter::new(config))
            }
            JAPANESE_READING_FORM_TOKEN_FILTER_NAME => {
                let config = JapaneseReadingFormTokenFilterConfig::from_value(value)?;
                BoxTokenFilter::from(JapaneseReadingFormTokenFilter::new(config))
            }
            JAPANESE_STOP_TAGS_TOKEN_FILTER_NAME => {
                let config = JapaneseStopTagsTokenFilterConfig::from_value(value)?;
                BoxTokenFilter::from(JapaneseStopTagsTokenFilter::new(config))
            }
            KEEP_WORDS_TOKEN_FILTER_NAME => {
                let config = KeepWordsTokenFilterConfig::from_value(value)?;
                BoxTokenFilter::from(KeepWordsTokenFilter::new(config))
            }
            KOREAN_KEEP_TAGS_TOKEN_FILTER_NAME => {
                let config = KoreanKeepTagsTokenFilterConfig::from_value(value)?;
                BoxTokenFilter::from(KoreanKeepTagsTokenFilter::new(config))
            }
            KOREAN_READING_FORM_TOKEN_FILTER_NAME => {
                BoxTokenFilter::from(KoreanReadingFormTokenFilter::new())
            }
            KOREAN_STOP_TAGS_TOKEN_FILTER_NAME => {
                let config = KoreanStopTagsTokenFilterConfig::from_value(value)?;
                BoxTokenFilter::from(KoreanStopTagsTokenFilter::new(config))
            }
            LENGTH_TOKEN_FILTER_NAME => {
                let config = LengthTokenFilterConfig::from_value(value)?;
                BoxTokenFilter::from(LengthTokenFilter::new(config))
            }
            LOWERCASE_TOKEN_FILTER_NAME => BoxTokenFilter::from(LowercaseTokenFilter::new()),
            MAPPING_TOKEN_FILTER_NAME => {
                let config = MappingTokenFilterConfig::from_value(value)?;
                BoxTokenFilter::from(MappingTokenFilter::new(config)?)
            }
            STOP_WORDS_TOKEN_FILTER_NAME => {
                let config = StopWordsTokenFilterConfig::from_value(value)?;
                BoxTokenFilter::from(StopWordsTokenFilter::new(config))
            }
            UPPERCASE_TOKEN_FILTER_NAME => BoxTokenFilter::from(UppercaseTokenFilter::new()),
            _ => {
                return Err(LinderaErrorKind::Deserialize
                    .with_error(anyhow::anyhow!("unsupported token filter: {}", kind)));
            }
        };

        Ok(token_filter)
    }

    pub fn load_from_cli_flag(cli_flag: &str) -> LinderaResult<BoxTokenFilter> {
        let (kind, args) = parse_cli_flag(cli_flag)?;

        let character_filter = Self::load_from_value(kind, &args)?;

        Ok(character_filter)
    }
}