use std::fmt::Debug;
use std::fmt::Formatter;
use std::sync::Arc;
use crate::analysis::analyzer::analyzer::Analyzer;
use crate::analysis::analyzer::pipeline::PipelineAnalyzer;
use crate::analysis::char_filter::japanese_iteration_mark::JapaneseIterationMarkCharFilter;
use crate::analysis::char_filter::unicode_normalize::NormalizationForm;
use crate::analysis::char_filter::unicode_normalize::UnicodeNormalizationCharFilter;
use crate::analysis::token::TokenStream;
use crate::analysis::token_filter::lowercase::LowercaseFilter;
use crate::analysis::token_filter::stop::{DEFAULT_JAPANESE_STOP_WORDS_SET, StopFilter};
use crate::analysis::tokenizer::lindera::LinderaTokenizer;
use crate::error::Result;
pub struct JapaneseAnalyzer {
inner: PipelineAnalyzer,
}
impl JapaneseAnalyzer {
pub fn new() -> Result<Self> {
let tokenizer = Arc::new(LinderaTokenizer::new("normal", "embedded://ipadic", None)?);
let analyzer = PipelineAnalyzer::new(tokenizer)
.add_char_filter(Arc::new(UnicodeNormalizationCharFilter::new(
NormalizationForm::NFKC,
)))
.add_char_filter(Arc::new(JapaneseIterationMarkCharFilter::new(true, true)))
.add_filter(Arc::new(LowercaseFilter::new()))
.add_filter(Arc::new(StopFilter::with_stop_words(
DEFAULT_JAPANESE_STOP_WORDS_SET.clone(),
)))
.with_name("japanese".to_string());
Ok(Self { inner: analyzer })
}
}
impl Default for JapaneseAnalyzer {
fn default() -> Self {
Self::new().expect("Japanese analyzer should be creatable with default settings")
}
}
impl Analyzer for JapaneseAnalyzer {
fn analyze(&self, text: &str) -> Result<TokenStream> {
self.inner.analyze(text)
}
fn name(&self) -> &'static str {
"japanese"
}
fn as_any(&self) -> &dyn std::any::Any {
self
}
}
impl Debug for JapaneseAnalyzer {
fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
f.debug_struct("JapaneseAnalyzer")
.field("inner", &self.inner)
.finish()
}
}
#[cfg(test)]
mod tests {
use super::*;
use crate::analysis::token::Token;
#[test]
fn test_english_analyzer() {
let analyzer = JapaneseAnalyzer::new().unwrap();
let text = "日本語の形態素解析を行うことができます。";
let tokens: Vec<Token> = analyzer.analyze(text).unwrap().collect();
assert_eq!(tokens.len(), 5);
assert_eq!(tokens[0].text, "日本語");
assert_eq!(tokens[1].text, "形態素");
assert_eq!(tokens[2].text, "解析");
assert_eq!(tokens[3].text, "行う");
assert_eq!(tokens[4].text, "。");
}
#[test]
fn test_japanese_analyzer_name() {
let analyzer = JapaneseAnalyzer::new().unwrap();
assert_eq!(analyzer.name(), "japanese");
}
}