use std::fmt::Debug;
use std::fmt::Formatter;
use std::sync::Arc;
use crate::analysis::analyzer::analyzer::Analyzer;
use crate::analysis::analyzer::pipeline::PipelineAnalyzer;
use crate::analysis::char_filter::japanese_iteration_mark::JapaneseIterationMarkCharFilter;
use crate::analysis::char_filter::unicode_normalize::NormalizationForm;
use crate::analysis::char_filter::unicode_normalize::UnicodeNormalizationCharFilter;
use crate::analysis::token::TokenStream;
use crate::analysis::token_filter::lowercase::LowercaseFilter;
use crate::analysis::token_filter::stop::{DEFAULT_JAPANESE_STOP_WORDS_SET, StopFilter};
use crate::analysis::tokenizer::lindera::LinderaTokenizer;
use crate::error::Result;
pub struct JapaneseAnalyzer {
inner: PipelineAnalyzer,
}
impl JapaneseAnalyzer {
pub fn new(mode_str: &str, dict_uri: &str, user_dict_uri: Option<&str>) -> Result<Self> {
let tokenizer = Arc::new(LinderaTokenizer::new(mode_str, dict_uri, user_dict_uri)?);
Ok(Self {
inner: Self::build_pipeline(tokenizer),
})
}
#[allow(clippy::too_many_arguments)]
pub fn from_bytes(
mode_str: &str,
metadata: &[u8],
dict_da: &[u8],
dict_vals: &[u8],
dict_words_idx: &[u8],
dict_words: &[u8],
matrix_mtx: &[u8],
char_def: &[u8],
unk: &[u8],
) -> Result<Self> {
let tokenizer = Arc::new(LinderaTokenizer::from_bytes(
mode_str,
metadata,
dict_da,
dict_vals,
dict_words_idx,
dict_words,
matrix_mtx,
char_def,
unk,
)?);
Ok(Self {
inner: Self::build_pipeline(tokenizer),
})
}
fn build_pipeline(tokenizer: Arc<LinderaTokenizer>) -> PipelineAnalyzer {
PipelineAnalyzer::new(tokenizer)
.add_char_filter(Arc::new(UnicodeNormalizationCharFilter::new(
NormalizationForm::NFKC,
)))
.add_char_filter(Arc::new(JapaneseIterationMarkCharFilter::new(true, true)))
.add_filter(Arc::new(LowercaseFilter::new()))
.add_filter(Arc::new(StopFilter::with_stop_words(
DEFAULT_JAPANESE_STOP_WORDS_SET.clone(),
)))
.with_name("japanese".to_string())
}
}
impl Analyzer for JapaneseAnalyzer {
fn analyze(&self, text: &str) -> Result<TokenStream> {
self.inner.analyze(text)
}
fn name(&self) -> &'static str {
"japanese"
}
fn as_any(&self) -> &dyn std::any::Any {
self
}
}
impl Debug for JapaneseAnalyzer {
fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
f.debug_struct("JapaneseAnalyzer")
.field("inner", &self.inner)
.finish()
}
}
#[cfg(test)]
mod tests {
use super::*;
use crate::analysis::token::Token;
#[test]
fn test_japanese_analyzer_segmentation() {
let analyzer = JapaneseAnalyzer::new("normal", "embedded://ipadic", None).unwrap();
let text = "日本語の形態素解析を行うことができます。";
let tokens: Vec<Token> = analyzer.analyze(text).unwrap().collect();
assert_eq!(tokens.len(), 5);
assert_eq!(tokens[0].text, "日本語");
assert_eq!(tokens[1].text, "形態素");
assert_eq!(tokens[2].text, "解析");
assert_eq!(tokens[3].text, "行う");
assert_eq!(tokens[4].text, "。");
}
#[test]
fn test_japanese_analyzer_name() {
let analyzer = JapaneseAnalyzer::new("normal", "embedded://ipadic", None).unwrap();
assert_eq!(analyzer.name(), "japanese");
}
}