use std::sync::Arc;
use crate::analysis::analyzer::analyzer::Analyzer;
use crate::analysis::char_filter::CharFilter;
use crate::analysis::token::TokenStream;
use crate::analysis::token_filter::Filter;
use crate::analysis::tokenizer::Tokenizer;
use crate::error::Result;
#[derive(Clone)]
pub struct PipelineAnalyzer {
tokenizer: Arc<dyn Tokenizer>,
char_filters: Vec<Arc<dyn CharFilter>>,
filters: Vec<Arc<dyn Filter>>,
name: String,
}
impl PipelineAnalyzer {
pub fn new(tokenizer: Arc<dyn Tokenizer>) -> Self {
PipelineAnalyzer {
name: format!("pipeline_{}", tokenizer.name()),
tokenizer,
char_filters: Vec::new(),
filters: Vec::new(),
}
}
pub fn add_char_filter(mut self, char_filter: Arc<dyn CharFilter>) -> Self {
self.char_filters.push(char_filter);
self
}
pub fn add_filter(mut self, filter: Arc<dyn Filter>) -> Self {
self.filters.push(filter);
self
}
pub fn with_name<S: Into<String>>(mut self, name: S) -> Self {
self.name = name.into();
self
}
pub fn tokenizer(&self) -> &Arc<dyn Tokenizer> {
&self.tokenizer
}
pub fn char_filters(&self) -> &[Arc<dyn CharFilter>] {
&self.char_filters
}
pub fn filters(&self) -> &[Arc<dyn Filter>] {
&self.filters
}
}
impl Analyzer for PipelineAnalyzer {
fn analyze(&self, text: &str) -> Result<TokenStream> {
let mut filtered_text = text.to_string();
let mut filter_transformations = Vec::with_capacity(self.char_filters.len());
for char_filter in &self.char_filters {
let (new_text, transformations) = char_filter.filter(&filtered_text);
filtered_text = new_text;
filter_transformations.push(transformations);
}
let mut tokens = self.tokenizer.tokenize(&filtered_text)?;
for filter in &self.filters {
tokens = filter.filter(tokens)?;
}
if !self.char_filters.is_empty() {
let collected: Vec<_> = tokens
.map(|mut token| {
for transformations in filter_transformations.iter().rev() {
token.start_offset =
Self::correct_offset(token.start_offset, transformations);
token.end_offset = Self::correct_offset(token.end_offset, transformations);
}
token
})
.collect();
return Ok(Box::new(collected.into_iter()));
}
Ok(tokens)
}
fn name(&self) -> &str {
&self.name
}
fn as_any(&self) -> &dyn std::any::Any {
self
}
}
impl PipelineAnalyzer {
fn correct_offset(
offset: usize,
transformations: &[crate::analysis::char_filter::Transformation],
) -> usize {
let mut corrected = offset;
for t in transformations {
if offset >= t.new_end {
let original_len = t.original_end - t.original_start;
let new_len = t.new_end - t.new_start;
corrected = corrected
.saturating_sub(new_len)
.saturating_add(original_len);
} else if offset >= t.new_start {
let offset_in_new = offset - t.new_start;
let new_len = t.new_end - t.new_start;
let original_len = t.original_end - t.original_start;
if new_len == 0 {
return t.original_start;
}
let offset_in_original = (offset_in_new * original_len) / new_len;
return t.original_start + offset_in_original;
}
}
corrected
}
}
impl std::fmt::Debug for PipelineAnalyzer {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
f.debug_struct("PipelineAnalyzer")
.field("name", &self.name)
.field("tokenizer", &self.tokenizer.name())
.field(
"char_filters",
&self
.char_filters
.iter()
.map(|f| f.name())
.collect::<Vec<_>>(),
)
.field(
"filters",
&self.filters.iter().map(|f| f.name()).collect::<Vec<_>>(),
)
.finish()
}
}
#[cfg(test)]
mod tests {
use super::*;
use crate::analysis::char_filter::pattern_replace::PatternReplaceCharFilter;
use crate::analysis::char_filter::unicode_normalize::{
NormalizationForm, UnicodeNormalizationCharFilter,
};
use crate::analysis::token::Token;
use crate::analysis::token_filter::lowercase::LowercaseFilter;
use crate::analysis::token_filter::stop::StopFilter;
use crate::analysis::tokenizer::regex::RegexTokenizer;
use crate::analysis::tokenizer::whitespace::WhitespaceTokenizer;
#[test]
fn test_pipeline_analyzer() {
let tokenizer = Arc::new(RegexTokenizer::new().unwrap());
let analyzer = PipelineAnalyzer::new(tokenizer)
.add_filter(Arc::new(LowercaseFilter::new()))
.add_filter(Arc::new(StopFilter::from_words(vec!["the", "and"])));
let tokens: Vec<Token> = analyzer
.analyze("Hello THE world AND test")
.unwrap()
.collect();
assert_eq!(tokens.len(), 3);
assert_eq!(tokens[0].text, "hello");
assert_eq!(tokens[1].text, "world");
assert_eq!(tokens[2].text, "test");
}
#[test]
fn test_pipeline_with_char_filter() {
let tokenizer = Arc::new(WhitespaceTokenizer::new());
let analyzer = PipelineAnalyzer::new(tokenizer)
.add_char_filter(Arc::new(UnicodeNormalizationCharFilter::new(
NormalizationForm::NFKC,
)))
.add_filter(Arc::new(LowercaseFilter::new()));
let tokens: Vec<Token> = analyzer.analyze("\u{ff21}BC DEF").unwrap().collect();
assert_eq!(tokens.len(), 2);
assert_eq!(tokens[0].text, "abc");
assert_eq!(tokens[1].text, "def");
}
#[test]
fn test_pipeline_with_pattern_replace() {
let tokenizer = Arc::new(WhitespaceTokenizer::new());
let analyzer = PipelineAnalyzer::new(tokenizer)
.add_char_filter(Arc::new(PatternReplaceCharFilter::new(r"-", "").unwrap()));
let tokens: Vec<Token> = analyzer.analyze("123-456 789").unwrap().collect();
assert_eq!(tokens.len(), 2);
assert_eq!(tokens[0].text, "123456");
assert_eq!(tokens[1].text, "789");
}
#[test]
fn test_offset_correction_normalization() {
let tokenizer = Arc::new(WhitespaceTokenizer::new());
let analyzer = PipelineAnalyzer::new(tokenizer).add_char_filter(Arc::new(
UnicodeNormalizationCharFilter::new(NormalizationForm::NFKC),
));
let tokens: Vec<Token> = analyzer.analyze("㌂").unwrap().collect();
assert_eq!(tokens.len(), 1);
assert_eq!(tokens[0].text, "アンペア");
assert_eq!(tokens[0].start_offset, 0);
assert_eq!(tokens[0].end_offset, 3);
}
#[test]
fn test_offset_correction_pattern_replace() {
let tokenizer = Arc::new(WhitespaceTokenizer::new());
let analyzer = PipelineAnalyzer::new(tokenizer)
.add_char_filter(Arc::new(PatternReplaceCharFilter::new(r"-", "").unwrap()));
let tokens: Vec<Token> = analyzer.analyze("foo-bar").unwrap().collect();
assert_eq!(tokens.len(), 1);
assert_eq!(tokens[0].text, "foobar");
assert_eq!(tokens[0].start_offset, 0);
assert_eq!(tokens[0].end_offset, 7);
}
}