use std::mem;
use tantivy::tokenizer::{
AsciiFoldingFilter, BoxTokenStream, LowerCaser, RemoveLongFilter, SimpleTokenizer, TextAnalyzer, Token,
TokenFilter, TokenStream,
};
use unicode_normalization::UnicodeNormalization;
pub(super) const TOKENIZER_NAME: &str = "simple_normalized";
pub(super) fn simple_normalized_tokenizer() -> TextAnalyzer {
TextAnalyzer::from(SimpleTokenizer)
.filter(RemoveLongFilter::limit(40))
.filter(LowerCaser)
.filter(UnicodeNormalizationFilter)
.filter(AsciiFoldingFilter)
}
#[derive(Clone)]
struct UnicodeNormalizationFilter;
impl TokenFilter for UnicodeNormalizationFilter {
fn transform<'a>(&self, token_stream: BoxTokenStream<'a>) -> BoxTokenStream<'a> {
From::from(UnicodeNormalizationTokenStream {
tail: token_stream,
buffer: String::with_capacity(100),
})
}
}
struct UnicodeNormalizationTokenStream<'a> {
buffer: String,
tail: BoxTokenStream<'a>,
}
impl<'a> TokenStream for UnicodeNormalizationTokenStream<'a> {
fn advance(&mut self) -> bool {
if !self.tail.advance() {
return false;
}
if !self.token_mut().text.is_ascii() {
normalize(&self.tail.token().text, &mut self.buffer);
mem::swap(&mut self.tail.token_mut().text, &mut self.buffer);
}
true
}
fn token(&self) -> &Token {
self.tail.token()
}
fn token_mut(&mut self) -> &mut Token {
self.tail.token_mut()
}
}
fn normalize(text: &str, output: &mut String) {
output.clear();
output.extend(text.nfkc());
}