use std::io::{self, Read};
pub mod standard;
pub use standard::{LowerCaseFilter, StandardAnalyzer, StandardTokenizer};
#[derive(Clone, Debug, PartialEq, Eq)]
pub struct Token {
pub text: String,
pub start_offset: usize,
pub end_offset: usize,
pub position_increment: u32,
}
pub trait Tokenizer {
fn tokenize(&self, text: &str) -> Vec<Token>;
}
pub trait TokenFilter {
fn filter(&self, tokens: Vec<Token>) -> Vec<Token>;
}
pub struct TokenRef<'a> {
pub text: &'a str,
pub start_offset: usize,
pub end_offset: usize,
pub position_increment: u32,
}
pub trait Analyzer: Send + Sync {
fn analyze(&self, text: &str) -> Vec<Token>;
fn analyze_to(&self, text: &str, buf: &mut String, callback: &mut dyn FnMut(TokenRef<'_>)) {
let _ = buf; for token in self.analyze(text) {
callback(TokenRef {
text: &token.text,
start_offset: token.start_offset,
end_offset: token.end_offset,
position_increment: token.position_increment,
});
}
}
fn analyze_reader(
&self,
reader: &mut dyn Read,
buf: &mut String,
callback: &mut dyn FnMut(TokenRef<'_>),
) -> io::Result<()> {
buf.clear();
reader.read_to_string(buf)?;
let text = std::mem::take(buf);
self.analyze_to(&text, buf, callback);
*buf = text;
Ok(())
}
}
#[cfg(test)]
mod tests {
use super::*;
struct SplitAnalyzer;
impl Analyzer for SplitAnalyzer {
fn analyze(&self, text: &str) -> Vec<Token> {
text.split_whitespace()
.map(|word| Token {
text: word.to_string(),
start_offset: 0,
end_offset: word.len(),
position_increment: 1,
})
.collect()
}
}
#[test]
fn test_default_analyze_to() {
let analyzer = SplitAnalyzer;
let mut buf = String::new();
let mut tokens = Vec::new();
analyzer.analyze_to("hello world", &mut buf, &mut |tr| {
tokens.push(tr.text.to_string());
});
assert_eq!(tokens, vec!["hello", "world"]);
}
#[test]
fn test_default_analyze_reader() {
let analyzer = SplitAnalyzer;
let mut buf = String::new();
let mut tokens = Vec::new();
let mut cursor = std::io::Cursor::new(b"hello world");
analyzer
.analyze_reader(&mut cursor, &mut buf, &mut |tr| {
tokens.push(tr.text.to_string());
})
.unwrap();
assert_eq!(tokens, vec!["hello", "world"]);
}
#[test]
fn test_default_analyze_reader_empty() {
let analyzer = SplitAnalyzer;
let mut buf = String::new();
let mut tokens = Vec::new();
let mut cursor = std::io::Cursor::new(b"");
analyzer
.analyze_reader(&mut cursor, &mut buf, &mut |tr| {
tokens.push(tr.text.to_string());
})
.unwrap();
assert_is_empty!(tokens);
}
}