use smartstring::{LazyCompact, SmartString};
use std::borrow::{Borrow, BorrowMut};
use std::iter;
use std::ops::{Deref, DerefMut};
use rustc_hash::FxHashSet;
use crate::fts::tokenizer::empty_tokenizer::EmptyTokenizer;
#[derive(Debug, Clone, serde_derive::Serialize, serde_derive::Deserialize, Eq, PartialEq)]
pub(crate) struct Token {
pub(crate) offset_from: usize,
pub(crate) offset_to: usize,
pub(crate) position: usize,
pub(crate) text: String,
pub(crate) position_length: usize,
}
impl Default for Token {
fn default() -> Token {
Token {
offset_from: 0,
offset_to: 0,
position: usize::MAX,
text: String::with_capacity(200),
position_length: 1,
}
}
}
pub(crate) struct TextAnalyzer {
pub(crate) tokenizer: Box<dyn Tokenizer>,
pub(crate) token_filters: Vec<BoxTokenFilter>,
}
impl Default for TextAnalyzer {
fn default() -> TextAnalyzer {
TextAnalyzer::from(EmptyTokenizer)
}
}
impl<T: Tokenizer> From<T> for TextAnalyzer {
fn from(tokenizer: T) -> Self {
TextAnalyzer::new(tokenizer, Vec::new())
}
}
impl TextAnalyzer {
pub(crate) fn new<T: Tokenizer>(
tokenizer: T,
token_filters: Vec<BoxTokenFilter>,
) -> TextAnalyzer {
TextAnalyzer {
tokenizer: Box::new(tokenizer),
token_filters,
}
}
#[allow(unused)]
pub(crate) fn filter<F: Into<BoxTokenFilter>>(mut self, token_filter: F) -> Self {
self.token_filters.push(token_filter.into());
self
}
pub(crate) fn token_stream<'a>(&self, text: &'a str) -> BoxTokenStream<'a> {
let mut token_stream = self.tokenizer.token_stream(text);
for token_filter in &self.token_filters {
token_stream = token_filter.transform(token_stream);
}
token_stream
}
pub(crate) fn unique_ngrams(&self, text: &str, n: usize) -> FxHashSet<Vec<SmartString<LazyCompact>>> {
let mut token_steam = self.token_stream(text);
let mut coll: Vec<SmartString<LazyCompact>> = vec![];
while let Some(token) = token_steam.next() {
coll.push(SmartString::from(token.text.as_str()));
}
if n == 1 {
coll.iter().map(|x| vec![x.clone()]).collect()
} else if n >= coll.len() {
iter::once(coll).collect()
} else {
let mut ret = FxHashSet::default();
for chunk in coll.windows(n) {
ret.insert(chunk.to_vec());
}
ret
}
}
}
impl Clone for TextAnalyzer {
fn clone(&self) -> Self {
TextAnalyzer {
tokenizer: self.tokenizer.box_clone(),
token_filters: self
.token_filters
.iter()
.map(|token_filter| token_filter.box_clone())
.collect(),
}
}
}
pub(crate) trait Tokenizer: 'static + Send + Sync + TokenizerClone {
fn token_stream<'a>(&self, text: &'a str) -> BoxTokenStream<'a>;
}
pub(crate) trait TokenizerClone {
fn box_clone(&self) -> Box<dyn Tokenizer>;
}
impl<T: Tokenizer + Clone> TokenizerClone for T {
fn box_clone(&self) -> Box<dyn Tokenizer> {
Box::new(self.clone())
}
}
impl<'a> TokenStream for Box<dyn TokenStream + 'a> {
fn advance(&mut self) -> bool {
let token_stream: &mut dyn TokenStream = self.borrow_mut();
token_stream.advance()
}
fn token<'b>(&'b self) -> &'b Token {
let token_stream: &'b (dyn TokenStream + 'a) = self.borrow();
token_stream.token()
}
fn token_mut<'b>(&'b mut self) -> &'b mut Token {
let token_stream: &'b mut (dyn TokenStream + 'a) = self.borrow_mut();
token_stream.token_mut()
}
}
pub(crate) struct BoxTokenStream<'a>(Box<dyn TokenStream + 'a>);
impl<'a, T> From<T> for BoxTokenStream<'a>
where
T: TokenStream + 'a,
{
fn from(token_stream: T) -> BoxTokenStream<'a> {
BoxTokenStream(Box::new(token_stream))
}
}
impl<'a> Deref for BoxTokenStream<'a> {
type Target = dyn TokenStream + 'a;
fn deref(&self) -> &Self::Target {
&*self.0
}
}
impl<'a> DerefMut for BoxTokenStream<'a> {
fn deref_mut(&mut self) -> &mut Self::Target {
&mut *self.0
}
}
pub(crate) struct BoxTokenFilter(Box<dyn TokenFilter>);
impl Deref for BoxTokenFilter {
type Target = dyn TokenFilter;
fn deref(&self) -> &dyn TokenFilter {
&*self.0
}
}
impl<T: TokenFilter> From<T> for BoxTokenFilter {
fn from(tokenizer: T) -> BoxTokenFilter {
BoxTokenFilter(Box::new(tokenizer))
}
}
pub(crate) trait TokenStream {
fn advance(&mut self) -> bool;
fn token(&self) -> &Token;
fn token_mut(&mut self) -> &mut Token;
fn next(&mut self) -> Option<&Token> {
if self.advance() {
Some(self.token())
} else {
None
}
}
fn process(&mut self, sink: &mut dyn FnMut(&Token)) {
while self.advance() {
sink(self.token());
}
}
}
pub(crate) trait TokenFilterClone {
fn box_clone(&self) -> BoxTokenFilter;
}
pub(crate) trait TokenFilter: 'static + Send + Sync + TokenFilterClone {
fn transform<'a>(&self, token_stream: BoxTokenStream<'a>) -> BoxTokenStream<'a>;
}
impl<T: TokenFilter + Clone> TokenFilterClone for T {
fn box_clone(&self) -> BoxTokenFilter {
BoxTokenFilter::from(self.clone())
}
}
#[cfg(test)]
mod test {
use super::Token;
#[test]
fn clone() {
let t1 = Token {
position: 1,
offset_from: 2,
offset_to: 3,
text: "abc".to_string(),
position_length: 1,
};
let t2 = t1.clone();
assert_eq!(t1.position, t2.position);
assert_eq!(t1.offset_from, t2.offset_from);
assert_eq!(t1.offset_to, t2.offset_to);
assert_eq!(t1.text, t2.text);
}
}