Skip to main content

lance_tokenizer/
analyzer.rs

1// SPDX-License-Identifier: Apache-2.0
2// SPDX-FileCopyrightText: Copyright The Lance Authors
3// SPDX-License-Identifier: MIT
4// Adapted from Tantivy v0.24.2 tokenizer analyzer.
5// Copyright (c) 2017-present Tantivy contributors.
6
7use crate::{BoxTokenStream, TokenFilter, Tokenizer};
8
9#[derive(Clone)]
10pub struct TextAnalyzer {
11    tokenizer: Box<dyn BoxableTokenizer>,
12}
13
14impl<T: Tokenizer + Clone> From<T> for TextAnalyzer {
15    fn from(tokenizer: T) -> Self {
16        Self::builder(tokenizer).build()
17    }
18}
19
20impl Default for TextAnalyzer {
21    fn default() -> Self {
22        Self::from(crate::RawTokenizer::default())
23    }
24}
25
26impl TextAnalyzer {
27    pub fn builder<T: Tokenizer>(tokenizer: T) -> TextAnalyzerBuilder<T> {
28        TextAnalyzerBuilder { tokenizer }
29    }
30
31    pub fn token_stream<'a>(&'a mut self, text: &'a str) -> BoxTokenStream<'a> {
32        self.tokenizer.token_stream(text)
33    }
34}
35
36pub trait BoxableTokenizer: 'static + Send + Sync {
37    fn box_token_stream<'a>(&'a mut self, text: &'a str) -> BoxTokenStream<'a>;
38
39    fn box_clone(&self) -> Box<dyn BoxableTokenizer>;
40}
41
42impl<T: Tokenizer> BoxableTokenizer for T {
43    fn box_token_stream<'a>(&'a mut self, text: &'a str) -> BoxTokenStream<'a> {
44        BoxTokenStream::new(self.token_stream(text))
45    }
46
47    fn box_clone(&self) -> Box<dyn BoxableTokenizer> {
48        Box::new(self.clone())
49    }
50}
51
52impl Clone for Box<dyn BoxableTokenizer> {
53    fn clone(&self) -> Self {
54        (**self).box_clone()
55    }
56}
57
58impl Tokenizer for Box<dyn BoxableTokenizer> {
59    type TokenStream<'a> = BoxTokenStream<'a>;
60
61    fn token_stream<'a>(&'a mut self, text: &'a str) -> Self::TokenStream<'a> {
62        (**self).box_token_stream(text)
63    }
64}
65
66pub struct TextAnalyzerBuilder<T = Box<dyn BoxableTokenizer>> {
67    tokenizer: T,
68}
69
70impl<T: Tokenizer> TextAnalyzerBuilder<T> {
71    pub fn filter<F: TokenFilter>(self, token_filter: F) -> TextAnalyzerBuilder<F::Tokenizer<T>> {
72        TextAnalyzerBuilder {
73            tokenizer: token_filter.transform(self.tokenizer),
74        }
75    }
76
77    pub fn dynamic(self) -> TextAnalyzerBuilder {
78        TextAnalyzerBuilder {
79            tokenizer: Box::new(self.tokenizer),
80        }
81    }
82
83    pub fn filter_dynamic<F: TokenFilter>(self, token_filter: F) -> TextAnalyzerBuilder {
84        self.filter(token_filter).dynamic()
85    }
86
87    pub fn build(self) -> TextAnalyzer {
88        TextAnalyzer {
89            tokenizer: Box::new(self.tokenizer),
90        }
91    }
92}