lance_tokenizer/
analyzer.rs1use crate::{BoxTokenStream, TokenFilter, Tokenizer};
8
9#[derive(Clone)]
10pub struct TextAnalyzer {
11 tokenizer: Box<dyn BoxableTokenizer>,
12}
13
14impl<T: Tokenizer + Clone> From<T> for TextAnalyzer {
15 fn from(tokenizer: T) -> Self {
16 Self::builder(tokenizer).build()
17 }
18}
19
20impl Default for TextAnalyzer {
21 fn default() -> Self {
22 Self::from(crate::RawTokenizer::default())
23 }
24}
25
26impl TextAnalyzer {
27 pub fn builder<T: Tokenizer>(tokenizer: T) -> TextAnalyzerBuilder<T> {
28 TextAnalyzerBuilder { tokenizer }
29 }
30
31 pub fn token_stream<'a>(&'a mut self, text: &'a str) -> BoxTokenStream<'a> {
32 self.tokenizer.token_stream(text)
33 }
34}
35
36pub trait BoxableTokenizer: 'static + Send + Sync {
37 fn box_token_stream<'a>(&'a mut self, text: &'a str) -> BoxTokenStream<'a>;
38
39 fn box_clone(&self) -> Box<dyn BoxableTokenizer>;
40}
41
42impl<T: Tokenizer> BoxableTokenizer for T {
43 fn box_token_stream<'a>(&'a mut self, text: &'a str) -> BoxTokenStream<'a> {
44 BoxTokenStream::new(self.token_stream(text))
45 }
46
47 fn box_clone(&self) -> Box<dyn BoxableTokenizer> {
48 Box::new(self.clone())
49 }
50}
51
52impl Clone for Box<dyn BoxableTokenizer> {
53 fn clone(&self) -> Self {
54 (**self).box_clone()
55 }
56}
57
58impl Tokenizer for Box<dyn BoxableTokenizer> {
59 type TokenStream<'a> = BoxTokenStream<'a>;
60
61 fn token_stream<'a>(&'a mut self, text: &'a str) -> Self::TokenStream<'a> {
62 (**self).box_token_stream(text)
63 }
64}
65
66pub struct TextAnalyzerBuilder<T = Box<dyn BoxableTokenizer>> {
67 tokenizer: T,
68}
69
70impl<T: Tokenizer> TextAnalyzerBuilder<T> {
71 pub fn filter<F: TokenFilter>(self, token_filter: F) -> TextAnalyzerBuilder<F::Tokenizer<T>> {
72 TextAnalyzerBuilder {
73 tokenizer: token_filter.transform(self.tokenizer),
74 }
75 }
76
77 pub fn dynamic(self) -> TextAnalyzerBuilder {
78 TextAnalyzerBuilder {
79 tokenizer: Box::new(self.tokenizer),
80 }
81 }
82
83 pub fn filter_dynamic<F: TokenFilter>(self, token_filter: F) -> TextAnalyzerBuilder {
84 self.filter(token_filter).dynamic()
85 }
86
87 pub fn build(self) -> TextAnalyzer {
88 TextAnalyzer {
89 tokenizer: Box::new(self.tokenizer),
90 }
91 }
92}