lance_tokenizer/
tokenizer_api.rs1use std::borrow::{Borrow, BorrowMut};
8use std::ops::{Deref, DerefMut};
9
10#[derive(Debug, Clone, Eq, PartialEq)]
12pub struct Token {
13 pub offset_from: usize,
15 pub offset_to: usize,
17 pub position: usize,
19 pub text: String,
21 pub position_length: usize,
23}
24
25impl Default for Token {
26 fn default() -> Self {
27 Self {
28 offset_from: 0,
29 offset_to: 0,
30 position: usize::MAX,
31 text: String::new(),
32 position_length: 1,
33 }
34 }
35}
36
37impl Token {
38 pub fn reset(&mut self) {
40 self.offset_from = 0;
41 self.offset_to = 0;
42 self.position = usize::MAX;
43 self.text.clear();
44 self.position_length = 1;
45 }
46}
47
48pub trait Tokenizer: 'static + Clone + Send + Sync {
50 type TokenStream<'a>: TokenStream;
52
53 fn token_stream<'a>(&'a mut self, text: &'a str) -> Self::TokenStream<'a>;
55}
56
57pub struct BoxTokenStream<'a>(Box<dyn TokenStream + 'a>);
59
60impl<'a> BoxTokenStream<'a> {
61 pub fn new<T: TokenStream + 'a>(token_stream: T) -> Self {
62 Self(Box::new(token_stream))
63 }
64}
65
66impl TokenStream for BoxTokenStream<'_> {
67 fn advance(&mut self) -> bool {
68 self.0.advance()
69 }
70
71 fn token(&self) -> &Token {
72 self.0.token()
73 }
74
75 fn token_mut(&mut self) -> &mut Token {
76 self.0.token_mut()
77 }
78}
79
80impl<'a> Deref for BoxTokenStream<'a> {
81 type Target = dyn TokenStream + 'a;
82
83 fn deref(&self) -> &Self::Target {
84 &*self.0
85 }
86}
87
88impl DerefMut for BoxTokenStream<'_> {
89 fn deref_mut(&mut self) -> &mut Self::Target {
90 &mut *self.0
91 }
92}
93
94impl<'a> TokenStream for Box<dyn TokenStream + 'a> {
95 fn advance(&mut self) -> bool {
96 let token_stream: &mut dyn TokenStream = self.borrow_mut();
97 token_stream.advance()
98 }
99
100 fn token(&self) -> &Token {
101 let token_stream: &(dyn TokenStream + 'a) = self.borrow();
102 token_stream.token()
103 }
104
105 fn token_mut(&mut self) -> &mut Token {
106 let token_stream: &mut (dyn TokenStream + 'a) = self.borrow_mut();
107 token_stream.token_mut()
108 }
109}
110
111pub trait TokenStream {
113 fn advance(&mut self) -> bool;
115
116 fn token(&self) -> &Token;
118
119 fn token_mut(&mut self) -> &mut Token;
121
122 fn next(&mut self) -> Option<&Token> {
124 if self.advance() {
125 Some(self.token())
126 } else {
127 None
128 }
129 }
130
131 fn process(&mut self, sink: &mut dyn FnMut(&Token)) {
133 while self.advance() {
134 sink(self.token());
135 }
136 }
137}
138
139pub trait TokenFilter: 'static + Send + Sync {
141 type Tokenizer<T: Tokenizer>: Tokenizer;
143
144 fn transform<T: Tokenizer>(self, tokenizer: T) -> Self::Tokenizer<T>;
146}