summavy_tokenizer_api/
lib.rs

1//! Tokenizer are in charge of chopping text into a stream of tokens
2//! ready for indexing. This is an seperate crate from tantivy, so implementors don't need to update
3//! for each new tantivy version.
4//!
5//! To add support for a tokenizer, implement the [`Tokenizer`](crate::Tokenizer) trait.
6//! Checkout the [tantivy repo](https://github.com/quickwit-oss/tantivy/tree/main/src/tokenizer) for some examples.
7
8use std::borrow::{Borrow, BorrowMut};
9use std::ops::{Deref, DerefMut};
10
11use serde::{Deserialize, Serialize};
12
13/// Token
14#[derive(Debug, Clone, Serialize, Deserialize, Eq, PartialEq)]
15pub struct Token {
16    /// Offset (byte index) of the first character of the token.
17    /// Offsets shall not be modified by token filters.
18    pub offset_from: usize,
19    /// Offset (byte index) of the last character of the token + 1.
20    /// The text that generated the token should be obtained by
21    /// &text[token.offset_from..token.offset_to]
22    pub offset_to: usize,
23    /// Position, expressed in number of tokens.
24    pub position: usize,
25    /// Actual text content of the token.
26    pub text: String,
27    /// Is the length expressed in term of number of original tokens.
28    pub position_length: usize,
29}
30
31impl Default for Token {
32    fn default() -> Token {
33        Token {
34            offset_from: 0,
35            offset_to: 0,
36            position: usize::MAX,
37            text: String::with_capacity(200),
38            position_length: 1,
39        }
40    }
41}
42
43/// `Tokenizer` are in charge of splitting text into a stream of token
44/// before indexing.
45///
46/// # Warning
47///
48/// This API may change to use associated types.
49pub trait Tokenizer: 'static + Send + Sync + TokenizerClone {
50    /// Creates a token stream for a given `str`.
51    fn token_stream<'a>(&self, text: &'a str) -> BoxTokenStream<'a>;
52}
53
54pub trait TokenizerClone {
55    fn box_clone(&self) -> Box<dyn Tokenizer>;
56}
57
58impl<T: Tokenizer + Clone> TokenizerClone for T {
59    fn box_clone(&self) -> Box<dyn Tokenizer> {
60        Box::new(self.clone())
61    }
62}
63
64/// Simple wrapper of `Box<dyn TokenStream + 'a>`.
65///
66/// See [`TokenStream`] for more information.
67pub struct BoxTokenStream<'a>(Box<dyn TokenStream + 'a>);
68
69impl<'a, T> From<T> for BoxTokenStream<'a>
70where T: TokenStream + 'a
71{
72    fn from(token_stream: T) -> BoxTokenStream<'a> {
73        BoxTokenStream(Box::new(token_stream))
74    }
75}
76
77impl<'a> Deref for BoxTokenStream<'a> {
78    type Target = dyn TokenStream + 'a;
79
80    fn deref(&self) -> &Self::Target {
81        &*self.0
82    }
83}
84impl<'a> DerefMut for BoxTokenStream<'a> {
85    fn deref_mut(&mut self) -> &mut Self::Target {
86        &mut *self.0
87    }
88}
89
90impl<'a> TokenStream for Box<dyn TokenStream + 'a> {
91    fn advance(&mut self) -> bool {
92        let token_stream: &mut dyn TokenStream = self.borrow_mut();
93        token_stream.advance()
94    }
95
96    fn token<'b>(&'b self) -> &'b Token {
97        let token_stream: &'b (dyn TokenStream + 'a) = self.borrow();
98        token_stream.token()
99    }
100
101    fn token_mut<'b>(&'b mut self) -> &'b mut Token {
102        let token_stream: &'b mut (dyn TokenStream + 'a) = self.borrow_mut();
103        token_stream.token_mut()
104    }
105}
106
107/// `TokenStream` is the result of the tokenization.
108///
109/// It consists consumable stream of `Token`s.
110pub trait TokenStream {
111    /// Advance to the next token
112    ///
113    /// Returns false if there are no other tokens.
114    fn advance(&mut self) -> bool;
115
116    /// Returns a reference to the current token.
117    fn token(&self) -> &Token;
118
119    /// Returns a mutable reference to the current token.
120    fn token_mut(&mut self) -> &mut Token;
121
122    /// Helper to iterate over tokens. It
123    /// simply combines a call to `.advance()`
124    /// and `.token()`.
125    fn next(&mut self) -> Option<&Token> {
126        if self.advance() {
127            Some(self.token())
128        } else {
129            None
130        }
131    }
132
133    /// Helper function to consume the entire `TokenStream`
134    /// and push the tokens to a sink function.
135    fn process(&mut self, sink: &mut dyn FnMut(&Token)) {
136        while self.advance() {
137            sink(self.token());
138        }
139    }
140}
141
142/// Simple wrapper of `Box<dyn TokenFilter + 'a>`.
143///
144/// See [`TokenFilter`] for more information.
145pub struct BoxTokenFilter(Box<dyn TokenFilter>);
146
147impl Deref for BoxTokenFilter {
148    type Target = dyn TokenFilter;
149
150    fn deref(&self) -> &dyn TokenFilter {
151        &*self.0
152    }
153}
154
155impl<T: TokenFilter> From<T> for BoxTokenFilter {
156    fn from(tokenizer: T) -> BoxTokenFilter {
157        BoxTokenFilter(Box::new(tokenizer))
158    }
159}
160
161pub trait TokenFilterClone {
162    fn box_clone(&self) -> BoxTokenFilter;
163}
164
165/// Trait for the pluggable components of `Tokenizer`s.
166pub trait TokenFilter: 'static + Send + Sync + TokenFilterClone {
167    /// Wraps a token stream and returns the modified one.
168    fn transform<'a>(&self, token_stream: BoxTokenStream<'a>) -> BoxTokenStream<'a>;
169}
170
171impl<T: TokenFilter + Clone> TokenFilterClone for T {
172    fn box_clone(&self) -> BoxTokenFilter {
173        BoxTokenFilter::from(self.clone())
174    }
175}
176
177#[cfg(test)]
178mod test {
179    use super::*;
180
181    #[test]
182    fn clone() {
183        let t1 = Token {
184            position: 1,
185            offset_from: 2,
186            offset_to: 3,
187            text: "abc".to_string(),
188            position_length: 1,
189        };
190        let t2 = t1.clone();
191
192        assert_eq!(t1.position, t2.position);
193        assert_eq!(t1.offset_from, t2.offset_from);
194        assert_eq!(t1.offset_to, t2.offset_to);
195        assert_eq!(t1.text, t2.text);
196    }
197}