izihawa_tantivy_tokenizer_api/
lib.rs

1//! Tokenizer are in charge of chopping text into a stream of tokens
2//! ready for indexing. This is an separate crate from tantivy, so implementors don't need to update
3//! for each new tantivy version.
4//!
5//! To add support for a tokenizer, implement the [`Tokenizer`] trait.
6//! Checkout the [tantivy repo](https://github.com/quickwit-oss/tantivy/tree/main/src/tokenizer) for some examples.
7
8use std::borrow::{Borrow, BorrowMut};
9use std::ops::{Deref, DerefMut};
10
11use serde::{Deserialize, Serialize};
12
13/// Token
14#[derive(Debug, Clone, Serialize, Deserialize, Eq, PartialEq)]
15pub struct Token {
16    /// Offset (byte index) of the first character of the token.
17    /// Offsets shall not be modified by token filters.
18    pub offset_from: usize,
19    /// Offset (byte index) of the last character of the token + 1.
20    /// The text that generated the token should be obtained by
21    /// &text[token.offset_from..token.offset_to]
22    pub offset_to: usize,
23    /// Position, expressed in number of tokens.
24    pub position: usize,
25    /// Actual text content of the token.
26    pub text: String,
27    /// Is the length expressed in term of number of original tokens.
28    pub position_length: usize,
29}
30
31impl Default for Token {
32    fn default() -> Token {
33        Token {
34            offset_from: 0,
35            offset_to: 0,
36            position: usize::MAX,
37            text: String::new(),
38            position_length: 1,
39        }
40    }
41}
42
43impl Token {
44    /// reset to default
45    pub fn reset(&mut self) {
46        self.offset_from = 0;
47        self.offset_to = 0;
48        self.position = usize::MAX;
49        self.text.clear();
50        self.position_length = 1;
51    }
52}
53
54/// `Tokenizer` are in charge of splitting text into a stream of token
55/// before indexing.
56pub trait Tokenizer: 'static + Clone + Send + Sync {
57    /// The token stream returned by this Tokenizer.
58    type TokenStream<'a>: TokenStream;
59    /// Creates a token stream for a given `str`.
60    fn token_stream<'a>(&'a mut self, text: &'a str) -> Self::TokenStream<'a>;
61}
62
63/// Simple wrapper of `Box<dyn TokenStream + 'a>`.
64pub struct BoxTokenStream<'a>(Box<dyn TokenStream + 'a>);
65
66impl TokenStream for BoxTokenStream<'_> {
67    fn advance(&mut self) -> bool {
68        self.0.advance()
69    }
70
71    fn token(&self) -> &Token {
72        self.0.token()
73    }
74
75    fn token_mut(&mut self) -> &mut Token {
76        self.0.token_mut()
77    }
78}
79
80impl<'a> BoxTokenStream<'a> {
81    pub fn new<T: TokenStream + 'a>(token_stream: T) -> BoxTokenStream<'a> {
82        BoxTokenStream(Box::new(token_stream))
83    }
84}
85
86impl<'a> Deref for BoxTokenStream<'a> {
87    type Target = dyn TokenStream + 'a;
88
89    fn deref(&self) -> &Self::Target {
90        &*self.0
91    }
92}
93impl DerefMut for BoxTokenStream<'_> {
94    fn deref_mut(&mut self) -> &mut Self::Target {
95        &mut *self.0
96    }
97}
98
99impl<'a> TokenStream for Box<dyn TokenStream + 'a> {
100    fn advance(&mut self) -> bool {
101        let token_stream: &mut dyn TokenStream = self.borrow_mut();
102        token_stream.advance()
103    }
104
105    fn token<'b>(&'b self) -> &'b Token {
106        let token_stream: &'b (dyn TokenStream + 'a) = self.borrow();
107        token_stream.token()
108    }
109
110    fn token_mut<'b>(&'b mut self) -> &'b mut Token {
111        let token_stream: &'b mut (dyn TokenStream + 'a) = self.borrow_mut();
112        token_stream.token_mut()
113    }
114}
115
116/// `TokenStream` is the result of the tokenization.
117///
118/// It consists consumable stream of `Token`s.
119pub trait TokenStream {
120    /// Advance to the next token
121    ///
122    /// Returns false if there are no other tokens.
123    fn advance(&mut self) -> bool;
124
125    /// Returns a reference to the current token.
126    fn token(&self) -> &Token;
127
128    /// Returns a mutable reference to the current token.
129    fn token_mut(&mut self) -> &mut Token;
130
131    /// Helper to iterate over tokens. It
132    /// simply combines a call to `.advance()`
133    /// and `.token()`.
134    fn next(&mut self) -> Option<&Token> {
135        if self.advance() {
136            Some(self.token())
137        } else {
138            None
139        }
140    }
141
142    /// Helper function to consume the entire `TokenStream`
143    /// and push the tokens to a sink function.
144    fn process(&mut self, sink: &mut dyn FnMut(&Token)) {
145        while self.advance() {
146            sink(self.token());
147        }
148    }
149}
150
151/// Trait for the pluggable components of `Tokenizer`s.
152pub trait TokenFilter: 'static + Send + Sync {
153    /// The Tokenizer type returned by this filter, typically parametrized by the underlying
154    /// Tokenizer.
155    type Tokenizer<T: Tokenizer>: Tokenizer;
156    /// Wraps a Tokenizer and returns a new one.
157    fn transform<T: Tokenizer>(self, tokenizer: T) -> Self::Tokenizer<T>;
158}
159
160#[cfg(test)]
161mod test {
162    use super::*;
163
164    #[test]
165    fn clone() {
166        let t1 = Token {
167            position: 1,
168            offset_from: 2,
169            offset_to: 3,
170            text: "abc".to_string(),
171            position_length: 1,
172        };
173        let t2 = t1.clone();
174
175        assert_eq!(t1.position, t2.position);
176        assert_eq!(t1.offset_from, t2.offset_from);
177        assert_eq!(t1.offset_to, t2.offset_to);
178        assert_eq!(t1.text, t2.text);
179    }
180}