Skip to main content

lance_tokenizer/
tokenizer_api.rs

1// SPDX-License-Identifier: Apache-2.0
2// SPDX-FileCopyrightText: Copyright The Lance Authors
3// SPDX-License-Identifier: MIT
4// Adapted from Tantivy v0.24.2 tokenizer API.
5// Copyright (c) 2017-present Tantivy contributors.
6
7use std::borrow::{Borrow, BorrowMut};
8use std::ops::{Deref, DerefMut};
9
10/// Token emitted by a tokenizer.
11#[derive(Debug, Clone, Eq, PartialEq)]
12pub struct Token {
13    /// Byte offset of the first character.
14    pub offset_from: usize,
15    /// Byte offset after the last character.
16    pub offset_to: usize,
17    /// Logical token position.
18    pub position: usize,
19    /// Token text.
20    pub text: String,
21    /// Position length measured in original tokens.
22    pub position_length: usize,
23}
24
25impl Default for Token {
26    fn default() -> Self {
27        Self {
28            offset_from: 0,
29            offset_to: 0,
30            position: usize::MAX,
31            text: String::new(),
32            position_length: 1,
33        }
34    }
35}
36
37impl Token {
38    /// Reset the token to its default state.
39    pub fn reset(&mut self) {
40        self.offset_from = 0;
41        self.offset_to = 0;
42        self.position = usize::MAX;
43        self.text.clear();
44        self.position_length = 1;
45    }
46}
47
48/// Tokenizer splits text into a token stream.
49pub trait Tokenizer: 'static + Clone + Send + Sync {
50    /// Stream type emitted by the tokenizer.
51    type TokenStream<'a>: TokenStream;
52
53    /// Create a token stream for the provided text.
54    fn token_stream<'a>(&'a mut self, text: &'a str) -> Self::TokenStream<'a>;
55}
56
57/// Token stream object-safe wrapper.
58pub struct BoxTokenStream<'a>(Box<dyn TokenStream + 'a>);
59
60impl<'a> BoxTokenStream<'a> {
61    pub fn new<T: TokenStream + 'a>(token_stream: T) -> Self {
62        Self(Box::new(token_stream))
63    }
64}
65
66impl TokenStream for BoxTokenStream<'_> {
67    fn advance(&mut self) -> bool {
68        self.0.advance()
69    }
70
71    fn token(&self) -> &Token {
72        self.0.token()
73    }
74
75    fn token_mut(&mut self) -> &mut Token {
76        self.0.token_mut()
77    }
78}
79
80impl<'a> Deref for BoxTokenStream<'a> {
81    type Target = dyn TokenStream + 'a;
82
83    fn deref(&self) -> &Self::Target {
84        &*self.0
85    }
86}
87
88impl DerefMut for BoxTokenStream<'_> {
89    fn deref_mut(&mut self) -> &mut Self::Target {
90        &mut *self.0
91    }
92}
93
94impl<'a> TokenStream for Box<dyn TokenStream + 'a> {
95    fn advance(&mut self) -> bool {
96        let token_stream: &mut dyn TokenStream = self.borrow_mut();
97        token_stream.advance()
98    }
99
100    fn token(&self) -> &Token {
101        let token_stream: &(dyn TokenStream + 'a) = self.borrow();
102        token_stream.token()
103    }
104
105    fn token_mut(&mut self) -> &mut Token {
106        let token_stream: &mut (dyn TokenStream + 'a) = self.borrow_mut();
107        token_stream.token_mut()
108    }
109}
110
111/// Consumable token stream.
112pub trait TokenStream {
113    /// Advance to the next token.
114    fn advance(&mut self) -> bool;
115
116    /// Access the current token.
117    fn token(&self) -> &Token;
118
119    /// Mutate the current token.
120    fn token_mut(&mut self) -> &mut Token;
121
122    /// Iterate to the next token and return it.
123    fn next(&mut self) -> Option<&Token> {
124        if self.advance() {
125            Some(self.token())
126        } else {
127            None
128        }
129    }
130
131    /// Consume the remaining stream into the provided sink.
132    fn process(&mut self, sink: &mut dyn FnMut(&Token)) {
133        while self.advance() {
134            sink(self.token());
135        }
136    }
137}
138
139/// Filter that wraps a tokenizer with additional token-processing behavior.
140pub trait TokenFilter: 'static + Send + Sync {
141    /// Tokenizer produced by this filter.
142    type Tokenizer<T: Tokenizer>: Tokenizer;
143
144    /// Wrap the tokenizer.
145    fn transform<T: Tokenizer>(self, tokenizer: T) -> Self::Tokenizer<T>;
146}