Skip to main content

lance_tokenizer/
raw_tokenizer.rs

1// SPDX-License-Identifier: Apache-2.0
2// SPDX-FileCopyrightText: Copyright The Lance Authors
3// SPDX-License-Identifier: MIT
4// Adapted from Tantivy v0.24.2 raw tokenizer.
5// Copyright (c) 2017-present Tantivy contributors.
6
7use crate::{Token, TokenStream, Tokenizer};
8
9#[derive(Clone, Default)]
10pub struct RawTokenizer {
11    token: Token,
12}
13
14pub struct RawTokenStream<'a> {
15    token: &'a mut Token,
16    has_token: bool,
17}
18
19impl Tokenizer for RawTokenizer {
20    type TokenStream<'a> = RawTokenStream<'a>;
21
22    fn token_stream<'a>(&'a mut self, text: &'a str) -> Self::TokenStream<'a> {
23        self.token.reset();
24        self.token.position = 0;
25        self.token.position_length = 1;
26        self.token.offset_from = 0;
27        self.token.offset_to = text.len();
28        self.token.text.clear();
29        self.token.text.push_str(text);
30        RawTokenStream {
31            token: &mut self.token,
32            has_token: true,
33        }
34    }
35}
36
37impl TokenStream for RawTokenStream<'_> {
38    fn advance(&mut self) -> bool {
39        let has_token = self.has_token;
40        self.has_token = false;
41        has_token
42    }
43
44    fn token(&self) -> &Token {
45        self.token
46    }
47
48    fn token_mut(&mut self) -> &mut Token {
49        self.token
50    }
51}