Skip to main content

lance_tokenizer/
alphanum_only.rs

1// SPDX-License-Identifier: Apache-2.0
2// SPDX-FileCopyrightText: Copyright The Lance Authors
3// SPDX-License-Identifier: MIT
4// Adapted from Tantivy v0.24.2 alphanum-only filter.
5// Copyright (c) 2017-present Tantivy contributors.
6
7use crate::{Token, TokenFilter, TokenStream, Tokenizer};
8
9#[derive(Clone)]
10pub struct AlphaNumOnlyFilter;
11
12pub struct AlphaNumOnlyFilterStream<T> {
13    tail: T,
14}
15
16impl<T> AlphaNumOnlyFilterStream<T> {
17    fn predicate(&self, token: &Token) -> bool {
18        token.text.chars().all(|ch| ch.is_ascii_alphanumeric())
19    }
20}
21
22impl TokenFilter for AlphaNumOnlyFilter {
23    type Tokenizer<T: Tokenizer> = AlphaNumOnlyFilterWrapper<T>;
24
25    fn transform<T: Tokenizer>(self, tokenizer: T) -> Self::Tokenizer<T> {
26        AlphaNumOnlyFilterWrapper(tokenizer)
27    }
28}
29
30#[derive(Clone)]
31pub struct AlphaNumOnlyFilterWrapper<T>(T);
32
33impl<T: Tokenizer> Tokenizer for AlphaNumOnlyFilterWrapper<T> {
34    type TokenStream<'a> = AlphaNumOnlyFilterStream<T::TokenStream<'a>>;
35
36    fn token_stream<'a>(&'a mut self, text: &'a str) -> Self::TokenStream<'a> {
37        AlphaNumOnlyFilterStream {
38            tail: self.0.token_stream(text),
39        }
40    }
41}
42
43impl<T: TokenStream> TokenStream for AlphaNumOnlyFilterStream<T> {
44    fn advance(&mut self) -> bool {
45        while self.tail.advance() {
46            if self.predicate(self.tail.token()) {
47                return true;
48            }
49        }
50        false
51    }
52
53    fn token(&self) -> &Token {
54        self.tail.token()
55    }
56
57    fn token_mut(&mut self) -> &mut Token {
58        self.tail.token_mut()
59    }
60}