Skip to main content

lance_tokenizer/
remove_long.rs

1// SPDX-License-Identifier: Apache-2.0
2// SPDX-FileCopyrightText: Copyright The Lance Authors
3// SPDX-License-Identifier: MIT
4// Adapted from Tantivy v0.24.2 remove-long filter.
5// Copyright (c) 2017-present Tantivy contributors.
6
7use crate::{Token, TokenFilter, TokenStream, Tokenizer};
8
9#[derive(Clone)]
10pub struct RemoveLongFilter {
11    length_limit: usize,
12}
13
14impl RemoveLongFilter {
15    pub fn limit(length_limit: usize) -> Self {
16        Self { length_limit }
17    }
18}
19
20impl TokenFilter for RemoveLongFilter {
21    type Tokenizer<T: Tokenizer> = RemoveLongFilterWrapper<T>;
22
23    fn transform<T: Tokenizer>(self, tokenizer: T) -> Self::Tokenizer<T> {
24        RemoveLongFilterWrapper {
25            length_limit: self.length_limit,
26            inner: tokenizer,
27        }
28    }
29}
30
31#[derive(Clone)]
32pub struct RemoveLongFilterWrapper<T: Tokenizer> {
33    length_limit: usize,
34    inner: T,
35}
36
37impl<T: Tokenizer> Tokenizer for RemoveLongFilterWrapper<T> {
38    type TokenStream<'a> = RemoveLongFilterStream<T::TokenStream<'a>>;
39
40    fn token_stream<'a>(&'a mut self, text: &'a str) -> Self::TokenStream<'a> {
41        RemoveLongFilterStream {
42            token_length_limit: self.length_limit,
43            tail: self.inner.token_stream(text),
44        }
45    }
46}
47
48pub struct RemoveLongFilterStream<T> {
49    token_length_limit: usize,
50    tail: T,
51}
52
53impl<T> RemoveLongFilterStream<T> {
54    fn predicate(&self, token: &Token) -> bool {
55        token.text.len() < self.token_length_limit
56    }
57}
58
59impl<T: TokenStream> TokenStream for RemoveLongFilterStream<T> {
60    fn advance(&mut self) -> bool {
61        while self.tail.advance() {
62            if self.predicate(self.tail.token()) {
63                return true;
64            }
65        }
66        false
67    }
68
69    fn token(&self) -> &Token {
70        self.tail.token()
71    }
72
73    fn token_mut(&mut self) -> &mut Token {
74        self.tail.token_mut()
75    }
76}