lance_tokenizer/
remove_long.rs1use crate::{Token, TokenFilter, TokenStream, Tokenizer};
8
9#[derive(Clone)]
10pub struct RemoveLongFilter {
11 length_limit: usize,
12}
13
14impl RemoveLongFilter {
15 pub fn limit(length_limit: usize) -> Self {
16 Self { length_limit }
17 }
18}
19
20impl TokenFilter for RemoveLongFilter {
21 type Tokenizer<T: Tokenizer> = RemoveLongFilterWrapper<T>;
22
23 fn transform<T: Tokenizer>(self, tokenizer: T) -> Self::Tokenizer<T> {
24 RemoveLongFilterWrapper {
25 length_limit: self.length_limit,
26 inner: tokenizer,
27 }
28 }
29}
30
31#[derive(Clone)]
32pub struct RemoveLongFilterWrapper<T: Tokenizer> {
33 length_limit: usize,
34 inner: T,
35}
36
37impl<T: Tokenizer> Tokenizer for RemoveLongFilterWrapper<T> {
38 type TokenStream<'a> = RemoveLongFilterStream<T::TokenStream<'a>>;
39
40 fn token_stream<'a>(&'a mut self, text: &'a str) -> Self::TokenStream<'a> {
41 RemoveLongFilterStream {
42 token_length_limit: self.length_limit,
43 tail: self.inner.token_stream(text),
44 }
45 }
46}
47
48pub struct RemoveLongFilterStream<T> {
49 token_length_limit: usize,
50 tail: T,
51}
52
53impl<T> RemoveLongFilterStream<T> {
54 fn predicate(&self, token: &Token) -> bool {
55 token.text.len() < self.token_length_limit
56 }
57}
58
59impl<T: TokenStream> TokenStream for RemoveLongFilterStream<T> {
60 fn advance(&mut self) -> bool {
61 while self.tail.advance() {
62 if self.predicate(self.tail.token()) {
63 return true;
64 }
65 }
66 false
67 }
68
69 fn token(&self) -> &Token {
70 self.tail.token()
71 }
72
73 fn token_mut(&mut self) -> &mut Token {
74 self.tail.token_mut()
75 }
76}