Skip to main content

lance_tokenizer/
lower_caser.rs

1// SPDX-License-Identifier: Apache-2.0
2// SPDX-FileCopyrightText: Copyright The Lance Authors
3// SPDX-License-Identifier: MIT
4// Adapted from Tantivy v0.24.2 lower caser.
5// Copyright (c) 2017-present Tantivy contributors.
6
7use std::mem;
8
9use crate::{Token, TokenFilter, TokenStream, Tokenizer};
10
11#[derive(Clone)]
12pub struct LowerCaser;
13
14impl TokenFilter for LowerCaser {
15    type Tokenizer<T: Tokenizer> = LowerCaserFilter<T>;
16
17    fn transform<T: Tokenizer>(self, tokenizer: T) -> Self::Tokenizer<T> {
18        LowerCaserFilter {
19            tokenizer,
20            buffer: String::new(),
21        }
22    }
23}
24
25#[derive(Clone)]
26pub struct LowerCaserFilter<T> {
27    tokenizer: T,
28    buffer: String,
29}
30
31impl<T: Tokenizer> Tokenizer for LowerCaserFilter<T> {
32    type TokenStream<'a> = LowerCaserTokenStream<'a, T::TokenStream<'a>>;
33
34    fn token_stream<'a>(&'a mut self, text: &'a str) -> Self::TokenStream<'a> {
35        self.buffer.clear();
36        LowerCaserTokenStream {
37            buffer: &mut self.buffer,
38            tail: self.tokenizer.token_stream(text),
39        }
40    }
41}
42
43pub struct LowerCaserTokenStream<'a, T> {
44    buffer: &'a mut String,
45    tail: T,
46}
47
48fn to_lowercase_unicode(text: &str, output: &mut String) {
49    output.clear();
50    output.reserve(50);
51    for ch in text.chars() {
52        output.extend(ch.to_lowercase());
53    }
54}
55
56impl<T: TokenStream> TokenStream for LowerCaserTokenStream<'_, T> {
57    fn advance(&mut self) -> bool {
58        if !self.tail.advance() {
59            return false;
60        }
61        if self.token_mut().text.is_ascii() {
62            self.token_mut().text.make_ascii_lowercase();
63        } else {
64            to_lowercase_unicode(&self.tail.token().text, self.buffer);
65            mem::swap(&mut self.tail.token_mut().text, self.buffer);
66        }
67        true
68    }
69
70    fn token(&self) -> &Token {
71        self.tail.token()
72    }
73
74    fn token_mut(&mut self) -> &mut Token {
75        self.tail.token_mut()
76    }
77}