lance_tokenizer/
lower_caser.rs1use std::mem;
8
9use crate::{Token, TokenFilter, TokenStream, Tokenizer};
10
11#[derive(Clone)]
12pub struct LowerCaser;
13
14impl TokenFilter for LowerCaser {
15 type Tokenizer<T: Tokenizer> = LowerCaserFilter<T>;
16
17 fn transform<T: Tokenizer>(self, tokenizer: T) -> Self::Tokenizer<T> {
18 LowerCaserFilter {
19 tokenizer,
20 buffer: String::new(),
21 }
22 }
23}
24
25#[derive(Clone)]
26pub struct LowerCaserFilter<T> {
27 tokenizer: T,
28 buffer: String,
29}
30
31impl<T: Tokenizer> Tokenizer for LowerCaserFilter<T> {
32 type TokenStream<'a> = LowerCaserTokenStream<'a, T::TokenStream<'a>>;
33
34 fn token_stream<'a>(&'a mut self, text: &'a str) -> Self::TokenStream<'a> {
35 self.buffer.clear();
36 LowerCaserTokenStream {
37 buffer: &mut self.buffer,
38 tail: self.tokenizer.token_stream(text),
39 }
40 }
41}
42
43pub struct LowerCaserTokenStream<'a, T> {
44 buffer: &'a mut String,
45 tail: T,
46}
47
48fn to_lowercase_unicode(text: &str, output: &mut String) {
49 output.clear();
50 output.reserve(50);
51 for ch in text.chars() {
52 output.extend(ch.to_lowercase());
53 }
54}
55
56impl<T: TokenStream> TokenStream for LowerCaserTokenStream<'_, T> {
57 fn advance(&mut self) -> bool {
58 if !self.tail.advance() {
59 return false;
60 }
61 if self.token_mut().text.is_ascii() {
62 self.token_mut().text.make_ascii_lowercase();
63 } else {
64 to_lowercase_unicode(&self.tail.token().text, self.buffer);
65 mem::swap(&mut self.tail.token_mut().text, self.buffer);
66 }
67 true
68 }
69
70 fn token(&self) -> &Token {
71 self.tail.token()
72 }
73
74 fn token_mut(&mut self) -> &mut Token {
75 self.tail.token_mut()
76 }
77}