Skip to main content

lance_tokenizer/
stemmer.rs

1// SPDX-License-Identifier: Apache-2.0
2// SPDX-FileCopyrightText: Copyright The Lance Authors
3// SPDX-License-Identifier: MIT
4// Adapted from Tantivy v0.24.2 stemmer filter.
5// Copyright (c) 2017-present Tantivy contributors.
6
7use std::borrow::Cow;
8use std::mem;
9
10use rust_stemmers::Algorithm;
11use serde::{Deserialize, Serialize};
12
13use crate::{Token, TokenFilter, TokenStream, Tokenizer};
14
15#[derive(Debug, Serialize, Deserialize, Eq, PartialEq, Copy, Clone)]
16pub enum Language {
17    Arabic,
18    Danish,
19    Dutch,
20    English,
21    Finnish,
22    French,
23    German,
24    Greek,
25    Hungarian,
26    Italian,
27    Norwegian,
28    Portuguese,
29    Romanian,
30    Russian,
31    Spanish,
32    Swedish,
33    Tamil,
34    Turkish,
35}
36
37impl Language {
38    fn algorithm(self) -> Algorithm {
39        match self {
40            Self::Arabic => Algorithm::Arabic,
41            Self::Danish => Algorithm::Danish,
42            Self::Dutch => Algorithm::Dutch,
43            Self::English => Algorithm::English,
44            Self::Finnish => Algorithm::Finnish,
45            Self::French => Algorithm::French,
46            Self::German => Algorithm::German,
47            Self::Greek => Algorithm::Greek,
48            Self::Hungarian => Algorithm::Hungarian,
49            Self::Italian => Algorithm::Italian,
50            Self::Norwegian => Algorithm::Norwegian,
51            Self::Portuguese => Algorithm::Portuguese,
52            Self::Romanian => Algorithm::Romanian,
53            Self::Russian => Algorithm::Russian,
54            Self::Spanish => Algorithm::Spanish,
55            Self::Swedish => Algorithm::Swedish,
56            Self::Tamil => Algorithm::Tamil,
57            Self::Turkish => Algorithm::Turkish,
58        }
59    }
60}
61
62#[derive(Clone)]
63pub struct Stemmer {
64    stemmer_algorithm: Algorithm,
65}
66
67impl Stemmer {
68    pub fn new(language: Language) -> Self {
69        Self {
70            stemmer_algorithm: language.algorithm(),
71        }
72    }
73}
74
75impl Default for Stemmer {
76    fn default() -> Self {
77        Self::new(Language::English)
78    }
79}
80
81impl TokenFilter for Stemmer {
82    type Tokenizer<T: Tokenizer> = StemmerFilter<T>;
83
84    fn transform<T: Tokenizer>(self, tokenizer: T) -> Self::Tokenizer<T> {
85        StemmerFilter {
86            stemmer_algorithm: self.stemmer_algorithm,
87            inner: tokenizer,
88        }
89    }
90}
91
92#[derive(Clone)]
93pub struct StemmerFilter<T> {
94    stemmer_algorithm: Algorithm,
95    inner: T,
96}
97
98impl<T: Tokenizer> Tokenizer for StemmerFilter<T> {
99    type TokenStream<'a> = StemmerTokenStream<T::TokenStream<'a>>;
100
101    fn token_stream<'a>(&'a mut self, text: &'a str) -> Self::TokenStream<'a> {
102        StemmerTokenStream {
103            tail: self.inner.token_stream(text),
104            stemmer: rust_stemmers::Stemmer::create(self.stemmer_algorithm),
105            buffer: String::new(),
106        }
107    }
108}
109
110pub struct StemmerTokenStream<T> {
111    tail: T,
112    stemmer: rust_stemmers::Stemmer,
113    buffer: String,
114}
115
116impl<T: TokenStream> TokenStream for StemmerTokenStream<T> {
117    fn advance(&mut self) -> bool {
118        if !self.tail.advance() {
119            return false;
120        }
121        let token = self.tail.token_mut();
122        let stemmed = self.stemmer.stem(&token.text);
123        match stemmed {
124            Cow::Owned(stemmed) => token.text = stemmed,
125            Cow::Borrowed(stemmed) => {
126                self.buffer.clear();
127                self.buffer.push_str(stemmed);
128                mem::swap(&mut token.text, &mut self.buffer);
129            }
130        }
131        true
132    }
133
134    fn token(&self) -> &Token {
135        self.tail.token()
136    }
137
138    fn token_mut(&mut self) -> &mut Token {
139        self.tail.token_mut()
140    }
141}