Skip to main content

lingua/
ngram.rs

1/*
2 * Copyright © 2020-present Peter M. Stahl pemistahl@gmail.com
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either expressed or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17use std::fmt;
18use std::fmt::Display;
19
20#[derive(Clone, Debug, Eq, PartialEq, Hash)]
21pub(crate) struct Ngram {
22    pub(crate) value: String,
23}
24
25impl Ngram {
26    pub(crate) fn new(value: &str) -> Self {
27        let char_count = value.chars().count();
28        if !(0..6).contains(&char_count) {
29            panic!("length {char_count} of ngram '{value}' is not in range 0..6");
30        }
31        Self {
32            value: value.to_string(),
33        }
34    }
35}
36
37impl Display for Ngram {
38    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
39        write!(f, "{}", self.value)
40    }
41}
42
43#[derive(Copy, Clone, Debug, Eq, PartialEq, Hash)]
44pub(crate) struct NgramRef<'a> {
45    pub(crate) value: &'a str,
46    pub(crate) char_count: usize,
47}
48
49impl<'a> NgramRef<'a> {
50    pub(crate) fn new(value: &'a str) -> Self {
51        let char_count = value.chars().count();
52        if !(0..6).contains(&char_count) {
53            panic!("length {char_count} of ngram '{value}' is not in range 0..6");
54        }
55        Self { value, char_count }
56    }
57
58    pub(crate) fn range_of_lower_order_ngrams(&self) -> NgramRefRange<'a> {
59        NgramRefRange { start: *self }
60    }
61}
62
63pub(crate) struct NgramRefRange<'a> {
64    start: NgramRef<'a>,
65}
66
67impl<'a> Iterator for NgramRefRange<'a> {
68    type Item = NgramRef<'a>;
69
70    fn next(&mut self) -> Option<Self::Item> {
71        let last_ch = self.start.value.chars().next_back()?;
72        let result = self.start;
73        self.start.value = &self.start.value[..self.start.value.len() - last_ch.len_utf8()];
74        self.start.char_count -= 1;
75        Some(result)
76    }
77}
78
79#[cfg(test)]
80mod tests {
81    use super::*;
82
83    #[test]
84    fn test_ngram_iterator() {
85        let ngram = NgramRef::new("äbcde");
86        let mut range = ngram.range_of_lower_order_ngrams();
87        assert_eq!(range.next(), Some(NgramRef::new("äbcde")));
88        assert_eq!(range.next(), Some(NgramRef::new("äbcd")));
89        assert_eq!(range.next(), Some(NgramRef::new("äbc")));
90        assert_eq!(range.next(), Some(NgramRef::new("äb")));
91        assert_eq!(range.next(), Some(NgramRef::new("ä")));
92        assert_eq!(range.next(), None);
93    }
94}