rphonetic/
helper.rs

1/*
2 * Licensed to the Apache Software Foundation (ASF) under one or more
3 * contributor license agreements.  See the NOTICE file distributed with
4 * this work for additional information regarding copyright ownership.
5 * The ASF licenses this file to You under the Apache License, Version 2.0
6 * (the "License"); you may not use this file except in compliance with
7 * the License.  You may obtain a copy of the License at
8 *
9 *      http://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
16 */
17use std::fmt::{Display, Formatter};
18use std::ops::{Index, Range, RangeFrom, RangeFull, RangeInclusive, RangeTo, RangeToInclusive};
19
20use serde::{Deserialize, Serialize};
21
22/// Replace regex like "s+" by a single char "S".
23pub fn replace_compact_all_to_uppercase(string: String, chars: Vec<char>) -> String {
24    let mut ret = String::with_capacity(string.len());
25    let mut previous: Option<char> = None;
26
27    string.chars().for_each(|ch| {
28        if chars.contains(&ch) {
29            if let Some(prev) = previous {
30                if prev != ch {
31                    ret.push(ch.to_ascii_uppercase());
32                    previous = Some(ch);
33                }
34            } else {
35                ret.push(ch.to_ascii_uppercase());
36                previous = Some(ch);
37            }
38        } else {
39            ret.push(ch);
40            previous = None;
41        }
42    });
43
44    ret
45}
46
47/// Test if `string` ends with `pattern` and replace it by `to`.
48pub fn replace_end<'a>(mut string: String, pattern: &'a str, to: &'a str) -> String {
49    if string.ends_with(pattern) {
50        string.replace_range(string.len() - pattern.len().., to);
51    }
52    string
53}
54
55/// Test if a char is a vowel.
56pub fn is_vowel(c: Option<char>, include_y: bool) -> bool {
57    match c {
58        Some(ch) => matches!(ch, 'a' | 'e' | 'i' | 'o' | 'u') || (include_y && ch == 'y'),
59        None => false,
60    }
61}
62
63pub fn replace_char<F>(string: String, f: F) -> String
64where
65    F: FnMut((usize, char)) -> char,
66{
67    string.chars().enumerate().map(f).collect::<String>()
68}
69
70pub fn remove_all_non_letter(string: String) -> String {
71    string
72        .chars()
73        .filter(|&c| c.is_lowercase())
74        .collect::<String>()
75}
76
77/// This struct is a wrapper around an `&str` allowing
78/// to slice by char.
79///
80/// It implements [Index], allowing to slice according to
81/// [char]. Please note that it is not really efficient as
82/// it uses [CharIndices](std::str::CharIndices).
83#[derive(Debug, Clone, Hash, Ord, PartialOrd, Eq, PartialEq, Serialize, Deserialize)]
84pub struct CharSequence<'a> {
85    inner: &'a str,
86    len_in_char: usize,
87}
88
89impl Display for CharSequence<'_> {
90    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
91        write!(f, "{}", self.inner)
92    }
93}
94
95impl CharSequence<'_> {
96    /// Return the length of the string in terme of
97    /// [char] instead of byte.
98    pub fn len(&self) -> usize {
99        self.len_in_char
100    }
101
102    /// Return `true` if the string contains no `char`.
103    pub fn is_empty(&self) -> bool {
104        self.len_in_char == 0
105    }
106
107    /// Return the inner string.
108    pub fn as_str(&self) -> &str {
109        self.inner
110    }
111}
112
113impl<'a> From<&'a str> for CharSequence<'a> {
114    fn from(original: &'a str) -> Self {
115        let len_in_char = original.chars().count();
116        Self {
117            inner: original,
118            len_in_char,
119        }
120    }
121}
122
123impl<'a> From<CharSequence<'a>> for &'a str {
124    fn from(value: CharSequence<'a>) -> Self {
125        value.inner
126    }
127}
128
129impl<'a> Index<Range<usize>> for CharSequence<'a> {
130    type Output = str;
131
132    // To make this faster, at the cost of an increase of memory usage,
133    // we could store an array in an array of size chars().count()
134    // the index of each char().
135    fn index(&self, index: Range<usize>) -> &'a Self::Output {
136        let mut iterator = self.inner.char_indices().skip(index.start);
137
138        let start: Option<(usize, _)> = iterator.next();
139        let skip = if index.end > index.start {
140            index.end - (index.start + 1)
141        } else {
142            return "";
143        };
144        let mut iterator = iterator.skip(skip);
145        let end: Option<(usize, _)> = iterator.next();
146
147        let start = match start {
148            None => return "",
149            Some((s, _)) => s,
150        };
151
152        match end {
153            None => &self.inner[start..],
154            Some((s, _)) => &self.inner[start..s],
155        }
156    }
157}
158
159impl Index<RangeFrom<usize>> for CharSequence<'_> {
160    type Output = str;
161
162    fn index(&self, index: RangeFrom<usize>) -> &Self::Output {
163        &self[index.start..self.len_in_char]
164    }
165}
166
167impl Index<RangeFull> for CharSequence<'_> {
168    type Output = str;
169
170    fn index(&self, _: RangeFull) -> &Self::Output {
171        &self[0..self.len_in_char]
172    }
173}
174
175impl Index<RangeInclusive<usize>> for CharSequence<'_> {
176    type Output = str;
177
178    fn index(&self, index: RangeInclusive<usize>) -> &Self::Output {
179        &self[*index.start()..*index.end() + 1]
180    }
181}
182
183impl Index<RangeTo<usize>> for CharSequence<'_> {
184    type Output = str;
185
186    fn index(&self, index: RangeTo<usize>) -> &Self::Output {
187        &self[0..index.end]
188    }
189}
190
191impl Index<RangeToInclusive<usize>> for CharSequence<'_> {
192    type Output = str;
193
194    fn index(&self, index: RangeToInclusive<usize>) -> &Self::Output {
195        &self[0..=index.end]
196    }
197}
198
199#[cfg(test)]
200mod tests {
201    // Note this useful idiom: importing names from outer (for mod tests) scope.
202    use super::*;
203
204    #[test]
205    fn test_vowel() {
206        assert!(is_vowel(Some('a'), false));
207        assert!(is_vowel(Some('e'), false));
208        assert!(is_vowel(Some('i'), false));
209        assert!(is_vowel(Some('o'), false));
210        assert!(is_vowel(Some('u'), false));
211        assert!(!is_vowel(Some('b'), false));
212        assert!(!is_vowel(Some('d'), false));
213        assert!(!is_vowel(Some('p'), false));
214        assert!(!is_vowel(Some('q'), false));
215        assert!(!is_vowel(Some('z'), false));
216        assert!(!is_vowel(Some('A'), false));
217        assert!(!is_vowel(Some('I'), false));
218        assert!(!is_vowel(Some('3'), false));
219
220        assert!(!is_vowel(Some('y'), false));
221        assert!(is_vowel(Some('y'), true));
222
223        assert!(!is_vowel(None, false));
224    }
225
226    #[test]
227    fn test_replace_compact_all_to_uppercase_nothing_to_compact() {
228        let result =
229            replace_compact_all_to_uppercase("aaaabbbbccccdddd".to_string(), vec!['e', 'f', 'g']);
230        assert_eq!(result, "aaaabbbbccccdddd");
231    }
232
233    #[test]
234    fn test_replace_compact_all_to_uppercase_compact_all() {
235        let result = replace_compact_all_to_uppercase(
236            "aaaabbbbccccdddd".to_string(),
237            vec!['a', 'b', 'c', 'd'],
238        );
239        assert_eq!(result, "ABCD");
240    }
241
242    #[test]
243    fn test_replace_compact_all_to_uppercase() {
244        let result =
245            replace_compact_all_to_uppercase("aaaabbbbccccdddd".to_string(), vec!['b', 'd']);
246        assert_eq!(result, "aaaaBccccD");
247    }
248
249    #[test]
250    fn test_char_sequence_all_char_range() {
251        for ch in '\u{0000}'..'\u{ffff}' {
252            let data = ch.to_string();
253            let data = data.as_str();
254            let char_sequence = CharSequence::from(data);
255            assert_eq!(char_sequence.len_in_char, 1);
256            assert_eq!(&char_sequence[0..1], data);
257        }
258    }
259
260    #[test]
261    fn test_char_sequence_all_char_range_from() {
262        for ch in '\u{0000}'..'\u{ffff}' {
263            let data = ch.to_string();
264            let data = data.as_str();
265            let char_sequence = CharSequence::from(data);
266            assert_eq!(char_sequence.len_in_char, 1);
267            assert_eq!(&char_sequence[0..], data);
268        }
269    }
270
271    #[test]
272    fn test_char_sequence_all_char_range_full() {
273        for ch in '\u{0000}'..'\u{ffff}' {
274            let data = ch.to_string();
275            let data = data.as_str();
276            let char_sequence = CharSequence::from(data);
277            assert_eq!(char_sequence.len_in_char, 1);
278            assert_eq!(&char_sequence[..], data);
279        }
280    }
281
282    #[test]
283    fn test_char_sequence_all_char_range_inclusive() {
284        for ch in '\u{0000}'..'\u{ffff}' {
285            let data = ch.to_string();
286            let data = data.as_str();
287            let char_sequence = CharSequence::from(data);
288            assert_eq!(char_sequence.len_in_char, 1);
289            assert_eq!(&char_sequence[0..=0], data);
290        }
291    }
292
293    #[test]
294    fn test_char_sequence_all_char_range_to() {
295        for ch in '\u{0000}'..'\u{ffff}' {
296            let data = ch.to_string();
297            let data = data.as_str();
298            let char_sequence = CharSequence::from(data);
299            assert_eq!(char_sequence.len_in_char, 1);
300            assert_eq!(&char_sequence[..1], data);
301        }
302    }
303
304    #[test]
305    fn test_char_sequence_all_char_range_to_inclusive() {
306        for ch in '\u{0000}'..'\u{ffff}' {
307            let data = ch.to_string();
308            let data = data.as_str();
309            let char_sequence = CharSequence::from(data);
310            assert_eq!(char_sequence.len_in_char, 1);
311            assert_eq!(&char_sequence[..=0], data);
312        }
313    }
314
315    #[test]
316    fn test_char_sequence_index_with_ascii() {
317        let data = "This is the string to test.";
318        let char_sequence = CharSequence::from(data);
319
320        assert_eq!(char_sequence[5..7], data[5..7]);
321        assert_eq!(char_sequence[5..], data[5..]);
322        assert_eq!(char_sequence[..], data[..]);
323        assert_eq!(char_sequence[5..=6], data[5..=6]);
324        assert_eq!(char_sequence[..6], data[..6]);
325        assert_eq!(char_sequence[..=6], data[..=6]);
326    }
327
328    #[test]
329    fn test_char_sequence_chinese() {
330        let data = "每个人都有他的作战策略,直到脸上中了一拳。";
331        assert_ne!(data.len(), 21);
332        let char_sequence = CharSequence::from(data);
333        assert_eq!(char_sequence.len(), 21);
334
335        assert_eq!(&char_sequence[6..9], "的作战");
336        assert_eq!(&char_sequence[6..], "的作战策略,直到脸上中了一拳。");
337        assert_eq!(
338            &char_sequence[..],
339            "每个人都有他的作战策略,直到脸上中了一拳。"
340        );
341        assert_eq!(&char_sequence[6..=9], "的作战策");
342        assert_eq!(&char_sequence[..9], "每个人都有他的作战");
343        assert_eq!(&char_sequence[..=9], "每个人都有他的作战策");
344    }
345
346    #[test]
347    fn test_char_sequence_to_0() {
348        let data = "azerty";
349        let char_sequence = CharSequence::from(data);
350
351        assert_eq!(&char_sequence[..0], "");
352    }
353}