lindera_filter/
character_filter.rs

1pub mod japanese_iteration_mark;
2pub mod mapping;
3pub mod regex;
4pub mod unicode_normalize;
5
6use serde_json::Value;
7use std::ops::Deref;
8
9use lindera_core::error::LinderaErrorKind;
10use lindera_core::LinderaResult;
11
12use crate::character_filter::japanese_iteration_mark::{
13    JapaneseIterationMarkCharacterFilter, JapaneseIterationMarkCharacterFilterConfig,
14    JAPANESE_ITERATION_MARK_CHARACTER_FILTER_NAME,
15};
16use crate::character_filter::mapping::{
17    MappingCharacterFilter, MappingCharacterFilterConfig, MAPPING_CHARACTER_FILTER_NAME,
18};
19use crate::character_filter::regex::{
20    RegexCharacterFilter, RegexCharacterFilterConfig, REGEX_CHARACTER_FILTER_NAME,
21};
22use crate::character_filter::unicode_normalize::{
23    UnicodeNormalizeCharacterFilter, UnicodeNormalizeCharacterFilterConfig,
24    UNICODE_NORMALIZE_CHARACTER_FILTER_NAME,
25};
26use crate::parse_cli_flag;
27
28pub trait CharacterFilter: 'static + Send + Sync + CharacterFilterClone {
29    fn name(&self) -> &str;
30    fn apply(&self, text: &str) -> LinderaResult<(String, Vec<usize>, Vec<i64>)>;
31}
32
33pub struct BoxCharacterFilter(Box<dyn CharacterFilter + 'static + Send + Sync>);
34
35impl Deref for BoxCharacterFilter {
36    type Target = dyn CharacterFilter;
37
38    fn deref(&self) -> &dyn CharacterFilter {
39        &*self.0
40    }
41}
42
43impl<T: CharacterFilter> From<T> for BoxCharacterFilter {
44    fn from(character_filter: T) -> BoxCharacterFilter {
45        BoxCharacterFilter(Box::new(character_filter))
46    }
47}
48
49pub trait CharacterFilterClone {
50    fn box_clone(&self) -> BoxCharacterFilter;
51}
52
53impl<T: CharacterFilter + Clone + 'static> CharacterFilterClone for T {
54    fn box_clone(&self) -> BoxCharacterFilter {
55        BoxCharacterFilter::from(self.clone())
56    }
57}
58
59pub fn add_offset_diff(offsets: &mut Vec<usize>, diffs: &mut Vec<i64>, offset: usize, diff: i64) {
60    match offsets.last() {
61        Some(&last_offset) => {
62            if last_offset == offset {
63                // Replace the last diff.
64                diffs.pop();
65                diffs.push(diff);
66            } else {
67                offsets.push(offset);
68                diffs.push(diff);
69            }
70        }
71        None => {
72            // First offset.
73            offsets.push(offset);
74            diffs.push(diff);
75        }
76    }
77}
78
79pub fn correct_offset(offset: usize, offsets: &[usize], diffs: &[i64], text_len: usize) -> usize {
80    // If `offsets` is empty, the `offset` specified is the correct offset.
81    if offsets.is_empty() {
82        return offset;
83    }
84
85    // Finds the `index` containing the specified `offset` from the `offsets`.
86    let index = match offsets.binary_search(&offset) {
87        Ok(i) => i,
88        Err(i) => {
89            if i != 0 {
90                // If `i` is greater than `0`, then `i - 1` is the `index` for the `diff` of the specified `offset`.
91                i - 1
92            } else if i >= text_len {
93                text_len
94            } else {
95                // If the `offset` is not found and `i` is 0,
96                // the specified `offset` is the correct offset.
97                return offset;
98            }
99        }
100    };
101
102    // The correct offset value can be calculated by adding `diff[index]` to the given `offset`.
103    (offset as i64 + diffs[index]) as usize
104}
105
106pub struct CharacterFilterLoader {}
107
108impl CharacterFilterLoader {
109    pub fn load_from_value(kind: &str, value: &Value) -> LinderaResult<BoxCharacterFilter> {
110        let character_filter = match kind {
111            JAPANESE_ITERATION_MARK_CHARACTER_FILTER_NAME => {
112                BoxCharacterFilter::from(JapaneseIterationMarkCharacterFilter::new(
113                    JapaneseIterationMarkCharacterFilterConfig::from_value(value)?,
114                ))
115            }
116            MAPPING_CHARACTER_FILTER_NAME => {
117                let config = MappingCharacterFilterConfig::from_value(value)?;
118                BoxCharacterFilter::from(MappingCharacterFilter::new(config)?)
119            }
120            REGEX_CHARACTER_FILTER_NAME => {
121                let config = RegexCharacterFilterConfig::from_value(value)?;
122                BoxCharacterFilter::from(RegexCharacterFilter::new(config)?)
123            }
124            UNICODE_NORMALIZE_CHARACTER_FILTER_NAME => {
125                let config = UnicodeNormalizeCharacterFilterConfig::from_value(value)?;
126                BoxCharacterFilter::from(UnicodeNormalizeCharacterFilter::new(config))
127            }
128            _ => {
129                return Err(LinderaErrorKind::Deserialize
130                    .with_error(anyhow::anyhow!("unsupported character filter: {}", kind)));
131            }
132        };
133
134        Ok(character_filter)
135    }
136
137    pub fn load_from_cli_flag(cli_flag: &str) -> LinderaResult<BoxCharacterFilter> {
138        let (kind, args) = parse_cli_flag(cli_flag)?;
139
140        let character_filter = Self::load_from_value(kind, &args)?;
141
142        Ok(character_filter)
143    }
144}
145
146#[cfg(test)]
147mod tests {
148    #[test]
149    fn test_correct_offset() {
150        let text = "ABCDEFG";
151        let filterd_text = "AbbbCdddFgggg";
152
153        let text_len = filterd_text.len();
154        let offsets = vec![2, 3, 7, 10, 11, 12];
155        let diffs = vec![-1, -2, -3, -4, -5, -6];
156
157        let start_b = 1;
158        let end_b = 4;
159        assert_eq!("bbb", &filterd_text[start_b..end_b]);
160        let correct_start_b = super::correct_offset(start_b, &offsets, &diffs, text_len);
161        let correct_end_b = super::correct_offset(end_b, &offsets, &diffs, text_len);
162        assert_eq!(1, correct_start_b);
163        assert_eq!(2, correct_end_b);
164        assert_eq!("B", &text[correct_start_b..correct_end_b]);
165
166        let start_g = 9;
167        let end_g = 13;
168        assert_eq!("gggg", &filterd_text[start_g..end_g]);
169        let correct_start_g = super::correct_offset(start_g, &offsets, &diffs, text_len);
170        let correct_end_g = super::correct_offset(end_g, &offsets, &diffs, text_len);
171        assert_eq!(6, correct_start_g);
172        assert_eq!(7, correct_end_g);
173        assert_eq!("G", &text[correct_start_g..correct_end_g]);
174
175        let start = 0;
176        let end = 13;
177        assert_eq!("AbbbCdddFgggg", &filterd_text[start..end]);
178        let correct_start = super::correct_offset(start, &offsets, &diffs, text_len);
179        let correct_end = super::correct_offset(end, &offsets, &diffs, text_len);
180        assert_eq!(0, correct_start);
181        assert_eq!(7, correct_end);
182        assert_eq!("ABCDEFG", &text[correct_start..correct_end]);
183    }
184}