lindera_filter/
character_filter.rs1pub mod japanese_iteration_mark;
2pub mod mapping;
3pub mod regex;
4pub mod unicode_normalize;
5
6use serde_json::Value;
7use std::ops::Deref;
8
9use lindera_core::error::LinderaErrorKind;
10use lindera_core::LinderaResult;
11
12use crate::character_filter::japanese_iteration_mark::{
13 JapaneseIterationMarkCharacterFilter, JapaneseIterationMarkCharacterFilterConfig,
14 JAPANESE_ITERATION_MARK_CHARACTER_FILTER_NAME,
15};
16use crate::character_filter::mapping::{
17 MappingCharacterFilter, MappingCharacterFilterConfig, MAPPING_CHARACTER_FILTER_NAME,
18};
19use crate::character_filter::regex::{
20 RegexCharacterFilter, RegexCharacterFilterConfig, REGEX_CHARACTER_FILTER_NAME,
21};
22use crate::character_filter::unicode_normalize::{
23 UnicodeNormalizeCharacterFilter, UnicodeNormalizeCharacterFilterConfig,
24 UNICODE_NORMALIZE_CHARACTER_FILTER_NAME,
25};
26use crate::parse_cli_flag;
27
28pub trait CharacterFilter: 'static + Send + Sync + CharacterFilterClone {
29 fn name(&self) -> &str;
30 fn apply(&self, text: &str) -> LinderaResult<(String, Vec<usize>, Vec<i64>)>;
31}
32
33pub struct BoxCharacterFilter(Box<dyn CharacterFilter + 'static + Send + Sync>);
34
35impl Deref for BoxCharacterFilter {
36 type Target = dyn CharacterFilter;
37
38 fn deref(&self) -> &dyn CharacterFilter {
39 &*self.0
40 }
41}
42
43impl<T: CharacterFilter> From<T> for BoxCharacterFilter {
44 fn from(character_filter: T) -> BoxCharacterFilter {
45 BoxCharacterFilter(Box::new(character_filter))
46 }
47}
48
49pub trait CharacterFilterClone {
50 fn box_clone(&self) -> BoxCharacterFilter;
51}
52
53impl<T: CharacterFilter + Clone + 'static> CharacterFilterClone for T {
54 fn box_clone(&self) -> BoxCharacterFilter {
55 BoxCharacterFilter::from(self.clone())
56 }
57}
58
59pub fn add_offset_diff(offsets: &mut Vec<usize>, diffs: &mut Vec<i64>, offset: usize, diff: i64) {
60 match offsets.last() {
61 Some(&last_offset) => {
62 if last_offset == offset {
63 diffs.pop();
65 diffs.push(diff);
66 } else {
67 offsets.push(offset);
68 diffs.push(diff);
69 }
70 }
71 None => {
72 offsets.push(offset);
74 diffs.push(diff);
75 }
76 }
77}
78
79pub fn correct_offset(offset: usize, offsets: &[usize], diffs: &[i64], text_len: usize) -> usize {
80 if offsets.is_empty() {
82 return offset;
83 }
84
85 let index = match offsets.binary_search(&offset) {
87 Ok(i) => i,
88 Err(i) => {
89 if i != 0 {
90 i - 1
92 } else if i >= text_len {
93 text_len
94 } else {
95 return offset;
98 }
99 }
100 };
101
102 (offset as i64 + diffs[index]) as usize
104}
105
106pub struct CharacterFilterLoader {}
107
108impl CharacterFilterLoader {
109 pub fn load_from_value(kind: &str, value: &Value) -> LinderaResult<BoxCharacterFilter> {
110 let character_filter = match kind {
111 JAPANESE_ITERATION_MARK_CHARACTER_FILTER_NAME => {
112 BoxCharacterFilter::from(JapaneseIterationMarkCharacterFilter::new(
113 JapaneseIterationMarkCharacterFilterConfig::from_value(value)?,
114 ))
115 }
116 MAPPING_CHARACTER_FILTER_NAME => {
117 let config = MappingCharacterFilterConfig::from_value(value)?;
118 BoxCharacterFilter::from(MappingCharacterFilter::new(config)?)
119 }
120 REGEX_CHARACTER_FILTER_NAME => {
121 let config = RegexCharacterFilterConfig::from_value(value)?;
122 BoxCharacterFilter::from(RegexCharacterFilter::new(config)?)
123 }
124 UNICODE_NORMALIZE_CHARACTER_FILTER_NAME => {
125 let config = UnicodeNormalizeCharacterFilterConfig::from_value(value)?;
126 BoxCharacterFilter::from(UnicodeNormalizeCharacterFilter::new(config))
127 }
128 _ => {
129 return Err(LinderaErrorKind::Deserialize
130 .with_error(anyhow::anyhow!("unsupported character filter: {}", kind)));
131 }
132 };
133
134 Ok(character_filter)
135 }
136
137 pub fn load_from_cli_flag(cli_flag: &str) -> LinderaResult<BoxCharacterFilter> {
138 let (kind, args) = parse_cli_flag(cli_flag)?;
139
140 let character_filter = Self::load_from_value(kind, &args)?;
141
142 Ok(character_filter)
143 }
144}
145
146#[cfg(test)]
147mod tests {
148 #[test]
149 fn test_correct_offset() {
150 let text = "ABCDEFG";
151 let filterd_text = "AbbbCdddFgggg";
152
153 let text_len = filterd_text.len();
154 let offsets = vec![2, 3, 7, 10, 11, 12];
155 let diffs = vec![-1, -2, -3, -4, -5, -6];
156
157 let start_b = 1;
158 let end_b = 4;
159 assert_eq!("bbb", &filterd_text[start_b..end_b]);
160 let correct_start_b = super::correct_offset(start_b, &offsets, &diffs, text_len);
161 let correct_end_b = super::correct_offset(end_b, &offsets, &diffs, text_len);
162 assert_eq!(1, correct_start_b);
163 assert_eq!(2, correct_end_b);
164 assert_eq!("B", &text[correct_start_b..correct_end_b]);
165
166 let start_g = 9;
167 let end_g = 13;
168 assert_eq!("gggg", &filterd_text[start_g..end_g]);
169 let correct_start_g = super::correct_offset(start_g, &offsets, &diffs, text_len);
170 let correct_end_g = super::correct_offset(end_g, &offsets, &diffs, text_len);
171 assert_eq!(6, correct_start_g);
172 assert_eq!(7, correct_end_g);
173 assert_eq!("G", &text[correct_start_g..correct_end_g]);
174
175 let start = 0;
176 let end = 13;
177 assert_eq!("AbbbCdddFgggg", &filterd_text[start..end]);
178 let correct_start = super::correct_offset(start, &offsets, &diffs, text_len);
179 let correct_end = super::correct_offset(end, &offsets, &diffs, text_len);
180 assert_eq!(0, correct_start);
181 assert_eq!(7, correct_end);
182 assert_eq!("ABCDEFG", &text[correct_start..correct_end]);
183 }
184}