1#![cfg_attr(docsrs, feature(doc_cfg))] #[cfg(feature = "serde")]
47extern crate serde_crate as serde;
48#[cfg(feature = "serde")]
49use serde::{Deserialize, Serialize};
50
51use std::{collections::HashMap, ops::Range};
52
53use regex::Regex;
54
55#[cfg(feature = "from_xml")]
56mod from_xml;
57#[cfg(feature = "from_xml")]
58mod utils;
59#[cfg(feature = "from_xml")]
60pub use from_xml::Error;
61
62#[cfg_attr(
64 feature = "serde",
65 derive(Serialize, Deserialize),
66 serde(crate = "serde_crate")
67)]
68#[derive(Debug, Clone, Eq, PartialEq, Hash, Ord, PartialOrd)]
69pub struct Language(pub String);
70
71#[cfg_attr(
75 feature = "serde",
76 derive(Serialize, Deserialize),
77 serde(crate = "serde_crate")
78)]
79#[derive(Debug, Clone)]
80#[non_exhaustive]
81struct Rule {
82 #[cfg_attr(feature = "serde", serde(with = "serde_regex"))]
83 regex: Regex,
84 do_break: bool,
85}
86
87impl Rule {
88 fn match_indices<'a>(&'a self, text: &'a str) -> impl Iterator<Item = usize> + 'a {
91 self.regex.captures_iter(text).filter_map(|x| {
92 x.get(1).map(|x| x.start())
96 })
97 }
98
99 fn do_break(&self) -> bool {
101 self.do_break
102 }
103}
104
105#[cfg_attr(
110 feature = "serde",
111 derive(Serialize, Deserialize),
112 serde(crate = "serde_crate")
113)]
114#[derive(Debug, Clone, Default)]
115pub struct Rules {
116 rules: Vec<Rule>,
117}
118
119impl Rules {
120 pub fn split_ranges(&self, text: &str) -> Vec<Range<usize>> {
122 let mut segments = Vec::new();
123
124 let mut masked_bytes: Vec<Option<bool>> = vec![None; text.len()];
126
127 'outer: for rule in &self.rules {
128 for byte_index in rule.match_indices(text) {
129 if byte_index >= text.len() {
130 continue 'outer;
131 }
132
133 if masked_bytes[byte_index].is_none() {
134 masked_bytes[byte_index] = Some(rule.do_break());
135 }
136 }
137 }
138
139 let mut prev_byte_pos = 0;
140
141 for (byte_pos, _c) in text.char_indices() {
143 if let Some(true) = masked_bytes[byte_pos] {
144 segments.push(prev_byte_pos..byte_pos);
145 prev_byte_pos = byte_pos;
146 }
147 }
148
149 if text[prev_byte_pos..].chars().next().is_some() {
152 segments.push(prev_byte_pos..text.len());
153 }
154
155 segments
156 }
157
158 pub fn split<'a, 'b>(&self, text: &'a str) -> impl Iterator<Item = &'a str> + 'b
160 where
161 'a: 'b,
162 {
163 self.split_ranges(text)
164 .into_iter()
165 .map(move |range| &text[range])
166 }
167
168 pub fn is_empty(&self) -> bool {
169 self.rules.is_empty()
170 }
171
172 pub fn len(&self) -> usize {
173 self.rules.len()
174 }
175}
176
177#[cfg_attr(
180 feature = "serde",
181 derive(Serialize, Deserialize),
182 serde(crate = "serde_crate")
183)]
184#[derive(Debug, Clone)]
185struct LanguageRegex {
186 #[cfg_attr(feature = "serde", serde(with = "serde_regex"))]
187 regex: Regex,
188 language: Language,
189}
190
191#[cfg_attr(
194 feature = "serde",
195 derive(Serialize, Deserialize),
196 serde(crate = "serde_crate")
197)]
198#[derive(Debug, Clone)]
199pub struct SRX {
200 cascade: bool,
201 map: Vec<LanguageRegex>,
202 rules: HashMap<Language, Vec<Rule>>,
203 errors: HashMap<Language, Vec<String>>,
204}
205
206impl SRX {
207 pub fn language_rules<S: AsRef<str>>(&self, lang_code: S) -> Rules {
213 let mut rules = Vec::new();
214
215 for item in &self.map {
216 if item.regex.is_match(lang_code.as_ref()) {
217 rules.extend(self.rules.get(&item.language).expect("languagerulename in <languagemap> must have a corresponding entry in <languagerules>").iter().cloned());
218 if !self.cascade {
219 break;
220 }
221 }
222 }
223
224 Rules { rules }
225 }
226
227 pub fn errors(&self) -> &HashMap<Language, Vec<String>> {
229 &self.errors
230 }
231}
232
233#[cfg(test)]
234mod tests {
235 use super::*;
236 use quickcheck_macros::quickcheck;
237 use std::{fs, str::FromStr};
238
239 #[quickcheck]
240 fn length_invariant(text: String) {
241 let rules =
242 SRX::from_str(&fs::read_to_string("data/example.srx").expect("example file exists"))
243 .expect("example file is valid")
244 .language_rules("en");
245
246 assert_eq!(
247 text.len(),
248 rules.split(&text).fold(0, |acc, x| acc + x.len())
249 );
250 }
251
252 #[test]
253 fn match_indices_correct() {
254 let rule = Rule::new(Some("abc"), Some("d+fg"), true).expect("test rule is valid");
255
256 assert_eq!(
257 rule.match_indices("abcddfgxxx").collect::<Vec<_>>(),
258 vec![3_usize]
259 );
260 }
261
262 #[test]
263 fn example_splits_correct() {
264 let rules =
265 SRX::from_str(&fs::read_to_string("data/example.srx").expect("example file exists"))
266 .expect("example file is valid")
267 .language_rules("en");
268
269 let text =
271 "The U.K. Prime Minister, Mr. Blair, was seen out with his family today. He is well.";
272 assert_eq!(
273 rules.split(text).collect::<Vec<_>>(),
274 vec![
275 "The U.K. Prime Minister, Mr. Blair, was seen out with his family today.",
276 " He is well."
277 ]
278 );
279 }
280 #[test]
281 fn example_splits_correct_multi_emoji() {
282 let rules =
283 SRX::from_str(&fs::read_to_string("data/segment.srx").expect("example file exists"))
284 .expect("example file is valid")
285 .language_rules("en");
286
287 let text = "e.g. U.K. and Mr. do not split. SRX is a 👒🍏🍱-based format 🐱";
288 assert_eq!(
289 rules.split(text).collect::<Vec<_>>(),
290 vec![
291 "e.g. U.K. and Mr. do not split. ",
292 "SRX is a 👒🍏🍱-based format 🐱"
293 ]
294 );
295 }
296
297 #[test]
298 fn ignores_last_match_index() {
299 let rules =
300 SRX::from_str(&fs::read_to_string("data/segment.srx").expect("example file exists"))
301 .expect("example file is valid")
302 .language_rules("en");
303
304 let _ = rules.split("Hello! ").collect::<Vec<_>>();
305 }
306
307 #[test]
308 fn errors_reported() {
309 let srx =
310 SRX::from_str(&fs::read_to_string("data/segment.srx").expect("segment file exists"))
311 .expect("segment file is valid");
312
313 assert!(!srx.errors().is_empty());
314 assert_eq!(srx.errors().values().flatten().count(), 49);
315 }
316}