srx/
lib.rs

1//! A simple and reasonably fast Rust implementation of the
2//! [Segmentation Rules eXchange 2.0 standard](https://www.unicode.org/uli/pas/srx/srx20.html)
3//! for text segmentation. `srx` is *not* fully compliant with the standard.
4//!
5//! This crate is intended for segmentation of plaintext so markup information (`<formathandle>` and `segmentsubflows`)
6//! is ignored.
7//!
8//! Not complying with the SRX spec, overlapping matches of the same `<rule>` are not found which could
9//! lead to different behavior in a few edge cases.
10//!
11//! ## Example
12//!
13//! ```
14//! use std::{fs, str::FromStr};
15//! use srx::SRX;
16//!
17//! let srx = SRX::from_str(&fs::read_to_string("data/segment.srx").unwrap())?;
18//! let english_rules = srx.language_rules("en");
19//!
20//! assert_eq!(
21//!     english_rules.split("e.g. U.K. and Mr. do not split. SRX is a rule-based format.").collect::<Vec<_>>(),
22//!     vec!["e.g. U.K. and Mr. do not split. ", "SRX is a rule-based format."]
23//! );
24//! # Ok::<(), srx::Error>(())
25//! ```
26//!
27//! ## Features
28//!
29//! - `serde`: Serde serialization and deserialization support for [SRX].
30//! - `from_xml`: [SRX::from_reader] method and [std::str::FromStr] implementation to load from an XML file in SRX format.
31//!
32//! ## A note on regular expressions
33//!
34//! This crate uses the [`regex` crate](https://github.com/rust-lang/regex) for parsing and executing
35//! regular expressions. The `regex` crate is mostly compatible with the
36//! [regular expression standard](https://www.unicode.org/uli/pas/srx/srx20.html#Intro_RegExp) from the SRX specification.
37//! However, some metacharacters such as `\Q` and `\E` are not supported.
38//!
39//! To still be able to use files containing unsupported rules and to parse useful SRX files
40//! such as
41//! [`segment.srx` from LanguageTool](https://github.com/languagetool-org/languagetool/blob/master/languagetool-core/src/main/resources/org/languagetool/resource/segment.srx)
42//! which does not comply with the standard by e. g. using look-ahead and look-behind, `srx`
43//! ignores `<rule>` elements with invalid regular expressions and provides information about
44//! them via the [SRX::errors] function.
45#![cfg_attr(docsrs, feature(doc_cfg))] // see https://stackoverflow.com/a/61417700
46#[cfg(feature = "serde")]
47extern crate serde_crate as serde;
48#[cfg(feature = "serde")]
49use serde::{Deserialize, Serialize};
50
51use std::{collections::HashMap, ops::Range};
52
53use regex::Regex;
54
55#[cfg(feature = "from_xml")]
56mod from_xml;
57#[cfg(feature = "from_xml")]
58mod utils;
59#[cfg(feature = "from_xml")]
60pub use from_xml::Error;
61
62/// Newtype denoting a language (`languagerulename` attribute in SRX).
63#[cfg_attr(
64    feature = "serde",
65    derive(Serialize, Deserialize),
66    serde(crate = "serde_crate")
67)]
68#[derive(Debug, Clone, Eq, PartialEq, Hash, Ord, PartialOrd)]
69pub struct Language(pub String);
70
71/// A single SRX rule. In SRX, consists of one `before_break` and one `after_break` Regex.
72/// For efficiency this crate compiles these regexes into one regex of the form `before_break(after_break)`
73/// and uses the start of the first capture group as the split index.
74#[cfg_attr(
75    feature = "serde",
76    derive(Serialize, Deserialize),
77    serde(crate = "serde_crate")
78)]
79#[derive(Debug, Clone)]
80#[non_exhaustive]
81struct Rule {
82    #[cfg_attr(feature = "serde", serde(with = "serde_regex"))]
83    regex: Regex,
84    do_break: bool,
85}
86
87impl Rule {
88    /// Gets all byte indices in the text at which this rule matches.
89    /// Contrary to the SRX 2.0 spec this does not find overlapping matches.
90    fn match_indices<'a>(&'a self, text: &'a str) -> impl Iterator<Item = usize> + 'a {
91        self.regex.captures_iter(text).filter_map(|x| {
92            // generally it is guaranteed that a regex has
93            // at least one match, but be lenient about
94            // errors in the srx xml files and drop those without
95            x.get(1).map(|x| x.start())
96        })
97    }
98
99    /// Whether this rule breaks or prevents breaking.
100    fn do_break(&self) -> bool {
101        self.do_break
102    }
103}
104
105/// An ordered set of rules.
106/// Rules are executed in order.
107/// Once a rule matches on an index, no other rule can match at the same index.
108/// Each rule either breaks (i. e. splits the text at this index) or prevents breaking.
109#[cfg_attr(
110    feature = "serde",
111    derive(Serialize, Deserialize),
112    serde(crate = "serde_crate")
113)]
114#[derive(Debug, Clone, Default)]
115pub struct Rules {
116    rules: Vec<Rule>,
117}
118
119impl Rules {
120    /// Obtain the ranges for text segments. Guaranteed to be at character bounds.
121    pub fn split_ranges(&self, text: &str) -> Vec<Range<usize>> {
122        let mut segments = Vec::new();
123
124        // TODO use a proper tri-state enum here
125        let mut masked_bytes: Vec<Option<bool>> = vec![None; text.len()];
126
127        'outer: for rule in &self.rules {
128            for byte_index in rule.match_indices(text) {
129                if byte_index >= text.len() {
130                    continue 'outer;
131                }
132
133                if masked_bytes[byte_index].is_none() {
134                    masked_bytes[byte_index] = Some(rule.do_break());
135                }
136            }
137        }
138
139        let mut prev_byte_pos = 0;
140
141        // Iterate over characters, we don't want no half characters in the output ranges
142        for (byte_pos, _c) in text.char_indices() {
143            if let Some(true) = masked_bytes[byte_pos] {
144                segments.push(prev_byte_pos..byte_pos);
145                prev_byte_pos = byte_pos;
146            }
147        }
148
149        // Deal with the trailing element, which is by definition
150        // not required to be suffixed by a gap char.
151        if text[prev_byte_pos..].chars().next().is_some() {
152            segments.push(prev_byte_pos..text.len());
153        }
154
155        segments
156    }
157
158    /// Split text into segments.
159    pub fn split<'a, 'b>(&self, text: &'a str) -> impl Iterator<Item = &'a str> + 'b
160    where
161        'a: 'b,
162    {
163        self.split_ranges(text)
164            .into_iter()
165            .map(move |range| &text[range])
166    }
167
168    pub fn is_empty(&self) -> bool {
169        self.rules.is_empty()
170    }
171
172    pub fn len(&self) -> usize {
173        self.rules.len()
174    }
175}
176
177/// An entry of the `<maprules>` element.
178/// Associates a regex with a [Language].
179#[cfg_attr(
180    feature = "serde",
181    derive(Serialize, Deserialize),
182    serde(crate = "serde_crate")
183)]
184#[derive(Debug, Clone)]
185struct LanguageRegex {
186    #[cfg_attr(feature = "serde", serde(with = "serde_regex"))]
187    regex: Regex,
188    language: Language,
189}
190
191/// The SRX root.
192/// Does not execute rules on is own.
193#[cfg_attr(
194    feature = "serde",
195    derive(Serialize, Deserialize),
196    serde(crate = "serde_crate")
197)]
198#[derive(Debug, Clone)]
199pub struct SRX {
200    cascade: bool,
201    map: Vec<LanguageRegex>,
202    rules: HashMap<Language, Vec<Rule>>,
203    errors: HashMap<Language, Vec<String>>,
204}
205
206impl SRX {
207    /// Gets the rules for a language code by
208    /// - aggregating rules from all [Language]s with a matching `<languagepattern>` (if the SRX is set to be cascading)
209    /// - finding the first matching `<languagepattern>` (if the SRX is set to be not cascading)
210    ///
211    /// Result should be cached instead of calling this repeatedly as it clones the rules.
212    pub fn language_rules<S: AsRef<str>>(&self, lang_code: S) -> Rules {
213        let mut rules = Vec::new();
214
215        for item in &self.map {
216            if item.regex.is_match(lang_code.as_ref()) {
217                rules.extend(self.rules.get(&item.language).expect("languagerulename in <languagemap> must have a corresponding entry in <languagerules>").iter().cloned());
218                if !self.cascade {
219                    break;
220                }
221            }
222        }
223
224        Rules { rules }
225    }
226
227    /// Maps [Language]s to a vector of string representations of errors which occured during parsing regular expressions for this language.
228    pub fn errors(&self) -> &HashMap<Language, Vec<String>> {
229        &self.errors
230    }
231}
232
233#[cfg(test)]
234mod tests {
235    use super::*;
236    use quickcheck_macros::quickcheck;
237    use std::{fs, str::FromStr};
238
239    #[quickcheck]
240    fn length_invariant(text: String) {
241        let rules =
242            SRX::from_str(&fs::read_to_string("data/example.srx").expect("example file exists"))
243                .expect("example file is valid")
244                .language_rules("en");
245
246        assert_eq!(
247            text.len(),
248            rules.split(&text).fold(0, |acc, x| acc + x.len())
249        );
250    }
251
252    #[test]
253    fn match_indices_correct() {
254        let rule = Rule::new(Some("abc"), Some("d+fg"), true).expect("test rule is valid");
255
256        assert_eq!(
257            rule.match_indices("abcddfgxxx").collect::<Vec<_>>(),
258            vec![3_usize]
259        );
260    }
261
262    #[test]
263    fn example_splits_correct() {
264        let rules =
265            SRX::from_str(&fs::read_to_string("data/example.srx").expect("example file exists"))
266                .expect("example file is valid")
267                .language_rules("en");
268
269        // example from the spec: https://www.unicode.org/uli/pas/srx/srx20.html#AppExample
270        let text =
271            "The U.K. Prime Minister, Mr. Blair, was seen out with his family today. He is well.";
272        assert_eq!(
273            rules.split(text).collect::<Vec<_>>(),
274            vec![
275                "The U.K. Prime Minister, Mr. Blair, was seen out with his family today.",
276                " He is well."
277            ]
278        );
279    }
280    #[test]
281    fn example_splits_correct_multi_emoji() {
282        let rules =
283            SRX::from_str(&fs::read_to_string("data/segment.srx").expect("example file exists"))
284                .expect("example file is valid")
285                .language_rules("en");
286
287        let text = "e.g. U.K. and Mr. do not split. SRX is a 👒🍏🍱-based format 🐱";
288        assert_eq!(
289            rules.split(text).collect::<Vec<_>>(),
290            vec![
291                "e.g. U.K. and Mr. do not split. ",
292                "SRX is a 👒🍏🍱-based format 🐱"
293            ]
294        );
295    }
296
297    #[test]
298    fn ignores_last_match_index() {
299        let rules =
300            SRX::from_str(&fs::read_to_string("data/segment.srx").expect("example file exists"))
301                .expect("example file is valid")
302                .language_rules("en");
303
304        let _ = rules.split("Hello! ").collect::<Vec<_>>();
305    }
306
307    #[test]
308    fn errors_reported() {
309        let srx =
310            SRX::from_str(&fs::read_to_string("data/segment.srx").expect("segment file exists"))
311                .expect("segment file is valid");
312
313        assert!(!srx.errors().is_empty());
314        assert_eq!(srx.errors().values().flatten().count(), 49);
315    }
316}