#![cfg_attr(docsrs, feature(doc_cfg))] #[cfg(feature = "serde")]
extern crate serde_crate as serde;
#[cfg(feature = "serde")]
use serde::{Deserialize, Serialize};
use std::{collections::HashMap, ops::Range};
use regex::Regex;
#[cfg(feature = "from_xml")]
mod from_xml;
#[cfg(feature = "from_xml")]
mod utils;
#[cfg(feature = "from_xml")]
pub use from_xml::Error;
#[cfg_attr(
feature = "serde",
derive(Serialize, Deserialize),
serde(crate = "serde_crate")
)]
#[derive(Debug, Clone, Eq, PartialEq, Hash, Ord, PartialOrd)]
pub struct Language(pub String);
#[cfg_attr(
feature = "serde",
derive(Serialize, Deserialize),
serde(crate = "serde_crate")
)]
#[derive(Debug, Clone)]
#[non_exhaustive]
struct Rule {
#[cfg_attr(feature = "serde", serde(with = "serde_regex"))]
regex: Regex,
do_break: bool,
}
impl Rule {
fn match_indices<'a>(&'a self, text: &'a str) -> impl Iterator<Item = usize> + 'a {
self.regex.captures_iter(text).filter_map(|x| {
x.get(1).map(|x| x.start())
})
}
fn do_break(&self) -> bool {
self.do_break
}
}
#[cfg_attr(
feature = "serde",
derive(Serialize, Deserialize),
serde(crate = "serde_crate")
)]
#[derive(Debug, Clone, Default)]
pub struct Rules {
rules: Vec<Rule>,
}
impl Rules {
pub fn split_ranges(&self, text: &str) -> Vec<Range<usize>> {
let mut segments = Vec::new();
let mut masked_bytes: Vec<Option<bool>> = vec![None; text.len()];
'outer: for rule in &self.rules {
for byte_index in rule.match_indices(text) {
if byte_index >= text.len() {
continue 'outer;
}
if masked_bytes[byte_index].is_none() {
masked_bytes[byte_index] = Some(rule.do_break());
}
}
}
let mut prev_byte_pos = 0;
for (byte_pos, _c) in text.char_indices() {
if let Some(true) = masked_bytes[byte_pos] {
segments.push(prev_byte_pos..byte_pos);
prev_byte_pos = byte_pos;
}
}
if text[prev_byte_pos..].chars().next().is_some() {
segments.push(prev_byte_pos..text.len());
}
segments
}
pub fn split<'a, 'b>(&self, text: &'a str) -> impl Iterator<Item = &'a str> + 'b
where
'a: 'b,
{
self.split_ranges(text)
.into_iter()
.map(move |range| &text[range])
}
pub fn is_empty(&self) -> bool {
self.rules.is_empty()
}
pub fn len(&self) -> usize {
self.rules.len()
}
}
#[cfg_attr(
feature = "serde",
derive(Serialize, Deserialize),
serde(crate = "serde_crate")
)]
#[derive(Debug, Clone)]
struct LanguageRegex {
#[cfg_attr(feature = "serde", serde(with = "serde_regex"))]
regex: Regex,
language: Language,
}
#[cfg_attr(
feature = "serde",
derive(Serialize, Deserialize),
serde(crate = "serde_crate")
)]
#[derive(Debug, Clone)]
pub struct SRX {
cascade: bool,
map: Vec<LanguageRegex>,
rules: HashMap<Language, Vec<Rule>>,
errors: HashMap<Language, Vec<String>>,
}
impl SRX {
pub fn language_rules<S: AsRef<str>>(&self, lang_code: S) -> Rules {
let mut rules = Vec::new();
for item in &self.map {
if item.regex.is_match(lang_code.as_ref()) {
rules.extend(self.rules.get(&item.language).expect("languagerulename in <languagemap> must have a corresponding entry in <languagerules>").iter().cloned());
if !self.cascade {
break;
}
}
}
Rules { rules }
}
pub fn errors(&self) -> &HashMap<Language, Vec<String>> {
&self.errors
}
}
#[cfg(test)]
mod tests {
use super::*;
use quickcheck_macros::quickcheck;
use std::{fs, str::FromStr};
#[quickcheck]
fn length_invariant(text: String) {
let rules =
SRX::from_str(&fs::read_to_string("data/example.srx").expect("example file exists"))
.expect("example file is valid")
.language_rules("en");
assert_eq!(
text.len(),
rules.split(&text).fold(0, |acc, x| acc + x.len())
);
}
#[test]
fn match_indices_correct() {
let rule = Rule::new(Some("abc"), Some("d+fg"), true).expect("test rule is valid");
assert_eq!(
rule.match_indices("abcddfgxxx").collect::<Vec<_>>(),
vec![3_usize]
);
}
#[test]
fn example_splits_correct() {
let rules =
SRX::from_str(&fs::read_to_string("data/example.srx").expect("example file exists"))
.expect("example file is valid")
.language_rules("en");
let text =
"The U.K. Prime Minister, Mr. Blair, was seen out with his family today. He is well.";
assert_eq!(
rules.split(text).collect::<Vec<_>>(),
vec![
"The U.K. Prime Minister, Mr. Blair, was seen out with his family today.",
" He is well."
]
);
}
#[test]
fn example_splits_correct_multi_emoji() {
let rules =
SRX::from_str(&fs::read_to_string("data/segment.srx").expect("example file exists"))
.expect("example file is valid")
.language_rules("en");
let text = "e.g. U.K. and Mr. do not split. SRX is a 👒🍏🍱-based format 🐱";
assert_eq!(
rules.split(text).collect::<Vec<_>>(),
vec![
"e.g. U.K. and Mr. do not split. ",
"SRX is a 👒🍏🍱-based format 🐱"
]
);
}
#[test]
fn ignores_last_match_index() {
let rules =
SRX::from_str(&fs::read_to_string("data/segment.srx").expect("example file exists"))
.expect("example file is valid")
.language_rules("en");
let _ = rules.split("Hello! ").collect::<Vec<_>>();
}
#[test]
fn errors_reported() {
let srx =
SRX::from_str(&fs::read_to_string("data/segment.srx").expect("segment file exists"))
.expect("segment file is valid");
assert!(!srx.errors().is_empty());
assert_eq!(srx.errors().values().flatten().count(), 49);
}
}