use unicode_linebreak::linebreaks;
pub(crate) fn split_paragraphs(text: &str) -> Vec<(usize, &str)> {
let mut out: Vec<(usize, &str)> = Vec::new();
let mut start = 0usize;
let mut chars = text.char_indices().peekable();
while let Some((i, c)) = chars.next() {
let term_len = match c {
'\n' | '\u{000B}' | '\u{000C}' | '\u{0085}' | '\u{2028}' | '\u{2029}' => c.len_utf8(),
'\r' => {
if matches!(chars.peek(), Some(&(_, '\n'))) {
chars.next();
2
} else {
1
}
}
_ => continue,
};
out.push((start, &text[start..i]));
start = i + term_len;
}
out.push((start, &text[start..]));
out
}
pub(crate) fn break_offsets(text: &str) -> Vec<usize> {
linebreaks(text)
.filter_map(|(offset, opportunity)| {
let _ = opportunity;
(offset > 0 && offset < text.len()).then_some(offset)
})
.collect()
}
#[inline]
pub(crate) fn is_break_before(breaks: &[usize], offset: usize) -> bool {
breaks.binary_search(&offset).is_ok()
}
#[cfg(test)]
fn mandatory_offsets(text: &str) -> Vec<usize> {
use unicode_linebreak::BreakOpportunity;
linebreaks(text)
.filter_map(|(offset, opportunity)| {
(offset < text.len() && opportunity == BreakOpportunity::Mandatory).then_some(offset)
})
.collect()
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn ascii_breaks_after_spaces_only() {
assert_eq!(break_offsets("a b c"), vec![2, 4]);
}
#[test]
fn single_space_one_break() {
assert_eq!(break_offsets("Hello world!"), vec![6]);
}
#[test]
fn no_break_inside_a_word() {
assert_eq!(break_offsets("abc"), Vec::<usize>::new());
}
#[test]
fn empty_text_has_no_breaks() {
assert_eq!(break_offsets(""), Vec::<usize>::new());
}
#[test]
fn hyphen_offers_a_break_after_it() {
assert_eq!(break_offsets("foo-bar"), vec![4]);
}
#[test]
fn cjk_breaks_between_every_ideograph() {
assert_eq!(break_offsets("日本語"), vec![3, 6]);
}
#[test]
fn cjk_offsets_land_on_char_boundaries() {
let text = "日本語のテキスト";
for &b in &break_offsets(text) {
assert!(text.is_char_boundary(b), "break {b} split a code point");
}
}
#[test]
fn newline_free_line_has_no_interior_mandatory_break() {
assert!(mandatory_offsets("a b c").is_empty());
assert!(mandatory_offsets("日本語").is_empty());
}
#[test]
fn explicit_newline_is_a_mandatory_interior_break() {
assert_eq!(mandatory_offsets("a\nb"), vec![2]);
}
fn split_newline_reference(text: &str) -> Vec<(usize, &str)> {
let mut out = Vec::new();
let mut offset = 0usize;
for para in text.split('\n') {
out.push((offset, para));
offset += para.len() + 1;
}
out
}
#[test]
fn split_paragraphs_matches_split_newline_on_lf_only() {
for s in ["", "a", "a\nb", "a\n", "\n", "a\n\nb", "日本語\nx"] {
assert_eq!(
split_paragraphs(s),
split_newline_reference(s),
"split_paragraphs disagreed with split('\\n') reference for {s:?}"
);
}
}
#[test]
fn crlf_is_one_terminator() {
assert_eq!(split_paragraphs("a\r\nb"), vec![(0, "a"), (3, "b")]);
}
#[test]
fn lone_cr_breaks() {
assert_eq!(split_paragraphs("a\rb"), vec![(0, "a"), (2, "b")]);
}
#[test]
fn unicode_separators_break() {
assert_eq!(split_paragraphs("a\u{000B}b"), vec![(0, "a"), (2, "b")]);
assert_eq!(split_paragraphs("a\u{000C}b"), vec![(0, "a"), (2, "b")]);
assert_eq!(split_paragraphs("a\u{0085}b"), vec![(0, "a"), (3, "b")]);
assert_eq!(split_paragraphs("a\u{2028}b"), vec![(0, "a"), (4, "b")]);
assert_eq!(split_paragraphs("a\u{2029}b"), vec![(0, "a"), (4, "b")]);
}
#[test]
fn trailing_terminator_yields_empty_paragraph() {
assert_eq!(split_paragraphs("a\r\n"), vec![(0, "a"), (3, "")]);
}
#[test]
fn content_excludes_terminator() {
let seps = [
'\n', '\r', '\u{000B}', '\u{000C}', '\u{0085}', '\u{2028}', '\u{2029}',
];
for (_, content) in split_paragraphs("a\r\nb\nc\u{2028}d\u{0085}\u{000C}e") {
assert!(
!content.chars().any(|c| seps.contains(&c)),
"content {content:?} contained a separator byte"
);
}
}
#[test]
fn mixed_terminators() {
let spans = split_paragraphs("a\r\nb\nc\u{2028}d");
assert_eq!(spans, vec![(0, "a"), (3, "b"), (5, "c"), (9, "d")]);
for w in spans.windows(2) {
assert!(w[0].0 < w[1].0, "starts must strictly ascend");
}
}
}