use super::ProseRange;
const fn is_bridge_char(c: char) -> bool {
c.is_ascii_whitespace()
|| matches!(
c,
',' | '.'
| ';'
| ':'
| '!'
| '?'
| '('
| ')'
| '\''
| '"'
| '-'
| '\u{2013}'
| '\u{2014}'
| '['
| ']'
| '{'
| '}'
| '~'
)
}
pub fn merge_ranges(
words: &[(usize, usize)],
text: &str,
strip_noise: fn(&str) -> String,
collect_exclusions: fn(&str, usize, &mut Vec<(usize, usize)>),
) -> Vec<ProseRange> {
if words.is_empty() {
return Vec::new();
}
let mut ranges = Vec::new();
let mut chunk_start = words[0].0;
let mut chunk_end = words[0].1;
let mut exclusions: Vec<(usize, usize)> = Vec::new();
for &(start, end) in &words[1..] {
let gap = &text[chunk_end..start];
if is_bridgeable_gap(gap, strip_noise) {
collect_exclusions(gap, chunk_end, &mut exclusions);
} else {
ranges.push(ProseRange {
start_byte: chunk_start,
end_byte: chunk_end,
exclusions: std::mem::take(&mut exclusions),
});
chunk_start = start;
}
chunk_end = end;
}
ranges.push(ProseRange {
start_byte: chunk_start,
end_byte: chunk_end,
exclusions,
});
ranges
}
fn is_bridgeable_gap(gap: &str, strip_noise: fn(&str) -> String) -> bool {
if gap.contains("\n\n") || gap.contains("\r\n\r\n") {
return false;
}
let stripped = strip_noise(gap);
if stripped.contains("\n\n") || stripped.contains("\r\n\r\n") {
return false;
}
stripped.chars().all(is_bridge_char)
}
pub fn skip_balanced_bytes(
bytes: &[u8],
mut i: usize,
open: u8,
close: u8,
escape: Option<u8>,
) -> usize {
let mut depth: u32 = 1;
while i < bytes.len() && depth > 0 {
if let Some(esc) = escape
&& bytes[i] == esc
&& i + 1 < bytes.len()
{
i += 2;
continue;
}
if bytes[i] == open {
depth += 1;
} else if bytes[i] == close {
depth -= 1;
}
i += 1;
}
i
}
pub fn skip_balanced_chars(chars: &[char], mut i: usize, open: char, close: char) -> usize {
let mut depth: u32 = 1;
while i < chars.len() && depth > 0 {
if chars[i] == open {
depth += 1;
} else if chars[i] == close {
depth -= 1;
}
i += 1;
}
i
}
pub fn skip_command_args_bytes(bytes: &[u8], mut i: usize, pairs: &[(u8, u8)]) -> usize {
while i < bytes.len() {
if let Some(&(open, close)) = pairs.iter().find(|(o, _)| *o == bytes[i]) {
i = skip_balanced_bytes(bytes, i + 1, open, close, None);
} else {
break;
}
}
i
}
pub fn skip_command_args_chars(chars: &[char], mut i: usize, pairs: &[(char, char)]) -> usize {
while i < chars.len() {
if let Some(&(open, close)) = pairs.iter().find(|(o, _)| *o == chars[i]) {
i = skip_balanced_chars(chars, i + 1, open, close);
} else {
break;
}
}
i
}
pub fn install_skip_exclusions(ranges: &mut [ProseRange], skips: &[(usize, usize)], text: &[u8]) {
for range in ranges.iter_mut() {
for &(skip_start, skip_end) in skips {
if skip_end <= range.start_byte || skip_start >= range.end_byte {
continue;
}
let exc_start = skip_start.max(range.start_byte);
let exc_end = skip_end.min(range.end_byte);
range.exclusions.push((
absorb_linebreak_left(text, range.start_byte, exc_start),
absorb_linebreak_right(text, range.end_byte, exc_end),
));
}
}
}
fn absorb_linebreak_left(text: &[u8], lower_bound: usize, from: usize) -> usize {
let mut s = from;
while s > lower_bound && text[s - 1].is_ascii_whitespace() {
s -= 1;
}
if text[s..from].iter().any(|&b| b == b'\n' || b == b'\r') {
s
} else {
from
}
}
fn absorb_linebreak_right(text: &[u8], upper_bound: usize, from: usize) -> usize {
let mut e = from;
while e < upper_bound && text[e].is_ascii_whitespace() {
e += 1;
}
if text[from..e].iter().any(|&b| b == b'\n' || b == b'\r') {
e
} else {
from
}
}
pub fn dedup_exclusions(ranges: &mut [ProseRange]) {
for range in ranges.iter_mut() {
if range.exclusions.len() <= 1 {
continue;
}
range.exclusions.sort_unstable_by_key(|&(s, _)| s);
let mut merged = vec![range.exclusions[0]];
for &(s, e) in &range.exclusions[1..] {
let last = merged.last_mut().unwrap();
if s <= last.1 {
last.1 = last.1.max(e);
} else {
merged.push((s, e));
}
}
range.exclusions = merged;
}
}
pub fn is_fully_excluded(range: &ProseRange) -> bool {
if range.exclusions.is_empty() {
return false;
}
let mut covered = range.start_byte;
for &(s, e) in &range.exclusions {
if s > covered {
return false;
}
covered = covered.max(e);
}
covered >= range.end_byte
}
#[must_use]
pub fn merge_continuations(
mut ranges: Vec<ProseRange>,
text: &str,
force_regions: &[std::ops::Range<usize>],
) -> Vec<ProseRange> {
if ranges.len() < 2 {
return ranges;
}
ranges.sort_by_key(|r| r.start_byte);
let mut out: Vec<ProseRange> = Vec::with_capacity(ranges.len());
for next in ranges {
let merge = out.last().is_some_and(|prev| {
in_same_force_region(prev, &next, force_regions)
|| is_natural_continuation(prev, &next, text)
});
if merge {
let prev = out.last_mut().expect("merge implies a previous range");
if prev.end_byte < next.start_byte {
prev.exclusions.push((prev.end_byte, next.start_byte));
}
prev.exclusions.extend(next.exclusions.iter().copied());
prev.end_byte = next.end_byte;
} else {
out.push(next);
}
}
out
}
fn in_same_force_region(
prev: &ProseRange,
next: &ProseRange,
force_regions: &[std::ops::Range<usize>],
) -> bool {
force_regions
.iter()
.any(|r| r.contains(&prev.start_byte) && r.contains(&next.start_byte))
}
fn is_natural_continuation(prev: &ProseRange, next: &ProseRange, text: &str) -> bool {
let gap = &text[prev.end_byte..next.start_byte];
if gap.contains("\n\n") || gap.contains("\r\n\r\n") {
return false;
}
let prev_text = prev.extract_text(text);
match prev_text.trim_end().chars().next_back() {
Some('.' | '!' | '?') | None => return false,
Some(_) => {}
}
let next_text = next.extract_text(text);
matches!(next_text.trim_start().chars().next(), Some(c) if c.is_lowercase())
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn install_skip_keeps_inline_space_bounds_tight() {
let text = "ab #{G} cd";
let mut ranges = [ProseRange {
start_byte: 0,
end_byte: text.len(),
exclusions: Vec::new(),
}];
install_skip_exclusions(&mut ranges, &[(3, 7)], text.as_bytes());
assert_eq!(ranges[0].exclusions, vec![(3, 7)]);
}
#[test]
fn install_skip_absorbs_flanking_newline() {
let text = "ab\n##\ncd";
let mut ranges = [ProseRange {
start_byte: 0,
end_byte: text.len(),
exclusions: Vec::new(),
}];
install_skip_exclusions(&mut ranges, &[(3, 5)], text.as_bytes());
assert_eq!(ranges[0].exclusions, vec![(2, 6)]);
}
fn range(start: usize, end: usize) -> ProseRange {
ProseRange {
start_byte: start,
end_byte: end,
exclusions: Vec::new(),
}
}
#[test]
fn continuation_merges_lowercase_after_no_terminator() {
let text = "Here is something continuation.";
let merged = merge_continuations(vec![range(0, 17), range(19, 32)], text, &[]);
assert_eq!(merged.len(), 1, "blocks should merge into one");
assert_eq!((merged[0].start_byte, merged[0].end_byte), (0, 32));
assert!(
merged[0].exclusions.contains(&(17, 19)),
"gap recorded as exclusion"
);
}
#[test]
fn no_merge_when_prev_ends_in_terminator() {
let text = "First sentence. Second one.";
let merged = merge_continuations(vec![range(0, 15), range(16, 27)], text, &[]);
assert_eq!(merged.len(), 2, "terminal '.' blocks the merge");
}
#[test]
fn no_merge_when_next_starts_uppercase() {
let text = "here we go Now more";
let merged = merge_continuations(vec![range(0, 10), range(11, 19)], text, &[]);
assert_eq!(merged.len(), 2, "uppercase next start blocks the merge");
}
#[test]
fn no_merge_across_blank_line() {
let text = "here we go\n\nmore stuff";
let merged = merge_continuations(vec![range(0, 10), range(12, 22)], text, &[]);
assert_eq!(merged.len(), 2, "a blank line is a paragraph break");
}
#[test]
fn force_region_overrides_heuristic() {
let text = "First sentence. Second one.";
let merged = merge_continuations(vec![range(0, 15), range(16, 27)], text, &[0..text.len()]);
assert_eq!(
merged.len(),
1,
"force region merges regardless of heuristic"
);
}
#[test]
fn test_skip_balanced_bytes_simple() {
let b = b"{hello}";
assert_eq!(skip_balanced_bytes(b, 1, b'{', b'}', None), 7);
}
#[test]
fn test_skip_balanced_bytes_nested() {
let b = b"{a{b{c}d}e}rest";
assert_eq!(skip_balanced_bytes(b, 1, b'{', b'}', None), 11);
}
#[test]
fn test_skip_balanced_bytes_with_escape() {
let b = br"{\}}";
assert_eq!(skip_balanced_bytes(b, 1, b'{', b'}', Some(b'\\')), 4);
}
#[test]
fn test_skip_balanced_bytes_unterminated() {
let b = b"{abc";
assert_eq!(skip_balanced_bytes(b, 1, b'{', b'}', None), 4);
}
#[test]
fn test_skip_balanced_chars_simple() {
let chars: Vec<char> = "{hello}".chars().collect();
assert_eq!(skip_balanced_chars(&chars, 1, '{', '}'), 7);
}
#[test]
fn test_skip_balanced_chars_nested() {
let chars: Vec<char> = "{a{b}c}rest".chars().collect();
assert_eq!(skip_balanced_chars(&chars, 1, '{', '}'), 7);
}
#[test]
fn test_skip_command_args_bytes_multi() {
let b = b"{arg1}[opt]{arg2}rest";
let end = skip_command_args_bytes(b, 0, &[(b'{', b'}'), (b'[', b']')]);
assert_eq!(end, 17);
}
#[test]
fn test_skip_command_args_bytes_no_args() {
let b = b"rest";
assert_eq!(skip_command_args_bytes(b, 0, &[(b'{', b'}')]), 0);
}
#[test]
fn test_skip_command_args_chars_multi() {
let chars: Vec<char> = "{x}[y]{z}tail".chars().collect();
let end = skip_command_args_chars(&chars, 0, &[('{', '}'), ('[', ']')]);
assert_eq!(end, 9);
}
#[test]
fn test_dedup_exclusions_merges_overlapping() {
let mut ranges = vec![ProseRange {
start_byte: 0,
end_byte: 100,
exclusions: vec![(10, 30), (10, 25), (20, 40), (50, 60)],
}];
dedup_exclusions(&mut ranges);
assert_eq!(ranges[0].exclusions, vec![(10, 40), (50, 60)]);
}
#[test]
fn test_dedup_exclusions_adjacent() {
let mut ranges = vec![ProseRange {
start_byte: 0,
end_byte: 100,
exclusions: vec![(10, 20), (20, 30)],
}];
dedup_exclusions(&mut ranges);
assert_eq!(ranges[0].exclusions, vec![(10, 30)]);
}
#[test]
fn test_is_fully_excluded_covered() {
let r = ProseRange {
start_byte: 10,
end_byte: 50,
exclusions: vec![(10, 50)],
};
assert!(is_fully_excluded(&r));
}
#[test]
fn test_is_fully_excluded_gap() {
let r = ProseRange {
start_byte: 10,
end_byte: 50,
exclusions: vec![(10, 30), (35, 50)],
};
assert!(!is_fully_excluded(&r));
}
#[test]
fn test_is_fully_excluded_empty() {
let r = ProseRange {
start_byte: 10,
end_byte: 50,
exclusions: vec![],
};
assert!(!is_fully_excluded(&r));
}
}