use crate::types::Language;
const MAX_SEGMENT_WORDS: usize = 2;
const MAX_RHYTHMIC_SEGMENT_WORDS: usize = 4;
const MAX_RHYTHMIC_SPREAD: usize = 2;
const MIN_ITEMS_FOR_RHYTHMIC_DETECTION: usize = 5;
const EN_CLAUSE_ONSET_MARKERS: &[&str] = &[
"we",
"i",
"you",
"he",
"she",
"it",
"they",
"this",
"that",
"these",
"those",
"although",
"though",
"while",
"when",
"since",
"because",
"if",
"however",
"moreover",
"furthermore",
"but",
"yet",
"so",
];
const FR_CLAUSE_ONSET_MARKERS: &[&str] = &[
"nous",
"je",
"tu",
"il",
"elle",
"on",
"ils",
"elles",
"ce",
"cet",
"cette",
"ces",
"bien",
"alors",
"tandis",
"quand",
"puisque",
"parce",
"si",
"mais",
"or",
"donc",
"car",
"cependant",
"toutefois",
"néanmoins",
];
const MIN_ITEMS_FOR_DETECTION: u32 = 3;
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct Enumeration {
pub start: usize,
pub end: usize,
pub items: u32,
pub commas: u32,
}
#[must_use]
pub fn detect_enumerations(sentence: &str, language: Language) -> Vec<Enumeration> {
let connectors: &[&str] = match language {
Language::En => &["and", "or", "plus"],
Language::Fr => &["et", "ou", "plus"],
Language::Unknown => return Vec::new(),
};
let segments = split_comma_segments(sentence);
let mut result: Vec<Enumeration> = Vec::new();
let clause_onsets: &[&str] = match language {
Language::En => EN_CLAUSE_ONSET_MARKERS,
Language::Fr => FR_CLAUSE_ONSET_MARKERS,
Language::Unknown => return Vec::new(),
};
for (i, seg) in segments.iter().enumerate() {
let connector_text = &sentence[seg.range.clone()];
if !segment_starts_with_connector(connector_text, connectors) {
continue;
}
let start_idx =
walk_back_under_tight_limit(&segments, sentence, i, connectors).or_else(|| {
walk_back_under_rhythmic_limit(&segments, sentence, i, connectors, clause_onsets)
});
let Some(start_idx) = start_idx else {
continue;
};
let items = u32::try_from(i - start_idx + 1).unwrap_or(u32::MAX);
if items < MIN_ITEMS_FOR_DETECTION {
continue;
}
let start_byte = segments[start_idx].range.start;
let end_byte = seg.range.end;
let commas = items.saturating_sub(1);
push_or_merge(
&mut result,
Enumeration {
start: start_byte,
end: end_byte,
items,
commas,
},
);
}
result
}
fn walk_back_under_tight_limit(
segments: &[Segment],
sentence: &str,
connector_idx: usize,
connectors: &[&str],
) -> Option<usize> {
let mut start_idx = connector_idx;
while start_idx > 0 {
let prev_text = &sentence[segments[start_idx - 1].range.clone()];
if word_count(prev_text) == 0 || word_count(prev_text) > MAX_SEGMENT_WORDS {
break;
}
if segment_starts_with_connector(prev_text, connectors) {
break;
}
start_idx -= 1;
}
let items = connector_idx - start_idx + 1;
if items >= MIN_ITEMS_FOR_DETECTION as usize {
Some(start_idx)
} else {
None
}
}
fn walk_back_under_rhythmic_limit(
segments: &[Segment],
sentence: &str,
connector_idx: usize,
connectors: &[&str],
clause_onsets: &[&str],
) -> Option<usize> {
let mut start_idx = connector_idx;
while start_idx > 0 {
let prev_text = &sentence[segments[start_idx - 1].range.clone()];
let count = word_count(prev_text);
if count == 0 || count > MAX_RHYTHMIC_SEGMENT_WORDS {
break;
}
if segment_starts_with_connector(prev_text, connectors) {
break;
}
if segment_starts_with_word_in(prev_text, clause_onsets) {
break;
}
start_idx -= 1;
}
let items = connector_idx - start_idx + 1;
if items < MIN_ITEMS_FOR_RHYTHMIC_DETECTION {
return None;
}
let mut counts: Vec<usize> = (start_idx..connector_idx)
.map(|k| word_count(&sentence[segments[k].range.clone()]))
.collect();
let connector_text = &sentence[segments[connector_idx].range.clone()];
counts.push(connector_segment_item_word_count(
connector_text,
connectors,
));
if !run_is_rhythmically_regular(&counts) {
return None;
}
Some(start_idx)
}
fn run_is_rhythmically_regular(counts: &[usize]) -> bool {
let (Some(&min), Some(&max)) = (counts.iter().min(), counts.iter().max()) else {
return false;
};
if min == 0 || max > MAX_RHYTHMIC_SEGMENT_WORDS {
return false;
}
max - min <= MAX_RHYTHMIC_SPREAD
}
fn word_count(segment: &str) -> usize {
segment.split_whitespace().count()
}
fn connector_segment_item_word_count(segment: &str, connectors: &[&str]) -> usize {
let trimmed = segment.trim_start();
let lower = trimmed.to_lowercase();
for connector in connectors {
if let Some(rest) = lower.strip_prefix(connector) {
if rest.chars().next().is_some_and(char::is_whitespace) {
return word_count(rest);
}
}
}
word_count(segment)
}
fn segment_starts_with_word_in(segment: &str, markers: &[&str]) -> bool {
let trimmed = segment.trim_start_matches(|c: char| !c.is_alphabetic());
let first_word = trimmed
.split(|c: char| !c.is_alphabetic() && c != '\'')
.next()
.unwrap_or("");
if first_word.is_empty() {
return false;
}
let lower = first_word.to_lowercase();
markers.iter().any(|m| *m == lower)
}
#[must_use]
pub fn enumeration_comma_count(sentence: &str, language: Language) -> u32 {
detect_enumerations(sentence, language)
.iter()
.map(|e| e.commas)
.sum()
}
#[must_use]
pub fn parenthesised_list_comma_count(sentence: &str) -> u32 {
let bytes = sentence.as_bytes();
let mut total: u32 = 0;
let mut i = 0;
while i < bytes.len() {
if bytes[i] != b'(' {
i += 1;
continue;
}
let start = i + 1;
let mut depth = 1usize;
let mut j = start;
while j < bytes.len() {
match bytes[j] {
b'(' => depth += 1,
b')' => {
depth -= 1;
if depth == 0 {
break;
}
},
_ => {},
}
j += 1;
}
if depth != 0 {
break;
}
let inner = &sentence[start..j];
if !inner.contains('(') {
total = total.saturating_add(parenthesised_run_comma_count(inner));
}
i = j + 1;
}
total
}
fn parenthesised_run_comma_count(inner: &str) -> u32 {
let segments: Vec<&str> = inner.split(',').collect();
if segments.len() < MIN_ITEMS_FOR_DETECTION as usize {
return 0;
}
if !segments
.iter()
.all(|s| s.split_whitespace().count() <= MAX_SEGMENT_WORDS)
{
return 0;
}
u32::try_from(segments.len() - 1).unwrap_or(u32::MAX)
}
struct Segment {
range: std::ops::Range<usize>,
}
fn split_comma_segments(sentence: &str) -> Vec<Segment> {
let mut segments = Vec::new();
let bytes = sentence.as_bytes();
let mut start = 0;
for (idx, &b) in bytes.iter().enumerate() {
if b == b',' {
segments.push(Segment { range: start..idx });
start = idx + 1;
}
}
segments.push(Segment {
range: start..bytes.len(),
});
segments
}
fn segment_starts_with_connector(segment: &str, connectors: &[&str]) -> bool {
let trimmed = segment.trim_start();
let lower = trimmed.to_lowercase();
for connector in connectors {
if let Some(rest) = lower.strip_prefix(connector) {
if rest.chars().next().is_some_and(char::is_whitespace) {
return true;
}
}
}
false
}
fn push_or_merge(out: &mut Vec<Enumeration>, candidate: Enumeration) {
if let Some(last) = out.last_mut() {
if candidate.start <= last.end {
last.end = candidate.end.max(last.end);
last.items = last.items.max(candidate.items);
last.commas = last.commas.max(candidate.commas);
return;
}
}
out.push(candidate);
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn detects_english_oxford_enumeration() {
let s = "red, green, blue, and yellow";
let enums = detect_enumerations(s, Language::En);
assert_eq!(enums.len(), 1);
assert_eq!(enums[0].items, 4);
assert_eq!(enums[0].commas, 3);
}
#[test]
fn detects_french_oxford_enumeration() {
let s = "rouge, vert, bleu, et jaune";
let enums = detect_enumerations(s, Language::Fr);
assert_eq!(enums.len(), 1);
assert_eq!(enums[0].items, 4);
}
#[test]
fn detects_or_as_connector() {
let s = "a, b, c, or d";
let enums = detect_enumerations(s, Language::En);
assert_eq!(enums.len(), 1);
assert_eq!(enums[0].items, 4);
}
#[test]
fn ignores_non_oxford_form() {
let s = "red, green, blue and yellow";
assert!(detect_enumerations(s, Language::En).is_empty());
}
#[test]
fn needs_at_least_three_items() {
let s = "a, and b";
assert!(detect_enumerations(s, Language::En).is_empty());
}
#[test]
fn rejects_long_segments() {
let s =
"first, the second segment is way too long to count as a short item, third, and fourth";
assert!(detect_enumerations(s, Language::En).is_empty());
}
#[test]
fn does_not_swallow_surrounding_subordinates() {
let s = "Note, although we agreed, to pack the red, green, and blue files, carefully";
assert!(detect_enumerations(s, Language::En).is_empty());
}
#[test]
fn enumeration_comma_count_sums_all_enumerations() {
let s = "red, green, blue, and yellow";
assert_eq!(enumeration_comma_count(s, Language::En), 3);
}
#[test]
fn unknown_language_disables_detection() {
let s = "red, green, blue, and yellow";
assert!(detect_enumerations(s, Language::Unknown).is_empty());
}
#[test]
fn parenthesised_list_counts_inner_commas() {
let s = "Pick a colour (red, green, blue, yellow) for the frame.";
assert_eq!(parenthesised_list_comma_count(s), 3);
}
#[test]
fn parenthesised_list_needs_three_segments() {
let s = "The pair (foo, bar) matters.";
assert_eq!(parenthesised_list_comma_count(s), 0);
}
#[test]
fn parenthesised_list_rejects_long_segments() {
let s = "See (red, a very long qualifying clause here, blue, yellow).";
assert_eq!(parenthesised_list_comma_count(s), 0);
}
#[test]
fn parenthesised_list_ignores_nested_parens() {
let s = "See (red, green (emerald), blue, yellow) here.";
assert_eq!(parenthesised_list_comma_count(s), 0);
}
#[test]
fn parenthesised_list_ignores_unbalanced_parens() {
let s = "See (red, green, blue here.";
assert_eq!(parenthesised_list_comma_count(s), 0);
}
#[test]
fn parenthesised_list_handles_multiple_runs() {
let s = "Digits (`1`, `2`, `3`) and spellings (`one`, `two`, `three`) differ.";
assert_eq!(parenthesised_list_comma_count(s), 4);
}
#[test]
fn parenthesised_list_counts_empty_segments_from_stripped_code() {
let s = "The tokens (, , , ) are listed.";
assert_eq!(parenthesised_list_comma_count(s), 3);
}
#[test]
fn parenthesised_list_is_language_agnostic() {
let s = "Voyelles (`a`, `e`, `i`, `o`, `u`) courantes.";
assert_eq!(parenthesised_list_comma_count(s), 4);
}
#[test]
fn case_insensitive_connector() {
let s = "red, green, blue, And yellow";
assert_eq!(detect_enumerations(s, Language::En).len(), 1);
}
#[test]
fn rhythmic_three_to_four_word_oxford_run_is_detected() {
let s = "category, severity, default weight, parameters per profile, EN/FR examples, \
and suppression";
let enums = detect_enumerations(s, Language::En);
assert_eq!(enums.len(), 1);
assert_eq!(enums[0].items, 6);
assert_eq!(enums[0].commas, 5);
}
#[test]
fn rhythmic_run_in_french_is_detected() {
let s =
"categorie, severite, poids par defaut, seuils par profil, exemples, et neutralisation";
let enums = detect_enumerations(s, Language::Fr);
assert_eq!(enums.len(), 1);
assert_eq!(enums[0].items, 6);
}
#[test]
fn arrhythmic_subordination_pile_is_rejected() {
let s = "the team decided, after much debate among stakeholders, to revise the palette, \
before shipping, and despite the tight deadline";
assert!(detect_enumerations(s, Language::En).is_empty());
}
#[test]
fn five_item_run_at_rhythmic_word_cap_is_accepted() {
let s = "alpha beta gamma delta, epsilon zeta eta theta, iota kappa lambda mu, \
nu xi omicron pi, and rho sigma tau upsilon";
let enums = detect_enumerations(s, Language::En);
assert_eq!(enums.len(), 1);
assert_eq!(enums[0].items, 5);
}
#[test]
fn rhythmic_run_above_word_cap_is_rejected() {
let s = "first item, second item, the third item is five words, fourth item, \
and fifth";
assert!(detect_enumerations(s, Language::En).is_empty());
}
#[test]
fn rhythmic_spread_above_two_is_rejected() {
let s = "alpha, beta gamma delta epsilon, zeta, eta theta iota kappa, and mu";
assert!(detect_enumerations(s, Language::En).is_empty());
}
#[test]
fn rhythmic_run_below_five_items_is_rejected() {
let s = "first new item, second new item, third new item, and fourth new item";
assert!(detect_enumerations(s, Language::En).is_empty());
}
#[test]
fn clause_onset_marker_stops_relaxed_walk_back() {
let s = "Note, although we agreed, we packed red, green, and blue, carefully, and quietly";
assert!(detect_enumerations(s, Language::En).is_empty());
}
#[test]
fn clause_onset_marker_works_in_french() {
let s = "Note, alors que nous avions accepté, nous avons emballé rouge, vert, et bleu";
assert!(detect_enumerations(s, Language::Fr).is_empty());
}
#[test]
fn tight_pass_still_accepts_pure_one_word_run() {
let s = "red, green, blue, and yellow";
let enums = detect_enumerations(s, Language::En);
assert_eq!(enums.len(), 1);
assert_eq!(enums[0].items, 4);
}
#[test]
fn rhythmic_relaxation_does_not_break_subordinate_guard() {
let s = "Note, although we agreed, to pack the red, green, and blue files, carefully";
assert!(detect_enumerations(s, Language::En).is_empty());
}
#[test]
fn plus_closes_an_english_enumeration() {
let s = "profile, format, min-score, plus working-directory and args";
let enums = detect_enumerations(s, Language::En);
assert_eq!(enums.len(), 1);
assert_eq!(enums[0].items, 4);
assert_eq!(enums[0].commas, 3);
}
#[test]
fn plus_closes_a_french_enumeration() {
let s = "profil, format, score minimal, plus repertoire et arguments";
let enums = detect_enumerations(s, Language::Fr);
assert_eq!(enums.len(), 1);
assert_eq!(enums[0].items, 4);
}
#[test]
fn plus_after_oxford_does_not_extend_the_oxford_run() {
let s = "apples, oranges, and bananas, plus laughed";
let enums = detect_enumerations(s, Language::En);
assert_eq!(enums.len(), 1);
assert_eq!(enums[0].items, 3);
}
#[test]
fn plus_without_a_preceding_run_does_not_trigger() {
let s = "we shopped today, plus we ordered grapes";
assert!(detect_enumerations(s, Language::En).is_empty());
}
#[test]
fn run_is_rhythmically_regular_unit() {
assert!(run_is_rhythmically_regular(&[1, 1, 2, 3, 2, 1])); assert!(run_is_rhythmically_regular(&[2, 2, 2]));
assert!(run_is_rhythmically_regular(&[1, 2, 3]));
assert!(!run_is_rhythmically_regular(&[])); assert!(!run_is_rhythmically_regular(&[1, 2, 4])); assert!(!run_is_rhythmically_regular(&[0, 1, 2])); assert!(!run_is_rhythmically_regular(&[1, 2, 5])); }
}