use crate::types::Language;
const MAX_SEGMENT_WORDS: usize = 2;
const MIN_ITEMS_FOR_DETECTION: u32 = 3;
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct Enumeration {
pub start: usize,
pub end: usize,
pub items: u32,
pub commas: u32,
}
#[must_use]
pub fn detect_enumerations(sentence: &str, language: Language) -> Vec<Enumeration> {
let connectors: &[&str] = match language {
Language::En => &["and", "or"],
Language::Fr => &["et", "ou"],
Language::Unknown => return Vec::new(),
};
let segments = split_comma_segments(sentence);
let mut result: Vec<Enumeration> = Vec::new();
for (i, seg) in segments.iter().enumerate() {
if !segment_starts_with_connector(&sentence[seg.range.clone()], connectors) {
continue;
}
let mut start_idx = i;
while start_idx > 0 {
let prev_text = &sentence[segments[start_idx - 1].range.clone()];
if !is_short_segment(prev_text) {
break;
}
if segment_starts_with_connector(prev_text, connectors) {
break;
}
start_idx -= 1;
}
let items = u32::try_from(i - start_idx + 1).unwrap_or(u32::MAX);
if items < MIN_ITEMS_FOR_DETECTION {
continue;
}
let start_byte = segments[start_idx].range.start;
let end_byte = seg.range.end;
let commas = items.saturating_sub(1);
push_or_merge(
&mut result,
Enumeration {
start: start_byte,
end: end_byte,
items,
commas,
},
);
}
result
}
#[must_use]
pub fn enumeration_comma_count(sentence: &str, language: Language) -> u32 {
detect_enumerations(sentence, language)
.iter()
.map(|e| e.commas)
.sum()
}
#[must_use]
pub fn parenthesised_list_comma_count(sentence: &str) -> u32 {
let bytes = sentence.as_bytes();
let mut total: u32 = 0;
let mut i = 0;
while i < bytes.len() {
if bytes[i] != b'(' {
i += 1;
continue;
}
let start = i + 1;
let mut depth = 1usize;
let mut j = start;
while j < bytes.len() {
match bytes[j] {
b'(' => depth += 1,
b')' => {
depth -= 1;
if depth == 0 {
break;
}
},
_ => {},
}
j += 1;
}
if depth != 0 {
break;
}
let inner = &sentence[start..j];
if !inner.contains('(') {
total = total.saturating_add(parenthesised_run_comma_count(inner));
}
i = j + 1;
}
total
}
fn parenthesised_run_comma_count(inner: &str) -> u32 {
let segments: Vec<&str> = inner.split(',').collect();
if segments.len() < MIN_ITEMS_FOR_DETECTION as usize {
return 0;
}
if !segments
.iter()
.all(|s| s.split_whitespace().count() <= MAX_SEGMENT_WORDS)
{
return 0;
}
u32::try_from(segments.len() - 1).unwrap_or(u32::MAX)
}
struct Segment {
range: std::ops::Range<usize>,
}
fn split_comma_segments(sentence: &str) -> Vec<Segment> {
let mut segments = Vec::new();
let bytes = sentence.as_bytes();
let mut start = 0;
for (idx, &b) in bytes.iter().enumerate() {
if b == b',' {
segments.push(Segment { range: start..idx });
start = idx + 1;
}
}
segments.push(Segment {
range: start..bytes.len(),
});
segments
}
fn segment_starts_with_connector(segment: &str, connectors: &[&str]) -> bool {
let trimmed = segment.trim_start();
let lower = trimmed.to_lowercase();
for connector in connectors {
if let Some(rest) = lower.strip_prefix(connector) {
if rest.chars().next().is_some_and(char::is_whitespace) {
return true;
}
}
}
false
}
fn is_short_segment(segment: &str) -> bool {
let words = segment.split_whitespace().count();
words > 0 && words <= MAX_SEGMENT_WORDS
}
fn push_or_merge(out: &mut Vec<Enumeration>, candidate: Enumeration) {
if let Some(last) = out.last_mut() {
if candidate.start <= last.end {
last.end = candidate.end.max(last.end);
last.items = last.items.max(candidate.items);
last.commas = last.commas.max(candidate.commas);
return;
}
}
out.push(candidate);
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn detects_english_oxford_enumeration() {
let s = "red, green, blue, and yellow";
let enums = detect_enumerations(s, Language::En);
assert_eq!(enums.len(), 1);
assert_eq!(enums[0].items, 4);
assert_eq!(enums[0].commas, 3);
}
#[test]
fn detects_french_oxford_enumeration() {
let s = "rouge, vert, bleu, et jaune";
let enums = detect_enumerations(s, Language::Fr);
assert_eq!(enums.len(), 1);
assert_eq!(enums[0].items, 4);
}
#[test]
fn detects_or_as_connector() {
let s = "a, b, c, or d";
let enums = detect_enumerations(s, Language::En);
assert_eq!(enums.len(), 1);
assert_eq!(enums[0].items, 4);
}
#[test]
fn ignores_non_oxford_form() {
let s = "red, green, blue and yellow";
assert!(detect_enumerations(s, Language::En).is_empty());
}
#[test]
fn needs_at_least_three_items() {
let s = "a, and b";
assert!(detect_enumerations(s, Language::En).is_empty());
}
#[test]
fn rejects_long_segments() {
let s =
"first, the second segment is way too long to count as a short item, third, and fourth";
assert!(detect_enumerations(s, Language::En).is_empty());
}
#[test]
fn does_not_swallow_surrounding_subordinates() {
let s = "Note, although we agreed, to pack the red, green, and blue files, carefully";
assert!(detect_enumerations(s, Language::En).is_empty());
}
#[test]
fn enumeration_comma_count_sums_all_enumerations() {
let s = "red, green, blue, and yellow";
assert_eq!(enumeration_comma_count(s, Language::En), 3);
}
#[test]
fn unknown_language_disables_detection() {
let s = "red, green, blue, and yellow";
assert!(detect_enumerations(s, Language::Unknown).is_empty());
}
#[test]
fn parenthesised_list_counts_inner_commas() {
let s = "Pick a colour (red, green, blue, yellow) for the frame.";
assert_eq!(parenthesised_list_comma_count(s), 3);
}
#[test]
fn parenthesised_list_needs_three_segments() {
let s = "The pair (foo, bar) matters.";
assert_eq!(parenthesised_list_comma_count(s), 0);
}
#[test]
fn parenthesised_list_rejects_long_segments() {
let s = "See (red, a very long qualifying clause here, blue, yellow).";
assert_eq!(parenthesised_list_comma_count(s), 0);
}
#[test]
fn parenthesised_list_ignores_nested_parens() {
let s = "See (red, green (emerald), blue, yellow) here.";
assert_eq!(parenthesised_list_comma_count(s), 0);
}
#[test]
fn parenthesised_list_ignores_unbalanced_parens() {
let s = "See (red, green, blue here.";
assert_eq!(parenthesised_list_comma_count(s), 0);
}
#[test]
fn parenthesised_list_handles_multiple_runs() {
let s = "Digits (`1`, `2`, `3`) and spellings (`one`, `two`, `three`) differ.";
assert_eq!(parenthesised_list_comma_count(s), 4);
}
#[test]
fn parenthesised_list_counts_empty_segments_from_stripped_code() {
let s = "The tokens (, , , ) are listed.";
assert_eq!(parenthesised_list_comma_count(s), 3);
}
#[test]
fn parenthesised_list_is_language_agnostic() {
let s = "Voyelles (`a`, `e`, `i`, `o`, `u`) courantes.";
assert_eq!(parenthesised_list_comma_count(s), 4);
}
#[test]
fn case_insensitive_connector() {
let s = "red, green, blue, And yellow";
assert_eq!(detect_enumerations(s, Language::En).len(), 1);
}
}