const MAX_ALL_CAPS_LINE_LEN: usize = 80;
const MAX_TITLE_LINE_LEN: usize = 60;
const MAX_TITLE_WORD_COUNT: usize = 8;
const TITLE_UPPERCASE_RATIO: (usize, usize) = (3, 5);
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct DetectedBoundary {
pub byte_offset: usize,
pub is_header: bool,
}
pub fn detect_plain_text_boundaries(text: &str) -> Vec<DetectedBoundary> {
let mut boundaries = Vec::new();
let mut byte_offset: usize = 0;
for line in text.split('\n') {
let trimmed = line.trim();
if !trimmed.is_empty() && (is_all_caps_header(trimmed) || is_numbered_section(line) || is_title_line(trimmed)) {
boundaries.push(DetectedBoundary {
byte_offset,
is_header: true,
});
}
byte_offset += line.len() + 1; }
boundaries
}
fn is_all_caps_header(line: &str) -> bool {
if line.len() >= MAX_ALL_CAPS_LINE_LEN {
return false;
}
let mut alpha_count = 0;
for ch in line.chars() {
if ch.is_alphabetic() {
if !ch.is_uppercase() {
return false;
}
alpha_count += 1;
}
}
alpha_count >= 2
}
fn is_numbered_section(line: &str) -> bool {
if line.starts_with(' ') || line.starts_with('\t') {
return false;
}
let trimmed = line.trim();
if trimmed.is_empty() {
return false;
}
let prefix_end = trimmed
.find(|ch: char| !ch.is_ascii_digit() && ch != '.')
.unwrap_or(trimmed.len());
let prefix = &trimmed[..prefix_end];
if !prefix.contains('.') {
return false;
}
if !prefix.starts_with(|ch: char| ch.is_ascii_digit()) {
return false;
}
if prefix_end >= trimmed.len() {
return false;
}
if !trimmed[prefix_end..].starts_with(' ') {
return false;
}
if trimmed[prefix_end..].trim().is_empty() {
return false;
}
if prefix.contains("..") {
return false;
}
true
}
fn is_title_line(line: &str) -> bool {
if line.len() >= MAX_TITLE_LINE_LEN {
return false;
}
let first = match line.chars().next() {
Some(ch) => ch,
None => return false,
};
if !first.is_uppercase() {
return false;
}
let words: Vec<&str> = line.split_whitespace().collect();
if words.len() < 2 || words.len() > MAX_TITLE_WORD_COUNT {
return false;
}
let upper_start_count = words
.iter()
.filter(|w| w.chars().next().is_some_and(|c| c.is_uppercase()))
.count();
let (num, den) = TITLE_UPPERCASE_RATIO;
if upper_start_count * den < words.len() * num {
return false;
}
if line.ends_with(['.', '!', '?', ';', ':']) {
return false;
}
true
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn all_caps_introduction() {
assert!(is_all_caps_header("INTRODUCTION"));
}
#[test]
fn all_caps_chapter_one() {
assert!(is_all_caps_header("CHAPTER ONE"));
}
#[test]
fn all_caps_terms_and_conditions() {
assert!(is_all_caps_header("TERMS AND CONDITIONS"));
}
#[test]
fn not_all_caps_mixed_case() {
assert!(!is_all_caps_header("Introduction"));
}
#[test]
fn not_all_caps_single_char() {
assert!(!is_all_caps_header("a"));
}
#[test]
fn not_all_caps_too_long() {
let long = "A".repeat(100);
assert!(!is_all_caps_header(&long));
}
#[test]
fn not_all_caps_digits_only() {
assert!(!is_all_caps_header("12345"));
}
#[test]
fn numbered_simple() {
assert!(is_numbered_section("1. Introduction"));
}
#[test]
fn numbered_two_level() {
assert!(is_numbered_section("1.2 Background"));
}
#[test]
fn numbered_three_level() {
assert!(is_numbered_section("1.2.3 Methodology"));
}
#[test]
fn not_numbered_normal_text() {
assert!(!is_numbered_section("This is normal text."));
}
#[test]
fn not_numbered_no_dot() {
assert!(!is_numbered_section("1234 just a number"));
}
#[test]
fn not_numbered_indented() {
assert!(!is_numbered_section(" 1.2 Indented"));
}
#[test]
fn not_numbered_bare_single_level() {
assert!(!is_numbered_section("1."));
}
#[test]
fn not_numbered_bare_two_level() {
assert!(!is_numbered_section("1.2."));
}
#[test]
fn not_numbered_whitespace_only_after_prefix() {
assert!(!is_numbered_section("1. "));
}
#[test]
fn title_executive_summary() {
assert!(is_title_line("Executive Summary"));
}
#[test]
fn not_title_single_word() {
assert!(!is_title_line("Background"));
assert!(!is_title_line("Note"));
assert!(!is_title_line("Introduction"));
}
#[test]
fn title_risk_factors() {
assert!(is_title_line("Risk Factors"));
}
#[test]
fn not_title_ends_with_period() {
assert!(!is_title_line("This is a sentence."));
}
#[test]
fn not_title_ends_with_exclamation() {
assert!(!is_title_line("Watch out!"));
}
#[test]
fn not_title_too_long() {
let long = format!("A{}", "b".repeat(60));
assert!(!is_title_line(&long));
}
#[test]
fn not_title_indented() {
assert!(!is_title_line(" Indented Title"));
}
#[test]
fn not_title_ordinary_prose() {
assert!(!is_title_line("The cat sat on the mat"));
assert!(!is_title_line("I agree with you"));
assert!(!is_title_line("Call me later"));
assert!(!is_title_line("Total revenue was $5M"));
}
#[test]
fn not_title_starts_lowercase() {
assert!(!is_title_line("background"));
}
#[test]
fn not_title_too_many_words() {
assert!(!is_title_line("This Is A Title With Way Too Many Words In It"));
}
#[test]
fn detect_empty_text() {
let result = detect_plain_text_boundaries("");
assert!(result.is_empty());
}
#[test]
fn detect_no_headers() {
let text = "This is a normal sentence.\nAnother normal sentence here.\nNothing special going on.";
let result = detect_plain_text_boundaries(text);
assert!(result.is_empty());
}
#[test]
fn detect_byte_offsets_correct() {
let text = "INTRODUCTION\nSome body text here.\nCONCLUSION";
let result = detect_plain_text_boundaries(text);
assert_eq!(result.len(), 2);
assert_eq!(result[0].byte_offset, 0);
assert!(result[0].is_header);
let expected_offset = "INTRODUCTION".len() + 1 + "Some body text here.".len() + 1;
assert_eq!(result[1].byte_offset, expected_offset);
assert!(result[1].is_header);
}
#[test]
fn detect_mixed_boundary_types() {
let text = "CHAPTER ONE\n\n1.2 Background\n\nExecutive Summary\n\nThis is body text.";
let result = detect_plain_text_boundaries(text);
assert_eq!(result.len(), 3);
assert!(result.iter().all(|b| b.is_header));
for window in result.windows(2) {
assert!(window[0].byte_offset < window[1].byte_offset);
}
}
#[test]
fn not_title_starts_with_space() {
assert!(!is_title_line(" Leading space"));
}
#[test]
fn not_title_ends_with_colon() {
assert!(!is_title_line("Section:"));
}
#[test]
fn not_title_ends_with_semicolon() {
assert!(!is_title_line("Section;"));
}
#[test]
fn not_title_ends_with_question() {
assert!(!is_title_line("Is this a title?"));
}
#[test]
fn detect_all_caps_and_numbered_in_same_document() {
let text = "OVERVIEW\n\nSome body text.\n\n1.1 Subsection\n\nMore body text.";
let result = detect_plain_text_boundaries(text);
assert_eq!(result.len(), 2, "should detect both ALL CAPS and numbered headers");
}
#[test]
fn unicode_cjk_not_all_caps() {
assert!(!is_all_caps_header("Chinese characters are not uppercase Latin"));
assert!(!is_all_caps_header("日本語テスト")); }
#[test]
fn unicode_hangul_not_all_caps() {
assert!(!is_all_caps_header("한국어 테스트"));
}
#[test]
fn unicode_arabic_not_all_caps() {
assert!(!is_all_caps_header("مرحبا بالعالم"));
}
#[test]
fn numbers_and_special_chars_not_headers() {
assert!(!is_all_caps_header("12345"));
assert!(!is_all_caps_header("---"));
assert!(!is_all_caps_header("***"));
assert!(!is_all_caps_header("12.34.56"));
}
#[test]
fn detect_only_numbers_line_not_boundary() {
let text = "12345\n\nSome body text.\n\n67890";
let result = detect_plain_text_boundaries(text);
assert!(
result.is_empty(),
"pure number/special-char lines should not be boundaries"
);
}
#[test]
fn detect_all_three_header_types_in_one_document() {
let text = "ABSTRACT\n\nSome abstract text here.\n\n1.1 Methods\n\nMethodology description.\n\nFinal Remarks\n\nFinal remarks about the study.";
let result = detect_plain_text_boundaries(text);
assert_eq!(
result.len(),
3,
"should detect all three header types, got {} boundaries: {:?}",
result.len(),
result
);
for window in result.windows(2) {
assert!(
window[0].byte_offset < window[1].byte_offset,
"boundaries should be ordered by offset"
);
}
}
#[test]
fn unicode_chinese_text_not_all_caps() {
assert!(!is_all_caps_header("\u{4e2d}\u{6587}\u{6d4b}\u{8bd5}")); assert!(!is_all_caps_header("\u{7b2c}\u{4e00}\u{7ae0}")); }
#[test]
fn detect_chinese_text_not_boundary() {
let text = "\u{7b2c}\u{4e00}\u{7ae0}\n\n\u{8fd9}\u{662f}\u{6b63}\u{6587}\u{5185}\u{5bb9}\u{3002}";
let result = detect_plain_text_boundaries(text);
for boundary in &result {
let line_at_offset = text[boundary.byte_offset..].lines().next().unwrap_or("");
assert!(
!is_all_caps_header(line_at_offset.trim()),
"Chinese text should not be detected as ALL CAPS: {:?}",
line_at_offset
);
}
}
}