use crate::text::document_structure::{SectionNumber, SectionNumberFormat};
use regex::Regex;
use std::sync::OnceLock;
pub struct TextAnalyzer;
impl TextAnalyzer {
pub fn detect_heading_level(line: &str) -> Option<u8> {
let trimmed = line.trim();
if trimmed.is_empty() {
return None;
}
if trimmed.starts_with('#') {
let level = trimmed.chars().take_while(|&c| c == '#').count();
if level > 0 && level <= 6 {
if trimmed.len() > level && trimmed.chars().nth(level) == Some(' ') {
return Some(level.min(255) as u8);
}
}
}
if trimmed.len() >= 5 && Self::is_all_caps(trimmed) {
let level = if trimmed.len() < 20 {
1 } else if trimmed.len() < 40 {
2 } else {
3 };
return Some(level);
}
if let Some(section_num) = Self::extract_section_number(trimmed) {
let level = section_num.depth();
if level > 0 && level <= 6 {
return Some(level);
}
}
None
}
fn is_all_caps(text: &str) -> bool {
let letters: String = text.chars().filter(|c| c.is_alphabetic()).collect();
!letters.is_empty() && letters.chars().all(|c| c.is_uppercase())
}
pub fn extract_section_number(text: &str) -> Option<SectionNumber> {
static DECIMAL_REGEX: OnceLock<Regex> = OnceLock::new();
static ROMAN_REGEX: OnceLock<Regex> = OnceLock::new();
static ALPHA_REGEX: OnceLock<Regex> = OnceLock::new();
static CHAPTER_REGEX: OnceLock<Regex> = OnceLock::new();
let decimal_re = DECIMAL_REGEX.get_or_init(|| {
Regex::new(r"^(\d+(?:\.\d+)*)\s*[.:]?\s").expect("static regex literal")
});
let roman_re = ROMAN_REGEX
.get_or_init(|| Regex::new(r"^([IVXLCDM]+)[.:]?\s").expect("static regex literal"));
let alpha_re = ALPHA_REGEX
.get_or_init(|| Regex::new(r"^([A-Z])[.:]?\s").expect("static regex literal"));
let chapter_re = CHAPTER_REGEX.get_or_init(|| {
Regex::new(r"(?i)^(chapter|section|part|appendix)\s+(\d+|[IVXLCDM]+|[A-Z])\b")
.expect("static regex literal")
});
if let Some(caps) = decimal_re.captures(text) {
if let Some(num_str) = caps.get(1) {
let components: Vec<usize> = num_str
.as_str()
.split('.')
.filter_map(|s| s.parse().ok())
.collect();
if !components.is_empty() {
return Some(SectionNumber {
raw: num_str.as_str().to_string(),
format: SectionNumberFormat::Decimal,
components,
});
}
}
}
if let Some(caps) = chapter_re.captures(text) {
if let Some(num_match) = caps.get(2) {
let num_str = num_match.as_str();
if let Ok(num) = num_str.parse::<usize>() {
return Some(SectionNumber {
raw: format!("{} {}", caps.get(1)?.as_str(), num_str),
format: SectionNumberFormat::Mixed,
components: vec![num],
});
}
if let Some(num) = Self::parse_roman_numeral(num_str) {
return Some(SectionNumber {
raw: format!("{} {}", caps.get(1)?.as_str(), num_str),
format: SectionNumberFormat::Mixed,
components: vec![num],
});
}
if num_str.len() == 1 {
if let Some(ch) = num_str.chars().next() {
if ch.is_ascii_uppercase() {
let num = (ch as usize) - ('A' as usize) + 1;
return Some(SectionNumber {
raw: format!("{} {}", caps.get(1)?.as_str(), num_str),
format: SectionNumberFormat::Mixed,
components: vec![num],
});
}
}
}
}
}
if let Some(caps) = roman_re.captures(text) {
if let Some(roman_str) = caps.get(1) {
if let Some(num) = Self::parse_roman_numeral(roman_str.as_str()) {
return Some(SectionNumber {
raw: roman_str.as_str().to_string(),
format: SectionNumberFormat::Roman,
components: vec![num],
});
}
}
}
if let Some(caps) = alpha_re.captures(text) {
if let Some(letter) = caps.get(1) {
let ch = letter.as_str().chars().next()?;
let num = (ch as usize) - ('A' as usize) + 1;
return Some(SectionNumber {
raw: letter.as_str().to_string(),
format: SectionNumberFormat::Alphabetic,
components: vec![num],
});
}
}
None
}
fn parse_roman_numeral(roman: &str) -> Option<usize> {
let mut result = 0;
let mut prev_value = 0;
for ch in roman.chars().rev() {
let value = match ch {
'I' => 1,
'V' => 5,
'X' => 10,
'L' => 50,
'C' => 100,
'D' => 500,
'M' => 1000,
_ => return None,
};
if value < prev_value {
result -= value;
} else {
result += value;
}
prev_value = value;
}
Some(result)
}
pub fn find_blank_line_positions(text: &str) -> Vec<usize> {
let mut positions = Vec::new();
let mut current_offset = 0;
let mut prev_was_blank = false;
for line in text.lines() {
let is_blank = line.trim().is_empty();
if is_blank && !prev_was_blank {
positions.push(current_offset);
}
prev_was_blank = is_blank;
current_offset += line.len() + 1; }
positions
}
pub fn calculate_statistics(text: &str) -> TextStats {
let words: Vec<&str> = text.split_whitespace().collect();
let word_count = words.len();
let sentence_endings = ['.', '!', '?'];
let sentence_count = text
.chars()
.filter(|c| sentence_endings.contains(c))
.count()
.max(1);
let avg_sentence_length = if sentence_count > 0 {
word_count as f32 / sentence_count as f32
} else {
0.0
};
let paragraph_count = text
.split("\n\n")
.filter(|p| !p.trim().is_empty())
.count()
.max(1);
let char_count = text.chars().count();
TextStats {
word_count,
sentence_count,
paragraph_count,
char_count,
avg_sentence_length,
avg_word_length: if word_count > 0 {
char_count as f32 / word_count as f32
} else {
0.0
},
}
}
pub fn is_underline(line: &str) -> Option<u8> {
let trimmed = line.trim();
if trimmed.len() < 3 {
return None;
}
if trimmed.chars().all(|c| c == '=') {
Some(1) } else if trimmed.chars().all(|c| c == '-') {
Some(2) } else if trimmed.chars().all(|c| c == '_') {
Some(3) } else {
None
}
}
pub fn extract_title(text: &str) -> Option<String> {
for line in text.lines().take(10) {
let trimmed = line.trim();
if trimmed.is_empty() {
continue;
}
if Self::is_all_caps(trimmed) && trimmed.len() < 100 {
return Some(trimmed.to_string());
}
if Self::detect_heading_level(line).is_some() {
let clean = trimmed
.trim_start_matches('#')
.trim_start_matches(|c: char| c.is_numeric() || c == '.')
.trim();
if !clean.is_empty() {
return Some(clean.to_string());
}
}
if trimmed.len() > 5 {
return Some(trimmed.to_string());
}
}
None
}
}
#[derive(Debug, Clone)]
pub struct TextStats {
pub word_count: usize,
pub sentence_count: usize,
pub paragraph_count: usize,
pub char_count: usize,
pub avg_sentence_length: f32,
pub avg_word_length: f32,
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_markdown_heading_detection() {
assert_eq!(TextAnalyzer::detect_heading_level("# Chapter 1"), Some(1));
assert_eq!(
TextAnalyzer::detect_heading_level("## Section 1.1"),
Some(2)
);
assert_eq!(
TextAnalyzer::detect_heading_level("### Subsection 1.1.1"),
Some(3)
);
assert_eq!(TextAnalyzer::detect_heading_level("#### Level 4"), Some(4));
assert_eq!(TextAnalyzer::detect_heading_level("#No space"), None);
}
#[test]
fn test_all_caps_detection() {
assert_eq!(TextAnalyzer::detect_heading_level("CHAPTER ONE"), Some(1));
assert_eq!(
TextAnalyzer::detect_heading_level("INTRODUCTION TO MACHINE LEARNING"),
Some(2)
);
assert_eq!(
TextAnalyzer::detect_heading_level("This is not ALL CAPS"),
None
);
}
#[test]
fn test_section_number_extraction() {
let sec1 = TextAnalyzer::extract_section_number("1. Introduction").unwrap();
assert_eq!(sec1.components, vec![1]);
assert_eq!(sec1.format, SectionNumberFormat::Decimal);
let sec2 = TextAnalyzer::extract_section_number("1.2.3 Subsection").unwrap();
assert_eq!(sec2.components, vec![1, 2, 3]);
let sec3 = TextAnalyzer::extract_section_number("Chapter 1 Introduction").unwrap();
assert_eq!(sec3.components, vec![1]);
assert_eq!(sec3.format, SectionNumberFormat::Mixed);
let sec4 = TextAnalyzer::extract_section_number("I. First Chapter").unwrap();
assert_eq!(sec4.components, vec![1]);
assert_eq!(sec4.format, SectionNumberFormat::Roman);
let sec5 = TextAnalyzer::extract_section_number("IV. Fourth Chapter").unwrap();
assert_eq!(sec5.components, vec![4]);
}
#[test]
fn test_roman_numeral_parsing() {
assert_eq!(TextAnalyzer::parse_roman_numeral("I"), Some(1));
assert_eq!(TextAnalyzer::parse_roman_numeral("IV"), Some(4));
assert_eq!(TextAnalyzer::parse_roman_numeral("IX"), Some(9));
assert_eq!(TextAnalyzer::parse_roman_numeral("XL"), Some(40));
assert_eq!(TextAnalyzer::parse_roman_numeral("MCMXCIV"), Some(1994));
assert_eq!(TextAnalyzer::parse_roman_numeral("ABC"), None);
}
#[test]
fn test_blank_line_detection() {
let text = "Line 1\n\nLine 2\n\n\nLine 3";
let positions = TextAnalyzer::find_blank_line_positions(text);
assert_eq!(positions.len(), 2);
}
#[test]
fn test_text_statistics() {
let text = "This is a test. It has two sentences.";
let stats = TextAnalyzer::calculate_statistics(text);
assert_eq!(stats.sentence_count, 2);
assert!(stats.word_count >= 7);
assert!(stats.avg_sentence_length > 0.0);
}
#[test]
fn test_underline_detection() {
assert_eq!(TextAnalyzer::is_underline("====="), Some(1));
assert_eq!(TextAnalyzer::is_underline("-----"), Some(2));
assert_eq!(TextAnalyzer::is_underline("_____"), Some(3));
assert_eq!(TextAnalyzer::is_underline("===---"), None);
}
#[test]
fn test_title_extraction() {
let text = "# Main Title\n\nSome content here.";
let title = TextAnalyzer::extract_title(text);
assert_eq!(title, Some("Main Title".to_string()));
let text2 = "INTRODUCTION\n\nThis is the intro.";
let title2 = TextAnalyzer::extract_title(text2);
assert_eq!(title2, Some("INTRODUCTION".to_string()));
}
}