graphrag_core/text/
analysis.rs1use crate::text::document_structure::{SectionNumber, SectionNumberFormat};
7use regex::Regex;
8use std::sync::OnceLock;
9
10pub struct TextAnalyzer;
12
13impl TextAnalyzer {
14 pub fn detect_heading_level(line: &str) -> Option<u8> {
23 let trimmed = line.trim();
24
25 if trimmed.is_empty() {
26 return None;
27 }
28
29 if trimmed.starts_with('#') {
31 let level = trimmed.chars().take_while(|&c| c == '#').count();
32 if level > 0 && level <= 6 {
33 if trimmed.len() > level && trimmed.chars().nth(level) == Some(' ') {
35 return Some(level.min(255) as u8);
36 }
37 }
38 }
39
40 if trimmed.len() >= 5 && Self::is_all_caps(trimmed) {
42 let level = if trimmed.len() < 20 {
44 1 } else if trimmed.len() < 40 {
46 2 } else {
48 3 };
50 return Some(level);
51 }
52
53 if let Some(section_num) = Self::extract_section_number(trimmed) {
55 let level = section_num.depth();
56 if level > 0 && level <= 6 {
57 return Some(level);
58 }
59 }
60
61 None
62 }
63
64 fn is_all_caps(text: &str) -> bool {
66 let letters: String = text.chars().filter(|c| c.is_alphabetic()).collect();
67 !letters.is_empty() && letters.chars().all(|c| c.is_uppercase())
68 }
69
70 pub fn extract_section_number(text: &str) -> Option<SectionNumber> {
79 static DECIMAL_REGEX: OnceLock<Regex> = OnceLock::new();
80 static ROMAN_REGEX: OnceLock<Regex> = OnceLock::new();
81 static ALPHA_REGEX: OnceLock<Regex> = OnceLock::new();
82 static CHAPTER_REGEX: OnceLock<Regex> = OnceLock::new();
83
84 let decimal_re = DECIMAL_REGEX.get_or_init(|| {
85 Regex::new(r"^(\d+(?:\.\d+)*)\s*[.:]?\s").expect("static regex literal")
86 });
87
88 let roman_re = ROMAN_REGEX
89 .get_or_init(|| Regex::new(r"^([IVXLCDM]+)[.:]?\s").expect("static regex literal"));
90
91 let alpha_re = ALPHA_REGEX
92 .get_or_init(|| Regex::new(r"^([A-Z])[.:]?\s").expect("static regex literal"));
93
94 let chapter_re = CHAPTER_REGEX.get_or_init(|| {
95 Regex::new(r"(?i)^(chapter|section|part|appendix)\s+(\d+|[IVXLCDM]+|[A-Z])\b")
96 .expect("static regex literal")
97 });
98
99 if let Some(caps) = decimal_re.captures(text) {
101 if let Some(num_str) = caps.get(1) {
102 let components: Vec<usize> = num_str
103 .as_str()
104 .split('.')
105 .filter_map(|s| s.parse().ok())
106 .collect();
107
108 if !components.is_empty() {
109 return Some(SectionNumber {
110 raw: num_str.as_str().to_string(),
111 format: SectionNumberFormat::Decimal,
112 components,
113 });
114 }
115 }
116 }
117
118 if let Some(caps) = chapter_re.captures(text) {
120 if let Some(num_match) = caps.get(2) {
121 let num_str = num_match.as_str();
122
123 if let Ok(num) = num_str.parse::<usize>() {
125 return Some(SectionNumber {
126 raw: format!("{} {}", caps.get(1)?.as_str(), num_str),
127 format: SectionNumberFormat::Mixed,
128 components: vec![num],
129 });
130 }
131
132 if let Some(num) = Self::parse_roman_numeral(num_str) {
134 return Some(SectionNumber {
135 raw: format!("{} {}", caps.get(1)?.as_str(), num_str),
136 format: SectionNumberFormat::Mixed,
137 components: vec![num],
138 });
139 }
140
141 if num_str.len() == 1 {
143 if let Some(ch) = num_str.chars().next() {
144 if ch.is_ascii_uppercase() {
145 let num = (ch as usize) - ('A' as usize) + 1;
146 return Some(SectionNumber {
147 raw: format!("{} {}", caps.get(1)?.as_str(), num_str),
148 format: SectionNumberFormat::Mixed,
149 components: vec![num],
150 });
151 }
152 }
153 }
154 }
155 }
156
157 if let Some(caps) = roman_re.captures(text) {
159 if let Some(roman_str) = caps.get(1) {
160 if let Some(num) = Self::parse_roman_numeral(roman_str.as_str()) {
161 return Some(SectionNumber {
162 raw: roman_str.as_str().to_string(),
163 format: SectionNumberFormat::Roman,
164 components: vec![num],
165 });
166 }
167 }
168 }
169
170 if let Some(caps) = alpha_re.captures(text) {
172 if let Some(letter) = caps.get(1) {
173 let ch = letter.as_str().chars().next()?;
174 let num = (ch as usize) - ('A' as usize) + 1;
175 return Some(SectionNumber {
176 raw: letter.as_str().to_string(),
177 format: SectionNumberFormat::Alphabetic,
178 components: vec![num],
179 });
180 }
181 }
182
183 None
184 }
185
186 fn parse_roman_numeral(roman: &str) -> Option<usize> {
188 let mut result = 0;
189 let mut prev_value = 0;
190
191 for ch in roman.chars().rev() {
192 let value = match ch {
193 'I' => 1,
194 'V' => 5,
195 'X' => 10,
196 'L' => 50,
197 'C' => 100,
198 'D' => 500,
199 'M' => 1000,
200 _ => return None,
201 };
202
203 if value < prev_value {
204 result -= value;
205 } else {
206 result += value;
207 }
208 prev_value = value;
209 }
210
211 Some(result)
212 }
213
214 pub fn find_blank_line_positions(text: &str) -> Vec<usize> {
218 let mut positions = Vec::new();
219 let mut current_offset = 0;
220 let mut prev_was_blank = false;
221
222 for line in text.lines() {
223 let is_blank = line.trim().is_empty();
224
225 if is_blank && !prev_was_blank {
226 positions.push(current_offset);
227 }
228
229 prev_was_blank = is_blank;
230 current_offset += line.len() + 1; }
232
233 positions
234 }
235
236 pub fn calculate_statistics(text: &str) -> TextStats {
238 let words: Vec<&str> = text.split_whitespace().collect();
239 let word_count = words.len();
240
241 let sentence_endings = ['.', '!', '?'];
243 let sentence_count = text
244 .chars()
245 .filter(|c| sentence_endings.contains(c))
246 .count()
247 .max(1); let avg_sentence_length = if sentence_count > 0 {
250 word_count as f32 / sentence_count as f32
251 } else {
252 0.0
253 };
254
255 let paragraph_count = text
257 .split("\n\n")
258 .filter(|p| !p.trim().is_empty())
259 .count()
260 .max(1); let char_count = text.chars().count();
263
264 TextStats {
265 word_count,
266 sentence_count,
267 paragraph_count,
268 char_count,
269 avg_sentence_length,
270 avg_word_length: if word_count > 0 {
271 char_count as f32 / word_count as f32
272 } else {
273 0.0
274 },
275 }
276 }
277
278 pub fn is_underline(line: &str) -> Option<u8> {
282 let trimmed = line.trim();
283
284 if trimmed.len() < 3 {
285 return None;
286 }
287
288 if trimmed.chars().all(|c| c == '=') {
290 Some(1) } else if trimmed.chars().all(|c| c == '-') {
292 Some(2) } else if trimmed.chars().all(|c| c == '_') {
294 Some(3) } else {
296 None
297 }
298 }
299
300 pub fn extract_title(text: &str) -> Option<String> {
302 for line in text.lines().take(10) {
303 let trimmed = line.trim();
305
306 if trimmed.is_empty() {
307 continue;
308 }
309
310 if Self::is_all_caps(trimmed) && trimmed.len() < 100 {
312 return Some(trimmed.to_string());
313 }
314
315 if Self::detect_heading_level(line).is_some() {
317 let clean = trimmed
319 .trim_start_matches('#')
320 .trim_start_matches(|c: char| c.is_numeric() || c == '.')
321 .trim();
322 if !clean.is_empty() {
323 return Some(clean.to_string());
324 }
325 }
326
327 if trimmed.len() > 5 {
329 return Some(trimmed.to_string());
330 }
331 }
332
333 None
334 }
335}
336
337#[derive(Debug, Clone)]
339pub struct TextStats {
340 pub word_count: usize,
342 pub sentence_count: usize,
344 pub paragraph_count: usize,
346 pub char_count: usize,
348 pub avg_sentence_length: f32,
350 pub avg_word_length: f32,
352}
353
354#[cfg(test)]
355mod tests {
356 use super::*;
357
358 #[test]
359 fn test_markdown_heading_detection() {
360 assert_eq!(TextAnalyzer::detect_heading_level("# Chapter 1"), Some(1));
361 assert_eq!(
362 TextAnalyzer::detect_heading_level("## Section 1.1"),
363 Some(2)
364 );
365 assert_eq!(
366 TextAnalyzer::detect_heading_level("### Subsection 1.1.1"),
367 Some(3)
368 );
369 assert_eq!(TextAnalyzer::detect_heading_level("#### Level 4"), Some(4));
370 assert_eq!(TextAnalyzer::detect_heading_level("#No space"), None);
371 }
372
373 #[test]
374 fn test_all_caps_detection() {
375 assert_eq!(TextAnalyzer::detect_heading_level("CHAPTER ONE"), Some(1));
376 assert_eq!(
377 TextAnalyzer::detect_heading_level("INTRODUCTION TO MACHINE LEARNING"),
378 Some(2)
379 );
380 assert_eq!(
381 TextAnalyzer::detect_heading_level("This is not ALL CAPS"),
382 None
383 );
384 }
385
386 #[test]
387 fn test_section_number_extraction() {
388 let sec1 = TextAnalyzer::extract_section_number("1. Introduction").unwrap();
390 assert_eq!(sec1.components, vec![1]);
391 assert_eq!(sec1.format, SectionNumberFormat::Decimal);
392
393 let sec2 = TextAnalyzer::extract_section_number("1.2.3 Subsection").unwrap();
394 assert_eq!(sec2.components, vec![1, 2, 3]);
395
396 let sec3 = TextAnalyzer::extract_section_number("Chapter 1 Introduction").unwrap();
398 assert_eq!(sec3.components, vec![1]);
399 assert_eq!(sec3.format, SectionNumberFormat::Mixed);
400
401 let sec4 = TextAnalyzer::extract_section_number("I. First Chapter").unwrap();
403 assert_eq!(sec4.components, vec![1]);
404 assert_eq!(sec4.format, SectionNumberFormat::Roman);
405
406 let sec5 = TextAnalyzer::extract_section_number("IV. Fourth Chapter").unwrap();
407 assert_eq!(sec5.components, vec![4]);
408 }
409
410 #[test]
411 fn test_roman_numeral_parsing() {
412 assert_eq!(TextAnalyzer::parse_roman_numeral("I"), Some(1));
413 assert_eq!(TextAnalyzer::parse_roman_numeral("IV"), Some(4));
414 assert_eq!(TextAnalyzer::parse_roman_numeral("IX"), Some(9));
415 assert_eq!(TextAnalyzer::parse_roman_numeral("XL"), Some(40));
416 assert_eq!(TextAnalyzer::parse_roman_numeral("MCMXCIV"), Some(1994));
417 assert_eq!(TextAnalyzer::parse_roman_numeral("ABC"), None);
418 }
419
420 #[test]
421 fn test_blank_line_detection() {
422 let text = "Line 1\n\nLine 2\n\n\nLine 3";
423 let positions = TextAnalyzer::find_blank_line_positions(text);
424 assert_eq!(positions.len(), 2);
425 }
426
427 #[test]
428 fn test_text_statistics() {
429 let text = "This is a test. It has two sentences.";
430 let stats = TextAnalyzer::calculate_statistics(text);
431
432 assert_eq!(stats.sentence_count, 2);
433 assert!(stats.word_count >= 7);
434 assert!(stats.avg_sentence_length > 0.0);
435 }
436
437 #[test]
438 fn test_underline_detection() {
439 assert_eq!(TextAnalyzer::is_underline("====="), Some(1));
440 assert_eq!(TextAnalyzer::is_underline("-----"), Some(2));
441 assert_eq!(TextAnalyzer::is_underline("_____"), Some(3));
442 assert_eq!(TextAnalyzer::is_underline("===---"), None);
443 }
444
445 #[test]
446 fn test_title_extraction() {
447 let text = "# Main Title\n\nSome content here.";
448 let title = TextAnalyzer::extract_title(text);
449 assert_eq!(title, Some("Main Title".to_string()));
450
451 let text2 = "INTRODUCTION\n\nThis is the intro.";
452 let title2 = TextAnalyzer::extract_title(text2);
453 assert_eq!(title2, Some("INTRODUCTION".to_string()));
454 }
455}