graphrag_core/text/
analysis.rs1use crate::text::document_structure::{SectionNumber, SectionNumberFormat};
7use regex::Regex;
8use std::sync::OnceLock;
9
10pub struct TextAnalyzer;
12
13impl TextAnalyzer {
14 pub fn detect_heading_level(line: &str) -> Option<u8> {
23 let trimmed = line.trim();
24
25 if trimmed.is_empty() {
26 return None;
27 }
28
29 if trimmed.starts_with('#') {
31 let level = trimmed.chars().take_while(|&c| c == '#').count();
32 if level > 0 && level <= 6 {
33 if trimmed.len() > level && trimmed.chars().nth(level) == Some(' ') {
35 return Some(level.min(255) as u8);
36 }
37 }
38 }
39
40 if trimmed.len() >= 5 && Self::is_all_caps(trimmed) {
42 let level = if trimmed.len() < 20 {
44 1 } else if trimmed.len() < 40 {
46 2 } else {
48 3 };
50 return Some(level);
51 }
52
53 if let Some(section_num) = Self::extract_section_number(trimmed) {
55 let level = section_num.depth();
56 if level > 0 && level <= 6 {
57 return Some(level);
58 }
59 }
60
61 None
62 }
63
64 fn is_all_caps(text: &str) -> bool {
66 let letters: String = text.chars().filter(|c| c.is_alphabetic()).collect();
67 !letters.is_empty() && letters.chars().all(|c| c.is_uppercase())
68 }
69
70 pub fn extract_section_number(text: &str) -> Option<SectionNumber> {
79 static DECIMAL_REGEX: OnceLock<Regex> = OnceLock::new();
80 static ROMAN_REGEX: OnceLock<Regex> = OnceLock::new();
81 static ALPHA_REGEX: OnceLock<Regex> = OnceLock::new();
82 static CHAPTER_REGEX: OnceLock<Regex> = OnceLock::new();
83
84 let decimal_re =
85 DECIMAL_REGEX.get_or_init(|| Regex::new(r"^(\d+(?:\.\d+)*)\s*[.:]?\s").unwrap());
86
87 let roman_re = ROMAN_REGEX.get_or_init(|| Regex::new(r"^([IVXLCDM]+)[.:]?\s").unwrap());
88
89 let alpha_re = ALPHA_REGEX.get_or_init(|| Regex::new(r"^([A-Z])[.:]?\s").unwrap());
90
91 let chapter_re = CHAPTER_REGEX.get_or_init(|| {
92 Regex::new(r"(?i)^(chapter|section|part|appendix)\s+(\d+|[IVXLCDM]+|[A-Z])\b").unwrap()
93 });
94
95 if let Some(caps) = decimal_re.captures(text) {
97 if let Some(num_str) = caps.get(1) {
98 let components: Vec<usize> = num_str
99 .as_str()
100 .split('.')
101 .filter_map(|s| s.parse().ok())
102 .collect();
103
104 if !components.is_empty() {
105 return Some(SectionNumber {
106 raw: num_str.as_str().to_string(),
107 format: SectionNumberFormat::Decimal,
108 components,
109 });
110 }
111 }
112 }
113
114 if let Some(caps) = chapter_re.captures(text) {
116 if let Some(num_match) = caps.get(2) {
117 let num_str = num_match.as_str();
118
119 if let Ok(num) = num_str.parse::<usize>() {
121 return Some(SectionNumber {
122 raw: format!("{} {}", caps.get(1)?.as_str(), num_str),
123 format: SectionNumberFormat::Mixed,
124 components: vec![num],
125 });
126 }
127
128 if let Some(num) = Self::parse_roman_numeral(num_str) {
130 return Some(SectionNumber {
131 raw: format!("{} {}", caps.get(1)?.as_str(), num_str),
132 format: SectionNumberFormat::Mixed,
133 components: vec![num],
134 });
135 }
136
137 if num_str.len() == 1 {
139 if let Some(ch) = num_str.chars().next() {
140 if ch.is_ascii_uppercase() {
141 let num = (ch as usize) - ('A' as usize) + 1;
142 return Some(SectionNumber {
143 raw: format!("{} {}", caps.get(1)?.as_str(), num_str),
144 format: SectionNumberFormat::Mixed,
145 components: vec![num],
146 });
147 }
148 }
149 }
150 }
151 }
152
153 if let Some(caps) = roman_re.captures(text) {
155 if let Some(roman_str) = caps.get(1) {
156 if let Some(num) = Self::parse_roman_numeral(roman_str.as_str()) {
157 return Some(SectionNumber {
158 raw: roman_str.as_str().to_string(),
159 format: SectionNumberFormat::Roman,
160 components: vec![num],
161 });
162 }
163 }
164 }
165
166 if let Some(caps) = alpha_re.captures(text) {
168 if let Some(letter) = caps.get(1) {
169 let ch = letter.as_str().chars().next()?;
170 let num = (ch as usize) - ('A' as usize) + 1;
171 return Some(SectionNumber {
172 raw: letter.as_str().to_string(),
173 format: SectionNumberFormat::Alphabetic,
174 components: vec![num],
175 });
176 }
177 }
178
179 None
180 }
181
182 fn parse_roman_numeral(roman: &str) -> Option<usize> {
184 let mut result = 0;
185 let mut prev_value = 0;
186
187 for ch in roman.chars().rev() {
188 let value = match ch {
189 'I' => 1,
190 'V' => 5,
191 'X' => 10,
192 'L' => 50,
193 'C' => 100,
194 'D' => 500,
195 'M' => 1000,
196 _ => return None,
197 };
198
199 if value < prev_value {
200 result -= value;
201 } else {
202 result += value;
203 }
204 prev_value = value;
205 }
206
207 Some(result)
208 }
209
210 pub fn find_blank_line_positions(text: &str) -> Vec<usize> {
214 let mut positions = Vec::new();
215 let mut current_offset = 0;
216 let mut prev_was_blank = false;
217
218 for line in text.lines() {
219 let is_blank = line.trim().is_empty();
220
221 if is_blank && !prev_was_blank {
222 positions.push(current_offset);
223 }
224
225 prev_was_blank = is_blank;
226 current_offset += line.len() + 1; }
228
229 positions
230 }
231
232 pub fn calculate_statistics(text: &str) -> TextStats {
234 let words: Vec<&str> = text.split_whitespace().collect();
235 let word_count = words.len();
236
237 let sentence_endings = ['.', '!', '?'];
239 let sentence_count = text
240 .chars()
241 .filter(|c| sentence_endings.contains(c))
242 .count()
243 .max(1); let avg_sentence_length = if sentence_count > 0 {
246 word_count as f32 / sentence_count as f32
247 } else {
248 0.0
249 };
250
251 let paragraph_count = text
253 .split("\n\n")
254 .filter(|p| !p.trim().is_empty())
255 .count()
256 .max(1); let char_count = text.chars().count();
259
260 TextStats {
261 word_count,
262 sentence_count,
263 paragraph_count,
264 char_count,
265 avg_sentence_length,
266 avg_word_length: if word_count > 0 {
267 char_count as f32 / word_count as f32
268 } else {
269 0.0
270 },
271 }
272 }
273
274 pub fn is_underline(line: &str) -> Option<u8> {
278 let trimmed = line.trim();
279
280 if trimmed.len() < 3 {
281 return None;
282 }
283
284 if trimmed.chars().all(|c| c == '=') {
286 Some(1) } else if trimmed.chars().all(|c| c == '-') {
288 Some(2) } else if trimmed.chars().all(|c| c == '_') {
290 Some(3) } else {
292 None
293 }
294 }
295
296 pub fn extract_title(text: &str) -> Option<String> {
298 for line in text.lines().take(10) {
299 let trimmed = line.trim();
301
302 if trimmed.is_empty() {
303 continue;
304 }
305
306 if Self::is_all_caps(trimmed) && trimmed.len() < 100 {
308 return Some(trimmed.to_string());
309 }
310
311 if Self::detect_heading_level(line).is_some() {
313 let clean = trimmed
315 .trim_start_matches('#')
316 .trim_start_matches(|c: char| c.is_numeric() || c == '.')
317 .trim();
318 if !clean.is_empty() {
319 return Some(clean.to_string());
320 }
321 }
322
323 if trimmed.len() > 5 {
325 return Some(trimmed.to_string());
326 }
327 }
328
329 None
330 }
331}
332
333#[derive(Debug, Clone)]
335pub struct TextStats {
336 pub word_count: usize,
338 pub sentence_count: usize,
340 pub paragraph_count: usize,
342 pub char_count: usize,
344 pub avg_sentence_length: f32,
346 pub avg_word_length: f32,
348}
349
350#[cfg(test)]
351mod tests {
352 use super::*;
353
354 #[test]
355 fn test_markdown_heading_detection() {
356 assert_eq!(TextAnalyzer::detect_heading_level("# Chapter 1"), Some(1));
357 assert_eq!(
358 TextAnalyzer::detect_heading_level("## Section 1.1"),
359 Some(2)
360 );
361 assert_eq!(
362 TextAnalyzer::detect_heading_level("### Subsection 1.1.1"),
363 Some(3)
364 );
365 assert_eq!(TextAnalyzer::detect_heading_level("#### Level 4"), Some(4));
366 assert_eq!(TextAnalyzer::detect_heading_level("#No space"), None);
367 }
368
369 #[test]
370 fn test_all_caps_detection() {
371 assert_eq!(TextAnalyzer::detect_heading_level("CHAPTER ONE"), Some(1));
372 assert_eq!(
373 TextAnalyzer::detect_heading_level("INTRODUCTION TO MACHINE LEARNING"),
374 Some(2)
375 );
376 assert_eq!(
377 TextAnalyzer::detect_heading_level("This is not ALL CAPS"),
378 None
379 );
380 }
381
382 #[test]
383 fn test_section_number_extraction() {
384 let sec1 = TextAnalyzer::extract_section_number("1. Introduction").unwrap();
386 assert_eq!(sec1.components, vec![1]);
387 assert_eq!(sec1.format, SectionNumberFormat::Decimal);
388
389 let sec2 = TextAnalyzer::extract_section_number("1.2.3 Subsection").unwrap();
390 assert_eq!(sec2.components, vec![1, 2, 3]);
391
392 let sec3 = TextAnalyzer::extract_section_number("Chapter 1 Introduction").unwrap();
394 assert_eq!(sec3.components, vec![1]);
395 assert_eq!(sec3.format, SectionNumberFormat::Mixed);
396
397 let sec4 = TextAnalyzer::extract_section_number("I. First Chapter").unwrap();
399 assert_eq!(sec4.components, vec![1]);
400 assert_eq!(sec4.format, SectionNumberFormat::Roman);
401
402 let sec5 = TextAnalyzer::extract_section_number("IV. Fourth Chapter").unwrap();
403 assert_eq!(sec5.components, vec![4]);
404 }
405
406 #[test]
407 fn test_roman_numeral_parsing() {
408 assert_eq!(TextAnalyzer::parse_roman_numeral("I"), Some(1));
409 assert_eq!(TextAnalyzer::parse_roman_numeral("IV"), Some(4));
410 assert_eq!(TextAnalyzer::parse_roman_numeral("IX"), Some(9));
411 assert_eq!(TextAnalyzer::parse_roman_numeral("XL"), Some(40));
412 assert_eq!(TextAnalyzer::parse_roman_numeral("MCMXCIV"), Some(1994));
413 assert_eq!(TextAnalyzer::parse_roman_numeral("ABC"), None);
414 }
415
416 #[test]
417 fn test_blank_line_detection() {
418 let text = "Line 1\n\nLine 2\n\n\nLine 3";
419 let positions = TextAnalyzer::find_blank_line_positions(text);
420 assert_eq!(positions.len(), 2);
421 }
422
423 #[test]
424 fn test_text_statistics() {
425 let text = "This is a test. It has two sentences.";
426 let stats = TextAnalyzer::calculate_statistics(text);
427
428 assert_eq!(stats.sentence_count, 2);
429 assert!(stats.word_count >= 7);
430 assert!(stats.avg_sentence_length > 0.0);
431 }
432
433 #[test]
434 fn test_underline_detection() {
435 assert_eq!(TextAnalyzer::is_underline("====="), Some(1));
436 assert_eq!(TextAnalyzer::is_underline("-----"), Some(2));
437 assert_eq!(TextAnalyzer::is_underline("_____"), Some(3));
438 assert_eq!(TextAnalyzer::is_underline("===---"), None);
439 }
440
441 #[test]
442 fn test_title_extraction() {
443 let text = "# Main Title\n\nSome content here.";
444 let title = TextAnalyzer::extract_title(text);
445 assert_eq!(title, Some("Main Title".to_string()));
446
447 let text2 = "INTRODUCTION\n\nThis is the intro.";
448 let title2 = TextAnalyzer::extract_title(text2);
449 assert_eq!(title2, Some("INTRODUCTION".to_string()));
450 }
451}