graphrag_core/text/
analysis.rs1use crate::text::document_structure::{SectionNumber, SectionNumberFormat};
7use regex::Regex;
8use std::sync::OnceLock;
9
10pub struct TextAnalyzer;
12
13impl TextAnalyzer {
14 pub fn detect_heading_level(line: &str) -> Option<u8> {
23 let trimmed = line.trim();
24
25 if trimmed.is_empty() {
26 return None;
27 }
28
29 if trimmed.starts_with('#') {
31 let level = trimmed.chars().take_while(|&c| c == '#').count();
32 if level > 0 && level <= 6 {
33 if trimmed.len() > level && trimmed.chars().nth(level) == Some(' ') {
35 return Some(level.min(255) as u8);
36 }
37 }
38 }
39
40 if trimmed.len() >= 5 && Self::is_all_caps(trimmed) {
42 let level = if trimmed.len() < 20 {
44 1 } else if trimmed.len() < 40 {
46 2 } else {
48 3 };
50 return Some(level);
51 }
52
53 if let Some(section_num) = Self::extract_section_number(trimmed) {
55 let level = section_num.depth();
56 if level > 0 && level <= 6 {
57 return Some(level);
58 }
59 }
60
61 None
62 }
63
64 fn is_all_caps(text: &str) -> bool {
66 let letters: String = text.chars().filter(|c| c.is_alphabetic()).collect();
67 !letters.is_empty() && letters.chars().all(|c| c.is_uppercase())
68 }
69
70 pub fn extract_section_number(text: &str) -> Option<SectionNumber> {
79 static DECIMAL_REGEX: OnceLock<Regex> = OnceLock::new();
80 static ROMAN_REGEX: OnceLock<Regex> = OnceLock::new();
81 static ALPHA_REGEX: OnceLock<Regex> = OnceLock::new();
82 static CHAPTER_REGEX: OnceLock<Regex> = OnceLock::new();
83
84 let decimal_re = DECIMAL_REGEX.get_or_init(|| {
85 Regex::new(r"^(\d+(?:\.\d+)*)\s*[.:]?\s").unwrap()
86 });
87
88 let roman_re = ROMAN_REGEX.get_or_init(|| {
89 Regex::new(r"^([IVXLCDM]+)[.:]?\s").unwrap()
90 });
91
92 let alpha_re = ALPHA_REGEX.get_or_init(|| {
93 Regex::new(r"^([A-Z])[.:]?\s").unwrap()
94 });
95
96 let chapter_re = CHAPTER_REGEX.get_or_init(|| {
97 Regex::new(r"(?i)^(chapter|section|part|appendix)\s+(\d+|[IVXLCDM]+|[A-Z])\b").unwrap()
98 });
99
100 if let Some(caps) = decimal_re.captures(text) {
102 if let Some(num_str) = caps.get(1) {
103 let components: Vec<usize> = num_str
104 .as_str()
105 .split('.')
106 .filter_map(|s| s.parse().ok())
107 .collect();
108
109 if !components.is_empty() {
110 return Some(SectionNumber {
111 raw: num_str.as_str().to_string(),
112 format: SectionNumberFormat::Decimal,
113 components,
114 });
115 }
116 }
117 }
118
119 if let Some(caps) = chapter_re.captures(text) {
121 if let Some(num_match) = caps.get(2) {
122 let num_str = num_match.as_str();
123
124 if let Ok(num) = num_str.parse::<usize>() {
126 return Some(SectionNumber {
127 raw: format!("{} {}", caps.get(1)?.as_str(), num_str),
128 format: SectionNumberFormat::Mixed,
129 components: vec![num],
130 });
131 }
132
133 if let Some(num) = Self::parse_roman_numeral(num_str) {
135 return Some(SectionNumber {
136 raw: format!("{} {}", caps.get(1)?.as_str(), num_str),
137 format: SectionNumberFormat::Mixed,
138 components: vec![num],
139 });
140 }
141
142 if num_str.len() == 1 {
144 if let Some(ch) = num_str.chars().next() {
145 if ch.is_ascii_uppercase() {
146 let num = (ch as usize) - ('A' as usize) + 1;
147 return Some(SectionNumber {
148 raw: format!("{} {}", caps.get(1)?.as_str(), num_str),
149 format: SectionNumberFormat::Mixed,
150 components: vec![num],
151 });
152 }
153 }
154 }
155 }
156 }
157
158 if let Some(caps) = roman_re.captures(text) {
160 if let Some(roman_str) = caps.get(1) {
161 if let Some(num) = Self::parse_roman_numeral(roman_str.as_str()) {
162 return Some(SectionNumber {
163 raw: roman_str.as_str().to_string(),
164 format: SectionNumberFormat::Roman,
165 components: vec![num],
166 });
167 }
168 }
169 }
170
171 if let Some(caps) = alpha_re.captures(text) {
173 if let Some(letter) = caps.get(1) {
174 let ch = letter.as_str().chars().next()?;
175 let num = (ch as usize) - ('A' as usize) + 1;
176 return Some(SectionNumber {
177 raw: letter.as_str().to_string(),
178 format: SectionNumberFormat::Alphabetic,
179 components: vec![num],
180 });
181 }
182 }
183
184 None
185 }
186
187 fn parse_roman_numeral(roman: &str) -> Option<usize> {
189 let mut result = 0;
190 let mut prev_value = 0;
191
192 for ch in roman.chars().rev() {
193 let value = match ch {
194 'I' => 1,
195 'V' => 5,
196 'X' => 10,
197 'L' => 50,
198 'C' => 100,
199 'D' => 500,
200 'M' => 1000,
201 _ => return None,
202 };
203
204 if value < prev_value {
205 result -= value;
206 } else {
207 result += value;
208 }
209 prev_value = value;
210 }
211
212 Some(result)
213 }
214
215 pub fn find_blank_line_positions(text: &str) -> Vec<usize> {
219 let mut positions = Vec::new();
220 let mut current_offset = 0;
221 let mut prev_was_blank = false;
222
223 for line in text.lines() {
224 let is_blank = line.trim().is_empty();
225
226 if is_blank && !prev_was_blank {
227 positions.push(current_offset);
228 }
229
230 prev_was_blank = is_blank;
231 current_offset += line.len() + 1; }
233
234 positions
235 }
236
237 pub fn calculate_statistics(text: &str) -> TextStats {
239 let words: Vec<&str> = text.split_whitespace().collect();
240 let word_count = words.len();
241
242 let sentence_endings = ['.', '!', '?'];
244 let sentence_count = text
245 .chars()
246 .filter(|c| sentence_endings.contains(c))
247 .count()
248 .max(1); let avg_sentence_length = if sentence_count > 0 {
251 word_count as f32 / sentence_count as f32
252 } else {
253 0.0
254 };
255
256 let paragraph_count = text
258 .split("\n\n")
259 .filter(|p| !p.trim().is_empty())
260 .count()
261 .max(1); let char_count = text.chars().count();
264
265 TextStats {
266 word_count,
267 sentence_count,
268 paragraph_count,
269 char_count,
270 avg_sentence_length,
271 avg_word_length: if word_count > 0 {
272 char_count as f32 / word_count as f32
273 } else {
274 0.0
275 },
276 }
277 }
278
279 pub fn is_underline(line: &str) -> Option<u8> {
283 let trimmed = line.trim();
284
285 if trimmed.len() < 3 {
286 return None;
287 }
288
289 if trimmed.chars().all(|c| c == '=') {
291 Some(1) } else if trimmed.chars().all(|c| c == '-') {
293 Some(2) } else if trimmed.chars().all(|c| c == '_') {
295 Some(3) } else {
297 None
298 }
299 }
300
301 pub fn extract_title(text: &str) -> Option<String> {
303 for line in text.lines().take(10) {
304 let trimmed = line.trim();
306
307 if trimmed.is_empty() {
308 continue;
309 }
310
311 if Self::is_all_caps(trimmed) && trimmed.len() < 100 {
313 return Some(trimmed.to_string());
314 }
315
316 if Self::detect_heading_level(line).is_some() {
318 let clean = trimmed
320 .trim_start_matches('#')
321 .trim_start_matches(|c: char| c.is_numeric() || c == '.')
322 .trim();
323 if !clean.is_empty() {
324 return Some(clean.to_string());
325 }
326 }
327
328 if trimmed.len() > 5 {
330 return Some(trimmed.to_string());
331 }
332 }
333
334 None
335 }
336}
337
338#[derive(Debug, Clone)]
340pub struct TextStats {
341 pub word_count: usize,
343 pub sentence_count: usize,
345 pub paragraph_count: usize,
347 pub char_count: usize,
349 pub avg_sentence_length: f32,
351 pub avg_word_length: f32,
353}
354
355#[cfg(test)]
356mod tests {
357 use super::*;
358
359 #[test]
360 fn test_markdown_heading_detection() {
361 assert_eq!(TextAnalyzer::detect_heading_level("# Chapter 1"), Some(1));
362 assert_eq!(TextAnalyzer::detect_heading_level("## Section 1.1"), Some(2));
363 assert_eq!(
364 TextAnalyzer::detect_heading_level("### Subsection 1.1.1"),
365 Some(3)
366 );
367 assert_eq!(TextAnalyzer::detect_heading_level("#### Level 4"), Some(4));
368 assert_eq!(TextAnalyzer::detect_heading_level("#No space"), None);
369 }
370
371 #[test]
372 fn test_all_caps_detection() {
373 assert_eq!(TextAnalyzer::detect_heading_level("CHAPTER ONE"), Some(1));
374 assert_eq!(
375 TextAnalyzer::detect_heading_level("INTRODUCTION TO MACHINE LEARNING"),
376 Some(2)
377 );
378 assert_eq!(TextAnalyzer::detect_heading_level("This is not ALL CAPS"), None);
379 }
380
381 #[test]
382 fn test_section_number_extraction() {
383 let sec1 = TextAnalyzer::extract_section_number("1. Introduction").unwrap();
385 assert_eq!(sec1.components, vec![1]);
386 assert_eq!(sec1.format, SectionNumberFormat::Decimal);
387
388 let sec2 = TextAnalyzer::extract_section_number("1.2.3 Subsection").unwrap();
389 assert_eq!(sec2.components, vec![1, 2, 3]);
390
391 let sec3 = TextAnalyzer::extract_section_number("Chapter 1 Introduction").unwrap();
393 assert_eq!(sec3.components, vec![1]);
394 assert_eq!(sec3.format, SectionNumberFormat::Mixed);
395
396 let sec4 = TextAnalyzer::extract_section_number("I. First Chapter").unwrap();
398 assert_eq!(sec4.components, vec![1]);
399 assert_eq!(sec4.format, SectionNumberFormat::Roman);
400
401 let sec5 = TextAnalyzer::extract_section_number("IV. Fourth Chapter").unwrap();
402 assert_eq!(sec5.components, vec![4]);
403 }
404
405 #[test]
406 fn test_roman_numeral_parsing() {
407 assert_eq!(TextAnalyzer::parse_roman_numeral("I"), Some(1));
408 assert_eq!(TextAnalyzer::parse_roman_numeral("IV"), Some(4));
409 assert_eq!(TextAnalyzer::parse_roman_numeral("IX"), Some(9));
410 assert_eq!(TextAnalyzer::parse_roman_numeral("XL"), Some(40));
411 assert_eq!(TextAnalyzer::parse_roman_numeral("MCMXCIV"), Some(1994));
412 assert_eq!(TextAnalyzer::parse_roman_numeral("ABC"), None);
413 }
414
415 #[test]
416 fn test_blank_line_detection() {
417 let text = "Line 1\n\nLine 2\n\n\nLine 3";
418 let positions = TextAnalyzer::find_blank_line_positions(text);
419 assert_eq!(positions.len(), 2);
420 }
421
422 #[test]
423 fn test_text_statistics() {
424 let text = "This is a test. It has two sentences.";
425 let stats = TextAnalyzer::calculate_statistics(text);
426
427 assert_eq!(stats.sentence_count, 2);
428 assert!(stats.word_count >= 7);
429 assert!(stats.avg_sentence_length > 0.0);
430 }
431
432 #[test]
433 fn test_underline_detection() {
434 assert_eq!(TextAnalyzer::is_underline("====="), Some(1));
435 assert_eq!(TextAnalyzer::is_underline("-----"), Some(2));
436 assert_eq!(TextAnalyzer::is_underline("_____"), Some(3));
437 assert_eq!(TextAnalyzer::is_underline("===---"), None);
438 }
439
440 #[test]
441 fn test_title_extraction() {
442 let text = "# Main Title\n\nSome content here.";
443 let title = TextAnalyzer::extract_title(text);
444 assert_eq!(title, Some("Main Title".to_string()));
445
446 let text2 = "INTRODUCTION\n\nThis is the intro.";
447 let title2 = TextAnalyzer::extract_title(text2);
448 assert_eq!(title2, Some("INTRODUCTION".to_string()));
449 }
450}