Skip to main content

graphrag_core/text/
analysis.rs

1//! Text analysis utilities for document structure detection
2//!
3//! This module provides algorithms for analyzing text structure, including
4//! heading detection, section numbering extraction, and statistical analysis.
5
6use crate::text::document_structure::{SectionNumber, SectionNumberFormat};
7use regex::Regex;
8use std::sync::OnceLock;
9
10/// Text analyzer for structural analysis
11pub struct TextAnalyzer;
12
13impl TextAnalyzer {
14    /// Detect if a line is a heading and determine its level
15    ///
16    /// Supports multiple heading formats:
17    /// - Markdown: #, ##, ###, etc.
18    /// - Plain text: ALL CAPS, numeric prefixes
19    /// - Underlined text (detected by caller)
20    ///
21    /// Returns Some(level) if detected, where level 1 is highest (chapter)
22    pub fn detect_heading_level(line: &str) -> Option<u8> {
23        let trimmed = line.trim();
24
25        if trimmed.is_empty() {
26            return None;
27        }
28
29        // Markdown heading detection: # ## ### etc.
30        if trimmed.starts_with('#') {
31            let level = trimmed.chars().take_while(|&c| c == '#').count();
32            if level > 0 && level <= 6 {
33                // Verify there's a space after the hashes (proper markdown)
34                if trimmed.len() > level && trimmed.chars().nth(level) == Some(' ') {
35                    return Some(level.min(255) as u8);
36                }
37            }
38        }
39
40        // ALL CAPS detection (likely chapter/section heading)
41        if trimmed.len() >= 5 && Self::is_all_caps(trimmed) {
42            // Shorter ALL CAPS lines are more likely to be high-level headings
43            let level = if trimmed.len() < 20 {
44                1 // Short ALL CAPS = chapter
45            } else if trimmed.len() < 40 {
46                2 // Medium ALL CAPS = section
47            } else {
48                3 // Long ALL CAPS = subsection
49            };
50            return Some(level);
51        }
52
53        // Numbered heading detection: "1.", "1.1", "Chapter 1", etc.
54        if let Some(section_num) = Self::extract_section_number(trimmed) {
55            let level = section_num.depth();
56            if level > 0 && level <= 6 {
57                return Some(level);
58            }
59        }
60
61        None
62    }
63
64    /// Check if text is ALL CAPS (ignoring non-alphabetic characters)
65    fn is_all_caps(text: &str) -> bool {
66        let letters: String = text.chars().filter(|c| c.is_alphabetic()).collect();
67        !letters.is_empty() && letters.chars().all(|c| c.is_uppercase())
68    }
69
70    /// Extract section number from heading text
71    ///
72    /// Recognizes patterns like:
73    /// - "1.", "2.", "3."
74    /// - "1.1", "1.2.3", "2.3.4.5"
75    /// - "Chapter 1", "Section 2.1"
76    /// - "I.", "II.", "III." (Roman numerals)
77    /// - "A.", "B.", "C." (Alphabetic)
78    pub fn extract_section_number(text: &str) -> Option<SectionNumber> {
79        static DECIMAL_REGEX: OnceLock<Regex> = OnceLock::new();
80        static ROMAN_REGEX: OnceLock<Regex> = OnceLock::new();
81        static ALPHA_REGEX: OnceLock<Regex> = OnceLock::new();
82        static CHAPTER_REGEX: OnceLock<Regex> = OnceLock::new();
83
84        let decimal_re =
85            DECIMAL_REGEX.get_or_init(|| Regex::new(r"^(\d+(?:\.\d+)*)\s*[.:]?\s").unwrap());
86
87        let roman_re = ROMAN_REGEX.get_or_init(|| Regex::new(r"^([IVXLCDM]+)[.:]?\s").unwrap());
88
89        let alpha_re = ALPHA_REGEX.get_or_init(|| Regex::new(r"^([A-Z])[.:]?\s").unwrap());
90
91        let chapter_re = CHAPTER_REGEX.get_or_init(|| {
92            Regex::new(r"(?i)^(chapter|section|part|appendix)\s+(\d+|[IVXLCDM]+|[A-Z])\b").unwrap()
93        });
94
95        // Try decimal numbering (most common)
96        if let Some(caps) = decimal_re.captures(text) {
97            if let Some(num_str) = caps.get(1) {
98                let components: Vec<usize> = num_str
99                    .as_str()
100                    .split('.')
101                    .filter_map(|s| s.parse().ok())
102                    .collect();
103
104                if !components.is_empty() {
105                    return Some(SectionNumber {
106                        raw: num_str.as_str().to_string(),
107                        format: SectionNumberFormat::Decimal,
108                        components,
109                    });
110                }
111            }
112        }
113
114        // Try chapter/section keywords
115        if let Some(caps) = chapter_re.captures(text) {
116            if let Some(num_match) = caps.get(2) {
117                let num_str = num_match.as_str();
118
119                // Try parsing as decimal
120                if let Ok(num) = num_str.parse::<usize>() {
121                    return Some(SectionNumber {
122                        raw: format!("{} {}", caps.get(1)?.as_str(), num_str),
123                        format: SectionNumberFormat::Mixed,
124                        components: vec![num],
125                    });
126                }
127
128                // Try parsing as Roman numeral
129                if let Some(num) = Self::parse_roman_numeral(num_str) {
130                    return Some(SectionNumber {
131                        raw: format!("{} {}", caps.get(1)?.as_str(), num_str),
132                        format: SectionNumberFormat::Mixed,
133                        components: vec![num],
134                    });
135                }
136
137                // Try parsing as alphabetic
138                if num_str.len() == 1 {
139                    if let Some(ch) = num_str.chars().next() {
140                        if ch.is_ascii_uppercase() {
141                            let num = (ch as usize) - ('A' as usize) + 1;
142                            return Some(SectionNumber {
143                                raw: format!("{} {}", caps.get(1)?.as_str(), num_str),
144                                format: SectionNumberFormat::Mixed,
145                                components: vec![num],
146                            });
147                        }
148                    }
149                }
150            }
151        }
152
153        // Try Roman numerals
154        if let Some(caps) = roman_re.captures(text) {
155            if let Some(roman_str) = caps.get(1) {
156                if let Some(num) = Self::parse_roman_numeral(roman_str.as_str()) {
157                    return Some(SectionNumber {
158                        raw: roman_str.as_str().to_string(),
159                        format: SectionNumberFormat::Roman,
160                        components: vec![num],
161                    });
162                }
163            }
164        }
165
166        // Try alphabetic
167        if let Some(caps) = alpha_re.captures(text) {
168            if let Some(letter) = caps.get(1) {
169                let ch = letter.as_str().chars().next()?;
170                let num = (ch as usize) - ('A' as usize) + 1;
171                return Some(SectionNumber {
172                    raw: letter.as_str().to_string(),
173                    format: SectionNumberFormat::Alphabetic,
174                    components: vec![num],
175                });
176            }
177        }
178
179        None
180    }
181
182    /// Parse Roman numeral to decimal
183    fn parse_roman_numeral(roman: &str) -> Option<usize> {
184        let mut result = 0;
185        let mut prev_value = 0;
186
187        for ch in roman.chars().rev() {
188            let value = match ch {
189                'I' => 1,
190                'V' => 5,
191                'X' => 10,
192                'L' => 50,
193                'C' => 100,
194                'D' => 500,
195                'M' => 1000,
196                _ => return None,
197            };
198
199            if value < prev_value {
200                result -= value;
201            } else {
202                result += value;
203            }
204            prev_value = value;
205        }
206
207        Some(result)
208    }
209
210    /// Find positions of blank lines (paragraph separators)
211    ///
212    /// Returns character offsets where blank lines occur
213    pub fn find_blank_line_positions(text: &str) -> Vec<usize> {
214        let mut positions = Vec::new();
215        let mut current_offset = 0;
216        let mut prev_was_blank = false;
217
218        for line in text.lines() {
219            let is_blank = line.trim().is_empty();
220
221            if is_blank && !prev_was_blank {
222                positions.push(current_offset);
223            }
224
225            prev_was_blank = is_blank;
226            current_offset += line.len() + 1; // +1 for newline
227        }
228
229        positions
230    }
231
232    /// Calculate statistics about text
233    pub fn calculate_statistics(text: &str) -> TextStats {
234        let words: Vec<&str> = text.split_whitespace().collect();
235        let word_count = words.len();
236
237        // Count sentences (simple heuristic)
238        let sentence_endings = ['.', '!', '?'];
239        let sentence_count = text
240            .chars()
241            .filter(|c| sentence_endings.contains(c))
242            .count()
243            .max(1); // At least 1 sentence
244
245        let avg_sentence_length = if sentence_count > 0 {
246            word_count as f32 / sentence_count as f32
247        } else {
248            0.0
249        };
250
251        // Count paragraphs (separated by blank lines)
252        let paragraph_count = text
253            .split("\n\n")
254            .filter(|p| !p.trim().is_empty())
255            .count()
256            .max(1); // At least 1 paragraph
257
258        let char_count = text.chars().count();
259
260        TextStats {
261            word_count,
262            sentence_count,
263            paragraph_count,
264            char_count,
265            avg_sentence_length,
266            avg_word_length: if word_count > 0 {
267                char_count as f32 / word_count as f32
268            } else {
269                0.0
270            },
271        }
272    }
273
274    /// Detect if a line is underlined (for plain text heading detection)
275    ///
276    /// Checks if next_line consists entirely of underline characters (=, -, _)
277    pub fn is_underline(line: &str) -> Option<u8> {
278        let trimmed = line.trim();
279
280        if trimmed.len() < 3 {
281            return None;
282        }
283
284        // Check if line is all underline characters
285        if trimmed.chars().all(|c| c == '=') {
286            Some(1) // === is level 1 (chapter)
287        } else if trimmed.chars().all(|c| c == '-') {
288            Some(2) // --- is level 2 (section)
289        } else if trimmed.chars().all(|c| c == '_') {
290            Some(3) // ___ is level 3 (subsection)
291        } else {
292            None
293        }
294    }
295
296    /// Extract potential title from text (first non-empty line or ALL CAPS line)
297    pub fn extract_title(text: &str) -> Option<String> {
298        for line in text.lines().take(10) {
299            // Check first 10 lines
300            let trimmed = line.trim();
301
302            if trimmed.is_empty() {
303                continue;
304            }
305
306            // If it's ALL CAPS and reasonably short, it's likely the title
307            if Self::is_all_caps(trimmed) && trimmed.len() < 100 {
308                return Some(trimmed.to_string());
309            }
310
311            // If it looks like a heading, use it
312            if Self::detect_heading_level(line).is_some() {
313                // Strip heading markers
314                let clean = trimmed
315                    .trim_start_matches('#')
316                    .trim_start_matches(|c: char| c.is_numeric() || c == '.')
317                    .trim();
318                if !clean.is_empty() {
319                    return Some(clean.to_string());
320                }
321            }
322
323            // Otherwise, first non-empty line
324            if trimmed.len() > 5 {
325                return Some(trimmed.to_string());
326            }
327        }
328
329        None
330    }
331}
332
333/// Text statistics
334#[derive(Debug, Clone)]
335pub struct TextStats {
336    /// Total word count
337    pub word_count: usize,
338    /// Total sentence count
339    pub sentence_count: usize,
340    /// Total paragraph count
341    pub paragraph_count: usize,
342    /// Total character count
343    pub char_count: usize,
344    /// Average words per sentence
345    pub avg_sentence_length: f32,
346    /// Average characters per word
347    pub avg_word_length: f32,
348}
349
350#[cfg(test)]
351mod tests {
352    use super::*;
353
354    #[test]
355    fn test_markdown_heading_detection() {
356        assert_eq!(TextAnalyzer::detect_heading_level("# Chapter 1"), Some(1));
357        assert_eq!(
358            TextAnalyzer::detect_heading_level("## Section 1.1"),
359            Some(2)
360        );
361        assert_eq!(
362            TextAnalyzer::detect_heading_level("### Subsection 1.1.1"),
363            Some(3)
364        );
365        assert_eq!(TextAnalyzer::detect_heading_level("#### Level 4"), Some(4));
366        assert_eq!(TextAnalyzer::detect_heading_level("#No space"), None);
367    }
368
369    #[test]
370    fn test_all_caps_detection() {
371        assert_eq!(TextAnalyzer::detect_heading_level("CHAPTER ONE"), Some(1));
372        assert_eq!(
373            TextAnalyzer::detect_heading_level("INTRODUCTION TO MACHINE LEARNING"),
374            Some(2)
375        );
376        assert_eq!(
377            TextAnalyzer::detect_heading_level("This is not ALL CAPS"),
378            None
379        );
380    }
381
382    #[test]
383    fn test_section_number_extraction() {
384        // Decimal numbering
385        let sec1 = TextAnalyzer::extract_section_number("1. Introduction").unwrap();
386        assert_eq!(sec1.components, vec![1]);
387        assert_eq!(sec1.format, SectionNumberFormat::Decimal);
388
389        let sec2 = TextAnalyzer::extract_section_number("1.2.3 Subsection").unwrap();
390        assert_eq!(sec2.components, vec![1, 2, 3]);
391
392        // Chapter/Section keywords
393        let sec3 = TextAnalyzer::extract_section_number("Chapter 1 Introduction").unwrap();
394        assert_eq!(sec3.components, vec![1]);
395        assert_eq!(sec3.format, SectionNumberFormat::Mixed);
396
397        // Roman numerals
398        let sec4 = TextAnalyzer::extract_section_number("I. First Chapter").unwrap();
399        assert_eq!(sec4.components, vec![1]);
400        assert_eq!(sec4.format, SectionNumberFormat::Roman);
401
402        let sec5 = TextAnalyzer::extract_section_number("IV. Fourth Chapter").unwrap();
403        assert_eq!(sec5.components, vec![4]);
404    }
405
406    #[test]
407    fn test_roman_numeral_parsing() {
408        assert_eq!(TextAnalyzer::parse_roman_numeral("I"), Some(1));
409        assert_eq!(TextAnalyzer::parse_roman_numeral("IV"), Some(4));
410        assert_eq!(TextAnalyzer::parse_roman_numeral("IX"), Some(9));
411        assert_eq!(TextAnalyzer::parse_roman_numeral("XL"), Some(40));
412        assert_eq!(TextAnalyzer::parse_roman_numeral("MCMXCIV"), Some(1994));
413        assert_eq!(TextAnalyzer::parse_roman_numeral("ABC"), None);
414    }
415
416    #[test]
417    fn test_blank_line_detection() {
418        let text = "Line 1\n\nLine 2\n\n\nLine 3";
419        let positions = TextAnalyzer::find_blank_line_positions(text);
420        assert_eq!(positions.len(), 2);
421    }
422
423    #[test]
424    fn test_text_statistics() {
425        let text = "This is a test. It has two sentences.";
426        let stats = TextAnalyzer::calculate_statistics(text);
427
428        assert_eq!(stats.sentence_count, 2);
429        assert!(stats.word_count >= 7);
430        assert!(stats.avg_sentence_length > 0.0);
431    }
432
433    #[test]
434    fn test_underline_detection() {
435        assert_eq!(TextAnalyzer::is_underline("====="), Some(1));
436        assert_eq!(TextAnalyzer::is_underline("-----"), Some(2));
437        assert_eq!(TextAnalyzer::is_underline("_____"), Some(3));
438        assert_eq!(TextAnalyzer::is_underline("===---"), None);
439    }
440
441    #[test]
442    fn test_title_extraction() {
443        let text = "# Main Title\n\nSome content here.";
444        let title = TextAnalyzer::extract_title(text);
445        assert_eq!(title, Some("Main Title".to_string()));
446
447        let text2 = "INTRODUCTION\n\nThis is the intro.";
448        let title2 = TextAnalyzer::extract_title(text2);
449        assert_eq!(title2, Some("INTRODUCTION".to_string()));
450    }
451}