Skip to main content

graphrag_core/text/
analysis.rs

1//! Text analysis utilities for document structure detection
2//!
3//! This module provides algorithms for analyzing text structure, including
4//! heading detection, section numbering extraction, and statistical analysis.
5
6use crate::text::document_structure::{SectionNumber, SectionNumberFormat};
7use regex::Regex;
8use std::sync::OnceLock;
9
10/// Text analyzer for structural analysis
11pub struct TextAnalyzer;
12
13impl TextAnalyzer {
14    /// Detect if a line is a heading and determine its level
15    ///
16    /// Supports multiple heading formats:
17    /// - Markdown: #, ##, ###, etc.
18    /// - Plain text: ALL CAPS, numeric prefixes
19    /// - Underlined text (detected by caller)
20    ///
21    /// Returns Some(level) if detected, where level 1 is highest (chapter)
22    pub fn detect_heading_level(line: &str) -> Option<u8> {
23        let trimmed = line.trim();
24
25        if trimmed.is_empty() {
26            return None;
27        }
28
29        // Markdown heading detection: # ## ### etc.
30        if trimmed.starts_with('#') {
31            let level = trimmed.chars().take_while(|&c| c == '#').count();
32            if level > 0 && level <= 6 {
33                // Verify there's a space after the hashes (proper markdown)
34                if trimmed.len() > level && trimmed.chars().nth(level) == Some(' ') {
35                    return Some(level.min(255) as u8);
36                }
37            }
38        }
39
40        // ALL CAPS detection (likely chapter/section heading)
41        if trimmed.len() >= 5 && Self::is_all_caps(trimmed) {
42            // Shorter ALL CAPS lines are more likely to be high-level headings
43            let level = if trimmed.len() < 20 {
44                1 // Short ALL CAPS = chapter
45            } else if trimmed.len() < 40 {
46                2 // Medium ALL CAPS = section
47            } else {
48                3 // Long ALL CAPS = subsection
49            };
50            return Some(level);
51        }
52
53        // Numbered heading detection: "1.", "1.1", "Chapter 1", etc.
54        if let Some(section_num) = Self::extract_section_number(trimmed) {
55            let level = section_num.depth();
56            if level > 0 && level <= 6 {
57                return Some(level);
58            }
59        }
60
61        None
62    }
63
64    /// Check if text is ALL CAPS (ignoring non-alphabetic characters)
65    fn is_all_caps(text: &str) -> bool {
66        let letters: String = text.chars().filter(|c| c.is_alphabetic()).collect();
67        !letters.is_empty() && letters.chars().all(|c| c.is_uppercase())
68    }
69
70    /// Extract section number from heading text
71    ///
72    /// Recognizes patterns like:
73    /// - "1.", "2.", "3."
74    /// - "1.1", "1.2.3", "2.3.4.5"
75    /// - "Chapter 1", "Section 2.1"
76    /// - "I.", "II.", "III." (Roman numerals)
77    /// - "A.", "B.", "C." (Alphabetic)
78    pub fn extract_section_number(text: &str) -> Option<SectionNumber> {
79        static DECIMAL_REGEX: OnceLock<Regex> = OnceLock::new();
80        static ROMAN_REGEX: OnceLock<Regex> = OnceLock::new();
81        static ALPHA_REGEX: OnceLock<Regex> = OnceLock::new();
82        static CHAPTER_REGEX: OnceLock<Regex> = OnceLock::new();
83
84        let decimal_re = DECIMAL_REGEX.get_or_init(|| {
85            Regex::new(r"^(\d+(?:\.\d+)*)\s*[.:]?\s").unwrap()
86        });
87
88        let roman_re = ROMAN_REGEX.get_or_init(|| {
89            Regex::new(r"^([IVXLCDM]+)[.:]?\s").unwrap()
90        });
91
92        let alpha_re = ALPHA_REGEX.get_or_init(|| {
93            Regex::new(r"^([A-Z])[.:]?\s").unwrap()
94        });
95
96        let chapter_re = CHAPTER_REGEX.get_or_init(|| {
97            Regex::new(r"(?i)^(chapter|section|part|appendix)\s+(\d+|[IVXLCDM]+|[A-Z])\b").unwrap()
98        });
99
100        // Try decimal numbering (most common)
101        if let Some(caps) = decimal_re.captures(text) {
102            if let Some(num_str) = caps.get(1) {
103                let components: Vec<usize> = num_str
104                    .as_str()
105                    .split('.')
106                    .filter_map(|s| s.parse().ok())
107                    .collect();
108
109                if !components.is_empty() {
110                    return Some(SectionNumber {
111                        raw: num_str.as_str().to_string(),
112                        format: SectionNumberFormat::Decimal,
113                        components,
114                    });
115                }
116            }
117        }
118
119        // Try chapter/section keywords
120        if let Some(caps) = chapter_re.captures(text) {
121            if let Some(num_match) = caps.get(2) {
122                let num_str = num_match.as_str();
123
124                // Try parsing as decimal
125                if let Ok(num) = num_str.parse::<usize>() {
126                    return Some(SectionNumber {
127                        raw: format!("{} {}", caps.get(1)?.as_str(), num_str),
128                        format: SectionNumberFormat::Mixed,
129                        components: vec![num],
130                    });
131                }
132
133                // Try parsing as Roman numeral
134                if let Some(num) = Self::parse_roman_numeral(num_str) {
135                    return Some(SectionNumber {
136                        raw: format!("{} {}", caps.get(1)?.as_str(), num_str),
137                        format: SectionNumberFormat::Mixed,
138                        components: vec![num],
139                    });
140                }
141
142                // Try parsing as alphabetic
143                if num_str.len() == 1 {
144                    if let Some(ch) = num_str.chars().next() {
145                        if ch.is_ascii_uppercase() {
146                            let num = (ch as usize) - ('A' as usize) + 1;
147                            return Some(SectionNumber {
148                                raw: format!("{} {}", caps.get(1)?.as_str(), num_str),
149                                format: SectionNumberFormat::Mixed,
150                                components: vec![num],
151                            });
152                        }
153                    }
154                }
155            }
156        }
157
158        // Try Roman numerals
159        if let Some(caps) = roman_re.captures(text) {
160            if let Some(roman_str) = caps.get(1) {
161                if let Some(num) = Self::parse_roman_numeral(roman_str.as_str()) {
162                    return Some(SectionNumber {
163                        raw: roman_str.as_str().to_string(),
164                        format: SectionNumberFormat::Roman,
165                        components: vec![num],
166                    });
167                }
168            }
169        }
170
171        // Try alphabetic
172        if let Some(caps) = alpha_re.captures(text) {
173            if let Some(letter) = caps.get(1) {
174                let ch = letter.as_str().chars().next()?;
175                let num = (ch as usize) - ('A' as usize) + 1;
176                return Some(SectionNumber {
177                    raw: letter.as_str().to_string(),
178                    format: SectionNumberFormat::Alphabetic,
179                    components: vec![num],
180                });
181            }
182        }
183
184        None
185    }
186
187    /// Parse Roman numeral to decimal
188    fn parse_roman_numeral(roman: &str) -> Option<usize> {
189        let mut result = 0;
190        let mut prev_value = 0;
191
192        for ch in roman.chars().rev() {
193            let value = match ch {
194                'I' => 1,
195                'V' => 5,
196                'X' => 10,
197                'L' => 50,
198                'C' => 100,
199                'D' => 500,
200                'M' => 1000,
201                _ => return None,
202            };
203
204            if value < prev_value {
205                result -= value;
206            } else {
207                result += value;
208            }
209            prev_value = value;
210        }
211
212        Some(result)
213    }
214
215    /// Find positions of blank lines (paragraph separators)
216    ///
217    /// Returns character offsets where blank lines occur
218    pub fn find_blank_line_positions(text: &str) -> Vec<usize> {
219        let mut positions = Vec::new();
220        let mut current_offset = 0;
221        let mut prev_was_blank = false;
222
223        for line in text.lines() {
224            let is_blank = line.trim().is_empty();
225
226            if is_blank && !prev_was_blank {
227                positions.push(current_offset);
228            }
229
230            prev_was_blank = is_blank;
231            current_offset += line.len() + 1; // +1 for newline
232        }
233
234        positions
235    }
236
237    /// Calculate statistics about text
238    pub fn calculate_statistics(text: &str) -> TextStats {
239        let words: Vec<&str> = text.split_whitespace().collect();
240        let word_count = words.len();
241
242        // Count sentences (simple heuristic)
243        let sentence_endings = ['.', '!', '?'];
244        let sentence_count = text
245            .chars()
246            .filter(|c| sentence_endings.contains(c))
247            .count()
248            .max(1); // At least 1 sentence
249
250        let avg_sentence_length = if sentence_count > 0 {
251            word_count as f32 / sentence_count as f32
252        } else {
253            0.0
254        };
255
256        // Count paragraphs (separated by blank lines)
257        let paragraph_count = text
258            .split("\n\n")
259            .filter(|p| !p.trim().is_empty())
260            .count()
261            .max(1); // At least 1 paragraph
262
263        let char_count = text.chars().count();
264
265        TextStats {
266            word_count,
267            sentence_count,
268            paragraph_count,
269            char_count,
270            avg_sentence_length,
271            avg_word_length: if word_count > 0 {
272                char_count as f32 / word_count as f32
273            } else {
274                0.0
275            },
276        }
277    }
278
279    /// Detect if a line is underlined (for plain text heading detection)
280    ///
281    /// Checks if next_line consists entirely of underline characters (=, -, _)
282    pub fn is_underline(line: &str) -> Option<u8> {
283        let trimmed = line.trim();
284
285        if trimmed.len() < 3 {
286            return None;
287        }
288
289        // Check if line is all underline characters
290        if trimmed.chars().all(|c| c == '=') {
291            Some(1) // === is level 1 (chapter)
292        } else if trimmed.chars().all(|c| c == '-') {
293            Some(2) // --- is level 2 (section)
294        } else if trimmed.chars().all(|c| c == '_') {
295            Some(3) // ___ is level 3 (subsection)
296        } else {
297            None
298        }
299    }
300
301    /// Extract potential title from text (first non-empty line or ALL CAPS line)
302    pub fn extract_title(text: &str) -> Option<String> {
303        for line in text.lines().take(10) {
304            // Check first 10 lines
305            let trimmed = line.trim();
306
307            if trimmed.is_empty() {
308                continue;
309            }
310
311            // If it's ALL CAPS and reasonably short, it's likely the title
312            if Self::is_all_caps(trimmed) && trimmed.len() < 100 {
313                return Some(trimmed.to_string());
314            }
315
316            // If it looks like a heading, use it
317            if Self::detect_heading_level(line).is_some() {
318                // Strip heading markers
319                let clean = trimmed
320                    .trim_start_matches('#')
321                    .trim_start_matches(|c: char| c.is_numeric() || c == '.')
322                    .trim();
323                if !clean.is_empty() {
324                    return Some(clean.to_string());
325                }
326            }
327
328            // Otherwise, first non-empty line
329            if trimmed.len() > 5 {
330                return Some(trimmed.to_string());
331            }
332        }
333
334        None
335    }
336}
337
338/// Text statistics
339#[derive(Debug, Clone)]
340pub struct TextStats {
341    /// Total word count
342    pub word_count: usize,
343    /// Total sentence count
344    pub sentence_count: usize,
345    /// Total paragraph count
346    pub paragraph_count: usize,
347    /// Total character count
348    pub char_count: usize,
349    /// Average words per sentence
350    pub avg_sentence_length: f32,
351    /// Average characters per word
352    pub avg_word_length: f32,
353}
354
355#[cfg(test)]
356mod tests {
357    use super::*;
358
359    #[test]
360    fn test_markdown_heading_detection() {
361        assert_eq!(TextAnalyzer::detect_heading_level("# Chapter 1"), Some(1));
362        assert_eq!(TextAnalyzer::detect_heading_level("## Section 1.1"), Some(2));
363        assert_eq!(
364            TextAnalyzer::detect_heading_level("### Subsection 1.1.1"),
365            Some(3)
366        );
367        assert_eq!(TextAnalyzer::detect_heading_level("#### Level 4"), Some(4));
368        assert_eq!(TextAnalyzer::detect_heading_level("#No space"), None);
369    }
370
371    #[test]
372    fn test_all_caps_detection() {
373        assert_eq!(TextAnalyzer::detect_heading_level("CHAPTER ONE"), Some(1));
374        assert_eq!(
375            TextAnalyzer::detect_heading_level("INTRODUCTION TO MACHINE LEARNING"),
376            Some(2)
377        );
378        assert_eq!(TextAnalyzer::detect_heading_level("This is not ALL CAPS"), None);
379    }
380
381    #[test]
382    fn test_section_number_extraction() {
383        // Decimal numbering
384        let sec1 = TextAnalyzer::extract_section_number("1. Introduction").unwrap();
385        assert_eq!(sec1.components, vec![1]);
386        assert_eq!(sec1.format, SectionNumberFormat::Decimal);
387
388        let sec2 = TextAnalyzer::extract_section_number("1.2.3 Subsection").unwrap();
389        assert_eq!(sec2.components, vec![1, 2, 3]);
390
391        // Chapter/Section keywords
392        let sec3 = TextAnalyzer::extract_section_number("Chapter 1 Introduction").unwrap();
393        assert_eq!(sec3.components, vec![1]);
394        assert_eq!(sec3.format, SectionNumberFormat::Mixed);
395
396        // Roman numerals
397        let sec4 = TextAnalyzer::extract_section_number("I. First Chapter").unwrap();
398        assert_eq!(sec4.components, vec![1]);
399        assert_eq!(sec4.format, SectionNumberFormat::Roman);
400
401        let sec5 = TextAnalyzer::extract_section_number("IV. Fourth Chapter").unwrap();
402        assert_eq!(sec5.components, vec![4]);
403    }
404
405    #[test]
406    fn test_roman_numeral_parsing() {
407        assert_eq!(TextAnalyzer::parse_roman_numeral("I"), Some(1));
408        assert_eq!(TextAnalyzer::parse_roman_numeral("IV"), Some(4));
409        assert_eq!(TextAnalyzer::parse_roman_numeral("IX"), Some(9));
410        assert_eq!(TextAnalyzer::parse_roman_numeral("XL"), Some(40));
411        assert_eq!(TextAnalyzer::parse_roman_numeral("MCMXCIV"), Some(1994));
412        assert_eq!(TextAnalyzer::parse_roman_numeral("ABC"), None);
413    }
414
415    #[test]
416    fn test_blank_line_detection() {
417        let text = "Line 1\n\nLine 2\n\n\nLine 3";
418        let positions = TextAnalyzer::find_blank_line_positions(text);
419        assert_eq!(positions.len(), 2);
420    }
421
422    #[test]
423    fn test_text_statistics() {
424        let text = "This is a test. It has two sentences.";
425        let stats = TextAnalyzer::calculate_statistics(text);
426
427        assert_eq!(stats.sentence_count, 2);
428        assert!(stats.word_count >= 7);
429        assert!(stats.avg_sentence_length > 0.0);
430    }
431
432    #[test]
433    fn test_underline_detection() {
434        assert_eq!(TextAnalyzer::is_underline("====="), Some(1));
435        assert_eq!(TextAnalyzer::is_underline("-----"), Some(2));
436        assert_eq!(TextAnalyzer::is_underline("_____"), Some(3));
437        assert_eq!(TextAnalyzer::is_underline("===---"), None);
438    }
439
440    #[test]
441    fn test_title_extraction() {
442        let text = "# Main Title\n\nSome content here.";
443        let title = TextAnalyzer::extract_title(text);
444        assert_eq!(title, Some("Main Title".to_string()));
445
446        let text2 = "INTRODUCTION\n\nThis is the intro.";
447        let title2 = TextAnalyzer::extract_title(text2);
448        assert_eq!(title2, Some("INTRODUCTION".to_string()));
449    }
450}