graphrag_core/text/
analysis.rs

1//! Text analysis utilities for document structure detection
2//!
3//! This module provides algorithms for analyzing text structure, including
4//! heading detection, section numbering extraction, and statistical analysis.
5
6use crate::text::document_structure::{SectionNumber, SectionNumberFormat};
7use regex::Regex;
8use std::sync::OnceLock;
9
10/// Text analyzer for structural analysis
11pub struct TextAnalyzer;
12
13impl TextAnalyzer {
14    /// Detect if a line is a heading and determine its level
15    ///
16    /// Supports multiple heading formats:
17    /// - Markdown: #, ##, ###, etc.
18    /// - Plain text: ALL CAPS, numeric prefixes
19    /// - Underlined text (detected by caller)
20    ///
21    /// Returns Some(level) if detected, where level 1 is highest (chapter)
22    pub fn detect_heading_level(line: &str) -> Option<u8> {
23        let trimmed = line.trim();
24
25        if trimmed.is_empty() {
26            return None;
27        }
28
29        // Markdown heading detection: # ## ### etc.
30        if trimmed.starts_with('#') {
31            let level = trimmed.chars().take_while(|&c| c == '#').count();
32            if level > 0 && level <= 6 {
33                // Verify there's a space after the hashes (proper markdown)
34                if trimmed.len() > level && trimmed.chars().nth(level) == Some(' ') {
35                    return Some(level.min(255) as u8);
36                }
37            }
38        }
39
40        // ALL CAPS detection (likely chapter/section heading)
41        if trimmed.len() >= 5 && Self::is_all_caps(trimmed) {
42            // Shorter ALL CAPS lines are more likely to be high-level headings
43            let level = if trimmed.len() < 20 {
44                1 // Short ALL CAPS = chapter
45            } else if trimmed.len() < 40 {
46                2 // Medium ALL CAPS = section
47            } else {
48                3 // Long ALL CAPS = subsection
49            };
50            return Some(level);
51        }
52
53        // Numbered heading detection: "1.", "1.1", "Chapter 1", etc.
54        if let Some(section_num) = Self::extract_section_number(trimmed) {
55            let level = section_num.depth();
56            if level > 0 && level <= 6 {
57                return Some(level);
58            }
59        }
60
61        None
62    }
63
64    /// Check if text is ALL CAPS (ignoring non-alphabetic characters)
65    fn is_all_caps(text: &str) -> bool {
66        let letters: String = text.chars().filter(|c| c.is_alphabetic()).collect();
67        !letters.is_empty() && letters.chars().all(|c| c.is_uppercase())
68    }
69
70    /// Extract section number from heading text
71    ///
72    /// Recognizes patterns like:
73    /// - "1.", "2.", "3."
74    /// - "1.1", "1.2.3", "2.3.4.5"
75    /// - "Chapter 1", "Section 2.1"
76    /// - "I.", "II.", "III." (Roman numerals)
77    /// - "A.", "B.", "C." (Alphabetic)
78    pub fn extract_section_number(text: &str) -> Option<SectionNumber> {
79        static DECIMAL_REGEX: OnceLock<Regex> = OnceLock::new();
80        static ROMAN_REGEX: OnceLock<Regex> = OnceLock::new();
81        static ALPHA_REGEX: OnceLock<Regex> = OnceLock::new();
82        static CHAPTER_REGEX: OnceLock<Regex> = OnceLock::new();
83
84        let decimal_re = DECIMAL_REGEX.get_or_init(|| {
85            Regex::new(r"^(\d+(?:\.\d+)*)\s*[.:]?\s").expect("static regex literal")
86        });
87
88        let roman_re = ROMAN_REGEX
89            .get_or_init(|| Regex::new(r"^([IVXLCDM]+)[.:]?\s").expect("static regex literal"));
90
91        let alpha_re = ALPHA_REGEX
92            .get_or_init(|| Regex::new(r"^([A-Z])[.:]?\s").expect("static regex literal"));
93
94        let chapter_re = CHAPTER_REGEX.get_or_init(|| {
95            Regex::new(r"(?i)^(chapter|section|part|appendix)\s+(\d+|[IVXLCDM]+|[A-Z])\b")
96                .expect("static regex literal")
97        });
98
99        // Try decimal numbering (most common)
100        if let Some(caps) = decimal_re.captures(text) {
101            if let Some(num_str) = caps.get(1) {
102                let components: Vec<usize> = num_str
103                    .as_str()
104                    .split('.')
105                    .filter_map(|s| s.parse().ok())
106                    .collect();
107
108                if !components.is_empty() {
109                    return Some(SectionNumber {
110                        raw: num_str.as_str().to_string(),
111                        format: SectionNumberFormat::Decimal,
112                        components,
113                    });
114                }
115            }
116        }
117
118        // Try chapter/section keywords
119        if let Some(caps) = chapter_re.captures(text) {
120            if let Some(num_match) = caps.get(2) {
121                let num_str = num_match.as_str();
122
123                // Try parsing as decimal
124                if let Ok(num) = num_str.parse::<usize>() {
125                    return Some(SectionNumber {
126                        raw: format!("{} {}", caps.get(1)?.as_str(), num_str),
127                        format: SectionNumberFormat::Mixed,
128                        components: vec![num],
129                    });
130                }
131
132                // Try parsing as Roman numeral
133                if let Some(num) = Self::parse_roman_numeral(num_str) {
134                    return Some(SectionNumber {
135                        raw: format!("{} {}", caps.get(1)?.as_str(), num_str),
136                        format: SectionNumberFormat::Mixed,
137                        components: vec![num],
138                    });
139                }
140
141                // Try parsing as alphabetic
142                if num_str.len() == 1 {
143                    if let Some(ch) = num_str.chars().next() {
144                        if ch.is_ascii_uppercase() {
145                            let num = (ch as usize) - ('A' as usize) + 1;
146                            return Some(SectionNumber {
147                                raw: format!("{} {}", caps.get(1)?.as_str(), num_str),
148                                format: SectionNumberFormat::Mixed,
149                                components: vec![num],
150                            });
151                        }
152                    }
153                }
154            }
155        }
156
157        // Try Roman numerals
158        if let Some(caps) = roman_re.captures(text) {
159            if let Some(roman_str) = caps.get(1) {
160                if let Some(num) = Self::parse_roman_numeral(roman_str.as_str()) {
161                    return Some(SectionNumber {
162                        raw: roman_str.as_str().to_string(),
163                        format: SectionNumberFormat::Roman,
164                        components: vec![num],
165                    });
166                }
167            }
168        }
169
170        // Try alphabetic
171        if let Some(caps) = alpha_re.captures(text) {
172            if let Some(letter) = caps.get(1) {
173                let ch = letter.as_str().chars().next()?;
174                let num = (ch as usize) - ('A' as usize) + 1;
175                return Some(SectionNumber {
176                    raw: letter.as_str().to_string(),
177                    format: SectionNumberFormat::Alphabetic,
178                    components: vec![num],
179                });
180            }
181        }
182
183        None
184    }
185
186    /// Parse Roman numeral to decimal
187    fn parse_roman_numeral(roman: &str) -> Option<usize> {
188        let mut result = 0;
189        let mut prev_value = 0;
190
191        for ch in roman.chars().rev() {
192            let value = match ch {
193                'I' => 1,
194                'V' => 5,
195                'X' => 10,
196                'L' => 50,
197                'C' => 100,
198                'D' => 500,
199                'M' => 1000,
200                _ => return None,
201            };
202
203            if value < prev_value {
204                result -= value;
205            } else {
206                result += value;
207            }
208            prev_value = value;
209        }
210
211        Some(result)
212    }
213
214    /// Find positions of blank lines (paragraph separators)
215    ///
216    /// Returns character offsets where blank lines occur
217    pub fn find_blank_line_positions(text: &str) -> Vec<usize> {
218        let mut positions = Vec::new();
219        let mut current_offset = 0;
220        let mut prev_was_blank = false;
221
222        for line in text.lines() {
223            let is_blank = line.trim().is_empty();
224
225            if is_blank && !prev_was_blank {
226                positions.push(current_offset);
227            }
228
229            prev_was_blank = is_blank;
230            current_offset += line.len() + 1; // +1 for newline
231        }
232
233        positions
234    }
235
236    /// Calculate statistics about text
237    pub fn calculate_statistics(text: &str) -> TextStats {
238        let words: Vec<&str> = text.split_whitespace().collect();
239        let word_count = words.len();
240
241        // Count sentences (simple heuristic)
242        let sentence_endings = ['.', '!', '?'];
243        let sentence_count = text
244            .chars()
245            .filter(|c| sentence_endings.contains(c))
246            .count()
247            .max(1); // At least 1 sentence
248
249        let avg_sentence_length = if sentence_count > 0 {
250            word_count as f32 / sentence_count as f32
251        } else {
252            0.0
253        };
254
255        // Count paragraphs (separated by blank lines)
256        let paragraph_count = text
257            .split("\n\n")
258            .filter(|p| !p.trim().is_empty())
259            .count()
260            .max(1); // At least 1 paragraph
261
262        let char_count = text.chars().count();
263
264        TextStats {
265            word_count,
266            sentence_count,
267            paragraph_count,
268            char_count,
269            avg_sentence_length,
270            avg_word_length: if word_count > 0 {
271                char_count as f32 / word_count as f32
272            } else {
273                0.0
274            },
275        }
276    }
277
278    /// Detect if a line is underlined (for plain text heading detection)
279    ///
280    /// Checks if next_line consists entirely of underline characters (=, -, _)
281    pub fn is_underline(line: &str) -> Option<u8> {
282        let trimmed = line.trim();
283
284        if trimmed.len() < 3 {
285            return None;
286        }
287
288        // Check if line is all underline characters
289        if trimmed.chars().all(|c| c == '=') {
290            Some(1) // === is level 1 (chapter)
291        } else if trimmed.chars().all(|c| c == '-') {
292            Some(2) // --- is level 2 (section)
293        } else if trimmed.chars().all(|c| c == '_') {
294            Some(3) // ___ is level 3 (subsection)
295        } else {
296            None
297        }
298    }
299
300    /// Extract potential title from text (first non-empty line or ALL CAPS line)
301    pub fn extract_title(text: &str) -> Option<String> {
302        for line in text.lines().take(10) {
303            // Check first 10 lines
304            let trimmed = line.trim();
305
306            if trimmed.is_empty() {
307                continue;
308            }
309
310            // If it's ALL CAPS and reasonably short, it's likely the title
311            if Self::is_all_caps(trimmed) && trimmed.len() < 100 {
312                return Some(trimmed.to_string());
313            }
314
315            // If it looks like a heading, use it
316            if Self::detect_heading_level(line).is_some() {
317                // Strip heading markers
318                let clean = trimmed
319                    .trim_start_matches('#')
320                    .trim_start_matches(|c: char| c.is_numeric() || c == '.')
321                    .trim();
322                if !clean.is_empty() {
323                    return Some(clean.to_string());
324                }
325            }
326
327            // Otherwise, first non-empty line
328            if trimmed.len() > 5 {
329                return Some(trimmed.to_string());
330            }
331        }
332
333        None
334    }
335}
336
337/// Text statistics
338#[derive(Debug, Clone)]
339pub struct TextStats {
340    /// Total word count
341    pub word_count: usize,
342    /// Total sentence count
343    pub sentence_count: usize,
344    /// Total paragraph count
345    pub paragraph_count: usize,
346    /// Total character count
347    pub char_count: usize,
348    /// Average words per sentence
349    pub avg_sentence_length: f32,
350    /// Average characters per word
351    pub avg_word_length: f32,
352}
353
354#[cfg(test)]
355mod tests {
356    use super::*;
357
358    #[test]
359    fn test_markdown_heading_detection() {
360        assert_eq!(TextAnalyzer::detect_heading_level("# Chapter 1"), Some(1));
361        assert_eq!(
362            TextAnalyzer::detect_heading_level("## Section 1.1"),
363            Some(2)
364        );
365        assert_eq!(
366            TextAnalyzer::detect_heading_level("### Subsection 1.1.1"),
367            Some(3)
368        );
369        assert_eq!(TextAnalyzer::detect_heading_level("#### Level 4"), Some(4));
370        assert_eq!(TextAnalyzer::detect_heading_level("#No space"), None);
371    }
372
373    #[test]
374    fn test_all_caps_detection() {
375        assert_eq!(TextAnalyzer::detect_heading_level("CHAPTER ONE"), Some(1));
376        assert_eq!(
377            TextAnalyzer::detect_heading_level("INTRODUCTION TO MACHINE LEARNING"),
378            Some(2)
379        );
380        assert_eq!(
381            TextAnalyzer::detect_heading_level("This is not ALL CAPS"),
382            None
383        );
384    }
385
386    #[test]
387    fn test_section_number_extraction() {
388        // Decimal numbering
389        let sec1 = TextAnalyzer::extract_section_number("1. Introduction").unwrap();
390        assert_eq!(sec1.components, vec![1]);
391        assert_eq!(sec1.format, SectionNumberFormat::Decimal);
392
393        let sec2 = TextAnalyzer::extract_section_number("1.2.3 Subsection").unwrap();
394        assert_eq!(sec2.components, vec![1, 2, 3]);
395
396        // Chapter/Section keywords
397        let sec3 = TextAnalyzer::extract_section_number("Chapter 1 Introduction").unwrap();
398        assert_eq!(sec3.components, vec![1]);
399        assert_eq!(sec3.format, SectionNumberFormat::Mixed);
400
401        // Roman numerals
402        let sec4 = TextAnalyzer::extract_section_number("I. First Chapter").unwrap();
403        assert_eq!(sec4.components, vec![1]);
404        assert_eq!(sec4.format, SectionNumberFormat::Roman);
405
406        let sec5 = TextAnalyzer::extract_section_number("IV. Fourth Chapter").unwrap();
407        assert_eq!(sec5.components, vec![4]);
408    }
409
410    #[test]
411    fn test_roman_numeral_parsing() {
412        assert_eq!(TextAnalyzer::parse_roman_numeral("I"), Some(1));
413        assert_eq!(TextAnalyzer::parse_roman_numeral("IV"), Some(4));
414        assert_eq!(TextAnalyzer::parse_roman_numeral("IX"), Some(9));
415        assert_eq!(TextAnalyzer::parse_roman_numeral("XL"), Some(40));
416        assert_eq!(TextAnalyzer::parse_roman_numeral("MCMXCIV"), Some(1994));
417        assert_eq!(TextAnalyzer::parse_roman_numeral("ABC"), None);
418    }
419
420    #[test]
421    fn test_blank_line_detection() {
422        let text = "Line 1\n\nLine 2\n\n\nLine 3";
423        let positions = TextAnalyzer::find_blank_line_positions(text);
424        assert_eq!(positions.len(), 2);
425    }
426
427    #[test]
428    fn test_text_statistics() {
429        let text = "This is a test. It has two sentences.";
430        let stats = TextAnalyzer::calculate_statistics(text);
431
432        assert_eq!(stats.sentence_count, 2);
433        assert!(stats.word_count >= 7);
434        assert!(stats.avg_sentence_length > 0.0);
435    }
436
437    #[test]
438    fn test_underline_detection() {
439        assert_eq!(TextAnalyzer::is_underline("====="), Some(1));
440        assert_eq!(TextAnalyzer::is_underline("-----"), Some(2));
441        assert_eq!(TextAnalyzer::is_underline("_____"), Some(3));
442        assert_eq!(TextAnalyzer::is_underline("===---"), None);
443    }
444
445    #[test]
446    fn test_title_extraction() {
447        let text = "# Main Title\n\nSome content here.";
448        let title = TextAnalyzer::extract_title(text);
449        assert_eq!(title, Some("Main Title".to_string()));
450
451        let text2 = "INTRODUCTION\n\nThis is the intro.";
452        let title2 = TextAnalyzer::extract_title(text2);
453        assert_eq!(title2, Some("INTRODUCTION".to_string()));
454    }
455}
graphrag_core/text/analysis.rs

graphrag_core/text/
analysis.rs