Skip to main content

graphrag_core/text/parsers/
plaintext.rs

1//! Plain text layout parser with heuristic-based detection
2
3use crate::text::{
4    document_structure::{DocumentStructure, Heading, HeadingHierarchy, Section},
5    layout_parser::LayoutParser,
6    TextAnalyzer,
7};
8
9/// Parser for plain text documents using heuristics
10pub struct PlainTextLayoutParser;
11
12impl PlainTextLayoutParser {
13    /// Create new plain text parser
14    pub fn new() -> Self {
15        Self
16    }
17
18    /// Build sections from headings
19    fn build_sections(&self, headings: &[Heading], content: &str) -> Vec<Section> {
20        let mut sections = Vec::new();
21
22        for (i, heading) in headings.iter().enumerate() {
23            let content_start = heading.end_offset;
24            let content_end = headings
25                .get(i + 1)
26                .map(|h| h.start_offset)
27                .unwrap_or(content.len());
28
29            sections.push(Section::new(heading.clone(), content_start, content_end));
30        }
31
32        sections
33    }
34
35    /// Build hierarchy from sections
36    fn build_hierarchy(&self, sections: &mut [Section]) -> HeadingHierarchy {
37        let mut hierarchy = HeadingHierarchy::new();
38        let mut stack: Vec<usize> = Vec::new();
39
40        for idx in 0..sections.len() {
41            let section_level = sections[idx].heading.level;
42
43            while let Some(&parent_idx) = stack.last() {
44                if sections[parent_idx].heading.level < section_level {
45                    break;
46                }
47                stack.pop();
48            }
49
50            if let Some(&parent_idx) = stack.last() {
51                sections[parent_idx].child_sections.push(idx);
52                sections[idx].parent_section = Some(parent_idx);
53            } else {
54                hierarchy.root_sections.push(idx);
55            }
56
57            stack.push(idx);
58        }
59
60        // Build depth map
61        for (idx, section) in sections.iter().enumerate() {
62            let mut depth = 0;
63            let mut current = section.parent_section;
64            while let Some(parent_idx) = current {
65                depth += 1;
66                current = sections[parent_idx].parent_section;
67            }
68            hierarchy.depth_map.insert(idx, depth);
69        }
70
71        hierarchy
72    }
73}
74
75impl Default for PlainTextLayoutParser {
76    fn default() -> Self {
77        Self::new()
78    }
79}
80
81impl LayoutParser for PlainTextLayoutParser {
82    fn parse(&self, content: &str) -> DocumentStructure {
83        let mut headings = Vec::new();
84        let lines: Vec<&str> = content.lines().collect();
85        let mut current_offset = 0;
86
87        let mut i = 0;
88        while i < lines.len() {
89            let line = lines[i];
90            let trimmed = line.trim();
91
92            // Skip empty lines
93            if trimmed.is_empty() {
94                current_offset += line.len() + 1;
95                i += 1;
96                continue;
97            }
98
99            let mut detected_level: Option<u8> = None;
100            let heading_text = trimmed.to_string();
101
102            // Heuristic 1: Check if next line is underline
103            if i + 1 < lines.len() {
104                let next_line = lines[i + 1].trim();
105                if let Some(level) = TextAnalyzer::is_underline(next_line) {
106                    detected_level = Some(level);
107                    // Skip the underline in next iteration
108                    i += 1;
109                    current_offset += line.len() + 1;
110                    current_offset += next_line.len() + 1;
111                }
112            }
113
114            // Heuristic 2: ALL CAPS detection
115            if detected_level.is_none() {
116                if let Some(level) = TextAnalyzer::detect_heading_level(line) {
117                    detected_level = Some(level);
118                }
119            }
120
121            // If heading detected, add it
122            if let Some(level) = detected_level {
123                let heading = Heading::new(
124                    level,
125                    heading_text,
126                    current_offset,
127                    current_offset + line.len(),
128                )
129                .with_line_number(i);
130
131                headings.push(heading);
132            }
133
134            if detected_level.is_none() {
135                current_offset += line.len() + 1;
136            }
137
138            i += 1;
139        }
140
141        let mut sections = self.build_sections(&headings, content);
142        let hierarchy = self.build_hierarchy(&mut sections);
143
144        DocumentStructure {
145            headings,
146            sections,
147            hierarchy,
148        }
149    }
150
151    fn supports_format(&self, format: &str) -> bool {
152        matches!(format.to_lowercase().as_str(), "text" | "txt" | "plain")
153    }
154
155    fn name(&self) -> &'static str {
156        "PlainTextLayoutParser"
157    }
158}
159
160#[cfg(test)]
161mod tests {
162    use super::*;
163
164    #[test]
165    fn test_underline_detection() {
166        let parser = PlainTextLayoutParser::new();
167        let content =
168            "Chapter One\n===========\n\nSome text\n\nSection 1.1\n-----------\n\nMore text";
169
170        let structure = parser.parse(content);
171
172        assert!(structure.headings.len() >= 2);
173        assert_eq!(structure.headings[0].level, 1);
174        assert_eq!(structure.headings[0].text, "Chapter One");
175    }
176
177    #[test]
178    fn test_all_caps_detection() {
179        let parser = PlainTextLayoutParser::new();
180        let content = "INTRODUCTION\n\nThis is the intro.\n\nBACKGROUND\n\nSome background.";
181
182        let structure = parser.parse(content);
183
184        assert!(structure.headings.len() >= 2);
185        assert!(structure.headings[0].text.contains("INTRODUCTION"));
186    }
187
188    #[test]
189    fn test_numbered_sections() {
190        let parser = PlainTextLayoutParser::new();
191        let content = "1. First Chapter\n\nText here.\n\n1.1 Subsection\n\nMore text.";
192
193        let structure = parser.parse(content);
194
195        // Should detect numbered headings
196        assert!(!structure.headings.is_empty());
197    }
198}