graphrag_core/text/parsers/
plaintext.rs1use crate::text::{
4 document_structure::{DocumentStructure, Heading, HeadingHierarchy, Section},
5 layout_parser::LayoutParser,
6 TextAnalyzer,
7};
8
9pub struct PlainTextLayoutParser;
11
12impl PlainTextLayoutParser {
13 pub fn new() -> Self {
15 Self
16 }
17
18 fn build_sections(&self, headings: &[Heading], content: &str) -> Vec<Section> {
20 let mut sections = Vec::new();
21
22 for (i, heading) in headings.iter().enumerate() {
23 let content_start = heading.end_offset;
24 let content_end = headings
25 .get(i + 1)
26 .map(|h| h.start_offset)
27 .unwrap_or(content.len());
28
29 sections.push(Section::new(heading.clone(), content_start, content_end));
30 }
31
32 sections
33 }
34
35 fn build_hierarchy(&self, sections: &mut [Section]) -> HeadingHierarchy {
37 let mut hierarchy = HeadingHierarchy::new();
38 let mut stack: Vec<usize> = Vec::new();
39
40 for idx in 0..sections.len() {
41 let section_level = sections[idx].heading.level;
42
43 while let Some(&parent_idx) = stack.last() {
44 if sections[parent_idx].heading.level < section_level {
45 break;
46 }
47 stack.pop();
48 }
49
50 if let Some(&parent_idx) = stack.last() {
51 sections[parent_idx].child_sections.push(idx);
52 sections[idx].parent_section = Some(parent_idx);
53 } else {
54 hierarchy.root_sections.push(idx);
55 }
56
57 stack.push(idx);
58 }
59
60 for (idx, section) in sections.iter().enumerate() {
62 let mut depth = 0;
63 let mut current = section.parent_section;
64 while let Some(parent_idx) = current {
65 depth += 1;
66 current = sections[parent_idx].parent_section;
67 }
68 hierarchy.depth_map.insert(idx, depth);
69 }
70
71 hierarchy
72 }
73}
74
75impl Default for PlainTextLayoutParser {
76 fn default() -> Self {
77 Self::new()
78 }
79}
80
81impl LayoutParser for PlainTextLayoutParser {
82 fn parse(&self, content: &str) -> DocumentStructure {
83 let mut headings = Vec::new();
84 let lines: Vec<&str> = content.lines().collect();
85 let mut current_offset = 0;
86
87 let mut i = 0;
88 while i < lines.len() {
89 let line = lines[i];
90 let trimmed = line.trim();
91
92 if trimmed.is_empty() {
94 current_offset += line.len() + 1;
95 i += 1;
96 continue;
97 }
98
99 let mut detected_level: Option<u8> = None;
100 let heading_text = trimmed.to_string();
101
102 if i + 1 < lines.len() {
104 let next_line = lines[i + 1].trim();
105 if let Some(level) = TextAnalyzer::is_underline(next_line) {
106 detected_level = Some(level);
107 i += 1;
109 current_offset += line.len() + 1;
110 current_offset += next_line.len() + 1;
111 }
112 }
113
114 if detected_level.is_none() {
116 if let Some(level) = TextAnalyzer::detect_heading_level(line) {
117 detected_level = Some(level);
118 }
119 }
120
121 if let Some(level) = detected_level {
123 let heading = Heading::new(
124 level,
125 heading_text,
126 current_offset,
127 current_offset + line.len(),
128 )
129 .with_line_number(i);
130
131 headings.push(heading);
132 }
133
134 if detected_level.is_none() {
135 current_offset += line.len() + 1;
136 }
137
138 i += 1;
139 }
140
141 let mut sections = self.build_sections(&headings, content);
142 let hierarchy = self.build_hierarchy(&mut sections);
143
144 DocumentStructure {
145 headings,
146 sections,
147 hierarchy,
148 }
149 }
150
151 fn supports_format(&self, format: &str) -> bool {
152 matches!(format.to_lowercase().as_str(), "text" | "txt" | "plain")
153 }
154
155 fn name(&self) -> &'static str {
156 "PlainTextLayoutParser"
157 }
158}
159
160#[cfg(test)]
161mod tests {
162 use super::*;
163
164 #[test]
165 fn test_underline_detection() {
166 let parser = PlainTextLayoutParser::new();
167 let content =
168 "Chapter One\n===========\n\nSome text\n\nSection 1.1\n-----------\n\nMore text";
169
170 let structure = parser.parse(content);
171
172 assert!(structure.headings.len() >= 2);
173 assert_eq!(structure.headings[0].level, 1);
174 assert_eq!(structure.headings[0].text, "Chapter One");
175 }
176
177 #[test]
178 fn test_all_caps_detection() {
179 let parser = PlainTextLayoutParser::new();
180 let content = "INTRODUCTION\n\nThis is the intro.\n\nBACKGROUND\n\nSome background.";
181
182 let structure = parser.parse(content);
183
184 assert!(structure.headings.len() >= 2);
185 assert!(structure.headings[0].text.contains("INTRODUCTION"));
186 }
187
188 #[test]
189 fn test_numbered_sections() {
190 let parser = PlainTextLayoutParser::new();
191 let content = "1. First Chapter\n\nText here.\n\n1.1 Subsection\n\nMore text.";
192
193 let structure = parser.parse(content);
194
195 assert!(!structure.headings.is_empty());
197 }
198}