Skip to main content

graphrag_core/text/
document_structure.rs

1//! Document structure representation for hierarchical parsing
2//!
3//! This module provides data structures to represent the hierarchical structure
4//! of documents, including headings, sections, and their relationships.
5
6use serde::{Deserialize, Serialize};
7use std::collections::HashMap;
8
9/// A heading in a document (e.g., chapter, section, subsection)
10#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
11pub struct Heading {
12    /// Hierarchical level (1 = chapter/h1, 2 = section/h2, 3 = subsection/h3, etc.)
13    pub level: u8,
14
15    /// Text content of the heading
16    pub text: String,
17
18    /// Starting character offset in the original document
19    pub start_offset: usize,
20
21    /// Ending character offset in the original document
22    pub end_offset: usize,
23
24    /// Line number in the document (if applicable)
25    pub line_number: usize,
26
27    /// Optional section number (e.g., "1.2.3", "Chapter 1")
28    pub section_number: Option<String>,
29}
30
31impl Heading {
32    /// Create a new heading
33    pub fn new(level: u8, text: String, start_offset: usize, end_offset: usize) -> Self {
34        Self {
35            level,
36            text,
37            start_offset,
38            end_offset,
39            line_number: 0,
40            section_number: None,
41        }
42    }
43
44    /// Create heading with line number
45    pub fn with_line_number(mut self, line_number: usize) -> Self {
46        self.line_number = line_number;
47        self
48    }
49
50    /// Create heading with section number
51    pub fn with_section_number(mut self, section_number: String) -> Self {
52        self.section_number = Some(section_number);
53        self
54    }
55
56    /// Get a display string for the heading
57    pub fn display_string(&self) -> String {
58        if let Some(ref num) = self.section_number {
59            format!("{} {}", num, self.text)
60        } else {
61            self.text.clone()
62        }
63    }
64}
65
66/// A section in a document, defined by a heading and its content range
67#[derive(Debug, Clone, Serialize, Deserialize)]
68pub struct Section {
69    /// The heading that starts this section
70    pub heading: Heading,
71
72    /// Starting offset of the section content (after the heading)
73    pub content_start: usize,
74
75    /// Ending offset of the section content (before next heading or end of document)
76    pub content_end: usize,
77
78    /// Index of parent section in the sections array (None if root level)
79    pub parent_section: Option<usize>,
80
81    /// Indices of child sections in the sections array
82    pub child_sections: Vec<usize>,
83}
84
85impl Section {
86    /// Create a new section
87    pub fn new(heading: Heading, content_start: usize, content_end: usize) -> Self {
88        Self {
89            heading,
90            content_start,
91            content_end,
92            parent_section: None,
93            child_sections: Vec::new(),
94        }
95    }
96
97    /// Get the length of the section content in characters
98    pub fn content_length(&self) -> usize {
99        self.content_end.saturating_sub(self.content_start)
100    }
101
102    /// Check if this section contains the given offset
103    pub fn contains_offset(&self, offset: usize) -> bool {
104        offset >= self.heading.start_offset && offset < self.content_end
105    }
106
107    /// Check if this section is a root section (no parent)
108    pub fn is_root(&self) -> bool {
109        self.parent_section.is_none()
110    }
111
112    /// Check if this section has children
113    pub fn has_children(&self) -> bool {
114        !self.child_sections.is_empty()
115    }
116}
117
118/// Hierarchical structure of a document
119#[derive(Debug, Clone, Serialize, Deserialize)]
120pub struct HeadingHierarchy {
121    /// Indices of root-level sections
122    pub root_sections: Vec<usize>,
123
124    /// Mapping from section index to depth (0 = root, 1 = child of root, etc.)
125    pub depth_map: HashMap<usize, usize>,
126}
127
128impl HeadingHierarchy {
129    /// Create a new empty hierarchy
130    pub fn new() -> Self {
131        Self {
132            root_sections: Vec::new(),
133            depth_map: HashMap::new(),
134        }
135    }
136
137    /// Get the depth of a section in the hierarchy
138    pub fn get_depth(&self, section_idx: usize) -> Option<usize> {
139        self.depth_map.get(&section_idx).copied()
140    }
141
142    /// Check if a section is at root level
143    pub fn is_root(&self, section_idx: usize) -> bool {
144        self.root_sections.contains(&section_idx)
145    }
146}
147
148impl Default for HeadingHierarchy {
149    fn default() -> Self {
150        Self::new()
151    }
152}
153
154/// Complete document structure with headings and sections
155#[derive(Debug, Clone, Serialize, Deserialize)]
156pub struct DocumentStructure {
157    /// All headings found in the document, in order of appearance
158    pub headings: Vec<Heading>,
159
160    /// All sections derived from headings
161    pub sections: Vec<Section>,
162
163    /// Hierarchical relationships between sections
164    pub hierarchy: HeadingHierarchy,
165}
166
167impl DocumentStructure {
168    /// Create a new empty document structure
169    pub fn new() -> Self {
170        Self {
171            headings: Vec::new(),
172            sections: Vec::new(),
173            hierarchy: HeadingHierarchy::new(),
174        }
175    }
176
177    /// Find the section index that contains the given offset
178    pub fn find_section_containing_offset(&self, offset: usize) -> Option<usize> {
179        self.sections
180            .iter()
181            .position(|section| section.contains_offset(offset))
182    }
183
184    /// Get the heading path from root to the given section
185    ///
186    /// Returns a vector of heading texts in hierarchical order
187    /// Example: ["Chapter 1", "Section 1.1", "Subsection 1.1.1"]
188    pub fn get_heading_path(&self, section_idx: usize) -> Vec<String> {
189        let mut path = Vec::new();
190        let mut current_idx = Some(section_idx);
191
192        // Traverse up to root, collecting headings
193        while let Some(idx) = current_idx {
194            if idx < self.sections.len() {
195                let section = &self.sections[idx];
196                path.push(section.heading.display_string());
197                current_idx = section.parent_section;
198            } else {
199                break;
200            }
201        }
202
203        // Reverse to get root-to-leaf order
204        path.reverse();
205        path
206    }
207
208    /// Get all sections at a specific hierarchical level
209    pub fn get_sections_at_level(&self, level: u8) -> Vec<&Section> {
210        self.sections
211            .iter()
212            .filter(|s| s.heading.level == level)
213            .collect()
214    }
215
216    /// Get the total number of sections
217    pub fn section_count(&self) -> usize {
218        self.sections.len()
219    }
220
221    /// Get the maximum depth of the hierarchy
222    pub fn max_depth(&self) -> usize {
223        self.hierarchy
224            .depth_map
225            .values()
226            .max()
227            .copied()
228            .unwrap_or(0)
229    }
230
231    /// Check if the document has any structure
232    pub fn has_structure(&self) -> bool {
233        !self.headings.is_empty()
234    }
235
236    /// Get statistics about the document structure
237    pub fn get_statistics(&self) -> StructureStatistics {
238        let mut level_counts: HashMap<u8, usize> = HashMap::new();
239        for heading in &self.headings {
240            *level_counts.entry(heading.level).or_insert(0) += 1;
241        }
242
243        StructureStatistics {
244            total_headings: self.headings.len(),
245            total_sections: self.sections.len(),
246            max_depth: self.max_depth(),
247            level_counts,
248            root_sections: self.hierarchy.root_sections.len(),
249        }
250    }
251}
252
253impl Default for DocumentStructure {
254    fn default() -> Self {
255        Self::new()
256    }
257}
258
259/// Statistics about document structure
260#[derive(Debug, Clone, Serialize, Deserialize)]
261pub struct StructureStatistics {
262    /// Total number of headings
263    pub total_headings: usize,
264
265    /// Total number of sections
266    pub total_sections: usize,
267
268    /// Maximum depth of the hierarchy
269    pub max_depth: usize,
270
271    /// Count of headings at each level
272    pub level_counts: HashMap<u8, usize>,
273
274    /// Number of root-level sections
275    pub root_sections: usize,
276}
277
278impl StructureStatistics {
279    /// Print a human-readable summary of the statistics
280    pub fn print_summary(&self) {
281        println!("Document Structure Statistics:");
282        println!("  Total headings: {}", self.total_headings);
283        println!("  Total sections: {}", self.total_sections);
284        println!("  Max depth: {}", self.max_depth);
285        println!("  Root sections: {}", self.root_sections);
286        println!("  Headings by level:");
287        let mut levels: Vec<_> = self.level_counts.iter().collect();
288        levels.sort_by_key(|(level, _)| *level);
289        for (level, count) in levels {
290            println!("    Level {}: {} headings", level, count);
291        }
292    }
293}
294
295/// Section numbering format (e.g., "1.2.3", "Chapter 1", "I.A.1")
296#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
297pub enum SectionNumberFormat {
298    /// Numeric: 1, 2, 3
299    Numeric,
300    /// Decimal: 1.1, 1.2, 2.1
301    Decimal,
302    /// Roman: I, II, III
303    Roman,
304    /// Alphabetic: A, B, C
305    Alphabetic,
306    /// Mixed: Chapter 1, Section A, etc.
307    Mixed,
308}
309
310/// Parsed section number with format information
311#[derive(Debug, Clone, Serialize, Deserialize)]
312pub struct SectionNumber {
313    /// Original string representation
314    pub raw: String,
315
316    /// Detected format
317    pub format: SectionNumberFormat,
318
319    /// Numeric components (e.g., [1, 2, 3] for "1.2.3")
320    pub components: Vec<usize>,
321}
322
323impl SectionNumber {
324    /// Get the depth of the section number (number of components)
325    pub fn depth(&self) -> u8 {
326        self.components.len().min(255) as u8
327    }
328
329    /// Check if this section number is deeper than another
330    pub fn is_deeper_than(&self, other: &SectionNumber) -> bool {
331        self.depth() > other.depth()
332    }
333}
334
335#[cfg(test)]
336mod tests {
337    use super::*;
338
339    #[test]
340    fn test_heading_creation() {
341        let heading = Heading::new(1, "Chapter 1".to_string(), 0, 9)
342            .with_line_number(5)
343            .with_section_number("1".to_string());
344
345        assert_eq!(heading.level, 1);
346        assert_eq!(heading.text, "Chapter 1");
347        assert_eq!(heading.line_number, 5);
348        assert_eq!(heading.display_string(), "1 Chapter 1");
349    }
350
351    #[test]
352    fn test_section_contains_offset() {
353        let heading = Heading::new(1, "Test".to_string(), 0, 10);
354        let section = Section::new(heading, 10, 100);
355
356        assert!(section.contains_offset(0));
357        assert!(section.contains_offset(50));
358        assert!(!section.contains_offset(100));
359        assert!(!section.contains_offset(150));
360    }
361
362    #[test]
363    fn test_document_structure() {
364        let mut structure = DocumentStructure::new();
365
366        let h1 = Heading::new(1, "Chapter 1".to_string(), 0, 9);
367        let h2 = Heading::new(2, "Section 1.1".to_string(), 50, 61);
368
369        structure.headings.push(h1.clone());
370        structure.headings.push(h2.clone());
371
372        let s1 = Section::new(h1, 10, 50);
373        let mut s2 = Section::new(h2, 62, 100);
374        s2.parent_section = Some(0);
375
376        structure.sections.push(s1);
377        structure.sections.push(s2);
378        structure.hierarchy.root_sections.push(0);
379
380        assert_eq!(structure.section_count(), 2);
381        assert!(structure.has_structure());
382        assert_eq!(structure.find_section_containing_offset(25), Some(0));
383        assert_eq!(structure.find_section_containing_offset(75), Some(1));
384    }
385
386    #[test]
387    fn test_heading_path() {
388        let mut structure = DocumentStructure::new();
389
390        let h1 = Heading::new(1, "Chapter 1".to_string(), 0, 9);
391        let h2 = Heading::new(2, "Section 1.1".to_string(), 50, 61);
392        let h3 = Heading::new(3, "Subsection 1.1.1".to_string(), 100, 116);
393
394        structure.headings.push(h1.clone());
395        structure.headings.push(h2.clone());
396        structure.headings.push(h3.clone());
397
398        let s1 = Section::new(h1, 10, 50);
399        let mut s2 = Section::new(h2, 62, 100);
400        s2.parent_section = Some(0);
401        let mut s3 = Section::new(h3, 117, 200);
402        s3.parent_section = Some(1);
403
404        structure.sections.push(s1);
405        structure.sections.push(s2);
406        structure.sections.push(s3);
407
408        let path = structure.get_heading_path(2);
409        assert_eq!(path.len(), 3);
410        assert_eq!(path[0], "Chapter 1");
411        assert_eq!(path[1], "Section 1.1");
412        assert_eq!(path[2], "Subsection 1.1.1");
413    }
414
415    #[test]
416    fn test_structure_statistics() {
417        let mut structure = DocumentStructure::new();
418
419        structure.headings.push(Heading::new(1, "H1".to_string(), 0, 2));
420        structure.headings.push(Heading::new(2, "H2".to_string(), 10, 12));
421        structure.headings.push(Heading::new(2, "H2b".to_string(), 20, 23));
422
423        let stats = structure.get_statistics();
424        assert_eq!(stats.total_headings, 3);
425        assert_eq!(stats.level_counts.get(&1), Some(&1));
426        assert_eq!(stats.level_counts.get(&2), Some(&2));
427    }
428
429    #[test]
430    fn test_section_number_depth() {
431        let section_num = SectionNumber {
432            raw: "1.2.3".to_string(),
433            format: SectionNumberFormat::Decimal,
434            components: vec![1, 2, 3],
435        };
436
437        assert_eq!(section_num.depth(), 3);
438    }
439}