Skip to main content

edgeparse_core/utils/
xref_index.rs

1//! Cross-reference builder — creates an index mapping element IDs to
2//! their locations and relationships across the document.
3
4use std::collections::HashMap;
5
6use crate::models::content::ContentElement;
7
8/// A cross-reference entry for a document element.
9#[derive(Debug, Clone)]
10pub struct XRefEntry {
11    /// Element type tag.
12    pub element_type: String,
13    /// Element index in its parent container.
14    pub element_index: usize,
15    /// Page number (1-based, if known).
16    pub page_number: Option<u32>,
17    /// Heading text (if this is a heading element).
18    pub heading_text: Option<String>,
19    /// Heading level (if this is a heading element).
20    pub heading_level: Option<u32>,
21}
22
23/// A cross-reference index for the entire document.
24#[derive(Debug, Clone)]
25pub struct CrossReferenceIndex {
26    /// Map from element ID to its cross-reference entry.
27    entries: HashMap<usize, XRefEntry>,
28    /// Map from heading text (lowercase) to element ID.
29    heading_index: HashMap<String, Vec<usize>>,
30    /// Map from page number to element IDs on that page.
31    page_index: HashMap<u32, Vec<usize>>,
32}
33
34impl CrossReferenceIndex {
35    /// Build a cross-reference index from document pages.
36    pub fn from_pages(pages: &[Vec<ContentElement>]) -> Self {
37        let mut entries = HashMap::new();
38        let mut heading_index: HashMap<String, Vec<usize>> = HashMap::new();
39        let mut page_index: HashMap<u32, Vec<usize>> = HashMap::new();
40        let mut global_idx = 0usize;
41
42        for (page_idx, page) in pages.iter().enumerate() {
43            let page_num = (page_idx + 1) as u32;
44
45            for elem in page {
46                let entry = build_entry(elem, global_idx, page_num);
47
48                if let Some(ref heading) = entry.heading_text {
49                    heading_index
50                        .entry(heading.to_lowercase())
51                        .or_default()
52                        .push(global_idx);
53                }
54
55                page_index.entry(page_num).or_default().push(global_idx);
56                entries.insert(global_idx, entry);
57                global_idx += 1;
58            }
59        }
60
61        Self {
62            entries,
63            heading_index,
64            page_index,
65        }
66    }
67
68    /// Look up an element by its global index.
69    pub fn get(&self, id: usize) -> Option<&XRefEntry> {
70        self.entries.get(&id)
71    }
72
73    /// Find elements by heading text (case-insensitive substring match).
74    pub fn find_by_heading(&self, query: &str) -> Vec<(usize, &XRefEntry)> {
75        let query_lower = query.to_lowercase();
76        self.heading_index
77            .iter()
78            .filter(|(key, _)| key.contains(&query_lower))
79            .flat_map(|(_, ids)| {
80                ids.iter()
81                    .filter_map(|id| self.entries.get(id).map(|e| (*id, e)))
82            })
83            .collect()
84    }
85
86    /// Get all elements on a given page.
87    pub fn elements_on_page(&self, page_number: u32) -> Vec<(usize, &XRefEntry)> {
88        self.page_index
89            .get(&page_number)
90            .map(|ids| {
91                ids.iter()
92                    .filter_map(|id| self.entries.get(id).map(|e| (*id, e)))
93                    .collect()
94            })
95            .unwrap_or_default()
96    }
97
98    /// Total number of indexed elements.
99    pub fn len(&self) -> usize {
100        self.entries.len()
101    }
102
103    /// Whether the index is empty.
104    pub fn is_empty(&self) -> bool {
105        self.entries.is_empty()
106    }
107
108    /// Number of headings indexed.
109    pub fn heading_count(&self) -> usize {
110        self.heading_index.values().map(|v| v.len()).sum()
111    }
112
113    /// Number of pages with indexed elements.
114    pub fn page_count(&self) -> usize {
115        self.page_index.len()
116    }
117}
118
119fn build_entry(elem: &ContentElement, index: usize, page_number: u32) -> XRefEntry {
120    let element_type = element_type_name(elem).to_string();
121    let (heading_text, heading_level) = extract_heading_info(elem);
122
123    XRefEntry {
124        element_type,
125        element_index: index,
126        page_number: Some(page_number),
127        heading_text,
128        heading_level,
129    }
130}
131
132fn extract_heading_info(elem: &ContentElement) -> (Option<String>, Option<u32>) {
133    match elem {
134        ContentElement::Heading(h) => {
135            let text = h.base.base.value().trim().to_string();
136            let level = h.heading_level;
137            (if text.is_empty() { None } else { Some(text) }, level)
138        }
139        ContentElement::NumberHeading(nh) => {
140            let text = nh.base.base.base.value().trim().to_string();
141            let level = nh.base.heading_level;
142            (if text.is_empty() { None } else { Some(text) }, level)
143        }
144        _ => (None, None),
145    }
146}
147
148fn element_type_name(elem: &ContentElement) -> &'static str {
149    match elem {
150        ContentElement::TextChunk(_) => "TextChunk",
151        ContentElement::TextLine(_) => "TextLine",
152        ContentElement::TextBlock(_) => "TextBlock",
153        ContentElement::Paragraph(_) => "Paragraph",
154        ContentElement::Heading(_) => "Heading",
155        ContentElement::NumberHeading(_) => "NumberHeading",
156        ContentElement::Table(_) => "Table",
157        ContentElement::Figure(_) => "Figure",
158        ContentElement::Formula(_) => "Formula",
159        ContentElement::Picture(_) => "Picture",
160        ContentElement::Caption(_) => "Caption",
161        ContentElement::HeaderFooter(_) => "HeaderFooter",
162        ContentElement::Image(_) => "Image",
163        ContentElement::Line(_) => "Line",
164        ContentElement::LineArt(_) => "LineArt",
165        ContentElement::List(_) => "List",
166        ContentElement::TableBorder(_) => "TableBorder",
167    }
168}
169
170#[cfg(test)]
171mod tests {
172    use super::*;
173    use crate::models::bbox::BoundingBox;
174    use crate::models::chunks::TextChunk;
175    use crate::models::enums::{PdfLayer, TextFormat, TextType};
176
177    fn make_text(text: &str) -> ContentElement {
178        ContentElement::TextChunk(TextChunk {
179            value: text.to_string(),
180            bbox: BoundingBox::new(None, 0.0, 0.0, 100.0, 10.0),
181            font_name: "F".to_string(),
182            font_size: 12.0,
183            font_weight: 400.0,
184            italic_angle: 0.0,
185            font_color: "#000".to_string(),
186            contrast_ratio: 21.0,
187            symbol_ends: vec![],
188            text_format: TextFormat::Normal,
189            text_type: TextType::Regular,
190            pdf_layer: PdfLayer::Main,
191            ocg_visible: true,
192            index: None,
193            page_number: Some(1),
194            level: None,
195            mcid: None,
196        })
197    }
198
199    #[test]
200    fn test_build_index() {
201        let pages = vec![
202            vec![make_text("Hello"), make_text("World")],
203            vec![make_text("Page 2")],
204        ];
205        let index = CrossReferenceIndex::from_pages(&pages);
206        assert_eq!(index.len(), 3);
207        assert_eq!(index.page_count(), 2);
208    }
209
210    #[test]
211    fn test_get_element() {
212        let pages = vec![vec![make_text("Test")]];
213        let index = CrossReferenceIndex::from_pages(&pages);
214        let entry = index.get(0).unwrap();
215        assert_eq!(entry.element_type, "TextChunk");
216        assert_eq!(entry.page_number, Some(1));
217    }
218
219    #[test]
220    fn test_elements_on_page() {
221        let pages = vec![vec![make_text("A"), make_text("B")], vec![make_text("C")]];
222        let index = CrossReferenceIndex::from_pages(&pages);
223        assert_eq!(index.elements_on_page(1).len(), 2);
224        assert_eq!(index.elements_on_page(2).len(), 1);
225        assert_eq!(index.elements_on_page(3).len(), 0);
226    }
227
228    #[test]
229    fn test_empty_index() {
230        let pages: Vec<Vec<ContentElement>> = vec![];
231        let index = CrossReferenceIndex::from_pages(&pages);
232        assert!(index.is_empty());
233        assert_eq!(index.heading_count(), 0);
234    }
235}