1use std::collections::HashMap;
5
6use crate::models::content::ContentElement;
7
8#[derive(Debug, Clone)]
10pub struct XRefEntry {
11 pub element_type: String,
13 pub element_index: usize,
15 pub page_number: Option<u32>,
17 pub heading_text: Option<String>,
19 pub heading_level: Option<u32>,
21}
22
23#[derive(Debug, Clone)]
25pub struct CrossReferenceIndex {
26 entries: HashMap<usize, XRefEntry>,
28 heading_index: HashMap<String, Vec<usize>>,
30 page_index: HashMap<u32, Vec<usize>>,
32}
33
34impl CrossReferenceIndex {
35 pub fn from_pages(pages: &[Vec<ContentElement>]) -> Self {
37 let mut entries = HashMap::new();
38 let mut heading_index: HashMap<String, Vec<usize>> = HashMap::new();
39 let mut page_index: HashMap<u32, Vec<usize>> = HashMap::new();
40 let mut global_idx = 0usize;
41
42 for (page_idx, page) in pages.iter().enumerate() {
43 let page_num = (page_idx + 1) as u32;
44
45 for elem in page {
46 let entry = build_entry(elem, global_idx, page_num);
47
48 if let Some(ref heading) = entry.heading_text {
49 heading_index
50 .entry(heading.to_lowercase())
51 .or_default()
52 .push(global_idx);
53 }
54
55 page_index.entry(page_num).or_default().push(global_idx);
56 entries.insert(global_idx, entry);
57 global_idx += 1;
58 }
59 }
60
61 Self {
62 entries,
63 heading_index,
64 page_index,
65 }
66 }
67
68 pub fn get(&self, id: usize) -> Option<&XRefEntry> {
70 self.entries.get(&id)
71 }
72
73 pub fn find_by_heading(&self, query: &str) -> Vec<(usize, &XRefEntry)> {
75 let query_lower = query.to_lowercase();
76 self.heading_index
77 .iter()
78 .filter(|(key, _)| key.contains(&query_lower))
79 .flat_map(|(_, ids)| {
80 ids.iter()
81 .filter_map(|id| self.entries.get(id).map(|e| (*id, e)))
82 })
83 .collect()
84 }
85
86 pub fn elements_on_page(&self, page_number: u32) -> Vec<(usize, &XRefEntry)> {
88 self.page_index
89 .get(&page_number)
90 .map(|ids| {
91 ids.iter()
92 .filter_map(|id| self.entries.get(id).map(|e| (*id, e)))
93 .collect()
94 })
95 .unwrap_or_default()
96 }
97
98 pub fn len(&self) -> usize {
100 self.entries.len()
101 }
102
103 pub fn is_empty(&self) -> bool {
105 self.entries.is_empty()
106 }
107
108 pub fn heading_count(&self) -> usize {
110 self.heading_index.values().map(|v| v.len()).sum()
111 }
112
113 pub fn page_count(&self) -> usize {
115 self.page_index.len()
116 }
117}
118
119fn build_entry(elem: &ContentElement, index: usize, page_number: u32) -> XRefEntry {
120 let element_type = element_type_name(elem).to_string();
121 let (heading_text, heading_level) = extract_heading_info(elem);
122
123 XRefEntry {
124 element_type,
125 element_index: index,
126 page_number: Some(page_number),
127 heading_text,
128 heading_level,
129 }
130}
131
132fn extract_heading_info(elem: &ContentElement) -> (Option<String>, Option<u32>) {
133 match elem {
134 ContentElement::Heading(h) => {
135 let text = h.base.base.value().trim().to_string();
136 let level = h.heading_level;
137 (if text.is_empty() { None } else { Some(text) }, level)
138 }
139 ContentElement::NumberHeading(nh) => {
140 let text = nh.base.base.base.value().trim().to_string();
141 let level = nh.base.heading_level;
142 (if text.is_empty() { None } else { Some(text) }, level)
143 }
144 _ => (None, None),
145 }
146}
147
148fn element_type_name(elem: &ContentElement) -> &'static str {
149 match elem {
150 ContentElement::TextChunk(_) => "TextChunk",
151 ContentElement::TextLine(_) => "TextLine",
152 ContentElement::TextBlock(_) => "TextBlock",
153 ContentElement::Paragraph(_) => "Paragraph",
154 ContentElement::Heading(_) => "Heading",
155 ContentElement::NumberHeading(_) => "NumberHeading",
156 ContentElement::Table(_) => "Table",
157 ContentElement::Figure(_) => "Figure",
158 ContentElement::Formula(_) => "Formula",
159 ContentElement::Picture(_) => "Picture",
160 ContentElement::Caption(_) => "Caption",
161 ContentElement::HeaderFooter(_) => "HeaderFooter",
162 ContentElement::Image(_) => "Image",
163 ContentElement::Line(_) => "Line",
164 ContentElement::LineArt(_) => "LineArt",
165 ContentElement::List(_) => "List",
166 ContentElement::TableBorder(_) => "TableBorder",
167 }
168}
169
170#[cfg(test)]
171mod tests {
172 use super::*;
173 use crate::models::bbox::BoundingBox;
174 use crate::models::chunks::TextChunk;
175 use crate::models::enums::{PdfLayer, TextFormat, TextType};
176
177 fn make_text(text: &str) -> ContentElement {
178 ContentElement::TextChunk(TextChunk {
179 value: text.to_string(),
180 bbox: BoundingBox::new(None, 0.0, 0.0, 100.0, 10.0),
181 font_name: "F".to_string(),
182 font_size: 12.0,
183 font_weight: 400.0,
184 italic_angle: 0.0,
185 font_color: "#000".to_string(),
186 contrast_ratio: 21.0,
187 symbol_ends: vec![],
188 text_format: TextFormat::Normal,
189 text_type: TextType::Regular,
190 pdf_layer: PdfLayer::Main,
191 ocg_visible: true,
192 index: None,
193 page_number: Some(1),
194 level: None,
195 mcid: None,
196 })
197 }
198
199 #[test]
200 fn test_build_index() {
201 let pages = vec![
202 vec![make_text("Hello"), make_text("World")],
203 vec![make_text("Page 2")],
204 ];
205 let index = CrossReferenceIndex::from_pages(&pages);
206 assert_eq!(index.len(), 3);
207 assert_eq!(index.page_count(), 2);
208 }
209
210 #[test]
211 fn test_get_element() {
212 let pages = vec![vec![make_text("Test")]];
213 let index = CrossReferenceIndex::from_pages(&pages);
214 let entry = index.get(0).unwrap();
215 assert_eq!(entry.element_type, "TextChunk");
216 assert_eq!(entry.page_number, Some(1));
217 }
218
219 #[test]
220 fn test_elements_on_page() {
221 let pages = vec![vec![make_text("A"), make_text("B")], vec![make_text("C")]];
222 let index = CrossReferenceIndex::from_pages(&pages);
223 assert_eq!(index.elements_on_page(1).len(), 2);
224 assert_eq!(index.elements_on_page(2).len(), 1);
225 assert_eq!(index.elements_on_page(3).len(), 0);
226 }
227
228 #[test]
229 fn test_empty_index() {
230 let pages: Vec<Vec<ContentElement>> = vec![];
231 let index = CrossReferenceIndex::from_pages(&pages);
232 assert!(index.is_empty());
233 assert_eq!(index.heading_count(), 0);
234 }
235}