1use crate::error::{MemvidError, Result};
7
8#[derive(Debug, Clone)]
10pub struct TextBox {
11 pub text: String,
13 pub x: f32,
15 pub y: f32,
17 pub width: f32,
19 pub height: f32,
21 pub font_size: f32,
23 pub page: u32,
25}
26
27impl TextBox {
28 #[must_use]
30 pub fn right(&self) -> f32 {
31 self.x + self.width
32 }
33
34 #[must_use]
36 pub fn top(&self) -> f32 {
37 self.y + self.height
38 }
39
40 #[must_use]
42 pub fn center_x(&self) -> f32 {
43 self.x + self.width / 2.0
44 }
45
46 #[must_use]
48 pub fn center_y(&self) -> f32 {
49 self.y + self.height / 2.0
50 }
51
52 #[must_use]
54 pub fn overlaps(&self, other: &Self) -> bool {
55 self.x < other.right()
56 && self.right() > other.x
57 && self.y < other.top()
58 && self.top() > other.y
59 }
60}
61
62#[derive(Debug, Clone)]
64pub struct LineSegment {
65 pub x1: f32,
67 pub y1: f32,
69 pub x2: f32,
71 pub y2: f32,
73 pub page: u32,
75}
76
77impl LineSegment {
78 #[must_use]
80 pub fn is_horizontal(&self, tolerance: f32) -> bool {
81 (self.y1 - self.y2).abs() <= tolerance
82 }
83
84 #[must_use]
86 pub fn is_vertical(&self, tolerance: f32) -> bool {
87 (self.x1 - self.x2).abs() <= tolerance
88 }
89
90 #[must_use]
92 pub fn length(&self) -> f32 {
93 ((self.x2 - self.x1).powi(2) + (self.y2 - self.y1).powi(2)).sqrt()
94 }
95
96 #[must_use]
98 pub fn y_coord(&self) -> f32 {
99 f32::midpoint(self.y1, self.y2)
100 }
101
102 #[must_use]
104 pub fn x_coord(&self) -> f32 {
105 f32::midpoint(self.x1, self.x2)
106 }
107}
108
109#[derive(Debug, Clone)]
111pub struct PageLayout {
112 pub page_number: u32,
114 pub width: f32,
116 pub height: f32,
118 pub text_boxes: Vec<TextBox>,
120 pub lines: Vec<LineSegment>,
122}
123
124impl PageLayout {
125 #[must_use]
127 pub fn new(page_number: u32, width: f32, height: f32) -> Self {
128 Self {
129 page_number,
130 width,
131 height,
132 text_boxes: Vec::new(),
133 lines: Vec::new(),
134 }
135 }
136
137 #[must_use]
139 pub fn is_empty(&self) -> bool {
140 self.text_boxes.is_empty() && self.lines.is_empty()
141 }
142
143 #[must_use]
145 pub fn horizontal_lines(&self, tolerance: f32) -> Vec<&LineSegment> {
146 self.lines
147 .iter()
148 .filter(|l| l.is_horizontal(tolerance))
149 .collect()
150 }
151
152 #[must_use]
154 pub fn vertical_lines(&self, tolerance: f32) -> Vec<&LineSegment> {
155 self.lines
156 .iter()
157 .filter(|l| l.is_vertical(tolerance))
158 .collect()
159 }
160
161 #[must_use]
163 pub fn has_ruled_structure(&self, min_lines: usize, tolerance: f32) -> bool {
164 let h_count = self.horizontal_lines(tolerance).len();
165 let v_count = self.vertical_lines(tolerance).len();
166 h_count >= min_lines && v_count >= min_lines
167 }
168}
169
170#[cfg(feature = "pdfium")]
174pub fn extract_pdf_layout(bytes: &[u8], max_pages: usize) -> Result<Vec<PageLayout>> {
175 use pdfium_render::prelude::*;
176
177 let pdfium = Pdfium::default();
178 let document =
179 pdfium
180 .load_pdf_from_byte_slice(bytes, None)
181 .map_err(|e| MemvidError::TableExtraction {
182 reason: format!("failed to load PDF: {e}"),
183 })?;
184
185 let page_count = document.pages().len() as usize;
186 let max_pages_usize = max_pages as usize;
187 let pages_to_process = if max_pages_usize > 0 {
188 page_count.min(max_pages_usize)
189 } else {
190 page_count
191 };
192
193 let mut layouts = Vec::with_capacity(pages_to_process);
194
195 for page_idx in 0..pages_to_process {
196 let page =
197 document
198 .pages()
199 .get(page_idx as u16)
200 .map_err(|e| MemvidError::TableExtraction {
201 reason: format!("failed to get page {}: {e}", page_idx + 1),
202 })?;
203
204 let page_number = (page_idx + 1) as u32;
205 let width = page.width().value;
206 let height = page.height().value;
207
208 let mut layout = PageLayout::new(page_number, width, height);
209
210 for object in page.objects().iter() {
212 if let Some(text_obj) = object.as_text_object() {
213 if let Ok(bounds) = object.bounds() {
214 let text = text_obj.text();
215 if !text.trim().is_empty() {
216 layout.text_boxes.push(TextBox {
217 text,
218 x: bounds.left().value,
219 y: bounds.bottom().value,
220 width: bounds.right().value - bounds.left().value,
221 height: bounds.top().value - bounds.bottom().value,
222 font_size: text_obj.unscaled_font_size().value,
223 page: page_number,
224 });
225 }
226 }
227 }
228
229 if let Some(path_obj) = object.as_path_object() {
231 extract_lines_from_path(&path_obj, page_number, &mut layout.lines);
232 }
233 }
234
235 layouts.push(layout);
236 }
237
238 Ok(layouts)
239}
240
241#[cfg(feature = "pdfium")]
243fn extract_lines_from_path(
244 path: &pdfium_render::prelude::PdfPagePathObject,
245 page: u32,
246 lines: &mut Vec<LineSegment>,
247) {
248 use pdfium_render::prelude::*;
249
250 let mut current_x = 0.0f32;
251 let mut current_y = 0.0f32;
252
253 for segment in path.segments().iter() {
254 match segment.segment_type() {
255 PdfPathSegmentType::MoveTo => {
256 let x = segment.x();
258 let y = segment.y();
259 current_x = x.value;
260 current_y = y.value;
261 }
262 PdfPathSegmentType::LineTo => {
263 let x = segment.x();
264 let y = segment.y();
265 let new_x = x.value;
266 let new_y = y.value;
267
268 let length = ((new_x - current_x).powi(2) + (new_y - current_y).powi(2)).sqrt();
270 if length > 5.0 {
271 lines.push(LineSegment {
272 x1: current_x,
273 y1: current_y,
274 x2: new_x,
275 y2: new_y,
276 page,
277 });
278 }
279
280 current_x = new_x;
281 current_y = new_y;
282 }
283 PdfPathSegmentType::BezierTo => {
284 let x = segment.x();
287 let y = segment.y();
288 current_x = x.value;
289 current_y = y.value;
290 }
291 _ => {
292 }
296 }
297 }
298}
299
300#[cfg(not(feature = "pdfium"))]
306pub fn extract_pdf_layout(bytes: &[u8], max_pages: usize) -> Result<Vec<PageLayout>> {
307 use lopdf::Document;
308
309 let document = Document::load_mem(bytes).map_err(|e| MemvidError::TableExtraction {
310 reason: format!("failed to load PDF with lopdf: {e}"),
311 })?;
312
313 let page_count = document.get_pages().len();
314 let pages_to_process = if max_pages > 0 {
315 page_count.min(max_pages)
316 } else {
317 page_count
318 };
319
320 let mut layouts = Vec::with_capacity(pages_to_process);
321
322 for page_idx in 0..pages_to_process {
323 let page_number = u32::try_from(page_idx + 1).unwrap_or(0);
324
325 let (width, height) = get_page_dimensions(&document, page_idx).unwrap_or((612.0, 792.0));
327
328 let mut layout = PageLayout::new(page_number, width, height);
329
330 if let Ok(text) = document.extract_text(&[page_number]) {
332 let lines: Vec<&str> = text.lines().collect();
333 let line_height = if lines.is_empty() {
334 12.0
335 } else {
336 (height - 144.0) / lines.len() as f32 };
338
339 for (line_idx, line) in lines.iter().enumerate() {
340 if line.trim().is_empty() {
341 continue;
342 }
343
344 let text_boxes = parse_line_into_columns(
347 line,
348 line_idx,
349 page_number,
350 width,
351 height,
352 line_height,
353 );
354
355 layout.text_boxes.extend(text_boxes);
356 }
357 }
358
359 layout.lines = Vec::new();
361
362 layouts.push(layout);
363 }
364
365 Ok(layouts)
366}
367
368#[cfg(not(feature = "pdfium"))]
373fn parse_line_into_columns(
374 line: &str,
375 line_idx: usize,
376 page: u32,
377 page_width: f32,
378 page_height: f32,
379 line_height: f32,
380) -> Vec<TextBox> {
381 let mut boxes = Vec::new();
382 let y = page_height - 72.0 - (line_idx as f32 * line_height);
383
384 let re_split: Vec<&str> = line.split(" ").collect();
386
387 if re_split.len() > 1 {
388 let usable_width = page_width - 144.0; let col_width = usable_width / re_split.len() as f32;
391
392 for (col_idx, col_text) in re_split.iter().enumerate() {
393 let trimmed = col_text.trim();
394 if !trimmed.is_empty() {
395 let x = 72.0 + (col_idx as f32 * col_width);
396 boxes.push(TextBox {
397 text: trimmed.to_string(),
398 x,
399 y,
400 width: col_width * 0.9, height: line_height,
402 font_size: 12.0,
403 page,
404 });
405 }
406 }
407 } else {
408 let tab_split: Vec<&str> = line.split('\t').collect();
410
411 if tab_split.len() > 1 {
412 let usable_width = page_width - 144.0;
414 let col_width = usable_width / tab_split.len() as f32;
415
416 for (col_idx, col_text) in tab_split.iter().enumerate() {
417 let trimmed = col_text.trim();
418 if !trimmed.is_empty() {
419 let x = 72.0 + (col_idx as f32 * col_width);
420 boxes.push(TextBox {
421 text: trimmed.to_string(),
422 x,
423 y,
424 width: col_width * 0.9,
425 height: line_height,
426 font_size: 12.0,
427 page,
428 });
429 }
430 }
431 } else {
432 let trimmed = line.trim();
434 if !trimmed.is_empty() {
435 boxes.push(TextBox {
436 text: trimmed.to_string(),
437 x: 72.0,
438 y,
439 width: page_width - 144.0,
440 height: line_height,
441 font_size: 12.0,
442 page,
443 });
444 }
445 }
446 }
447
448 boxes
449}
450
451#[cfg(not(feature = "pdfium"))]
453fn get_page_dimensions(document: &lopdf::Document, page_idx: usize) -> Option<(f32, f32)> {
454 let pages = document.get_pages();
455 let page_id = *pages.get(&u32::try_from(page_idx + 1).unwrap_or(0))?;
456
457 if let Ok(page) = document.get_dictionary(page_id) {
458 if let Ok(media_box) = page.get(b"MediaBox") {
459 if let lopdf::Object::Array(arr) = media_box {
460 if arr.len() >= 4 {
461 let width = match &arr[2] {
462 lopdf::Object::Integer(n) => *n as f32,
463 lopdf::Object::Real(n) => *n,
464 _ => return None,
465 };
466 let height = match &arr[3] {
467 lopdf::Object::Integer(n) => *n as f32,
468 lopdf::Object::Real(n) => *n,
469 _ => return None,
470 };
471 return Some((width, height));
472 }
473 }
474 }
475 }
476 None
477}
478
479#[must_use]
484pub fn cluster_values(values: &[f32], threshold: f32) -> Vec<f32> {
485 if values.is_empty() {
486 return Vec::new();
487 }
488
489 let mut sorted: Vec<f32> = values.to_vec();
490 sorted.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
491
492 let mut clusters: Vec<Vec<f32>> = Vec::new();
493 let mut current_cluster = vec![sorted[0]];
494
495 for &val in &sorted[1..] {
496 let last = current_cluster.last().copied().unwrap_or(val);
497 if val - last <= threshold {
498 current_cluster.push(val);
499 } else {
500 clusters.push(current_cluster);
501 current_cluster = vec![val];
502 }
503 }
504
505 if !current_cluster.is_empty() {
506 clusters.push(current_cluster);
507 }
508
509 clusters
511 .iter()
512 .map(|cluster| cluster.iter().sum::<f32>() / cluster.len() as f32)
513 .collect()
514}
515
516#[allow(dead_code)]
518pub fn filter_consistent_values(
519 candidates: &[f32],
520 reference_values: &[f32],
521 threshold: f32,
522 min_occurrences: usize,
523) -> Vec<f32> {
524 candidates
525 .iter()
526 .filter(|&&candidate| {
527 let count = reference_values
528 .iter()
529 .filter(|&&v| (v - candidate).abs() <= threshold)
530 .count();
531 count >= min_occurrences
532 })
533 .copied()
534 .collect()
535}
536
537#[cfg(test)]
538mod tests {
539 use super::*;
540
541 #[test]
542 fn test_text_box_geometry() {
543 let tbox = TextBox {
544 text: "Hello".to_string(),
545 x: 100.0,
546 y: 200.0,
547 width: 50.0,
548 height: 20.0,
549 font_size: 12.0,
550 page: 1,
551 };
552
553 assert!((tbox.right() - 150.0).abs() < 0.001);
554 assert!((tbox.top() - 220.0).abs() < 0.001);
555 assert!((tbox.center_x() - 125.0).abs() < 0.001);
556 assert!((tbox.center_y() - 210.0).abs() < 0.001);
557 }
558
559 #[test]
560 fn test_line_segment_orientation() {
561 let h_line = LineSegment {
562 x1: 0.0,
563 y1: 100.0,
564 x2: 200.0,
565 y2: 100.0,
566 page: 1,
567 };
568 assert!(h_line.is_horizontal(1.0));
569 assert!(!h_line.is_vertical(1.0));
570
571 let v_line = LineSegment {
572 x1: 100.0,
573 y1: 0.0,
574 x2: 100.0,
575 y2: 200.0,
576 page: 1,
577 };
578 assert!(!v_line.is_horizontal(1.0));
579 assert!(v_line.is_vertical(1.0));
580 }
581
582 #[test]
583 fn test_cluster_values() {
584 let values = vec![10.0, 11.0, 12.0, 50.0, 51.0, 100.0];
585 let clusters = cluster_values(&values, 5.0);
586
587 assert_eq!(clusters.len(), 3);
588 assert!((clusters[0] - 11.0).abs() < 1.0);
590 assert!((clusters[1] - 50.5).abs() < 1.0);
592 assert!((clusters[2] - 100.0).abs() < 1.0);
594 }
595
596 #[test]
597 fn test_page_layout_line_filtering() {
598 let mut layout = PageLayout::new(1, 612.0, 792.0);
599 layout.lines.push(LineSegment {
600 x1: 0.0,
601 y1: 100.0,
602 x2: 200.0,
603 y2: 100.0,
604 page: 1,
605 }); layout.lines.push(LineSegment {
607 x1: 100.0,
608 y1: 0.0,
609 x2: 100.0,
610 y2: 200.0,
611 page: 1,
612 }); layout.lines.push(LineSegment {
614 x1: 0.0,
615 y1: 0.0,
616 x2: 200.0,
617 y2: 200.0,
618 page: 1,
619 }); assert_eq!(layout.horizontal_lines(2.0).len(), 1);
622 assert_eq!(layout.vertical_lines(2.0).len(), 1);
623 }
624}