1use lopdf::{Document, Object};
7
8use crate::models::bbox::{BoundingBox, Vertex};
9use crate::models::chunks::{LineArtChunk, LineChunk};
10use crate::pdf::graphics_state::GraphicsStateStack;
11use crate::EdgePdfError;
12
13const MIN_LINE_WIDTH: f64 = 0.1;
15
16const LINE_ASPECT_RATIO: f64 = 3.0;
18
19const MAX_LINE_THICKNESS: f64 = 10.0;
21
22pub fn extract_line_chunks(
24 doc: &Document,
25 page_number: u32,
26 page_id: lopdf::ObjectId,
27) -> Result<(Vec<LineChunk>, Vec<LineArtChunk>), EdgePdfError> {
28 let page_dict = doc
29 .get_object(page_id)
30 .map_err(|e| EdgePdfError::PipelineError {
31 stage: 1,
32 message: format!("Failed to get page {}: {}", page_number, e),
33 })?
34 .as_dict()
35 .map_err(|e| EdgePdfError::PipelineError {
36 stage: 1,
37 message: format!("Page {} is not a dictionary: {}", page_number, e),
38 })?;
39
40 let content_bytes = crate::pdf::text_extractor::get_page_content(doc, page_dict)?;
41 if content_bytes.is_empty() {
42 return Ok((Vec::new(), Vec::new()));
43 }
44
45 let content = lopdf::content::Content::decode(&content_bytes).map_err(|e| {
46 EdgePdfError::PipelineError {
47 stage: 1,
48 message: format!(
49 "Failed to decode content stream for page {}: {}",
50 page_number, e
51 ),
52 }
53 })?;
54
55 let mut gs_stack = GraphicsStateStack::default();
56 let mut line_width: f64 = 1.0;
57
58 let mut current_path: Vec<PathSegment> = Vec::new();
60 let mut subpath_start: Option<(f64, f64)> = None;
61 let mut current_point: Option<(f64, f64)> = None;
62
63 let mut line_chunks: Vec<LineChunk> = Vec::new();
64 let mut line_art_chunks: Vec<LineArtChunk> = Vec::new();
65 let mut line_index = 0u32;
66
67 for op in &content.operations {
68 match op.operator.as_str() {
69 "q" => gs_stack.save(),
71 "Q" => gs_stack.restore(),
72 "cm" => {
73 if op.operands.len() >= 6 {
74 let vals: Vec<f64> = op.operands.iter().filter_map(get_number).collect();
75 if vals.len() >= 6 {
76 gs_stack.concat_ctm(vals[0], vals[1], vals[2], vals[3], vals[4], vals[5]);
77 }
78 }
79 }
80 "w" => {
82 if let Some(w) = op.operands.first().and_then(get_number) {
83 line_width = w;
84 }
85 }
86 "m" => {
88 if op.operands.len() >= 2 {
90 if let (Some(x), Some(y)) = (
91 op.operands.first().and_then(get_number),
92 op.operands.get(1).and_then(get_number),
93 ) {
94 let (tx, ty) = transform_point(&gs_stack, x, y);
95 subpath_start = Some((tx, ty));
96 current_point = Some((tx, ty));
97 }
98 }
99 }
100 "l" => {
101 if op.operands.len() >= 2 {
103 if let (Some(x), Some(y)) = (
104 op.operands.first().and_then(get_number),
105 op.operands.get(1).and_then(get_number),
106 ) {
107 let (tx, ty) = transform_point(&gs_stack, x, y);
108 if let Some((cx, cy)) = current_point {
109 current_path.push(PathSegment::Line {
110 x1: cx,
111 y1: cy,
112 x2: tx,
113 y2: ty,
114 });
115 }
116 current_point = Some((tx, ty));
117 }
118 }
119 }
120 "c" => {
121 if op.operands.len() >= 6 {
123 let vals: Vec<f64> = op.operands.iter().filter_map(get_number).collect();
124 if vals.len() >= 6 {
125 let (tx, ty) = transform_point(&gs_stack, vals[4], vals[5]);
126 if let Some((cx, cy)) = current_point {
127 let (cp1x, cp1y) = transform_point(&gs_stack, vals[0], vals[1]);
128 let (cp2x, cp2y) = transform_point(&gs_stack, vals[2], vals[3]);
129 current_path.push(PathSegment::Curve {
130 x1: cx,
131 y1: cy,
132 cp1x,
133 cp1y,
134 cp2x,
135 cp2y,
136 x2: tx,
137 y2: ty,
138 });
139 }
140 current_point = Some((tx, ty));
141 }
142 }
143 }
144 "v" => {
145 if op.operands.len() >= 4 {
147 let vals: Vec<f64> = op.operands.iter().filter_map(get_number).collect();
148 if vals.len() >= 4 {
149 let (tx, ty) = transform_point(&gs_stack, vals[2], vals[3]);
150 if let Some((cx, cy)) = current_point {
151 let (cp2x, cp2y) = transform_point(&gs_stack, vals[0], vals[1]);
152 current_path.push(PathSegment::Curve {
153 x1: cx,
154 y1: cy,
155 cp1x: cx,
156 cp1y: cy,
157 cp2x,
158 cp2y,
159 x2: tx,
160 y2: ty,
161 });
162 }
163 current_point = Some((tx, ty));
164 }
165 }
166 }
167 "y" => {
168 if op.operands.len() >= 4 {
170 let vals: Vec<f64> = op.operands.iter().filter_map(get_number).collect();
171 if vals.len() >= 4 {
172 let (tx, ty) = transform_point(&gs_stack, vals[2], vals[3]);
173 if let Some((cx, cy)) = current_point {
174 let (cp1x, cp1y) = transform_point(&gs_stack, vals[0], vals[1]);
175 current_path.push(PathSegment::Curve {
176 x1: cx,
177 y1: cy,
178 cp1x,
179 cp1y,
180 cp2x: tx,
181 cp2y: ty,
182 x2: tx,
183 y2: ty,
184 });
185 }
186 current_point = Some((tx, ty));
187 }
188 }
189 }
190 "h" => {
191 if let (Some((sx, sy)), Some((cx, cy))) = (subpath_start, current_point) {
193 if (sx - cx).abs() > 0.01 || (sy - cy).abs() > 0.01 {
194 current_path.push(PathSegment::Line {
195 x1: cx,
196 y1: cy,
197 x2: sx,
198 y2: sy,
199 });
200 }
201 current_point = subpath_start;
202 }
203 }
204 "re" => {
205 if op.operands.len() >= 4 {
207 let vals: Vec<f64> = op.operands.iter().filter_map(get_number).collect();
208 if vals.len() >= 4 {
209 let (x, y, w, h) = (vals[0], vals[1], vals[2], vals[3]);
210 let (x1, y1) = transform_point(&gs_stack, x, y);
211 let (x2, y2) = transform_point(&gs_stack, x + w, y);
212 let (x3, y3) = transform_point(&gs_stack, x + w, y + h);
213 let (x4, y4) = transform_point(&gs_stack, x, y + h);
214 current_path.push(PathSegment::Line { x1, y1, x2, y2 });
215 current_path.push(PathSegment::Line {
216 x1: x2,
217 y1: y2,
218 x2: x3,
219 y2: y3,
220 });
221 current_path.push(PathSegment::Line {
222 x1: x3,
223 y1: y3,
224 x2: x4,
225 y2: y4,
226 });
227 current_path.push(PathSegment::Line {
228 x1: x4,
229 y1: y4,
230 x2: x1,
231 y2: y1,
232 });
233 subpath_start = Some((x1, y1));
234 current_point = Some((x1, y1));
235 }
236 }
237 }
238 "S" | "s" => {
240 if op.operator == "s" {
241 if let (Some((sx, sy)), Some((cx, cy))) = (subpath_start, current_point) {
243 if (sx - cx).abs() > 0.01 || (sy - cy).abs() > 0.01 {
244 current_path.push(PathSegment::Line {
245 x1: cx,
246 y1: cy,
247 x2: sx,
248 y2: sy,
249 });
250 }
251 }
252 }
253 classify_path(
254 ¤t_path,
255 line_width,
256 page_number,
257 &mut line_chunks,
258 &mut line_art_chunks,
259 &mut line_index,
260 );
261 current_path.clear();
262 subpath_start = None;
263 current_point = None;
264 }
265 "f" | "F" | "f*" => {
267 classify_path(
268 ¤t_path,
269 line_width,
270 page_number,
271 &mut line_chunks,
272 &mut line_art_chunks,
273 &mut line_index,
274 );
275 current_path.clear();
276 subpath_start = None;
277 current_point = None;
278 }
279 "B" | "B*" | "b" | "b*" => {
281 classify_path(
282 ¤t_path,
283 line_width,
284 page_number,
285 &mut line_chunks,
286 &mut line_art_chunks,
287 &mut line_index,
288 );
289 current_path.clear();
290 subpath_start = None;
291 current_point = None;
292 }
293 "n" => {
295 current_path.clear();
296 subpath_start = None;
297 current_point = None;
298 }
299 _ => {}
300 }
301 }
302
303 Ok((line_chunks, line_art_chunks))
304}
305
306#[derive(Debug, Clone)]
308enum PathSegment {
309 Line {
310 x1: f64,
311 y1: f64,
312 x2: f64,
313 y2: f64,
314 },
315 #[allow(dead_code)]
316 Curve {
317 x1: f64,
318 y1: f64,
319 cp1x: f64,
320 cp1y: f64,
321 cp2x: f64,
322 cp2y: f64,
323 x2: f64,
324 y2: f64,
325 },
326}
327
328fn transform_point(gs_stack: &GraphicsStateStack, x: f64, y: f64) -> (f64, f64) {
330 let ctm = &gs_stack.current.ctm;
331 ctm.transform_point(x, y)
332}
333
334fn classify_path(
336 segments: &[PathSegment],
337 line_width: f64,
338 page_number: u32,
339 line_chunks: &mut Vec<LineChunk>,
340 line_art_chunks: &mut Vec<LineArtChunk>,
341 index: &mut u32,
342) {
343 if segments.is_empty() {
344 return;
345 }
346
347 if line_width < MIN_LINE_WIDTH {
348 return;
349 }
350
351 let has_curves = segments
352 .iter()
353 .any(|s| matches!(s, PathSegment::Curve { .. }));
354
355 if !has_curves && segments.len() <= 4 {
356 let mut classified_lines = Vec::new();
358 for seg in segments {
359 if let PathSegment::Line { x1, y1, x2, y2 } = seg {
360 let dx = (x2 - x1).abs();
361 let dy = (y2 - y1).abs();
362 let length = (dx * dx + dy * dy).sqrt();
363
364 if length < MIN_LINE_WIDTH {
365 continue;
366 }
367
368 let is_horizontal = dy < MAX_LINE_THICKNESS && dx > dy * LINE_ASPECT_RATIO;
369 let is_vertical = dx < MAX_LINE_THICKNESS && dy > dx * LINE_ASPECT_RATIO;
370
371 if is_horizontal || is_vertical {
372 *index += 1;
373 let min_x = x1.min(*x2);
374 let max_x = x1.max(*x2);
375 let min_y = y1.min(*y2);
376 let max_y = y1.max(*y2);
377
378 let half_w = line_width / 2.0;
380 let bbox = BoundingBox::new(
381 Some(page_number),
382 min_x - if is_vertical { half_w } else { 0.0 },
383 min_y - if is_horizontal { half_w } else { 0.0 },
384 max_x + if is_vertical { half_w } else { 0.0 },
385 max_y + if is_horizontal { half_w } else { 0.0 },
386 );
387
388 classified_lines.push(LineChunk {
389 bbox,
390 index: Some(*index),
391 level: None,
392 start: Vertex {
393 x: *x1,
394 y: *y1,
395 radius: 0.0,
396 },
397 end: Vertex {
398 x: *x2,
399 y: *y2,
400 radius: 0.0,
401 },
402 width: line_width,
403 is_horizontal_line: is_horizontal,
404 is_vertical_line: is_vertical,
405 is_square: false,
406 });
407 }
408 }
409 }
410
411 if !classified_lines.is_empty() {
412 line_chunks.extend(classified_lines);
413 return;
414 }
415 }
416
417 if !has_curves && segments.len() == 4 {
419 if let Some(rect) = try_classify_rectangle(segments, line_width, page_number, index) {
420 line_chunks.push(rect);
421 return;
422 }
423 }
424
425 if segments.len() >= 2 {
427 let mut art_lines = Vec::new();
428 let mut min_x = f64::MAX;
429 let mut min_y = f64::MAX;
430 let mut max_x = f64::MIN;
431 let mut max_y = f64::MIN;
432
433 for seg in segments {
434 let (sx, sy, ex, ey) = match seg {
435 PathSegment::Line { x1, y1, x2, y2 } => (*x1, *y1, *x2, *y2),
436 PathSegment::Curve { x1, y1, x2, y2, .. } => (*x1, *y1, *x2, *y2),
437 };
438 min_x = min_x.min(sx).min(ex);
439 min_y = min_y.min(sy).min(ey);
440 max_x = max_x.max(sx).max(ex);
441 max_y = max_y.max(sy).max(ey);
442
443 *index += 1;
444 let lbbox = BoundingBox::new(
445 Some(page_number),
446 sx.min(ex),
447 sy.min(ey),
448 sx.max(ex),
449 sy.max(ey),
450 );
451 art_lines.push(LineChunk {
452 bbox: lbbox,
453 index: Some(*index),
454 level: None,
455 start: Vertex {
456 x: sx,
457 y: sy,
458 radius: 0.0,
459 },
460 end: Vertex {
461 x: ex,
462 y: ey,
463 radius: 0.0,
464 },
465 width: line_width,
466 is_horizontal_line: false,
467 is_vertical_line: false,
468 is_square: false,
469 });
470 }
471
472 *index += 1;
473 let art_bbox = BoundingBox::new(Some(page_number), min_x, min_y, max_x, max_y);
474 line_art_chunks.push(LineArtChunk {
475 bbox: art_bbox,
476 index: Some(*index),
477 level: None,
478 line_chunks: art_lines,
479 });
480 }
481}
482
483fn try_classify_rectangle(
485 segments: &[PathSegment],
486 _line_width: f64,
487 page_number: u32,
488 index: &mut u32,
489) -> Option<LineChunk> {
490 let mut min_x = f64::MAX;
491 let mut min_y = f64::MAX;
492 let mut max_x = f64::MIN;
493 let mut max_y = f64::MIN;
494
495 for seg in segments {
496 if let PathSegment::Line { x1, y1, x2, y2 } = seg {
497 min_x = min_x.min(*x1).min(*x2);
498 min_y = min_y.min(*y1).min(*y2);
499 max_x = max_x.max(*x1).max(*x2);
500 max_y = max_y.max(*y1).max(*y2);
501 } else {
502 return None;
503 }
504 }
505
506 let w = max_x - min_x;
507 let h = max_y - min_y;
508
509 if w < MIN_LINE_WIDTH || h < MIN_LINE_WIDTH {
510 return None;
511 }
512
513 let is_square = (w - h).abs() / w.max(h) < 0.3;
515
516 *index += 1;
517 Some(LineChunk {
518 bbox: BoundingBox::new(Some(page_number), min_x, min_y, max_x, max_y),
519 index: Some(*index),
520 level: None,
521 start: Vertex {
522 x: min_x,
523 y: min_y,
524 radius: 0.0,
525 },
526 end: Vertex {
527 x: max_x,
528 y: max_y,
529 radius: 0.0,
530 },
531 width: w.min(h),
532 is_horizontal_line: w > h * LINE_ASPECT_RATIO,
533 is_vertical_line: h > w * LINE_ASPECT_RATIO,
534 is_square,
535 })
536}
537
538fn get_number(obj: &Object) -> Option<f64> {
539 match obj {
540 Object::Integer(i) => Some(*i as f64),
541 Object::Real(f) => Some(*f),
542 _ => None,
543 }
544}
545
546#[cfg(test)]
547mod tests {
548 use super::*;
549 use lopdf::{content::Content, content::Operation, dictionary, Stream};
550
551 fn create_doc_with_content(operations: Vec<Operation>) -> (Document, u32, lopdf::ObjectId) {
552 let mut doc = Document::with_version("1.5");
553 let pages_id = doc.new_object_id();
554
555 let content = Content { operations };
556 let encoded = content.encode().unwrap();
557 let content_id = doc.add_object(Stream::new(dictionary! {}, encoded));
558
559 let page_id = doc.add_object(dictionary! {
560 "Type" => "Page",
561 "Parent" => pages_id,
562 "Contents" => content_id,
563 "MediaBox" => vec![0.into(), 0.into(), 595.into(), 842.into()],
564 });
565
566 let pages = dictionary! {
567 "Type" => "Pages",
568 "Kids" => vec![page_id.into()],
569 "Count" => 1,
570 };
571 doc.objects.insert(pages_id, Object::Dictionary(pages));
572
573 let catalog_id = doc.add_object(dictionary! {
574 "Type" => "Catalog",
575 "Pages" => pages_id,
576 });
577 doc.trailer.set("Root", catalog_id);
578
579 let pages_map = doc.get_pages();
580 let (&page_num, &pid) = pages_map.iter().next().unwrap();
581 (doc, page_num, pid)
582 }
583
584 #[test]
585 fn test_empty_page_no_lines() {
586 let (doc, page_num, pid) = create_doc_with_content(vec![]);
587 let (lines, arts) = extract_line_chunks(&doc, page_num, pid).unwrap();
588 assert!(lines.is_empty());
589 assert!(arts.is_empty());
590 }
591
592 #[test]
593 fn test_horizontal_line() {
594 let ops = vec![
595 Operation::new("w", vec![Object::Real(1.0)]),
596 Operation::new("m", vec![72.into(), 400.into()]),
597 Operation::new("l", vec![500.into(), 400.into()]),
598 Operation::new("S", vec![]),
599 ];
600 let (doc, page_num, pid) = create_doc_with_content(ops);
601 let (lines, arts) = extract_line_chunks(&doc, page_num, pid).unwrap();
602 assert_eq!(lines.len(), 1);
603 assert!(lines[0].is_horizontal_line);
604 assert!(!lines[0].is_vertical_line);
605 assert!(arts.is_empty());
606 }
607
608 #[test]
609 fn test_vertical_line() {
610 let ops = vec![
611 Operation::new("w", vec![Object::Real(1.0)]),
612 Operation::new("m", vec![200.into(), 100.into()]),
613 Operation::new("l", vec![200.into(), 700.into()]),
614 Operation::new("S", vec![]),
615 ];
616 let (doc, page_num, pid) = create_doc_with_content(ops);
617 let (lines, arts) = extract_line_chunks(&doc, page_num, pid).unwrap();
618 assert_eq!(lines.len(), 1);
619 assert!(!lines[0].is_horizontal_line);
620 assert!(lines[0].is_vertical_line);
621 }
622
623 #[test]
624 fn test_rectangle() {
625 let ops = vec![
626 Operation::new("w", vec![Object::Real(1.0)]),
627 Operation::new(
628 "re",
629 vec![
630 Object::Real(100.0),
631 Object::Real(200.0),
632 Object::Real(300.0),
633 Object::Real(400.0),
634 ],
635 ),
636 Operation::new("S", vec![]),
637 ];
638 let (doc, page_num, pid) = create_doc_with_content(ops);
639 let (lines, _arts) = extract_line_chunks(&doc, page_num, pid).unwrap();
640 assert!(!lines.is_empty());
642 }
643
644 #[test]
645 fn test_close_and_stroke() {
646 let ops = vec![
647 Operation::new("w", vec![Object::Real(1.0)]),
648 Operation::new("m", vec![72.into(), 400.into()]),
649 Operation::new("l", vec![500.into(), 400.into()]),
650 Operation::new("l", vec![500.into(), 410.into()]),
651 Operation::new("s", vec![]), ];
653 let (doc, page_num, pid) = create_doc_with_content(ops);
654 let (lines, arts) = extract_line_chunks(&doc, page_num, pid).unwrap();
655 assert!(!lines.is_empty() || !arts.is_empty());
657 }
658}