Skip to main content

edgeparse_core/pdf/
line_extractor.rs

1//! PDF line segment extraction — extract stroked/filled paths as LineChunks.
2//!
3//! Parses content stream path operators (m, l, c, re, S, s, f, h) and
4//! classifies resulting paths as horizontal lines, vertical lines, or shapes.
5
6use lopdf::{Document, Object};
7
8use crate::models::bbox::{BoundingBox, Vertex};
9use crate::models::chunks::{LineArtChunk, LineChunk};
10use crate::pdf::graphics_state::GraphicsStateStack;
11use crate::EdgePdfError;
12
13/// Minimum line width to consider a path as a line segment (in points).
14const MIN_LINE_WIDTH: f64 = 0.1;
15
16/// Aspect ratio threshold: if width/height > this, it's horizontal.
17const LINE_ASPECT_RATIO: f64 = 3.0;
18
19/// Maximum thickness for a line to be classified as a line (vs a rectangle).
20const MAX_LINE_THICKNESS: f64 = 10.0;
21
22/// Extract line segments and line art from a PDF page's content stream.
23pub fn extract_line_chunks(
24    doc: &Document,
25    page_number: u32,
26    page_id: lopdf::ObjectId,
27) -> Result<(Vec<LineChunk>, Vec<LineArtChunk>), EdgePdfError> {
28    let page_dict = doc
29        .get_object(page_id)
30        .map_err(|e| EdgePdfError::PipelineError {
31            stage: 1,
32            message: format!("Failed to get page {}: {}", page_number, e),
33        })?
34        .as_dict()
35        .map_err(|e| EdgePdfError::PipelineError {
36            stage: 1,
37            message: format!("Page {} is not a dictionary: {}", page_number, e),
38        })?;
39
40    let content_bytes = crate::pdf::text_extractor::get_page_content(doc, page_dict)?;
41    if content_bytes.is_empty() {
42        return Ok((Vec::new(), Vec::new()));
43    }
44
45    let content = lopdf::content::Content::decode(&content_bytes).map_err(|e| {
46        EdgePdfError::PipelineError {
47            stage: 1,
48            message: format!(
49                "Failed to decode content stream for page {}: {}",
50                page_number, e
51            ),
52        }
53    })?;
54
55    let mut gs_stack = GraphicsStateStack::default();
56    let mut line_width: f64 = 1.0;
57
58    // Current path state
59    let mut current_path: Vec<PathSegment> = Vec::new();
60    let mut subpath_start: Option<(f64, f64)> = None;
61    let mut current_point: Option<(f64, f64)> = None;
62
63    let mut line_chunks: Vec<LineChunk> = Vec::new();
64    let mut line_art_chunks: Vec<LineArtChunk> = Vec::new();
65    let mut line_index = 0u32;
66
67    for op in &content.operations {
68        match op.operator.as_str() {
69            // Graphics state
70            "q" => gs_stack.save(),
71            "Q" => gs_stack.restore(),
72            "cm" => {
73                if op.operands.len() >= 6 {
74                    let vals: Vec<f64> = op.operands.iter().filter_map(get_number).collect();
75                    if vals.len() >= 6 {
76                        gs_stack.concat_ctm(vals[0], vals[1], vals[2], vals[3], vals[4], vals[5]);
77                    }
78                }
79            }
80            // Line width
81            "w" => {
82                if let Some(w) = op.operands.first().and_then(get_number) {
83                    line_width = w;
84                }
85            }
86            // Path construction
87            "m" => {
88                // moveto
89                if op.operands.len() >= 2 {
90                    if let (Some(x), Some(y)) = (
91                        op.operands.first().and_then(get_number),
92                        op.operands.get(1).and_then(get_number),
93                    ) {
94                        let (tx, ty) = transform_point(&gs_stack, x, y);
95                        subpath_start = Some((tx, ty));
96                        current_point = Some((tx, ty));
97                    }
98                }
99            }
100            "l" => {
101                // lineto
102                if op.operands.len() >= 2 {
103                    if let (Some(x), Some(y)) = (
104                        op.operands.first().and_then(get_number),
105                        op.operands.get(1).and_then(get_number),
106                    ) {
107                        let (tx, ty) = transform_point(&gs_stack, x, y);
108                        if let Some((cx, cy)) = current_point {
109                            current_path.push(PathSegment::Line {
110                                x1: cx,
111                                y1: cy,
112                                x2: tx,
113                                y2: ty,
114                            });
115                        }
116                        current_point = Some((tx, ty));
117                    }
118                }
119            }
120            "c" => {
121                // curveto (cubic Bézier)
122                if op.operands.len() >= 6 {
123                    let vals: Vec<f64> = op.operands.iter().filter_map(get_number).collect();
124                    if vals.len() >= 6 {
125                        let (tx, ty) = transform_point(&gs_stack, vals[4], vals[5]);
126                        if let Some((cx, cy)) = current_point {
127                            let (cp1x, cp1y) = transform_point(&gs_stack, vals[0], vals[1]);
128                            let (cp2x, cp2y) = transform_point(&gs_stack, vals[2], vals[3]);
129                            current_path.push(PathSegment::Curve {
130                                x1: cx,
131                                y1: cy,
132                                cp1x,
133                                cp1y,
134                                cp2x,
135                                cp2y,
136                                x2: tx,
137                                y2: ty,
138                            });
139                        }
140                        current_point = Some((tx, ty));
141                    }
142                }
143            }
144            "v" => {
145                // curveto (initial point replicated)
146                if op.operands.len() >= 4 {
147                    let vals: Vec<f64> = op.operands.iter().filter_map(get_number).collect();
148                    if vals.len() >= 4 {
149                        let (tx, ty) = transform_point(&gs_stack, vals[2], vals[3]);
150                        if let Some((cx, cy)) = current_point {
151                            let (cp2x, cp2y) = transform_point(&gs_stack, vals[0], vals[1]);
152                            current_path.push(PathSegment::Curve {
153                                x1: cx,
154                                y1: cy,
155                                cp1x: cx,
156                                cp1y: cy,
157                                cp2x,
158                                cp2y,
159                                x2: tx,
160                                y2: ty,
161                            });
162                        }
163                        current_point = Some((tx, ty));
164                    }
165                }
166            }
167            "y" => {
168                // curveto (final point replicated)
169                if op.operands.len() >= 4 {
170                    let vals: Vec<f64> = op.operands.iter().filter_map(get_number).collect();
171                    if vals.len() >= 4 {
172                        let (tx, ty) = transform_point(&gs_stack, vals[2], vals[3]);
173                        if let Some((cx, cy)) = current_point {
174                            let (cp1x, cp1y) = transform_point(&gs_stack, vals[0], vals[1]);
175                            current_path.push(PathSegment::Curve {
176                                x1: cx,
177                                y1: cy,
178                                cp1x,
179                                cp1y,
180                                cp2x: tx,
181                                cp2y: ty,
182                                x2: tx,
183                                y2: ty,
184                            });
185                        }
186                        current_point = Some((tx, ty));
187                    }
188                }
189            }
190            "h" => {
191                // closepath
192                if let (Some((sx, sy)), Some((cx, cy))) = (subpath_start, current_point) {
193                    if (sx - cx).abs() > 0.01 || (sy - cy).abs() > 0.01 {
194                        current_path.push(PathSegment::Line {
195                            x1: cx,
196                            y1: cy,
197                            x2: sx,
198                            y2: sy,
199                        });
200                    }
201                    current_point = subpath_start;
202                }
203            }
204            "re" => {
205                // rectangle
206                if op.operands.len() >= 4 {
207                    let vals: Vec<f64> = op.operands.iter().filter_map(get_number).collect();
208                    if vals.len() >= 4 {
209                        let (x, y, w, h) = (vals[0], vals[1], vals[2], vals[3]);
210                        let (x1, y1) = transform_point(&gs_stack, x, y);
211                        let (x2, y2) = transform_point(&gs_stack, x + w, y);
212                        let (x3, y3) = transform_point(&gs_stack, x + w, y + h);
213                        let (x4, y4) = transform_point(&gs_stack, x, y + h);
214                        current_path.push(PathSegment::Line { x1, y1, x2, y2 });
215                        current_path.push(PathSegment::Line {
216                            x1: x2,
217                            y1: y2,
218                            x2: x3,
219                            y2: y3,
220                        });
221                        current_path.push(PathSegment::Line {
222                            x1: x3,
223                            y1: y3,
224                            x2: x4,
225                            y2: y4,
226                        });
227                        current_path.push(PathSegment::Line {
228                            x1: x4,
229                            y1: y4,
230                            x2: x1,
231                            y2: y1,
232                        });
233                        subpath_start = Some((x1, y1));
234                        current_point = Some((x1, y1));
235                    }
236                }
237            }
238            // Path painting — stroke
239            "S" | "s" => {
240                if op.operator == "s" {
241                    // close and stroke — close first
242                    if let (Some((sx, sy)), Some((cx, cy))) = (subpath_start, current_point) {
243                        if (sx - cx).abs() > 0.01 || (sy - cy).abs() > 0.01 {
244                            current_path.push(PathSegment::Line {
245                                x1: cx,
246                                y1: cy,
247                                x2: sx,
248                                y2: sy,
249                            });
250                        }
251                    }
252                }
253                classify_path(
254                    &current_path,
255                    line_width,
256                    page_number,
257                    &mut line_chunks,
258                    &mut line_art_chunks,
259                    &mut line_index,
260                );
261                current_path.clear();
262                subpath_start = None;
263                current_point = None;
264            }
265            // Fill (also acts as implicit close)
266            "f" | "F" | "f*" => {
267                classify_path(
268                    &current_path,
269                    line_width,
270                    page_number,
271                    &mut line_chunks,
272                    &mut line_art_chunks,
273                    &mut line_index,
274                );
275                current_path.clear();
276                subpath_start = None;
277                current_point = None;
278            }
279            // Fill and stroke
280            "B" | "B*" | "b" | "b*" => {
281                classify_path(
282                    &current_path,
283                    line_width,
284                    page_number,
285                    &mut line_chunks,
286                    &mut line_art_chunks,
287                    &mut line_index,
288                );
289                current_path.clear();
290                subpath_start = None;
291                current_point = None;
292            }
293            // End path without painting
294            "n" => {
295                current_path.clear();
296                subpath_start = None;
297                current_point = None;
298            }
299            _ => {}
300        }
301    }
302
303    Ok((line_chunks, line_art_chunks))
304}
305
306/// A segment in a path being constructed.
307#[derive(Debug, Clone)]
308enum PathSegment {
309    Line {
310        x1: f64,
311        y1: f64,
312        x2: f64,
313        y2: f64,
314    },
315    #[allow(dead_code)]
316    Curve {
317        x1: f64,
318        y1: f64,
319        cp1x: f64,
320        cp1y: f64,
321        cp2x: f64,
322        cp2y: f64,
323        x2: f64,
324        y2: f64,
325    },
326}
327
328/// Transform a point through the current CTM.
329fn transform_point(gs_stack: &GraphicsStateStack, x: f64, y: f64) -> (f64, f64) {
330    let ctm = &gs_stack.current.ctm;
331    ctm.transform_point(x, y)
332}
333
334/// Classify a completed path as line chunks or line art.
335fn classify_path(
336    segments: &[PathSegment],
337    line_width: f64,
338    page_number: u32,
339    line_chunks: &mut Vec<LineChunk>,
340    line_art_chunks: &mut Vec<LineArtChunk>,
341    index: &mut u32,
342) {
343    if segments.is_empty() {
344        return;
345    }
346
347    if line_width < MIN_LINE_WIDTH {
348        return;
349    }
350
351    let has_curves = segments
352        .iter()
353        .any(|s| matches!(s, PathSegment::Curve { .. }));
354
355    if !has_curves && segments.len() <= 4 {
356        // Try to classify individual segments as line chunks
357        let mut classified_lines = Vec::new();
358        for seg in segments {
359            if let PathSegment::Line { x1, y1, x2, y2 } = seg {
360                let dx = (x2 - x1).abs();
361                let dy = (y2 - y1).abs();
362                let length = (dx * dx + dy * dy).sqrt();
363
364                if length < MIN_LINE_WIDTH {
365                    continue;
366                }
367
368                let is_horizontal = dy < MAX_LINE_THICKNESS && dx > dy * LINE_ASPECT_RATIO;
369                let is_vertical = dx < MAX_LINE_THICKNESS && dy > dx * LINE_ASPECT_RATIO;
370
371                if is_horizontal || is_vertical {
372                    *index += 1;
373                    let min_x = x1.min(*x2);
374                    let max_x = x1.max(*x2);
375                    let min_y = y1.min(*y2);
376                    let max_y = y1.max(*y2);
377
378                    // Expand bbox by half the line width
379                    let half_w = line_width / 2.0;
380                    let bbox = BoundingBox::new(
381                        Some(page_number),
382                        min_x - if is_vertical { half_w } else { 0.0 },
383                        min_y - if is_horizontal { half_w } else { 0.0 },
384                        max_x + if is_vertical { half_w } else { 0.0 },
385                        max_y + if is_horizontal { half_w } else { 0.0 },
386                    );
387
388                    classified_lines.push(LineChunk {
389                        bbox,
390                        index: Some(*index),
391                        level: None,
392                        start: Vertex {
393                            x: *x1,
394                            y: *y1,
395                            radius: 0.0,
396                        },
397                        end: Vertex {
398                            x: *x2,
399                            y: *y2,
400                            radius: 0.0,
401                        },
402                        width: line_width,
403                        is_horizontal_line: is_horizontal,
404                        is_vertical_line: is_vertical,
405                        is_square: false,
406                    });
407                }
408            }
409        }
410
411        if !classified_lines.is_empty() {
412            line_chunks.extend(classified_lines);
413            return;
414        }
415    }
416
417    // Check for rectangle (4 lines forming a box)
418    if !has_curves && segments.len() == 4 {
419        if let Some(rect) = try_classify_rectangle(segments, line_width, page_number, index) {
420            line_chunks.push(rect);
421            return;
422        }
423    }
424
425    // Complex path → LineArtChunk
426    if segments.len() >= 2 {
427        let mut art_lines = Vec::new();
428        let mut min_x = f64::MAX;
429        let mut min_y = f64::MAX;
430        let mut max_x = f64::MIN;
431        let mut max_y = f64::MIN;
432
433        for seg in segments {
434            let (sx, sy, ex, ey) = match seg {
435                PathSegment::Line { x1, y1, x2, y2 } => (*x1, *y1, *x2, *y2),
436                PathSegment::Curve { x1, y1, x2, y2, .. } => (*x1, *y1, *x2, *y2),
437            };
438            min_x = min_x.min(sx).min(ex);
439            min_y = min_y.min(sy).min(ey);
440            max_x = max_x.max(sx).max(ex);
441            max_y = max_y.max(sy).max(ey);
442
443            *index += 1;
444            let lbbox = BoundingBox::new(
445                Some(page_number),
446                sx.min(ex),
447                sy.min(ey),
448                sx.max(ex),
449                sy.max(ey),
450            );
451            art_lines.push(LineChunk {
452                bbox: lbbox,
453                index: Some(*index),
454                level: None,
455                start: Vertex {
456                    x: sx,
457                    y: sy,
458                    radius: 0.0,
459                },
460                end: Vertex {
461                    x: ex,
462                    y: ey,
463                    radius: 0.0,
464                },
465                width: line_width,
466                is_horizontal_line: false,
467                is_vertical_line: false,
468                is_square: false,
469            });
470        }
471
472        *index += 1;
473        let art_bbox = BoundingBox::new(Some(page_number), min_x, min_y, max_x, max_y);
474        line_art_chunks.push(LineArtChunk {
475            bbox: art_bbox,
476            index: Some(*index),
477            level: None,
478            line_chunks: art_lines,
479        });
480    }
481}
482
483/// Try to classify 4 line segments as a rectangle.
484fn try_classify_rectangle(
485    segments: &[PathSegment],
486    _line_width: f64,
487    page_number: u32,
488    index: &mut u32,
489) -> Option<LineChunk> {
490    let mut min_x = f64::MAX;
491    let mut min_y = f64::MAX;
492    let mut max_x = f64::MIN;
493    let mut max_y = f64::MIN;
494
495    for seg in segments {
496        if let PathSegment::Line { x1, y1, x2, y2 } = seg {
497            min_x = min_x.min(*x1).min(*x2);
498            min_y = min_y.min(*y1).min(*y2);
499            max_x = max_x.max(*x1).max(*x2);
500            max_y = max_y.max(*y1).max(*y2);
501        } else {
502            return None;
503        }
504    }
505
506    let w = max_x - min_x;
507    let h = max_y - min_y;
508
509    if w < MIN_LINE_WIDTH || h < MIN_LINE_WIDTH {
510        return None;
511    }
512
513    // Determine if it's a thin rectangle (line) or a square-like shape
514    let is_square = (w - h).abs() / w.max(h) < 0.3;
515
516    *index += 1;
517    Some(LineChunk {
518        bbox: BoundingBox::new(Some(page_number), min_x, min_y, max_x, max_y),
519        index: Some(*index),
520        level: None,
521        start: Vertex {
522            x: min_x,
523            y: min_y,
524            radius: 0.0,
525        },
526        end: Vertex {
527            x: max_x,
528            y: max_y,
529            radius: 0.0,
530        },
531        width: w.min(h),
532        is_horizontal_line: w > h * LINE_ASPECT_RATIO,
533        is_vertical_line: h > w * LINE_ASPECT_RATIO,
534        is_square,
535    })
536}
537
538fn get_number(obj: &Object) -> Option<f64> {
539    match obj {
540        Object::Integer(i) => Some(*i as f64),
541        Object::Real(f) => Some(*f),
542        _ => None,
543    }
544}
545
546#[cfg(test)]
547mod tests {
548    use super::*;
549    use lopdf::{content::Content, content::Operation, dictionary, Stream};
550
551    fn create_doc_with_content(operations: Vec<Operation>) -> (Document, u32, lopdf::ObjectId) {
552        let mut doc = Document::with_version("1.5");
553        let pages_id = doc.new_object_id();
554
555        let content = Content { operations };
556        let encoded = content.encode().unwrap();
557        let content_id = doc.add_object(Stream::new(dictionary! {}, encoded));
558
559        let page_id = doc.add_object(dictionary! {
560            "Type" => "Page",
561            "Parent" => pages_id,
562            "Contents" => content_id,
563            "MediaBox" => vec![0.into(), 0.into(), 595.into(), 842.into()],
564        });
565
566        let pages = dictionary! {
567            "Type" => "Pages",
568            "Kids" => vec![page_id.into()],
569            "Count" => 1,
570        };
571        doc.objects.insert(pages_id, Object::Dictionary(pages));
572
573        let catalog_id = doc.add_object(dictionary! {
574            "Type" => "Catalog",
575            "Pages" => pages_id,
576        });
577        doc.trailer.set("Root", catalog_id);
578
579        let pages_map = doc.get_pages();
580        let (&page_num, &pid) = pages_map.iter().next().unwrap();
581        (doc, page_num, pid)
582    }
583
584    #[test]
585    fn test_empty_page_no_lines() {
586        let (doc, page_num, pid) = create_doc_with_content(vec![]);
587        let (lines, arts) = extract_line_chunks(&doc, page_num, pid).unwrap();
588        assert!(lines.is_empty());
589        assert!(arts.is_empty());
590    }
591
592    #[test]
593    fn test_horizontal_line() {
594        let ops = vec![
595            Operation::new("w", vec![Object::Real(1.0)]),
596            Operation::new("m", vec![72.into(), 400.into()]),
597            Operation::new("l", vec![500.into(), 400.into()]),
598            Operation::new("S", vec![]),
599        ];
600        let (doc, page_num, pid) = create_doc_with_content(ops);
601        let (lines, arts) = extract_line_chunks(&doc, page_num, pid).unwrap();
602        assert_eq!(lines.len(), 1);
603        assert!(lines[0].is_horizontal_line);
604        assert!(!lines[0].is_vertical_line);
605        assert!(arts.is_empty());
606    }
607
608    #[test]
609    fn test_vertical_line() {
610        let ops = vec![
611            Operation::new("w", vec![Object::Real(1.0)]),
612            Operation::new("m", vec![200.into(), 100.into()]),
613            Operation::new("l", vec![200.into(), 700.into()]),
614            Operation::new("S", vec![]),
615        ];
616        let (doc, page_num, pid) = create_doc_with_content(ops);
617        let (lines, arts) = extract_line_chunks(&doc, page_num, pid).unwrap();
618        assert_eq!(lines.len(), 1);
619        assert!(!lines[0].is_horizontal_line);
620        assert!(lines[0].is_vertical_line);
621    }
622
623    #[test]
624    fn test_rectangle() {
625        let ops = vec![
626            Operation::new("w", vec![Object::Real(1.0)]),
627            Operation::new(
628                "re",
629                vec![
630                    Object::Real(100.0),
631                    Object::Real(200.0),
632                    Object::Real(300.0),
633                    Object::Real(400.0),
634                ],
635            ),
636            Operation::new("S", vec![]),
637        ];
638        let (doc, page_num, pid) = create_doc_with_content(ops);
639        let (lines, _arts) = extract_line_chunks(&doc, page_num, pid).unwrap();
640        // Rectangle should be classified as a square-like shape
641        assert!(!lines.is_empty());
642    }
643
644    #[test]
645    fn test_close_and_stroke() {
646        let ops = vec![
647            Operation::new("w", vec![Object::Real(1.0)]),
648            Operation::new("m", vec![72.into(), 400.into()]),
649            Operation::new("l", vec![500.into(), 400.into()]),
650            Operation::new("l", vec![500.into(), 410.into()]),
651            Operation::new("s", vec![]), // close-and-stroke
652        ];
653        let (doc, page_num, pid) = create_doc_with_content(ops);
654        let (lines, arts) = extract_line_chunks(&doc, page_num, pid).unwrap();
655        // Should have classified the triangle/shape
656        assert!(!lines.is_empty() || !arts.is_empty());
657    }
658}