Skip to main content

edgeparse_core/models/
chunks.rs

1//! Chunk types — atomic units of extracted content.
2
3use serde::{Deserialize, Serialize};
4
5use super::bbox::{BoundingBox, Vertex};
6use super::enums::{PdfLayer, TextFormat, TextType};
7
8/// Atomic text fragment — one font run in the PDF content stream.
9#[derive(Debug, Clone, Serialize, Deserialize)]
10pub struct TextChunk {
11    /// Decoded Unicode text content
12    pub value: String,
13    /// Bounding box in page coordinates
14    pub bbox: BoundingBox,
15    /// Font name (base font name like "Helvetica")
16    pub font_name: String,
17    /// Font size in points (effective, after matrix transforms)
18    pub font_size: f64,
19    /// Font weight (100.0 - 900.0)
20    pub font_weight: f64,
21    /// Italic angle from font descriptor
22    pub italic_angle: f64,
23    /// Text color as hex string (e.g. "#000000")
24    pub font_color: String,
25    /// Contrast ratio against background (1.0-21.0)
26    pub contrast_ratio: f64,
27    /// X-coordinate of each glyph end position
28    pub symbol_ends: Vec<f64>,
29    /// Text baseline format (normal, superscript, subscript)
30    pub text_format: TextFormat,
31    /// Text type classification
32    pub text_type: TextType,
33    /// Processing layer that produced this chunk
34    pub pdf_layer: PdfLayer,
35    /// Whether the OCG (Optional Content Group) is visible
36    pub ocg_visible: bool,
37    /// Global index in extraction order
38    pub index: Option<usize>,
39    /// Page number (1-based)
40    pub page_number: Option<u32>,
41    /// Nesting level (from structure tree)
42    pub level: Option<String>,
43    /// Marked content identifier (from BDC/BMC operators in the content stream).
44    /// Links this chunk to a structure tree node for semantic tagging.
45    #[serde(skip_serializing_if = "Option::is_none")]
46    pub mcid: Option<i64>,
47}
48
49impl TextChunk {
50    /// Whether the entire text value is whitespace.
51    pub fn is_white_space_chunk(&self) -> bool {
52        self.value.chars().all(|c| c.is_whitespace())
53    }
54
55    /// Collapse consecutive spaces into single space.
56    pub fn compress_spaces(&mut self) {
57        let mut result = String::with_capacity(self.value.len());
58        let mut last_was_space = false;
59        for ch in self.value.chars() {
60            if ch == ' ' {
61                if !last_was_space {
62                    result.push(' ');
63                }
64                last_was_space = true;
65            } else {
66                result.push(ch);
67                last_was_space = false;
68            }
69        }
70        self.value = result;
71    }
72
73    /// Number of characters in the text.
74    pub fn text_length(&self) -> usize {
75        self.value.chars().count()
76    }
77
78    /// Average width per symbol.
79    pub fn average_symbol_width(&self) -> f64 {
80        let len = self.text_length();
81        if len == 0 {
82            return 0.0;
83        }
84        self.bbox.width() / len as f64
85    }
86
87    /// Get the X coordinate where the symbol at `idx` starts.
88    pub fn symbol_start_coordinate(&self, idx: usize) -> f64 {
89        if idx == 0 {
90            self.bbox.left_x
91        } else if idx <= self.symbol_ends.len() {
92            self.symbol_ends[idx - 1]
93        } else {
94            self.bbox.right_x
95        }
96    }
97
98    /// Get the X coordinate where the symbol at `idx` ends.
99    pub fn symbol_end_coordinate(&self, idx: usize) -> f64 {
100        if idx < self.symbol_ends.len() {
101            self.symbol_ends[idx]
102        } else {
103            self.bbox.right_x
104        }
105    }
106}
107
108/// Image bounding box — actual pixel data extracted at output time.
109#[derive(Debug, Clone, Serialize, Deserialize)]
110pub struct ImageChunk {
111    /// Bounding box in page coordinates
112    pub bbox: BoundingBox,
113    /// Global index
114    pub index: Option<u32>,
115    /// Nesting level
116    pub level: Option<String>,
117}
118
119/// Line segment — used for table border detection.
120#[derive(Debug, Clone, Serialize, Deserialize)]
121pub struct LineChunk {
122    /// Bounding box in page coordinates
123    pub bbox: BoundingBox,
124    /// Global index
125    pub index: Option<u32>,
126    /// Nesting level
127    pub level: Option<String>,
128    /// Start vertex
129    pub start: Vertex,
130    /// End vertex
131    pub end: Vertex,
132    /// Line width in points
133    pub width: f64,
134    /// Whether this is a horizontal line
135    pub is_horizontal_line: bool,
136    /// Whether this is a vertical line
137    pub is_vertical_line: bool,
138    /// Whether this is a square-like shape
139    pub is_square: bool,
140}
141
142/// Vector graphic — collection of line segments forming bullets, decorations, etc.
143#[derive(Debug, Clone, Serialize, Deserialize)]
144pub struct LineArtChunk {
145    /// Bounding box encompassing the line art
146    pub bbox: BoundingBox,
147    /// Global index
148    pub index: Option<u32>,
149    /// Nesting level
150    pub level: Option<String>,
151    /// Component line segments
152    pub line_chunks: Vec<LineChunk>,
153}
154
155/// Size comparison tolerance for line art classification.
156pub const LINE_ART_SIZE_EPSILON: f64 = 1.0;
157
158#[cfg(test)]
159mod tests {
160    use super::*;
161
162    fn make_text_chunk(value: &str) -> TextChunk {
163        TextChunk {
164            value: value.to_string(),
165            bbox: BoundingBox::new(Some(1), 0.0, 0.0, 100.0, 12.0),
166            font_name: "Helvetica".to_string(),
167            font_size: 12.0,
168            font_weight: 400.0,
169            italic_angle: 0.0,
170            font_color: "#000000".to_string(),
171            contrast_ratio: 21.0,
172            symbol_ends: vec![],
173            text_format: TextFormat::Normal,
174            text_type: TextType::Regular,
175            pdf_layer: PdfLayer::Main,
176            ocg_visible: true,
177            index: None,
178            page_number: Some(1),
179            level: None,
180            mcid: None,
181        }
182    }
183
184    #[test]
185    fn test_is_white_space_chunk() {
186        assert!(make_text_chunk("   ").is_white_space_chunk());
187        assert!(!make_text_chunk("hello").is_white_space_chunk());
188        assert!(make_text_chunk("").is_white_space_chunk());
189    }
190
191    #[test]
192    fn test_compress_spaces() {
193        let mut chunk = make_text_chunk("hello   world   test");
194        chunk.compress_spaces();
195        assert_eq!(chunk.value, "hello world test");
196    }
197
198    #[test]
199    fn test_text_length() {
200        assert_eq!(make_text_chunk("hello").text_length(), 5);
201        assert_eq!(make_text_chunk("").text_length(), 0);
202    }
203
204    #[test]
205    fn test_average_symbol_width() {
206        let chunk = make_text_chunk("hello");
207        assert!((chunk.average_symbol_width() - 20.0).abs() < 0.01);
208    }
209}