Skip to main content

pdfluent_lopdf/
object_stream.rs

1use crate::parser::{self, ParserInput};
2use crate::{Document, Error, Object, ObjectId, Result, Stream};
3use std::collections::BTreeMap;
4use std::num::TryFromIntError;
5use std::str::FromStr;
6
7use log::warn;
8#[cfg(feature = "rayon")]
9use rayon::prelude::*;
10
11#[derive(Debug)]
12pub struct ObjectStream {
13    pub objects: BTreeMap<ObjectId, Object>,
14    max_objects: usize,
15    compression_level: u32,
16}
17
18#[derive(Debug, Clone)]
19pub struct ObjectStreamBuilder {
20    max_objects: usize,
21    compression_level: u32,
22}
23
24#[derive(Debug, Clone)]
25pub struct ObjectStreamConfig {
26    pub max_objects_per_stream: usize,
27    pub compression_level: u32,
28}
29
30impl Default for ObjectStreamConfig {
31    fn default() -> Self {
32        Self {
33            max_objects_per_stream: 100,
34            compression_level: 6,
35        }
36    }
37}
38
39impl ObjectStream {
40    /// Parse an existing object stream
41    pub fn new(stream: &mut Stream) -> Result<ObjectStream> {
42        let _ = stream.decompress();
43
44        if stream.content.is_empty() {
45            return Ok(ObjectStream {
46                objects: BTreeMap::new(),
47                max_objects: 100,
48                compression_level: 6,
49            });
50        }
51
52        let first_offset = stream
53            .dict
54            .get(b"First")
55            .and_then(Object::as_i64)?
56            .try_into()
57            .map_err(|e: TryFromIntError| Error::NumericCast(e.to_string()))?;
58        let index_block = stream
59            .content
60            .get(..first_offset)
61            .ok_or(Error::InvalidOffset(first_offset))?;
62
63        let numbers_str = std::str::from_utf8(index_block)
64            .map_err(|e| Error::InvalidObjectStream(e.to_string()))?;
65        let numbers: Vec<_> = numbers_str
66            .split_whitespace()
67            .map(|number| u32::from_str(number).ok())
68            .collect();
69        let len = numbers.len() / 2 * 2; // Ensure only pairs.
70
71        let n = stream.dict.get(b"N").and_then(Object::as_i64)?;
72        if numbers.len().try_into().ok() != n.checked_mul(2) {
73            warn!("object stream: the object stream dictionary specifies a wrong number of objects")
74        }
75
76        let chunks_filter_map = |chunk: &[_]| {
77            let id = chunk[0]?;
78            let offset = first_offset + chunk[1]? as usize;
79
80            if offset >= stream.content.len() {
81                warn!("out-of-bounds offset in object stream");
82                return None;
83            }
84            let object = parser::direct_object(ParserInput::new_extra(
85                &stream.content[offset..],
86                "direct object",
87            ))?;
88
89            Some(((id, 0), object))
90        };
91        #[cfg(feature = "rayon")]
92        let objects = numbers[..len]
93            .par_chunks(2)
94            .filter_map(chunks_filter_map)
95            .collect();
96        #[cfg(not(feature = "rayon"))]
97        let objects = numbers[..len]
98            .chunks(2)
99            .filter_map(chunks_filter_map)
100            .collect();
101
102        Ok(ObjectStream {
103            objects,
104            max_objects: 100,
105            compression_level: 6,
106        })
107    }
108
109    /// Create a builder for constructing new object streams
110    pub fn builder() -> ObjectStreamBuilder {
111        ObjectStreamBuilder {
112            max_objects: 100,
113            compression_level: 6,
114        }
115    }
116
117    /// Add an object to the stream
118    pub fn add_object(&mut self, id: ObjectId, obj: Object) -> Result<()> {
119        // Check if object can be added to stream
120        if matches!(obj, Object::Stream(_)) {
121            return Err(Error::InvalidObjectStream(
122                "Stream objects cannot be stored in object streams".into(),
123            ));
124        }
125
126        // Check capacity
127        if self.objects.len() >= self.max_objects {
128            return Err(Error::InvalidObjectStream(format!(
129                "Object stream has reached maximum capacity of {} objects",
130                self.max_objects
131            )));
132        }
133
134        self.objects.insert(id, obj);
135        Ok(())
136    }
137
138    /// Get the number of objects in the stream
139    pub fn object_count(&self) -> usize {
140        self.objects.len()
141    }
142
143    /// Build the stream content in the format required by PDF spec
144    pub fn build_stream_content(&self) -> Result<Vec<u8>> {
145        if self.objects.is_empty() {
146            return Ok(Vec::new());
147        }
148
149        // Sort objects by ID for consistent output
150        let mut sorted_objects: Vec<_> = self.objects.iter().collect();
151        sorted_objects.sort_by_key(|(id, _)| *id);
152
153        // First build the offset table to know its size
154        let mut offset_entries = Vec::new();
155        let mut current_offset = 0;
156
157        for ((obj_num, _gen), obj) in &sorted_objects {
158            // Store the object number and its offset
159            offset_entries.push(format!("{obj_num} {current_offset}"));
160
161            // Calculate size of this object's serialization
162            let mut obj_bytes = Vec::new();
163            crate::writer::Writer::write_object(&mut obj_bytes, obj)?;
164            current_offset += obj_bytes.len() + 1; // +1 for space separator
165        }
166
167        // Build the complete offset table with proper spacing
168        let offset_table = offset_entries.join(" ") + " ";
169
170        // Now build the final content
171        let mut content = Vec::new();
172        content.extend_from_slice(offset_table.as_bytes());
173
174        // Add serialized objects with space separators
175        for ((_, _), obj) in &sorted_objects {
176            let mut obj_bytes = Vec::new();
177            crate::writer::Writer::write_object(&mut obj_bytes, obj)?;
178            content.extend_from_slice(&obj_bytes);
179            content.push(b' '); // Space separator between objects
180        }
181
182        Ok(content)
183    }
184
185    /// Convert to a Stream object ready for insertion into a PDF
186    pub fn to_stream_object(&self) -> Result<Stream> {
187        let content = self.build_stream_content()?;
188
189        // Calculate where the first object starts
190        // We need to find the size of the offset table
191        let mut sorted_objects: Vec<_> = self.objects.iter().collect();
192        sorted_objects.sort_by_key(|(id, _)| *id);
193
194        // Build the offset entries to calculate exact size
195        let mut offset_entries = Vec::new();
196        let mut current_offset = 0;
197
198        for ((obj_num, _gen), obj) in &sorted_objects {
199            offset_entries.push(format!("{obj_num} {current_offset}"));
200
201            // Calculate size of this object's serialization
202            let mut obj_bytes = Vec::new();
203            crate::writer::Writer::write_object(&mut obj_bytes, obj)?;
204            current_offset += obj_bytes.len() + 1; // +1 for space separator
205        }
206
207        // The offset table is joined with spaces and has a trailing space
208        let offset_table = offset_entries.join(" ") + " ";
209        let first_offset = offset_table.len();
210
211        let dict = dictionary! {
212            "Type" => "ObjStm",
213            "N" => self.objects.len() as i64,
214            "First" => first_offset as i64,
215        };
216
217        let mut stream = Stream::new(dict, content);
218
219        // Apply compression - object streams should always be compressed
220        if self.compression_level > 0 {
221            // Force compression by setting Filter directly
222            use flate2::Compression;
223            use flate2::write::ZlibEncoder;
224            use std::io::prelude::*;
225
226            let compression = match self.compression_level {
227                0 => Compression::none(),
228                1..=3 => Compression::fast(),
229                4..=6 => Compression::default(),
230                _ => Compression::best(),
231            };
232
233            let mut encoder = ZlibEncoder::new(Vec::new(), compression);
234            encoder.write_all(&stream.content)?;
235            let compressed = encoder.finish()?;
236
237            stream.dict.set("Filter", "FlateDecode");
238            stream.set_content(compressed);
239        }
240
241        Ok(stream)
242    }
243
244    /// Check if an object can be compressed into an object stream
245    pub fn can_be_compressed(id: ObjectId, obj: &Object, doc: &Document) -> bool {
246        // Rule 1: Stream objects cannot be compressed
247        if matches!(obj, Object::Stream(_)) {
248            return false;
249        }
250
251        // Rule 2: Objects with non-zero generation cannot be compressed
252        if id.1 != 0 {
253            return false;
254        }
255
256        // Rule 3: Only encryption dictionary cannot be compressed from trailer references
257        if let Ok(Object::Reference(encrypt_ref)) = doc.trailer.get(b"Encrypt") {
258            if id == *encrypt_ref {
259                return false;
260            }
261        }
262
263        // Rule 4: Specific object types that cannot be compressed
264        if let Object::Dictionary(dict) = obj {
265            if let Ok(type_obj) = dict.get(b"Type") {
266                if let Ok(type_name) = type_obj.as_name() {
267                    match type_name {
268                        // Cross-reference streams and object streams cannot be compressed
269                        b"XRef" => return false,
270                        b"ObjStm" => return false,
271
272                        // Catalog can only be excluded in linearized PDFs
273                        b"Catalog"
274                            // Check if PDF is linearized
275                            if Self::is_linearized(doc) => {
276                                return false;
277                            }
278
279                        // Page, Pages, and all other types CAN be compressed
280                        _ => {}
281                    }
282                }
283            }
284        }
285
286        // Default: Allow compression
287        true
288    }
289
290    /// Check if a PDF document is linearized
291    fn is_linearized(doc: &Document) -> bool {
292        // In a linearized PDF, the first object after the header should be a
293        // linearization dictionary with /Linearized entry
294        // For simplicity, we check if any object has a /Linearized entry
295        for obj in doc.objects.values() {
296            if let Object::Dictionary(dict) = obj {
297                if dict.has(b"Linearized") {
298                    return true;
299                }
300            }
301        }
302        false
303    }
304}
305
306impl ObjectStreamBuilder {
307    /// Set the maximum number of objects per stream
308    pub fn max_objects(mut self, max: usize) -> Self {
309        self.max_objects = max;
310        self
311    }
312
313    /// Set the compression level (0-9)
314    pub fn compression_level(mut self, level: u32) -> Self {
315        self.compression_level = level;
316        self
317    }
318
319    /// Build the ObjectStream
320    pub fn build(self) -> ObjectStream {
321        ObjectStream {
322            objects: BTreeMap::new(),
323            max_objects: self.max_objects,
324            compression_level: self.compression_level,
325        }
326    }
327
328    /// Get the current max_objects setting
329    pub fn get_max_objects(&self) -> usize {
330        self.max_objects
331    }
332
333    /// Get the current compression_level setting
334    pub fn get_compression_level(&self) -> u32 {
335        self.compression_level
336    }
337}