Skip to main content

lopdf/
object_stream.rs

1use crate::parser;
2use crate::{Document, Error, Object, ObjectId, Result, Stream};
3use std::collections::BTreeMap;
4use std::num::TryFromIntError;
5use std::str::FromStr;
6
7use log::warn;
8#[cfg(feature = "rayon")]
9use rayon::prelude::*;
10
11#[derive(Debug)]
12pub struct ObjectStream {
13    pub objects: BTreeMap<ObjectId, Object>,
14    max_objects: usize,
15    compression_level: u32,
16}
17
18#[derive(Debug, Clone)]
19pub struct ObjectStreamBuilder {
20    max_objects: usize,
21    compression_level: u32,
22}
23
24#[derive(Debug, Clone)]
25pub struct ObjectStreamConfig {
26    pub max_objects_per_stream: usize,
27    pub compression_level: u32,
28}
29
30impl Default for ObjectStreamConfig {
31    fn default() -> Self {
32        Self {
33            max_objects_per_stream: 100,
34            compression_level: 6,
35        }
36    }
37}
38
39impl ObjectStream {
40    /// Parse an existing object stream
41    pub fn new(stream: &mut Stream) -> Result<ObjectStream> {
42        let _ = stream.decompress();
43
44        if stream.content.is_empty() {
45            return Ok(ObjectStream {
46                objects: BTreeMap::new(),
47                max_objects: 100,
48                compression_level: 6,
49            });
50        }
51
52        let first_offset = stream
53            .dict
54            .get(b"First")
55            .and_then(Object::as_i64)?
56            .try_into()
57            .map_err(|e: TryFromIntError| Error::NumericCast(e.to_string()))?;
58        let index_block = stream
59            .content
60            .get(..first_offset)
61            .ok_or(Error::InvalidOffset(first_offset))?;
62
63        let numbers_str = std::str::from_utf8(index_block).map_err(|e| Error::InvalidObjectStream(e.to_string()))?;
64        let numbers: Vec<_> = numbers_str
65            .split_whitespace()
66            .map(|number| u32::from_str(number).ok())
67            .collect();
68        let len = numbers.len() / 2 * 2; // Ensure only pairs.
69
70        let n = stream.dict.get(b"N").and_then(Object::as_i64)?;
71        if numbers.len().try_into().ok() != n.checked_mul(2) {
72            warn!("object stream: the object stream dictionary specifies a wrong number of objects")
73        }
74
75        let chunks_filter_map = |chunk: &[_]| {
76            let id = chunk[0]?;
77            let offset = first_offset + chunk[1]? as usize;
78
79            if offset >= stream.content.len() {
80                warn!("out-of-bounds offset in object stream");
81                return None;
82            }
83            // Skip leading whitespace — some PDFs emit newlines before objects in ObjStm
84            let mut start = offset;
85            while start < stream.content.len() && stream.content[start].is_ascii_whitespace() {
86                start += 1;
87            }
88            if start >= stream.content.len() {
89                warn!("only whitespace after offset in object stream");
90                return None;
91            }
92            let object = parser::direct_object(&stream.content[start..])?;
93
94            Some(((id, 0), object))
95        };
96        #[cfg(feature = "rayon")]
97        let objects = numbers[..len].par_chunks(2).filter_map(chunks_filter_map).collect();
98        #[cfg(not(feature = "rayon"))]
99        let objects = numbers[..len].chunks(2).filter_map(chunks_filter_map).collect();
100
101        Ok(ObjectStream {
102            objects,
103            max_objects: 100,
104            compression_level: 6,
105        })
106    }
107
108    /// Create a builder for constructing new object streams
109    pub fn builder() -> ObjectStreamBuilder {
110        ObjectStreamBuilder {
111            max_objects: 100,
112            compression_level: 6,
113        }
114    }
115
116    /// Add an object to the stream
117    pub fn add_object(&mut self, id: ObjectId, obj: Object) -> Result<()> {
118        // Check if object can be added to stream
119        if matches!(obj, Object::Stream(_)) {
120            return Err(Error::InvalidObjectStream(
121                "Stream objects cannot be stored in object streams".into(),
122            ));
123        }
124
125        // Check capacity
126        if self.objects.len() >= self.max_objects {
127            return Err(Error::InvalidObjectStream(format!(
128                "Object stream has reached maximum capacity of {} objects",
129                self.max_objects
130            )));
131        }
132
133        self.objects.insert(id, obj);
134        Ok(())
135    }
136
137    /// Get the number of objects in the stream
138    pub fn object_count(&self) -> usize {
139        self.objects.len()
140    }
141
142    /// Build the stream content in the format required by PDF spec
143    pub fn build_stream_content(&self) -> Result<Vec<u8>> {
144        if self.objects.is_empty() {
145            return Ok(Vec::new());
146        }
147
148        // Sort objects by ID for consistent output
149        let mut sorted_objects: Vec<_> = self.objects.iter().collect();
150        sorted_objects.sort_by_key(|(id, _)| *id);
151
152        // First build the offset table to know its size
153        let mut offset_entries = Vec::new();
154        let mut current_offset = 0;
155
156        for ((obj_num, _gen), obj) in &sorted_objects {
157            // Store the object number and its offset
158            offset_entries.push(format!("{obj_num} {current_offset}"));
159
160            // Calculate size of this object's serialization
161            let mut obj_bytes = Vec::new();
162            crate::writer::Writer::write_object(&mut obj_bytes, obj)?;
163            current_offset += obj_bytes.len() + 1; // +1 for space separator
164        }
165
166        // Build the complete offset table with proper spacing
167        let offset_table = offset_entries.join(" ") + " ";
168
169        // Now build the final content
170        let mut content = Vec::new();
171        content.extend_from_slice(offset_table.as_bytes());
172
173        // Add serialized objects with space separators
174        for ((_, _), obj) in &sorted_objects {
175            let mut obj_bytes = Vec::new();
176            crate::writer::Writer::write_object(&mut obj_bytes, obj)?;
177            content.extend_from_slice(&obj_bytes);
178            content.push(b' '); // Space separator between objects
179        }
180
181        Ok(content)
182    }
183
184    /// Convert to a Stream object ready for insertion into a PDF
185    pub fn to_stream_object(&self) -> Result<Stream> {
186        let content = self.build_stream_content()?;
187
188        // Calculate where the first object starts
189        // We need to find the size of the offset table
190        let mut sorted_objects: Vec<_> = self.objects.iter().collect();
191        sorted_objects.sort_by_key(|(id, _)| *id);
192
193        // Build the offset entries to calculate exact size
194        let mut offset_entries = Vec::new();
195        let mut current_offset = 0;
196
197        for ((obj_num, _gen), obj) in &sorted_objects {
198            offset_entries.push(format!("{obj_num} {current_offset}"));
199
200            // Calculate size of this object's serialization
201            let mut obj_bytes = Vec::new();
202            crate::writer::Writer::write_object(&mut obj_bytes, obj)?;
203            current_offset += obj_bytes.len() + 1; // +1 for space separator
204        }
205
206        // The offset table is joined with spaces and has a trailing space
207        let offset_table = offset_entries.join(" ") + " ";
208        let first_offset = offset_table.len();
209
210        let dict = dictionary! {
211            "Type" => "ObjStm",
212            "N" => self.objects.len() as i64,
213            "First" => first_offset as i64,
214        };
215
216        let mut stream = Stream::new(dict, content);
217
218        // Apply compression - object streams should always be compressed
219        if self.compression_level > 0 {
220            // Force compression by setting Filter directly
221            use flate2::Compression;
222            use flate2::write::ZlibEncoder;
223            use std::io::prelude::*;
224
225            let compression = match self.compression_level {
226                0 => Compression::none(),
227                1..=3 => Compression::fast(),
228                4..=6 => Compression::default(),
229                _ => Compression::best(),
230            };
231
232            let mut encoder = ZlibEncoder::new(Vec::new(), compression);
233            encoder.write_all(&stream.content)?;
234            let compressed = encoder.finish()?;
235
236            stream.dict.set("Filter", "FlateDecode");
237            stream.set_content(compressed);
238        }
239
240        Ok(stream)
241    }
242
243    /// Check if an object can be compressed into an object stream
244    pub fn can_be_compressed(id: ObjectId, obj: &Object, doc: &Document) -> bool {
245        // Rule 1: Stream objects cannot be compressed
246        if matches!(obj, Object::Stream(_)) {
247            return false;
248        }
249
250        // Rule 2: Objects with non-zero generation cannot be compressed
251        if id.1 != 0 {
252            return false;
253        }
254
255        // Rule 3: Only encryption dictionary cannot be compressed from trailer references
256        if let Ok(Object::Reference(encrypt_ref)) = doc.trailer.get(b"Encrypt") {
257            if id == *encrypt_ref {
258                return false;
259            }
260        }
261
262        // Rule 4: Specific object types that cannot be compressed
263        if let Object::Dictionary(dict) = obj {
264            if let Ok(type_obj) = dict.get(b"Type") {
265                if let Ok(type_name) = type_obj.as_name() {
266                    match type_name {
267                        // Cross-reference streams and object streams cannot be compressed
268                        b"XRef" => return false,
269                        b"ObjStm" => return false,
270
271                        // Catalog can only be excluded in linearized PDFs
272                        b"Catalog" if Self::is_linearized(doc) => {
273                            return false;
274                        }
275                        b"Catalog" => {}
276
277                        // Page, Pages, and all other types CAN be compressed
278                        _ => {}
279                    }
280                }
281            }
282        }
283
284        // Default: Allow compression
285        true
286    }
287
288    /// Check if a PDF document is linearized
289    fn is_linearized(doc: &Document) -> bool {
290        // In a linearized PDF, the first object after the header should be a
291        // linearization dictionary with /Linearized entry
292        // For simplicity, we check if any object has a /Linearized entry
293        for obj in doc.objects.values() {
294            if let Object::Dictionary(dict) = obj {
295                if dict.has(b"Linearized") {
296                    return true;
297                }
298            }
299        }
300        false
301    }
302}
303
304impl ObjectStreamBuilder {
305    /// Set the maximum number of objects per stream
306    pub fn max_objects(mut self, max: usize) -> Self {
307        self.max_objects = max;
308        self
309    }
310
311    /// Set the compression level (0-9)
312    pub fn compression_level(mut self, level: u32) -> Self {
313        self.compression_level = level;
314        self
315    }
316
317    /// Build the ObjectStream
318    pub fn build(self) -> ObjectStream {
319        ObjectStream {
320            objects: BTreeMap::new(),
321            max_objects: self.max_objects,
322            compression_level: self.compression_level,
323        }
324    }
325
326    /// Get the current max_objects setting
327    pub fn get_max_objects(&self) -> usize {
328        self.max_objects
329    }
330
331    /// Get the current compression_level setting
332    pub fn get_compression_level(&self) -> u32 {
333        self.compression_level
334    }
335}