Skip to main content

folio_cos/
document.rs

1//! CosDoc — low-level PDF document access.
2//!
3//! Provides access to the PDF's object graph via the cross-reference table.
4
5use crate::object::{ObjectId, PdfObject, PdfStream};
6use crate::parser;
7use crate::serialize;
8use crate::tokenizer::{Token, Tokenizer};
9use crate::xref::{self, XrefEntry, XrefTable};
10use folio_core::{FolioError, Result};
11use indexmap::IndexMap;
12use std::collections::HashMap;
13
14/// A low-level PDF document providing access to the COS object graph.
15pub struct CosDoc {
16    /// The raw PDF data (for reading existing documents).
17    data: Vec<u8>,
18    /// The cross-reference table.
19    xref: XrefTable,
20    /// Cache of already-parsed objects.
21    object_cache: HashMap<u32, PdfObject>,
22    /// Newly created or modified objects (not yet saved).
23    modified_objects: HashMap<u32, PdfObject>,
24    /// Next available object number.
25    next_obj_num: u32,
26    /// Whether the document has been modified.
27    is_modified: bool,
28}
29
30impl CosDoc {
31    /// Open a PDF document from raw bytes.
32    pub fn open(data: Vec<u8>) -> Result<Self> {
33        // Verify PDF header
34        if !data.starts_with(b"%PDF-") {
35            return Err(FolioError::Parse {
36                offset: 0,
37                message: "Not a PDF file (missing %PDF- header)".into(),
38            });
39        }
40
41        // Parse cross-reference table(s)
42        let xref = xref::parse_all_xrefs(&data)?;
43
44        let next_obj_num = xref
45            .trailer
46            .get(b"Size".as_slice())
47            .and_then(|o| o.as_i64())
48            .unwrap_or(1) as u32;
49
50        Ok(Self {
51            data,
52            xref,
53            object_cache: HashMap::new(),
54            modified_objects: HashMap::new(),
55            next_obj_num,
56            is_modified: false,
57        })
58    }
59
60    /// Open a PDF document from a file path.
61    pub fn open_file(path: &str) -> Result<Self> {
62        let data = std::fs::read(path)?;
63        Self::open(data)
64    }
65
66    /// Create a new empty PDF document.
67    pub fn new() -> Self {
68        let mut trailer = IndexMap::new();
69        trailer.insert(b"Size".to_vec(), PdfObject::Integer(1));
70
71        Self {
72            data: Vec::new(),
73            xref: XrefTable {
74                entries: IndexMap::new(),
75                trailer,
76            },
77            object_cache: HashMap::new(),
78            modified_objects: HashMap::new(),
79            next_obj_num: 1,
80            is_modified: true,
81        }
82    }
83
84    /// Get the trailer dictionary.
85    pub fn trailer(&self) -> &IndexMap<Vec<u8>, PdfObject> {
86        &self.xref.trailer
87    }
88
89    /// Get a mutable reference to the trailer dictionary.
90    pub fn trailer_mut(&mut self) -> &mut IndexMap<Vec<u8>, PdfObject> {
91        &mut self.xref.trailer
92    }
93
94    /// Get an object by its object number.
95    ///
96    /// Returns the object directly (resolves the xref entry to load from file).
97    pub fn get_object(&mut self, obj_num: u32) -> Result<Option<&PdfObject>> {
98        // Check modified objects first
99        if self.modified_objects.contains_key(&obj_num) {
100            return Ok(self.modified_objects.get(&obj_num));
101        }
102
103        // Check cache
104        if self.object_cache.contains_key(&obj_num) {
105            return Ok(self.object_cache.get(&obj_num));
106        }
107
108        // Load from xref
109        let entry = match self.xref.entries.get(&obj_num) {
110            Some(e) => *e,
111            None => return Ok(None),
112        };
113
114        match entry {
115            XrefEntry::InUse { offset, .. } => {
116                let (_id, obj) = parser::parse_indirect_object_at(&self.data, offset as usize)?;
117                self.object_cache.insert(obj_num, obj);
118                Ok(self.object_cache.get(&obj_num))
119            }
120            XrefEntry::Free { .. } => Ok(None),
121            XrefEntry::Compressed { stream_obj, .. } => {
122                // Object is stored in an Object Stream (/Type /ObjStm).
123                // We need to load the stream, decompress it, and extract the object.
124                self.load_object_stream(stream_obj)?;
125                Ok(self.object_cache.get(&obj_num))
126            }
127        }
128    }
129
130    /// Resolve a PdfObject::Reference to the referenced object.
131    /// Returns the object itself if it's not a reference.
132    pub fn resolve(&mut self, obj: &PdfObject) -> Result<PdfObject> {
133        match obj {
134            PdfObject::Reference(id) => match self.get_object(id.num)? {
135                Some(resolved) => Ok(resolved.clone()),
136                None => Ok(PdfObject::Null),
137            },
138            _ => Ok(obj.clone()),
139        }
140    }
141
142    /// Load and decompress an Object Stream (/Type /ObjStm), caching all
143    /// contained objects into `object_cache`.
144    ///
145    /// An object stream packs multiple non-stream objects into a single stream.
146    /// Format: the stream data begins with N pairs of "obj_num offset" integers,
147    /// followed by the serialized objects. /First gives the byte offset in the
148    /// decoded data where the objects begin (after the integer pairs).
149    fn load_object_stream(&mut self, stream_obj_num: u32) -> Result<()> {
150        // Avoid infinite recursion: if we're already loading this stream, bail
151        if self.object_cache.contains_key(&stream_obj_num) {
152            return Ok(());
153        }
154
155        // Load the object stream itself (it must be a regular InUse object)
156        let entry = match self.xref.entries.get(&stream_obj_num) {
157            Some(XrefEntry::InUse { offset, .. }) => *offset,
158            _ => {
159                return Err(FolioError::InvalidObject(format!(
160                    "Object stream {} not found or not InUse",
161                    stream_obj_num
162                )));
163            }
164        };
165
166        let (_id, stream_obj) = parser::parse_indirect_object_at(&self.data, entry as usize)?;
167        let stream = match &stream_obj {
168            PdfObject::Stream(s) => s,
169            _ => {
170                return Err(FolioError::InvalidObject(format!(
171                    "Object {} is not a stream (expected ObjStm)",
172                    stream_obj_num
173                )));
174            }
175        };
176
177        // Cache the stream object itself
178        self.object_cache.insert(stream_obj_num, stream_obj.clone());
179
180        // Get /N (number of objects) and /First (byte offset of first object)
181        let n = stream
182            .dict
183            .get(b"N".as_slice())
184            .and_then(|o| o.as_i64())
185            .unwrap_or(0) as usize;
186        let first = stream
187            .dict
188            .get(b"First".as_slice())
189            .and_then(|o| o.as_i64())
190            .unwrap_or(0) as usize;
191
192        // Decode the stream data
193        let decoded = self.decode_stream(stream)?;
194
195        if decoded.is_empty() || n == 0 {
196            return Ok(());
197        }
198
199        // Parse the N pairs of (obj_num, offset) from the beginning of decoded data
200        let header = &decoded[..first.min(decoded.len())];
201        let mut tokenizer = Tokenizer::new_at(header, 0);
202        let mut obj_entries: Vec<(u32, usize)> = Vec::with_capacity(n);
203
204        for _ in 0..n {
205            let obj_num = match tokenizer.next_token()? {
206                Some(Token::Integer(num)) => num as u32,
207                _ => break,
208            };
209            let offset = match tokenizer.next_token()? {
210                Some(Token::Integer(off)) => off as usize,
211                _ => break,
212            };
213            obj_entries.push((obj_num, offset));
214        }
215
216        // Parse each object from the data section (starting at /First offset)
217        let objects_data = &decoded[first.min(decoded.len())..];
218
219        for (i, &(obj_num, offset)) in obj_entries.iter().enumerate() {
220            // Determine the end of this object's data
221            let end = if i + 1 < obj_entries.len() {
222                obj_entries[i + 1].1
223            } else {
224                objects_data.len()
225            };
226
227            if offset >= objects_data.len() {
228                continue;
229            }
230
231            let obj_data = &objects_data[offset..end.min(objects_data.len())];
232            let mut obj_tokenizer = Tokenizer::new_at(obj_data, 0);
233
234            match parser::parse_object(&mut obj_tokenizer) {
235                Ok(Some(obj)) => {
236                    self.object_cache.insert(obj_num, obj);
237                }
238                Ok(None) => {
239                    self.object_cache.insert(obj_num, PdfObject::Null);
240                }
241                Err(e) => {
242                    log::warn!(
243                        "Failed to parse object {} from ObjStm {}: {}",
244                        obj_num,
245                        stream_obj_num,
246                        e
247                    );
248                }
249            }
250        }
251
252        Ok(())
253    }
254
255    /// Create a new indirect object and return its ObjectId.
256    pub fn create_indirect(&mut self, obj: PdfObject) -> ObjectId {
257        let id = ObjectId::new(self.next_obj_num, 0);
258        self.modified_objects.insert(self.next_obj_num, obj);
259        self.next_obj_num += 1;
260        self.is_modified = true;
261        id
262    }
263
264    /// Update an existing indirect object.
265    pub fn update_object(&mut self, obj_num: u32, obj: PdfObject) {
266        self.modified_objects.insert(obj_num, obj);
267        self.is_modified = true;
268    }
269
270    /// Get the number of entries in the xref table.
271    pub fn xref_size(&self) -> u32 {
272        self.next_obj_num
273    }
274
275    /// Check if the document has been modified.
276    pub fn is_modified(&self) -> bool {
277        self.is_modified
278    }
279
280    /// Save the document to bytes (full save, not incremental).
281    ///
282    /// All objects are written as a flat traditional xref table,
283    /// even if the original used xref streams or object streams.
284    pub fn save_to_bytes(&mut self) -> Result<Vec<u8>> {
285        // First, eagerly load all compressed objects into cache
286        let compressed_entries: Vec<(u32, u32)> = self
287            .xref
288            .entries
289            .iter()
290            .filter_map(|(&num, entry)| match entry {
291                XrefEntry::Compressed { stream_obj, .. } => Some((num, *stream_obj)),
292                _ => None,
293            })
294            .collect();
295
296        for (_obj_num, stream_obj) in &compressed_entries {
297            if !self.object_cache.contains_key(stream_obj) {
298                let _ = self.load_object_stream(*stream_obj);
299            }
300        }
301
302        let mut objects: Vec<(ObjectId, PdfObject)> = Vec::new();
303        let mut seen = std::collections::HashSet::new();
304
305        // Collect objects from xref entries
306        for (&obj_num, entry) in &self.xref.entries {
307            if seen.contains(&obj_num) {
308                continue;
309            }
310
311            match entry {
312                XrefEntry::InUse { offset, .. } => {
313                    let obj = if let Some(modified) = self.modified_objects.get(&obj_num) {
314                        modified.clone()
315                    } else if let Some(cached) = self.object_cache.get(&obj_num) {
316                        cached.clone()
317                    } else if let Ok((_id, obj)) =
318                        parser::parse_indirect_object_at(&self.data, *offset as usize)
319                    {
320                        obj
321                    } else {
322                        continue;
323                    };
324
325                    // Skip object streams and xref streams — their contents are
326                    // written as regular objects in the flat output.
327                    let is_objstm_or_xref = obj
328                        .dict_get_name(b"Type")
329                        .is_some_and(|t| t == b"ObjStm" || t == b"XRef");
330                    if !is_objstm_or_xref {
331                        objects.push((ObjectId::new(obj_num, 0), obj));
332                        seen.insert(obj_num);
333                    }
334                }
335                XrefEntry::Compressed { .. } => {
336                    // Objects from object streams — should now be in cache
337                    if let Some(obj) = self.object_cache.get(&obj_num) {
338                        objects.push((ObjectId::new(obj_num, 0), obj.clone()));
339                        seen.insert(obj_num);
340                    }
341                }
342                XrefEntry::Free { .. } => {}
343            }
344        }
345
346        // Add newly created objects
347        for (&obj_num, obj) in &self.modified_objects {
348            if !seen.contains(&obj_num) {
349                objects.push((ObjectId::new(obj_num, 0), obj.clone()));
350            }
351        }
352
353        objects.sort_by_key(|(id, _)| id.num);
354
355        // Clean the trailer: remove xref-stream-specific keys and /Prev
356        // (since we're writing a traditional xref table, not a stream)
357        let mut clean_trailer = self.xref.trailer.clone();
358        for key in &[
359            b"Prev".as_slice(),
360            b"W".as_slice(),
361            b"Index".as_slice(),
362            b"Filter".as_slice(),
363            b"DecodeParms".as_slice(),
364            b"Length".as_slice(),
365            b"Type".as_slice(),
366            b"XRefStm".as_slice(),
367        ] {
368            clean_trailer.shift_remove(*key);
369        }
370
371        serialize::serialize_pdf(&objects, &clean_trailer)
372    }
373
374    /// Save the document to a file.
375    pub fn save_to_file(&mut self, path: &str) -> Result<()> {
376        let data = self.save_to_bytes()?;
377        std::fs::write(path, data)?;
378        Ok(())
379    }
380
381    /// Decode a stream object's data using its filters.
382    pub fn decode_stream(&self, stream: &PdfStream) -> Result<Vec<u8>> {
383        if stream.decoded {
384            return Ok(stream.data.clone());
385        }
386
387        let filter_names = self.get_stream_filters(stream);
388        let params = self.get_stream_filter_params(stream);
389
390        if filter_names.is_empty() {
391            return Ok(stream.data.clone());
392        }
393
394        folio_filters::decode_filter_chain(&filter_names, &stream.data, &params)
395    }
396
397    /// Get the filter names for a stream.
398    fn get_stream_filters(&self, stream: &PdfStream) -> Vec<Vec<u8>> {
399        match stream.dict.get(b"Filter".as_slice()) {
400            Some(PdfObject::Name(name)) => vec![name.clone()],
401            Some(PdfObject::Array(arr)) => arr
402                .iter()
403                .filter_map(|obj| obj.as_name().map(|n| n.to_vec()))
404                .collect(),
405            _ => vec![],
406        }
407    }
408
409    /// Get the decode parameters for a stream's filters.
410    fn get_stream_filter_params(
411        &self,
412        stream: &PdfStream,
413    ) -> Vec<Option<folio_filters::FilterParams>> {
414        let filters = self.get_stream_filters(stream);
415        let params_obj = stream.dict.get(b"DecodeParms".as_slice());
416
417        match params_obj {
418            Some(PdfObject::Dict(d)) => {
419                vec![Some(dict_to_filter_params(d)); filters.len().max(1)]
420            }
421            Some(PdfObject::Array(arr)) => arr
422                .iter()
423                .map(|obj| obj.as_dict().map(dict_to_filter_params))
424                .collect(),
425            _ => vec![None; filters.len()],
426        }
427    }
428}
429
430/// Convert a PDF dictionary to FilterParams.
431fn dict_to_filter_params(dict: &IndexMap<Vec<u8>, PdfObject>) -> folio_filters::FilterParams {
432    folio_filters::FilterParams {
433        predictor: dict
434            .get(b"Predictor".as_slice())
435            .and_then(|o| o.as_i64())
436            .unwrap_or(1) as i32,
437        colors: dict
438            .get(b"Colors".as_slice())
439            .and_then(|o| o.as_i64())
440            .unwrap_or(1) as i32,
441        bits_per_component: dict
442            .get(b"BitsPerComponent".as_slice())
443            .and_then(|o| o.as_i64())
444            .unwrap_or(8) as i32,
445        columns: dict
446            .get(b"Columns".as_slice())
447            .and_then(|o| o.as_i64())
448            .unwrap_or(1) as i32,
449        early_change: dict
450            .get(b"EarlyChange".as_slice())
451            .and_then(|o| o.as_i64())
452            .unwrap_or(1) as i32,
453    }
454}
455
456#[cfg(test)]
457mod tests {
458    use super::*;
459
460    #[test]
461    fn test_new_empty() {
462        let doc = CosDoc::new();
463        assert_eq!(doc.xref_size(), 1);
464        assert!(doc.is_modified());
465    }
466
467    #[test]
468    fn test_create_indirect() {
469        let mut doc = CosDoc::new();
470        let id = doc.create_indirect(PdfObject::Integer(42));
471        assert_eq!(id.num, 1);
472        assert_eq!(id.gen_num, 0);
473
474        let obj = doc.get_object(1).unwrap().unwrap();
475        assert_eq!(obj.as_i64(), Some(42));
476    }
477
478    #[test]
479    fn test_open_minimal_pdf() {
480        // Minimal valid PDF
481        let pdf = build_minimal_pdf();
482        let mut doc = CosDoc::open(pdf).unwrap();
483
484        // Check trailer
485        let root_ref = doc
486            .trailer()
487            .get(b"Root".as_slice())
488            .unwrap()
489            .as_reference()
490            .unwrap();
491        assert_eq!(root_ref.num, 1);
492
493        // Check catalog
494        let catalog = doc.get_object(1).unwrap().unwrap();
495        assert_eq!(catalog.dict_get_name(b"Type"), Some(b"Catalog".as_slice()));
496    }
497
498    fn build_minimal_pdf() -> Vec<u8> {
499        let mut buf = Vec::new();
500        buf.extend_from_slice(b"%PDF-1.4\n");
501
502        let obj1_offset = buf.len();
503        buf.extend_from_slice(b"1 0 obj\n<< /Type /Catalog /Pages 2 0 R >>\nendobj\n");
504
505        let obj2_offset = buf.len();
506        buf.extend_from_slice(b"2 0 obj\n<< /Type /Pages /Kids [] /Count 0 >>\nendobj\n");
507
508        let xref_offset = buf.len();
509        buf.extend_from_slice(b"xref\n0 3\n");
510        buf.extend_from_slice(b"0000000000 65535 f \n");
511        buf.extend_from_slice(format!("{:010} 00000 n \n", obj1_offset).as_bytes());
512        buf.extend_from_slice(format!("{:010} 00000 n \n", obj2_offset).as_bytes());
513        buf.extend_from_slice(b"trailer\n<< /Size 3 /Root 1 0 R >>\n");
514        buf.extend_from_slice(format!("startxref\n{}\n%%EOF\n", xref_offset).as_bytes());
515
516        buf
517    }
518}