1use crate::object::{ObjectId, PdfObject, PdfStream};
6use crate::parser;
7use crate::serialize;
8use crate::tokenizer::{Token, Tokenizer};
9use crate::xref::{self, XrefEntry, XrefTable};
10use folio_core::{FolioError, Result};
11use indexmap::IndexMap;
12use std::collections::HashMap;
13
14pub struct CosDoc {
16 data: Vec<u8>,
18 xref: XrefTable,
20 object_cache: HashMap<u32, PdfObject>,
22 modified_objects: HashMap<u32, PdfObject>,
24 next_obj_num: u32,
26 is_modified: bool,
28}
29
30impl CosDoc {
31 pub fn open(data: Vec<u8>) -> Result<Self> {
33 if !data.starts_with(b"%PDF-") {
35 return Err(FolioError::Parse {
36 offset: 0,
37 message: "Not a PDF file (missing %PDF- header)".into(),
38 });
39 }
40
41 let xref = xref::parse_all_xrefs(&data)?;
43
44 let next_obj_num = xref
45 .trailer
46 .get(b"Size".as_slice())
47 .and_then(|o| o.as_i64())
48 .unwrap_or(1) as u32;
49
50 Ok(Self {
51 data,
52 xref,
53 object_cache: HashMap::new(),
54 modified_objects: HashMap::new(),
55 next_obj_num,
56 is_modified: false,
57 })
58 }
59
60 pub fn open_file(path: &str) -> Result<Self> {
62 let data = std::fs::read(path)?;
63 Self::open(data)
64 }
65
66 pub fn new() -> Self {
68 let mut trailer = IndexMap::new();
69 trailer.insert(b"Size".to_vec(), PdfObject::Integer(1));
70
71 Self {
72 data: Vec::new(),
73 xref: XrefTable {
74 entries: IndexMap::new(),
75 trailer,
76 },
77 object_cache: HashMap::new(),
78 modified_objects: HashMap::new(),
79 next_obj_num: 1,
80 is_modified: true,
81 }
82 }
83
84 pub fn trailer(&self) -> &IndexMap<Vec<u8>, PdfObject> {
86 &self.xref.trailer
87 }
88
89 pub fn trailer_mut(&mut self) -> &mut IndexMap<Vec<u8>, PdfObject> {
91 &mut self.xref.trailer
92 }
93
94 pub fn get_object(&mut self, obj_num: u32) -> Result<Option<&PdfObject>> {
98 if self.modified_objects.contains_key(&obj_num) {
100 return Ok(self.modified_objects.get(&obj_num));
101 }
102
103 if self.object_cache.contains_key(&obj_num) {
105 return Ok(self.object_cache.get(&obj_num));
106 }
107
108 let entry = match self.xref.entries.get(&obj_num) {
110 Some(e) => *e,
111 None => return Ok(None),
112 };
113
114 match entry {
115 XrefEntry::InUse { offset, .. } => {
116 let (_id, obj) = parser::parse_indirect_object_at(&self.data, offset as usize)?;
117 self.object_cache.insert(obj_num, obj);
118 Ok(self.object_cache.get(&obj_num))
119 }
120 XrefEntry::Free { .. } => Ok(None),
121 XrefEntry::Compressed { stream_obj, .. } => {
122 self.load_object_stream(stream_obj)?;
125 Ok(self.object_cache.get(&obj_num))
126 }
127 }
128 }
129
130 pub fn resolve(&mut self, obj: &PdfObject) -> Result<PdfObject> {
133 match obj {
134 PdfObject::Reference(id) => match self.get_object(id.num)? {
135 Some(resolved) => Ok(resolved.clone()),
136 None => Ok(PdfObject::Null),
137 },
138 _ => Ok(obj.clone()),
139 }
140 }
141
142 fn load_object_stream(&mut self, stream_obj_num: u32) -> Result<()> {
150 if self.object_cache.contains_key(&stream_obj_num) {
152 return Ok(());
153 }
154
155 let entry = match self.xref.entries.get(&stream_obj_num) {
157 Some(XrefEntry::InUse { offset, .. }) => *offset,
158 _ => {
159 return Err(FolioError::InvalidObject(format!(
160 "Object stream {} not found or not InUse",
161 stream_obj_num
162 )));
163 }
164 };
165
166 let (_id, stream_obj) = parser::parse_indirect_object_at(&self.data, entry as usize)?;
167 let stream = match &stream_obj {
168 PdfObject::Stream(s) => s,
169 _ => {
170 return Err(FolioError::InvalidObject(format!(
171 "Object {} is not a stream (expected ObjStm)",
172 stream_obj_num
173 )));
174 }
175 };
176
177 self.object_cache.insert(stream_obj_num, stream_obj.clone());
179
180 let n = stream
182 .dict
183 .get(b"N".as_slice())
184 .and_then(|o| o.as_i64())
185 .unwrap_or(0) as usize;
186 let first = stream
187 .dict
188 .get(b"First".as_slice())
189 .and_then(|o| o.as_i64())
190 .unwrap_or(0) as usize;
191
192 let decoded = self.decode_stream(stream)?;
194
195 if decoded.is_empty() || n == 0 {
196 return Ok(());
197 }
198
199 let header = &decoded[..first.min(decoded.len())];
201 let mut tokenizer = Tokenizer::new_at(header, 0);
202 let mut obj_entries: Vec<(u32, usize)> = Vec::with_capacity(n);
203
204 for _ in 0..n {
205 let obj_num = match tokenizer.next_token()? {
206 Some(Token::Integer(num)) => num as u32,
207 _ => break,
208 };
209 let offset = match tokenizer.next_token()? {
210 Some(Token::Integer(off)) => off as usize,
211 _ => break,
212 };
213 obj_entries.push((obj_num, offset));
214 }
215
216 let objects_data = &decoded[first.min(decoded.len())..];
218
219 for (i, &(obj_num, offset)) in obj_entries.iter().enumerate() {
220 let end = if i + 1 < obj_entries.len() {
222 obj_entries[i + 1].1
223 } else {
224 objects_data.len()
225 };
226
227 if offset >= objects_data.len() {
228 continue;
229 }
230
231 let obj_data = &objects_data[offset..end.min(objects_data.len())];
232 let mut obj_tokenizer = Tokenizer::new_at(obj_data, 0);
233
234 match parser::parse_object(&mut obj_tokenizer) {
235 Ok(Some(obj)) => {
236 self.object_cache.insert(obj_num, obj);
237 }
238 Ok(None) => {
239 self.object_cache.insert(obj_num, PdfObject::Null);
240 }
241 Err(e) => {
242 log::warn!(
243 "Failed to parse object {} from ObjStm {}: {}",
244 obj_num,
245 stream_obj_num,
246 e
247 );
248 }
249 }
250 }
251
252 Ok(())
253 }
254
255 pub fn create_indirect(&mut self, obj: PdfObject) -> ObjectId {
257 let id = ObjectId::new(self.next_obj_num, 0);
258 self.modified_objects.insert(self.next_obj_num, obj);
259 self.next_obj_num += 1;
260 self.is_modified = true;
261 id
262 }
263
264 pub fn update_object(&mut self, obj_num: u32, obj: PdfObject) {
266 self.modified_objects.insert(obj_num, obj);
267 self.is_modified = true;
268 }
269
270 pub fn xref_size(&self) -> u32 {
272 self.next_obj_num
273 }
274
275 pub fn is_modified(&self) -> bool {
277 self.is_modified
278 }
279
280 pub fn save_to_bytes(&mut self) -> Result<Vec<u8>> {
285 let compressed_entries: Vec<(u32, u32)> = self
287 .xref
288 .entries
289 .iter()
290 .filter_map(|(&num, entry)| match entry {
291 XrefEntry::Compressed { stream_obj, .. } => Some((num, *stream_obj)),
292 _ => None,
293 })
294 .collect();
295
296 for (_obj_num, stream_obj) in &compressed_entries {
297 if !self.object_cache.contains_key(stream_obj) {
298 let _ = self.load_object_stream(*stream_obj);
299 }
300 }
301
302 let mut objects: Vec<(ObjectId, PdfObject)> = Vec::new();
303 let mut seen = std::collections::HashSet::new();
304
305 for (&obj_num, entry) in &self.xref.entries {
307 if seen.contains(&obj_num) {
308 continue;
309 }
310
311 match entry {
312 XrefEntry::InUse { offset, .. } => {
313 let obj = if let Some(modified) = self.modified_objects.get(&obj_num) {
314 modified.clone()
315 } else if let Some(cached) = self.object_cache.get(&obj_num) {
316 cached.clone()
317 } else if let Ok((_id, obj)) =
318 parser::parse_indirect_object_at(&self.data, *offset as usize)
319 {
320 obj
321 } else {
322 continue;
323 };
324
325 let is_objstm_or_xref = obj
328 .dict_get_name(b"Type")
329 .is_some_and(|t| t == b"ObjStm" || t == b"XRef");
330 if !is_objstm_or_xref {
331 objects.push((ObjectId::new(obj_num, 0), obj));
332 seen.insert(obj_num);
333 }
334 }
335 XrefEntry::Compressed { .. } => {
336 if let Some(obj) = self.object_cache.get(&obj_num) {
338 objects.push((ObjectId::new(obj_num, 0), obj.clone()));
339 seen.insert(obj_num);
340 }
341 }
342 XrefEntry::Free { .. } => {}
343 }
344 }
345
346 for (&obj_num, obj) in &self.modified_objects {
348 if !seen.contains(&obj_num) {
349 objects.push((ObjectId::new(obj_num, 0), obj.clone()));
350 }
351 }
352
353 objects.sort_by_key(|(id, _)| id.num);
354
355 let mut clean_trailer = self.xref.trailer.clone();
358 for key in &[
359 b"Prev".as_slice(),
360 b"W".as_slice(),
361 b"Index".as_slice(),
362 b"Filter".as_slice(),
363 b"DecodeParms".as_slice(),
364 b"Length".as_slice(),
365 b"Type".as_slice(),
366 b"XRefStm".as_slice(),
367 ] {
368 clean_trailer.shift_remove(*key);
369 }
370
371 serialize::serialize_pdf(&objects, &clean_trailer)
372 }
373
374 pub fn save_to_file(&mut self, path: &str) -> Result<()> {
376 let data = self.save_to_bytes()?;
377 std::fs::write(path, data)?;
378 Ok(())
379 }
380
381 pub fn decode_stream(&self, stream: &PdfStream) -> Result<Vec<u8>> {
383 if stream.decoded {
384 return Ok(stream.data.clone());
385 }
386
387 let filter_names = self.get_stream_filters(stream);
388 let params = self.get_stream_filter_params(stream);
389
390 if filter_names.is_empty() {
391 return Ok(stream.data.clone());
392 }
393
394 folio_filters::decode_filter_chain(&filter_names, &stream.data, ¶ms)
395 }
396
397 fn get_stream_filters(&self, stream: &PdfStream) -> Vec<Vec<u8>> {
399 match stream.dict.get(b"Filter".as_slice()) {
400 Some(PdfObject::Name(name)) => vec![name.clone()],
401 Some(PdfObject::Array(arr)) => arr
402 .iter()
403 .filter_map(|obj| obj.as_name().map(|n| n.to_vec()))
404 .collect(),
405 _ => vec![],
406 }
407 }
408
409 fn get_stream_filter_params(
411 &self,
412 stream: &PdfStream,
413 ) -> Vec<Option<folio_filters::FilterParams>> {
414 let filters = self.get_stream_filters(stream);
415 let params_obj = stream.dict.get(b"DecodeParms".as_slice());
416
417 match params_obj {
418 Some(PdfObject::Dict(d)) => {
419 vec![Some(dict_to_filter_params(d)); filters.len().max(1)]
420 }
421 Some(PdfObject::Array(arr)) => arr
422 .iter()
423 .map(|obj| obj.as_dict().map(dict_to_filter_params))
424 .collect(),
425 _ => vec![None; filters.len()],
426 }
427 }
428}
429
430fn dict_to_filter_params(dict: &IndexMap<Vec<u8>, PdfObject>) -> folio_filters::FilterParams {
432 folio_filters::FilterParams {
433 predictor: dict
434 .get(b"Predictor".as_slice())
435 .and_then(|o| o.as_i64())
436 .unwrap_or(1) as i32,
437 colors: dict
438 .get(b"Colors".as_slice())
439 .and_then(|o| o.as_i64())
440 .unwrap_or(1) as i32,
441 bits_per_component: dict
442 .get(b"BitsPerComponent".as_slice())
443 .and_then(|o| o.as_i64())
444 .unwrap_or(8) as i32,
445 columns: dict
446 .get(b"Columns".as_slice())
447 .and_then(|o| o.as_i64())
448 .unwrap_or(1) as i32,
449 early_change: dict
450 .get(b"EarlyChange".as_slice())
451 .and_then(|o| o.as_i64())
452 .unwrap_or(1) as i32,
453 }
454}
455
456#[cfg(test)]
457mod tests {
458 use super::*;
459
460 #[test]
461 fn test_new_empty() {
462 let doc = CosDoc::new();
463 assert_eq!(doc.xref_size(), 1);
464 assert!(doc.is_modified());
465 }
466
467 #[test]
468 fn test_create_indirect() {
469 let mut doc = CosDoc::new();
470 let id = doc.create_indirect(PdfObject::Integer(42));
471 assert_eq!(id.num, 1);
472 assert_eq!(id.gen_num, 0);
473
474 let obj = doc.get_object(1).unwrap().unwrap();
475 assert_eq!(obj.as_i64(), Some(42));
476 }
477
478 #[test]
479 fn test_open_minimal_pdf() {
480 let pdf = build_minimal_pdf();
482 let mut doc = CosDoc::open(pdf).unwrap();
483
484 let root_ref = doc
486 .trailer()
487 .get(b"Root".as_slice())
488 .unwrap()
489 .as_reference()
490 .unwrap();
491 assert_eq!(root_ref.num, 1);
492
493 let catalog = doc.get_object(1).unwrap().unwrap();
495 assert_eq!(catalog.dict_get_name(b"Type"), Some(b"Catalog".as_slice()));
496 }
497
498 fn build_minimal_pdf() -> Vec<u8> {
499 let mut buf = Vec::new();
500 buf.extend_from_slice(b"%PDF-1.4\n");
501
502 let obj1_offset = buf.len();
503 buf.extend_from_slice(b"1 0 obj\n<< /Type /Catalog /Pages 2 0 R >>\nendobj\n");
504
505 let obj2_offset = buf.len();
506 buf.extend_from_slice(b"2 0 obj\n<< /Type /Pages /Kids [] /Count 0 >>\nendobj\n");
507
508 let xref_offset = buf.len();
509 buf.extend_from_slice(b"xref\n0 3\n");
510 buf.extend_from_slice(b"0000000000 65535 f \n");
511 buf.extend_from_slice(format!("{:010} 00000 n \n", obj1_offset).as_bytes());
512 buf.extend_from_slice(format!("{:010} 00000 n \n", obj2_offset).as_bytes());
513 buf.extend_from_slice(b"trailer\n<< /Size 3 /Root 1 0 R >>\n");
514 buf.extend_from_slice(format!("startxref\n{}\n%%EOF\n", xref_offset).as_bytes());
515
516 buf
517 }
518}