Skip to main content

justpdf_core/object/
mod.rs

1mod types;
2
3pub use types::{IndirectRef, PdfDict, PdfObject};
4
5use crate::error::{JustPdfError, Result};
6use crate::tokenizer::Tokenizer;
7use crate::tokenizer::token::{Keyword, Token};
8
9/// Parse a single PDF object from the tokenizer's current position.
10/// Does NOT handle `N M obj ... endobj` wrappers — see `parse_indirect_object`.
11pub fn parse_object(tokenizer: &mut Tokenizer<'_>) -> Result<PdfObject> {
12    let offset = tokenizer.pos();
13    let Some(token) = tokenizer.next_token()? else {
14        return Err(JustPdfError::UnexpectedEof { offset });
15    };
16
17    match token {
18        Token::Keyword(Keyword::Null) => Ok(PdfObject::Null),
19        Token::Keyword(Keyword::True) => Ok(PdfObject::Bool(true)),
20        Token::Keyword(Keyword::False) => Ok(PdfObject::Bool(false)),
21        Token::Integer(v) => {
22            // Peek ahead to check for "N M R" (indirect reference)
23            let saved = tokenizer.pos();
24            match tokenizer.next_token() {
25                Ok(Some(Token::Integer(gen_val))) => match tokenizer.next_token() {
26                    Ok(Some(Token::Keyword(Keyword::R))) => Ok(PdfObject::Reference(IndirectRef {
27                        obj_num: v as u32,
28                        gen_num: gen_val as u16,
29                    })),
30                    _ => {
31                        tokenizer.seek(saved);
32                        Ok(PdfObject::Integer(v))
33                    }
34                },
35                _ => {
36                    tokenizer.seek(saved);
37                    Ok(PdfObject::Integer(v))
38                }
39            }
40        }
41        Token::Real(v) => Ok(PdfObject::Real(v)),
42        Token::LiteralString(v) => Ok(PdfObject::String(v)),
43        Token::HexString(v) => Ok(PdfObject::String(v)),
44        Token::Name(v) => Ok(PdfObject::Name(v)),
45        Token::ArrayBegin => {
46            let mut arr = Vec::new();
47            loop {
48                let peek_pos = tokenizer.pos();
49                match tokenizer.next_token()? {
50                    Some(Token::ArrayEnd) => break,
51                    Some(_tok) => {
52                        tokenizer.seek(peek_pos);
53                        arr.push(parse_object(tokenizer)?);
54                    }
55                    None => {
56                        return Err(JustPdfError::UnexpectedEof { offset });
57                    }
58                }
59            }
60            Ok(PdfObject::Array(arr))
61        }
62        Token::DictBegin => {
63            let dict = parse_dict_body(tokenizer, offset)?;
64            Ok(PdfObject::Dict(dict))
65        }
66        _ => Err(JustPdfError::InvalidObject {
67            offset,
68            detail: format!("unexpected token: {token:?}"),
69        }),
70    }
71}
72
73/// Parse dict entries until `>>`. Assumes `<<` has already been consumed.
74fn parse_dict_body(tokenizer: &mut Tokenizer<'_>, start: usize) -> Result<PdfDict> {
75    let mut dict = PdfDict::new();
76    loop {
77        let peek_pos = tokenizer.pos();
78        match tokenizer.next_token()? {
79            Some(Token::DictEnd) => break,
80            Some(Token::Name(key)) => {
81                let value = parse_object(tokenizer)?;
82                dict.insert(key, value);
83            }
84            Some(tok) => {
85                return Err(JustPdfError::InvalidObject {
86                    offset: peek_pos,
87                    detail: format!("expected name or >> in dict, got: {tok:?}"),
88                });
89            }
90            None => {
91                return Err(JustPdfError::UnexpectedEof { offset: start });
92            }
93        }
94    }
95    Ok(dict)
96}
97
98/// Parse an indirect object: `N M obj <object> endobj`.
99/// Returns (IndirectRef, PdfObject).
100pub fn parse_indirect_object(tokenizer: &mut Tokenizer<'_>) -> Result<(IndirectRef, PdfObject)> {
101    let offset = tokenizer.pos();
102
103    let obj_num = match tokenizer.next_token()? {
104        Some(Token::Integer(n)) => n as u32,
105        _ => {
106            return Err(JustPdfError::InvalidObject {
107                offset,
108                detail: "expected object number".into(),
109            });
110        }
111    };
112
113    let gen_num = match tokenizer.next_token()? {
114        Some(Token::Integer(n)) => n as u16,
115        _ => {
116            return Err(JustPdfError::InvalidObject {
117                offset,
118                detail: "expected generation number".into(),
119            });
120        }
121    };
122
123    match tokenizer.next_token()? {
124        Some(Token::Keyword(Keyword::Obj)) => {}
125        _ => {
126            return Err(JustPdfError::InvalidObject {
127                offset,
128                detail: "expected 'obj' keyword".into(),
129            });
130        }
131    }
132
133    let obj = parse_object(tokenizer)?;
134
135    // Check for stream
136    let saved = tokenizer.pos();
137    let result = match tokenizer.next_token()? {
138        Some(Token::Keyword(Keyword::Stream)) => {
139            // Stream: the dict we just parsed must be the stream dict
140            let dict = match obj {
141                PdfObject::Dict(d) => d,
142                _ => {
143                    return Err(JustPdfError::InvalidObject {
144                        offset,
145                        detail: "stream must be preceded by a dictionary".into(),
146                    });
147                }
148            };
149
150            let stream_data = read_stream_data(tokenizer, &dict, offset)?;
151            let stream_obj = PdfObject::Stream {
152                dict,
153                data: stream_data,
154            };
155
156            // Consume endstream
157            // (read_stream_data positions us after the data, now skip to endobj)
158            // The 'endstream' keyword may have been consumed by position-based reading
159            // Try to consume endstream if present
160            let saved2 = tokenizer.pos();
161            if let Ok(Some(Token::Keyword(Keyword::EndStream))) = tokenizer.next_token() {
162                // good
163            } else {
164                tokenizer.seek(saved2);
165            }
166
167            stream_obj
168        }
169        Some(Token::Keyword(Keyword::EndObj)) => {
170            return Ok((IndirectRef { obj_num, gen_num }, obj));
171        }
172        _ => {
173            tokenizer.seek(saved);
174            obj
175        }
176    };
177
178    // Consume endobj
179    let saved = tokenizer.pos();
180    if let Ok(Some(Token::Keyword(Keyword::EndObj))) = tokenizer.next_token() {
181        // good
182    } else {
183        tokenizer.seek(saved);
184    }
185
186    Ok((IndirectRef { obj_num, gen_num }, result))
187}
188
189/// Read raw stream data based on /Length in the dict.
190fn read_stream_data(
191    tokenizer: &mut Tokenizer<'_>,
192    dict: &PdfDict,
193    start_offset: usize,
194) -> Result<Vec<u8>> {
195    // Skip the newline after 'stream' keyword
196    let data = tokenizer.reader().data();
197    let mut pos = tokenizer.pos();
198
199    // PDF spec: stream keyword followed by \r\n or \n
200    if pos < data.len() && data[pos] == b'\r' {
201        pos += 1;
202    }
203    if pos < data.len() && data[pos] == b'\n' {
204        pos += 1;
205    }
206
207    // Get length from dict
208    let length = match dict.get(b"Length") {
209        Some(PdfObject::Integer(n)) => *n as usize,
210        _ => {
211            // Try to find endstream by scanning
212            return find_stream_data_by_endstream(data, pos, start_offset);
213        }
214    };
215
216    if pos + length > data.len() {
217        return Err(JustPdfError::UnexpectedEof { offset: pos });
218    }
219
220    let stream_data = data[pos..pos + length].to_vec();
221    tokenizer.seek(pos + length);
222
223    Ok(stream_data)
224}
225
226/// Fallback: scan for 'endstream' to determine stream length.
227fn find_stream_data_by_endstream(data: &[u8], start: usize, err_offset: usize) -> Result<Vec<u8>> {
228    let needle = b"endstream";
229    for i in start..data.len().saturating_sub(needle.len()) {
230        if &data[i..i + needle.len()] == needle {
231            // Remove trailing \r\n or \n before endstream
232            let mut end = i;
233            if end > start && data[end - 1] == b'\n' {
234                end -= 1;
235            }
236            if end > start && data[end - 1] == b'\r' {
237                end -= 1;
238            }
239            return Ok(data[start..end].to_vec());
240        }
241    }
242    Err(JustPdfError::InvalidObject {
243        offset: err_offset,
244        detail: "could not find endstream".into(),
245    })
246}
247
248#[cfg(test)]
249mod tests {
250    use super::*;
251
252    #[test]
253    fn test_parse_null() {
254        let mut t = Tokenizer::new(b"null");
255        assert_eq!(parse_object(&mut t).unwrap(), PdfObject::Null);
256    }
257
258    #[test]
259    fn test_parse_bool() {
260        let mut t = Tokenizer::new(b"true");
261        assert_eq!(parse_object(&mut t).unwrap(), PdfObject::Bool(true));
262
263        let mut t = Tokenizer::new(b"false");
264        assert_eq!(parse_object(&mut t).unwrap(), PdfObject::Bool(false));
265    }
266
267    #[test]
268    fn test_parse_numbers() {
269        let mut t = Tokenizer::new(b"42");
270        assert_eq!(parse_object(&mut t).unwrap(), PdfObject::Integer(42));
271
272        let mut t = Tokenizer::new(b"3.15");
273        assert_eq!(parse_object(&mut t).unwrap(), PdfObject::Real(3.15));
274    }
275
276    #[test]
277    fn test_parse_string() {
278        let mut t = Tokenizer::new(b"(Hello)");
279        assert_eq!(
280            parse_object(&mut t).unwrap(),
281            PdfObject::String(b"Hello".to_vec())
282        );
283    }
284
285    #[test]
286    fn test_parse_name() {
287        let mut t = Tokenizer::new(b"/Type");
288        assert_eq!(
289            parse_object(&mut t).unwrap(),
290            PdfObject::Name(b"Type".to_vec())
291        );
292    }
293
294    #[test]
295    fn test_parse_array() {
296        let mut t = Tokenizer::new(b"[1 2 3]");
297        assert_eq!(
298            parse_object(&mut t).unwrap(),
299            PdfObject::Array(vec![
300                PdfObject::Integer(1),
301                PdfObject::Integer(2),
302                PdfObject::Integer(3),
303            ])
304        );
305    }
306
307    #[test]
308    fn test_parse_dict() {
309        let mut t = Tokenizer::new(b"<< /Type /Catalog /Pages 2 0 R >>");
310        let obj = parse_object(&mut t).unwrap();
311        match &obj {
312            PdfObject::Dict(d) => {
313                assert_eq!(d.get(b"Type"), Some(&PdfObject::Name(b"Catalog".to_vec())));
314                assert_eq!(
315                    d.get(b"Pages"),
316                    Some(&PdfObject::Reference(IndirectRef {
317                        obj_num: 2,
318                        gen_num: 0
319                    }))
320                );
321            }
322            _ => panic!("expected dict, got {obj:?}"),
323        }
324    }
325
326    #[test]
327    fn test_parse_reference() {
328        let mut t = Tokenizer::new(b"10 0 R");
329        assert_eq!(
330            parse_object(&mut t).unwrap(),
331            PdfObject::Reference(IndirectRef {
332                obj_num: 10,
333                gen_num: 0
334            })
335        );
336    }
337
338    #[test]
339    fn test_parse_indirect_object() {
340        let input = b"1 0 obj\n<< /Type /Catalog >>\nendobj";
341        let mut t = Tokenizer::new(input);
342        let (iref, obj) = parse_indirect_object(&mut t).unwrap();
343        assert_eq!(
344            iref,
345            IndirectRef {
346                obj_num: 1,
347                gen_num: 0
348            }
349        );
350        assert!(matches!(obj, PdfObject::Dict(_)));
351    }
352
353    #[test]
354    fn test_parse_nested() {
355        let input = b"<< /Kids [ 1 0 R 2 0 R ] /Count 2 >>";
356        let mut t = Tokenizer::new(input);
357        let obj = parse_object(&mut t).unwrap();
358        match &obj {
359            PdfObject::Dict(d) => {
360                assert_eq!(d.get(b"Count"), Some(&PdfObject::Integer(2)));
361                match d.get(b"Kids") {
362                    Some(PdfObject::Array(arr)) => assert_eq!(arr.len(), 2),
363                    _ => panic!("expected array"),
364                }
365            }
366            _ => panic!("expected dict"),
367        }
368    }
369}