Skip to main content

folio_cos/
parser.rs

1//! PDF object parser.
2//!
3//! Parses PDF objects from a token stream produced by the tokenizer.
4//! Handles direct objects, indirect object definitions, and object references.
5
6use crate::object::{ObjectId, PdfObject, PdfStream};
7use crate::tokenizer::{Token, Tokenizer};
8use folio_core::{FolioError, Result};
9use indexmap::IndexMap;
10
11/// Parse a single PDF object from the tokenizer.
12///
13/// This may consume multiple tokens (e.g., for arrays, dicts, or references).
14/// Returns None if there are no more tokens.
15pub fn parse_object(tokenizer: &mut Tokenizer) -> Result<Option<PdfObject>> {
16    let token = match tokenizer.next_token()? {
17        Some(t) => t,
18        None => return Ok(None),
19    };
20
21    match token {
22        Token::Integer(n) => {
23            // Could be: integer, or start of "N G R" reference, or "N G obj" definition
24            let saved_pos = tokenizer.pos();
25            match tokenizer.next_token()? {
26                Some(Token::Integer(g)) => {
27                    let _saved_pos2 = tokenizer.pos();
28                    match tokenizer.next_token()? {
29                        Some(Token::Keyword(ref kw)) if kw == b"R" => Ok(Some(
30                            PdfObject::Reference(ObjectId::new(n as u32, g as u16)),
31                        )),
32                        Some(Token::Keyword(ref kw)) if kw == b"obj" => {
33                            // Indirect object definition — parse the contained object
34                            let obj = parse_object(tokenizer)?.unwrap_or(PdfObject::Null);
35                            // Skip 'endobj'
36                            skip_keyword(tokenizer, b"endobj");
37                            Ok(Some(obj))
38                        }
39                        _ => {
40                            // Not a reference or obj — put back both tokens
41                            tokenizer.set_pos(saved_pos);
42                            Ok(Some(PdfObject::Integer(n)))
43                        }
44                    }
45                }
46                _ => {
47                    tokenizer.set_pos(saved_pos);
48                    Ok(Some(PdfObject::Integer(n)))
49                }
50            }
51        }
52        Token::Real(n) => Ok(Some(PdfObject::Real(n))),
53        Token::LiteralString(s) => Ok(Some(PdfObject::Str(s))),
54        Token::HexString(s) => Ok(Some(PdfObject::Str(s))),
55        Token::Name(n) => Ok(Some(PdfObject::Name(n))),
56        Token::Keyword(ref kw) => match kw.as_slice() {
57            b"true" => Ok(Some(PdfObject::Bool(true))),
58            b"false" => Ok(Some(PdfObject::Bool(false))),
59            b"null" => Ok(Some(PdfObject::Null)),
60            _ => {
61                // Unknown keyword — return as-is for caller to handle
62                // (e.g., endobj, endstream, etc.)
63                Ok(None)
64            }
65        },
66        Token::ArrayBegin => parse_array(tokenizer).map(Some),
67        Token::DictBegin => parse_dict_or_stream(tokenizer).map(Some),
68        Token::ArrayEnd | Token::DictEnd => {
69            // These are handled by the array/dict parsers
70            Ok(None)
71        }
72    }
73}
74
75/// Parse an array (tokens between [ and ]).
76fn parse_array(tokenizer: &mut Tokenizer) -> Result<PdfObject> {
77    let mut items = Vec::new();
78
79    loop {
80        tokenizer.skip_whitespace_and_comments();
81
82        if tokenizer.is_eof() {
83            return Err(FolioError::Parse {
84                offset: tokenizer.pos() as u64,
85                message: "Unterminated array".into(),
86            });
87        }
88
89        // Check for ] without consuming it via next_token
90        if tokenizer.peek_byte() == Some(b']') {
91            tokenizer.set_pos(tokenizer.pos() + 1);
92            return Ok(PdfObject::Array(items));
93        }
94
95        match parse_object(tokenizer)? {
96            Some(obj) => items.push(obj),
97            None => {
98                // Could be ] consumed as keyword, or end of input
99                return Ok(PdfObject::Array(items));
100            }
101        }
102    }
103}
104
105/// Parse a dictionary or stream (tokens after <<).
106fn parse_dict_or_stream(tokenizer: &mut Tokenizer) -> Result<PdfObject> {
107    let mut dict = IndexMap::new();
108
109    loop {
110        tokenizer.skip_whitespace_and_comments();
111
112        if tokenizer.is_eof() {
113            return Err(FolioError::Parse {
114                offset: tokenizer.pos() as u64,
115                message: "Unterminated dictionary".into(),
116            });
117        }
118
119        // Check for >>
120        if tokenizer.peek_byte() == Some(b'>') {
121            let pos = tokenizer.pos();
122            if pos + 1 < tokenizer.data().len() && tokenizer.data()[pos + 1] == b'>' {
123                tokenizer.set_pos(pos + 2);
124
125                // Check if followed by 'stream' keyword
126                let saved_pos = tokenizer.pos();
127                tokenizer.skip_whitespace_and_comments();
128                let might_be_stream = tokenizer.pos();
129
130                if might_be_stream + 6 <= tokenizer.data().len()
131                    && &tokenizer.data()[might_be_stream..might_be_stream + 6] == b"stream"
132                {
133                    // Check the byte after "stream" to confirm it's the keyword
134                    let after = tokenizer.data().get(might_be_stream + 6).copied();
135                    if after == Some(b'\n') || after == Some(b'\r') {
136                        return parse_stream(tokenizer, dict, might_be_stream);
137                    }
138                }
139
140                // Not a stream — restore position
141                tokenizer.set_pos(saved_pos);
142                return Ok(PdfObject::Dict(dict));
143            }
144        }
145
146        // Read key (must be a Name)
147        let key = match tokenizer.next_token()? {
148            Some(Token::Name(n)) => n,
149            Some(Token::DictEnd) => return Ok(PdfObject::Dict(dict)),
150            Some(other) => {
151                return Err(FolioError::Parse {
152                    offset: tokenizer.pos() as u64,
153                    message: format!("Expected name key in dict, got {:?}", other),
154                });
155            }
156            None => return Ok(PdfObject::Dict(dict)),
157        };
158
159        // Read value
160        let value = parse_object(tokenizer)?.unwrap_or(PdfObject::Null);
161        dict.insert(key, value);
162    }
163}
164
165/// Parse stream data after dict and 'stream' keyword.
166fn parse_stream(
167    tokenizer: &mut Tokenizer,
168    dict: IndexMap<Vec<u8>, PdfObject>,
169    stream_keyword_pos: usize,
170) -> Result<PdfObject> {
171    // Position past 'stream'
172    let mut pos = stream_keyword_pos + 6;
173
174    // Skip the EOL after 'stream' (required: either \r\n or \n)
175    if pos < tokenizer.data().len() && tokenizer.data()[pos] == b'\r' {
176        pos += 1;
177    }
178    if pos < tokenizer.data().len() && tokenizer.data()[pos] == b'\n' {
179        pos += 1;
180    }
181
182    // Get stream length from dictionary
183    let length = dict
184        .get(b"Length".as_slice())
185        .and_then(|obj| obj.as_i64())
186        .unwrap_or(0) as usize;
187
188    let end_pos = (pos + length).min(tokenizer.data().len());
189    let data = tokenizer.data()[pos..end_pos].to_vec();
190
191    // Skip past the data + 'endstream'
192    let mut search_pos = end_pos;
193    // Skip whitespace before endstream
194    while search_pos < tokenizer.data().len()
195        && (tokenizer.data()[search_pos] == b'\r'
196            || tokenizer.data()[search_pos] == b'\n'
197            || tokenizer.data()[search_pos] == b' ')
198    {
199        search_pos += 1;
200    }
201    // Skip 'endstream' keyword
202    if search_pos + 9 <= tokenizer.data().len()
203        && &tokenizer.data()[search_pos..search_pos + 9] == b"endstream"
204    {
205        search_pos += 9;
206    }
207
208    tokenizer.set_pos(search_pos);
209
210    Ok(PdfObject::Stream(PdfStream {
211        dict,
212        data,
213        decoded: false,
214    }))
215}
216
217/// Skip an expected keyword (non-fatal if not found).
218fn skip_keyword(tokenizer: &mut Tokenizer, expected: &[u8]) {
219    let saved = tokenizer.pos();
220    tokenizer.skip_whitespace_and_comments();
221    if let Ok(Some(Token::Keyword(kw))) = tokenizer.next_token() {
222        if kw == expected {
223            return;
224        }
225    }
226    tokenizer.set_pos(saved);
227}
228
229/// Parse an indirect object at a given byte offset.
230/// Returns (ObjectId, PdfObject).
231pub fn parse_indirect_object_at(data: &[u8], offset: usize) -> Result<(ObjectId, PdfObject)> {
232    let mut tokenizer = Tokenizer::new_at(data, offset);
233
234    let obj_num = match tokenizer.next_token()? {
235        Some(Token::Integer(n)) => n as u32,
236        other => {
237            return Err(FolioError::Parse {
238                offset: offset as u64,
239                message: format!("Expected object number, got {:?}", other),
240            });
241        }
242    };
243
244    let gen_num = match tokenizer.next_token()? {
245        Some(Token::Integer(n)) => n as u16,
246        other => {
247            return Err(FolioError::Parse {
248                offset: offset as u64,
249                message: format!("Expected generation number, got {:?}", other),
250            });
251        }
252    };
253
254    match tokenizer.next_token()? {
255        Some(Token::Keyword(ref kw)) if kw == b"obj" => {}
256        other => {
257            return Err(FolioError::Parse {
258                offset: offset as u64,
259                message: format!("Expected 'obj' keyword, got {:?}", other),
260            });
261        }
262    }
263
264    let obj = parse_object(&mut tokenizer)?.unwrap_or(PdfObject::Null);
265
266    // Skip 'endobj'
267    skip_keyword(&mut tokenizer, b"endobj");
268
269    Ok((ObjectId::new(obj_num, gen_num), obj))
270}
271
272#[cfg(test)]
273mod tests {
274    use super::*;
275
276    fn parse(input: &[u8]) -> PdfObject {
277        let mut t = Tokenizer::new(input);
278        parse_object(&mut t).unwrap().unwrap()
279    }
280
281    #[test]
282    fn test_primitives() {
283        assert_eq!(parse(b"42"), PdfObject::Integer(42));
284        assert_eq!(parse(b"3.14"), PdfObject::Real(3.14));
285        assert_eq!(parse(b"true"), PdfObject::Bool(true));
286        assert_eq!(parse(b"false"), PdfObject::Bool(false));
287        assert_eq!(parse(b"null"), PdfObject::Null);
288    }
289
290    #[test]
291    fn test_name() {
292        assert_eq!(parse(b"/Type"), PdfObject::Name(b"Type".to_vec()));
293    }
294
295    #[test]
296    fn test_string() {
297        assert_eq!(parse(b"(Hello)"), PdfObject::Str(b"Hello".to_vec()));
298        assert_eq!(parse(b"<48656C6C6F>"), PdfObject::Str(b"Hello".to_vec()));
299    }
300
301    #[test]
302    fn test_array() {
303        let obj = parse(b"[1 2 3]");
304        let arr = obj.as_array().unwrap();
305        assert_eq!(arr.len(), 3);
306        assert_eq!(arr[0].as_i64(), Some(1));
307        assert_eq!(arr[2].as_i64(), Some(3));
308    }
309
310    #[test]
311    fn test_dict() {
312        let obj = parse(b"<< /Type /Page /Count 5 >>");
313        let dict = obj.as_dict().unwrap();
314        assert_eq!(dict.len(), 2);
315        assert_eq!(
316            dict.get(b"Type".as_slice()).unwrap().as_name(),
317            Some(b"Page".as_slice())
318        );
319        assert_eq!(dict.get(b"Count".as_slice()).unwrap().as_i64(), Some(5));
320    }
321
322    #[test]
323    fn test_reference() {
324        let obj = parse(b"3 0 R");
325        assert_eq!(obj.as_reference(), Some(ObjectId::new(3, 0)));
326    }
327
328    #[test]
329    fn test_nested() {
330        let obj = parse(b"<< /Kids [1 0 R 2 0 R] /Count 2 >>");
331        let kids = obj.dict_get(b"Kids").unwrap().as_array().unwrap();
332        assert_eq!(kids.len(), 2);
333        assert_eq!(kids[0].as_reference(), Some(ObjectId::new(1, 0)));
334    }
335
336    #[test]
337    fn test_indirect_object() {
338        let input = b"1 0 obj\n<< /Type /Catalog >>\nendobj";
339        let (id, obj) = parse_indirect_object_at(input, 0).unwrap();
340        assert_eq!(id, ObjectId::new(1, 0));
341        assert_eq!(obj.dict_get_name(b"Type"), Some(b"Catalog".as_slice()));
342    }
343
344    #[test]
345    fn test_stream() {
346        let input = b"<< /Length 5 >>\nstream\nHello\nendstream";
347        let obj = parse(input);
348        let stream = obj.as_stream().unwrap();
349        assert_eq!(&stream.data, b"Hello");
350        assert_eq!(
351            stream.dict.get(b"Length".as_slice()).unwrap().as_i64(),
352            Some(5)
353        );
354    }
355}