1use crate::object::{ObjectId, PdfObject, PdfStream};
7use crate::tokenizer::{Token, Tokenizer};
8use folio_core::{FolioError, Result};
9use indexmap::IndexMap;
10
11pub fn parse_object(tokenizer: &mut Tokenizer) -> Result<Option<PdfObject>> {
16 let token = match tokenizer.next_token()? {
17 Some(t) => t,
18 None => return Ok(None),
19 };
20
21 match token {
22 Token::Integer(n) => {
23 let saved_pos = tokenizer.pos();
25 match tokenizer.next_token()? {
26 Some(Token::Integer(g)) => {
27 let _saved_pos2 = tokenizer.pos();
28 match tokenizer.next_token()? {
29 Some(Token::Keyword(ref kw)) if kw == b"R" => Ok(Some(
30 PdfObject::Reference(ObjectId::new(n as u32, g as u16)),
31 )),
32 Some(Token::Keyword(ref kw)) if kw == b"obj" => {
33 let obj = parse_object(tokenizer)?.unwrap_or(PdfObject::Null);
35 skip_keyword(tokenizer, b"endobj");
37 Ok(Some(obj))
38 }
39 _ => {
40 tokenizer.set_pos(saved_pos);
42 Ok(Some(PdfObject::Integer(n)))
43 }
44 }
45 }
46 _ => {
47 tokenizer.set_pos(saved_pos);
48 Ok(Some(PdfObject::Integer(n)))
49 }
50 }
51 }
52 Token::Real(n) => Ok(Some(PdfObject::Real(n))),
53 Token::LiteralString(s) => Ok(Some(PdfObject::Str(s))),
54 Token::HexString(s) => Ok(Some(PdfObject::Str(s))),
55 Token::Name(n) => Ok(Some(PdfObject::Name(n))),
56 Token::Keyword(ref kw) => match kw.as_slice() {
57 b"true" => Ok(Some(PdfObject::Bool(true))),
58 b"false" => Ok(Some(PdfObject::Bool(false))),
59 b"null" => Ok(Some(PdfObject::Null)),
60 _ => {
61 Ok(None)
64 }
65 },
66 Token::ArrayBegin => parse_array(tokenizer).map(Some),
67 Token::DictBegin => parse_dict_or_stream(tokenizer).map(Some),
68 Token::ArrayEnd | Token::DictEnd => {
69 Ok(None)
71 }
72 }
73}
74
75fn parse_array(tokenizer: &mut Tokenizer) -> Result<PdfObject> {
77 let mut items = Vec::new();
78
79 loop {
80 tokenizer.skip_whitespace_and_comments();
81
82 if tokenizer.is_eof() {
83 return Err(FolioError::Parse {
84 offset: tokenizer.pos() as u64,
85 message: "Unterminated array".into(),
86 });
87 }
88
89 if tokenizer.peek_byte() == Some(b']') {
91 tokenizer.set_pos(tokenizer.pos() + 1);
92 return Ok(PdfObject::Array(items));
93 }
94
95 match parse_object(tokenizer)? {
96 Some(obj) => items.push(obj),
97 None => {
98 return Ok(PdfObject::Array(items));
100 }
101 }
102 }
103}
104
105fn parse_dict_or_stream(tokenizer: &mut Tokenizer) -> Result<PdfObject> {
107 let mut dict = IndexMap::new();
108
109 loop {
110 tokenizer.skip_whitespace_and_comments();
111
112 if tokenizer.is_eof() {
113 return Err(FolioError::Parse {
114 offset: tokenizer.pos() as u64,
115 message: "Unterminated dictionary".into(),
116 });
117 }
118
119 if tokenizer.peek_byte() == Some(b'>') {
121 let pos = tokenizer.pos();
122 if pos + 1 < tokenizer.data().len() && tokenizer.data()[pos + 1] == b'>' {
123 tokenizer.set_pos(pos + 2);
124
125 let saved_pos = tokenizer.pos();
127 tokenizer.skip_whitespace_and_comments();
128 let might_be_stream = tokenizer.pos();
129
130 if might_be_stream + 6 <= tokenizer.data().len()
131 && &tokenizer.data()[might_be_stream..might_be_stream + 6] == b"stream"
132 {
133 let after = tokenizer.data().get(might_be_stream + 6).copied();
135 if after == Some(b'\n') || after == Some(b'\r') {
136 return parse_stream(tokenizer, dict, might_be_stream);
137 }
138 }
139
140 tokenizer.set_pos(saved_pos);
142 return Ok(PdfObject::Dict(dict));
143 }
144 }
145
146 let key = match tokenizer.next_token()? {
148 Some(Token::Name(n)) => n,
149 Some(Token::DictEnd) => return Ok(PdfObject::Dict(dict)),
150 Some(other) => {
151 return Err(FolioError::Parse {
152 offset: tokenizer.pos() as u64,
153 message: format!("Expected name key in dict, got {:?}", other),
154 });
155 }
156 None => return Ok(PdfObject::Dict(dict)),
157 };
158
159 let value = parse_object(tokenizer)?.unwrap_or(PdfObject::Null);
161 dict.insert(key, value);
162 }
163}
164
165fn parse_stream(
167 tokenizer: &mut Tokenizer,
168 dict: IndexMap<Vec<u8>, PdfObject>,
169 stream_keyword_pos: usize,
170) -> Result<PdfObject> {
171 let mut pos = stream_keyword_pos + 6;
173
174 if pos < tokenizer.data().len() && tokenizer.data()[pos] == b'\r' {
176 pos += 1;
177 }
178 if pos < tokenizer.data().len() && tokenizer.data()[pos] == b'\n' {
179 pos += 1;
180 }
181
182 let length = dict
184 .get(b"Length".as_slice())
185 .and_then(|obj| obj.as_i64())
186 .unwrap_or(0) as usize;
187
188 let end_pos = (pos + length).min(tokenizer.data().len());
189 let data = tokenizer.data()[pos..end_pos].to_vec();
190
191 let mut search_pos = end_pos;
193 while search_pos < tokenizer.data().len()
195 && (tokenizer.data()[search_pos] == b'\r'
196 || tokenizer.data()[search_pos] == b'\n'
197 || tokenizer.data()[search_pos] == b' ')
198 {
199 search_pos += 1;
200 }
201 if search_pos + 9 <= tokenizer.data().len()
203 && &tokenizer.data()[search_pos..search_pos + 9] == b"endstream"
204 {
205 search_pos += 9;
206 }
207
208 tokenizer.set_pos(search_pos);
209
210 Ok(PdfObject::Stream(PdfStream {
211 dict,
212 data,
213 decoded: false,
214 }))
215}
216
217fn skip_keyword(tokenizer: &mut Tokenizer, expected: &[u8]) {
219 let saved = tokenizer.pos();
220 tokenizer.skip_whitespace_and_comments();
221 if let Ok(Some(Token::Keyword(kw))) = tokenizer.next_token() {
222 if kw == expected {
223 return;
224 }
225 }
226 tokenizer.set_pos(saved);
227}
228
229pub fn parse_indirect_object_at(data: &[u8], offset: usize) -> Result<(ObjectId, PdfObject)> {
232 let mut tokenizer = Tokenizer::new_at(data, offset);
233
234 let obj_num = match tokenizer.next_token()? {
235 Some(Token::Integer(n)) => n as u32,
236 other => {
237 return Err(FolioError::Parse {
238 offset: offset as u64,
239 message: format!("Expected object number, got {:?}", other),
240 });
241 }
242 };
243
244 let gen_num = match tokenizer.next_token()? {
245 Some(Token::Integer(n)) => n as u16,
246 other => {
247 return Err(FolioError::Parse {
248 offset: offset as u64,
249 message: format!("Expected generation number, got {:?}", other),
250 });
251 }
252 };
253
254 match tokenizer.next_token()? {
255 Some(Token::Keyword(ref kw)) if kw == b"obj" => {}
256 other => {
257 return Err(FolioError::Parse {
258 offset: offset as u64,
259 message: format!("Expected 'obj' keyword, got {:?}", other),
260 });
261 }
262 }
263
264 let obj = parse_object(&mut tokenizer)?.unwrap_or(PdfObject::Null);
265
266 skip_keyword(&mut tokenizer, b"endobj");
268
269 Ok((ObjectId::new(obj_num, gen_num), obj))
270}
271
272#[cfg(test)]
273mod tests {
274 use super::*;
275
276 fn parse(input: &[u8]) -> PdfObject {
277 let mut t = Tokenizer::new(input);
278 parse_object(&mut t).unwrap().unwrap()
279 }
280
281 #[test]
282 fn test_primitives() {
283 assert_eq!(parse(b"42"), PdfObject::Integer(42));
284 assert_eq!(parse(b"3.14"), PdfObject::Real(3.14));
285 assert_eq!(parse(b"true"), PdfObject::Bool(true));
286 assert_eq!(parse(b"false"), PdfObject::Bool(false));
287 assert_eq!(parse(b"null"), PdfObject::Null);
288 }
289
290 #[test]
291 fn test_name() {
292 assert_eq!(parse(b"/Type"), PdfObject::Name(b"Type".to_vec()));
293 }
294
295 #[test]
296 fn test_string() {
297 assert_eq!(parse(b"(Hello)"), PdfObject::Str(b"Hello".to_vec()));
298 assert_eq!(parse(b"<48656C6C6F>"), PdfObject::Str(b"Hello".to_vec()));
299 }
300
301 #[test]
302 fn test_array() {
303 let obj = parse(b"[1 2 3]");
304 let arr = obj.as_array().unwrap();
305 assert_eq!(arr.len(), 3);
306 assert_eq!(arr[0].as_i64(), Some(1));
307 assert_eq!(arr[2].as_i64(), Some(3));
308 }
309
310 #[test]
311 fn test_dict() {
312 let obj = parse(b"<< /Type /Page /Count 5 >>");
313 let dict = obj.as_dict().unwrap();
314 assert_eq!(dict.len(), 2);
315 assert_eq!(
316 dict.get(b"Type".as_slice()).unwrap().as_name(),
317 Some(b"Page".as_slice())
318 );
319 assert_eq!(dict.get(b"Count".as_slice()).unwrap().as_i64(), Some(5));
320 }
321
322 #[test]
323 fn test_reference() {
324 let obj = parse(b"3 0 R");
325 assert_eq!(obj.as_reference(), Some(ObjectId::new(3, 0)));
326 }
327
328 #[test]
329 fn test_nested() {
330 let obj = parse(b"<< /Kids [1 0 R 2 0 R] /Count 2 >>");
331 let kids = obj.dict_get(b"Kids").unwrap().as_array().unwrap();
332 assert_eq!(kids.len(), 2);
333 assert_eq!(kids[0].as_reference(), Some(ObjectId::new(1, 0)));
334 }
335
336 #[test]
337 fn test_indirect_object() {
338 let input = b"1 0 obj\n<< /Type /Catalog >>\nendobj";
339 let (id, obj) = parse_indirect_object_at(input, 0).unwrap();
340 assert_eq!(id, ObjectId::new(1, 0));
341 assert_eq!(obj.dict_get_name(b"Type"), Some(b"Catalog".as_slice()));
342 }
343
344 #[test]
345 fn test_stream() {
346 let input = b"<< /Length 5 >>\nstream\nHello\nendstream";
347 let obj = parse(input);
348 let stream = obj.as_stream().unwrap();
349 assert_eq!(&stream.data, b"Hello");
350 assert_eq!(
351 stream.dict.get(b"Length".as_slice()).unwrap().as_i64(),
352 Some(5)
353 );
354 }
355}