justpdf_core/object/
mod.rs1mod types;
2
3pub use types::{IndirectRef, PdfDict, PdfObject};
4
5use crate::error::{JustPdfError, Result};
6use crate::tokenizer::Tokenizer;
7use crate::tokenizer::token::{Keyword, Token};
8
9pub fn parse_object(tokenizer: &mut Tokenizer<'_>) -> Result<PdfObject> {
12 let offset = tokenizer.pos();
13 let Some(token) = tokenizer.next_token()? else {
14 return Err(JustPdfError::UnexpectedEof { offset });
15 };
16
17 match token {
18 Token::Keyword(Keyword::Null) => Ok(PdfObject::Null),
19 Token::Keyword(Keyword::True) => Ok(PdfObject::Bool(true)),
20 Token::Keyword(Keyword::False) => Ok(PdfObject::Bool(false)),
21 Token::Integer(v) => {
22 let saved = tokenizer.pos();
24 match tokenizer.next_token() {
25 Ok(Some(Token::Integer(gen_val))) => match tokenizer.next_token() {
26 Ok(Some(Token::Keyword(Keyword::R))) => Ok(PdfObject::Reference(IndirectRef {
27 obj_num: v as u32,
28 gen_num: gen_val as u16,
29 })),
30 _ => {
31 tokenizer.seek(saved);
32 Ok(PdfObject::Integer(v))
33 }
34 },
35 _ => {
36 tokenizer.seek(saved);
37 Ok(PdfObject::Integer(v))
38 }
39 }
40 }
41 Token::Real(v) => Ok(PdfObject::Real(v)),
42 Token::LiteralString(v) => Ok(PdfObject::String(v)),
43 Token::HexString(v) => Ok(PdfObject::String(v)),
44 Token::Name(v) => Ok(PdfObject::Name(v)),
45 Token::ArrayBegin => {
46 let mut arr = Vec::new();
47 loop {
48 let peek_pos = tokenizer.pos();
49 match tokenizer.next_token()? {
50 Some(Token::ArrayEnd) => break,
51 Some(_tok) => {
52 tokenizer.seek(peek_pos);
53 arr.push(parse_object(tokenizer)?);
54 }
55 None => {
56 return Err(JustPdfError::UnexpectedEof { offset });
57 }
58 }
59 }
60 Ok(PdfObject::Array(arr))
61 }
62 Token::DictBegin => {
63 let dict = parse_dict_body(tokenizer, offset)?;
64 Ok(PdfObject::Dict(dict))
65 }
66 _ => Err(JustPdfError::InvalidObject {
67 offset,
68 detail: format!("unexpected token: {token:?}"),
69 }),
70 }
71}
72
73fn parse_dict_body(tokenizer: &mut Tokenizer<'_>, start: usize) -> Result<PdfDict> {
75 let mut dict = PdfDict::new();
76 loop {
77 let peek_pos = tokenizer.pos();
78 match tokenizer.next_token()? {
79 Some(Token::DictEnd) => break,
80 Some(Token::Name(key)) => {
81 let value = parse_object(tokenizer)?;
82 dict.insert(key, value);
83 }
84 Some(tok) => {
85 return Err(JustPdfError::InvalidObject {
86 offset: peek_pos,
87 detail: format!("expected name or >> in dict, got: {tok:?}"),
88 });
89 }
90 None => {
91 return Err(JustPdfError::UnexpectedEof { offset: start });
92 }
93 }
94 }
95 Ok(dict)
96}
97
98pub fn parse_indirect_object(tokenizer: &mut Tokenizer<'_>) -> Result<(IndirectRef, PdfObject)> {
101 let offset = tokenizer.pos();
102
103 let obj_num = match tokenizer.next_token()? {
104 Some(Token::Integer(n)) => n as u32,
105 _ => {
106 return Err(JustPdfError::InvalidObject {
107 offset,
108 detail: "expected object number".into(),
109 });
110 }
111 };
112
113 let gen_num = match tokenizer.next_token()? {
114 Some(Token::Integer(n)) => n as u16,
115 _ => {
116 return Err(JustPdfError::InvalidObject {
117 offset,
118 detail: "expected generation number".into(),
119 });
120 }
121 };
122
123 match tokenizer.next_token()? {
124 Some(Token::Keyword(Keyword::Obj)) => {}
125 _ => {
126 return Err(JustPdfError::InvalidObject {
127 offset,
128 detail: "expected 'obj' keyword".into(),
129 });
130 }
131 }
132
133 let obj = parse_object(tokenizer)?;
134
135 let saved = tokenizer.pos();
137 let result = match tokenizer.next_token()? {
138 Some(Token::Keyword(Keyword::Stream)) => {
139 let dict = match obj {
141 PdfObject::Dict(d) => d,
142 _ => {
143 return Err(JustPdfError::InvalidObject {
144 offset,
145 detail: "stream must be preceded by a dictionary".into(),
146 });
147 }
148 };
149
150 let stream_data = read_stream_data(tokenizer, &dict, offset)?;
151 let stream_obj = PdfObject::Stream {
152 dict,
153 data: stream_data,
154 };
155
156 let saved2 = tokenizer.pos();
161 if let Ok(Some(Token::Keyword(Keyword::EndStream))) = tokenizer.next_token() {
162 } else {
164 tokenizer.seek(saved2);
165 }
166
167 stream_obj
168 }
169 Some(Token::Keyword(Keyword::EndObj)) => {
170 return Ok((IndirectRef { obj_num, gen_num }, obj));
171 }
172 _ => {
173 tokenizer.seek(saved);
174 obj
175 }
176 };
177
178 let saved = tokenizer.pos();
180 if let Ok(Some(Token::Keyword(Keyword::EndObj))) = tokenizer.next_token() {
181 } else {
183 tokenizer.seek(saved);
184 }
185
186 Ok((IndirectRef { obj_num, gen_num }, result))
187}
188
189fn read_stream_data(
191 tokenizer: &mut Tokenizer<'_>,
192 dict: &PdfDict,
193 start_offset: usize,
194) -> Result<Vec<u8>> {
195 let data = tokenizer.reader().data();
197 let mut pos = tokenizer.pos();
198
199 if pos < data.len() && data[pos] == b'\r' {
201 pos += 1;
202 }
203 if pos < data.len() && data[pos] == b'\n' {
204 pos += 1;
205 }
206
207 let length = match dict.get(b"Length") {
209 Some(PdfObject::Integer(n)) => *n as usize,
210 _ => {
211 return find_stream_data_by_endstream(data, pos, start_offset);
213 }
214 };
215
216 if pos + length > data.len() {
217 return Err(JustPdfError::UnexpectedEof { offset: pos });
218 }
219
220 let stream_data = data[pos..pos + length].to_vec();
221 tokenizer.seek(pos + length);
222
223 Ok(stream_data)
224}
225
226fn find_stream_data_by_endstream(data: &[u8], start: usize, err_offset: usize) -> Result<Vec<u8>> {
228 let needle = b"endstream";
229 for i in start..data.len().saturating_sub(needle.len()) {
230 if &data[i..i + needle.len()] == needle {
231 let mut end = i;
233 if end > start && data[end - 1] == b'\n' {
234 end -= 1;
235 }
236 if end > start && data[end - 1] == b'\r' {
237 end -= 1;
238 }
239 return Ok(data[start..end].to_vec());
240 }
241 }
242 Err(JustPdfError::InvalidObject {
243 offset: err_offset,
244 detail: "could not find endstream".into(),
245 })
246}
247
248#[cfg(test)]
249mod tests {
250 use super::*;
251
252 #[test]
253 fn test_parse_null() {
254 let mut t = Tokenizer::new(b"null");
255 assert_eq!(parse_object(&mut t).unwrap(), PdfObject::Null);
256 }
257
258 #[test]
259 fn test_parse_bool() {
260 let mut t = Tokenizer::new(b"true");
261 assert_eq!(parse_object(&mut t).unwrap(), PdfObject::Bool(true));
262
263 let mut t = Tokenizer::new(b"false");
264 assert_eq!(parse_object(&mut t).unwrap(), PdfObject::Bool(false));
265 }
266
267 #[test]
268 fn test_parse_numbers() {
269 let mut t = Tokenizer::new(b"42");
270 assert_eq!(parse_object(&mut t).unwrap(), PdfObject::Integer(42));
271
272 let mut t = Tokenizer::new(b"3.15");
273 assert_eq!(parse_object(&mut t).unwrap(), PdfObject::Real(3.15));
274 }
275
276 #[test]
277 fn test_parse_string() {
278 let mut t = Tokenizer::new(b"(Hello)");
279 assert_eq!(
280 parse_object(&mut t).unwrap(),
281 PdfObject::String(b"Hello".to_vec())
282 );
283 }
284
285 #[test]
286 fn test_parse_name() {
287 let mut t = Tokenizer::new(b"/Type");
288 assert_eq!(
289 parse_object(&mut t).unwrap(),
290 PdfObject::Name(b"Type".to_vec())
291 );
292 }
293
294 #[test]
295 fn test_parse_array() {
296 let mut t = Tokenizer::new(b"[1 2 3]");
297 assert_eq!(
298 parse_object(&mut t).unwrap(),
299 PdfObject::Array(vec![
300 PdfObject::Integer(1),
301 PdfObject::Integer(2),
302 PdfObject::Integer(3),
303 ])
304 );
305 }
306
307 #[test]
308 fn test_parse_dict() {
309 let mut t = Tokenizer::new(b"<< /Type /Catalog /Pages 2 0 R >>");
310 let obj = parse_object(&mut t).unwrap();
311 match &obj {
312 PdfObject::Dict(d) => {
313 assert_eq!(d.get(b"Type"), Some(&PdfObject::Name(b"Catalog".to_vec())));
314 assert_eq!(
315 d.get(b"Pages"),
316 Some(&PdfObject::Reference(IndirectRef {
317 obj_num: 2,
318 gen_num: 0
319 }))
320 );
321 }
322 _ => panic!("expected dict, got {obj:?}"),
323 }
324 }
325
326 #[test]
327 fn test_parse_reference() {
328 let mut t = Tokenizer::new(b"10 0 R");
329 assert_eq!(
330 parse_object(&mut t).unwrap(),
331 PdfObject::Reference(IndirectRef {
332 obj_num: 10,
333 gen_num: 0
334 })
335 );
336 }
337
338 #[test]
339 fn test_parse_indirect_object() {
340 let input = b"1 0 obj\n<< /Type /Catalog >>\nendobj";
341 let mut t = Tokenizer::new(input);
342 let (iref, obj) = parse_indirect_object(&mut t).unwrap();
343 assert_eq!(
344 iref,
345 IndirectRef {
346 obj_num: 1,
347 gen_num: 0
348 }
349 );
350 assert!(matches!(obj, PdfObject::Dict(_)));
351 }
352
353 #[test]
354 fn test_parse_nested() {
355 let input = b"<< /Kids [ 1 0 R 2 0 R ] /Count 2 >>";
356 let mut t = Tokenizer::new(input);
357 let obj = parse_object(&mut t).unwrap();
358 match &obj {
359 PdfObject::Dict(d) => {
360 assert_eq!(d.get(b"Count"), Some(&PdfObject::Integer(2)));
361 match d.get(b"Kids") {
362 Some(PdfObject::Array(arr)) => assert_eq!(arr.len(), 2),
363 _ => panic!("expected array"),
364 }
365 }
366 _ => panic!("expected dict"),
367 }
368 }
369}