acroform_pdf/parser/
mod.rs

1//! Basic functionality for parsing a PDF file.
2
3mod lexer;
4mod parse_object;
5mod parse_xref;
6
7pub use self::lexer::*;
8pub use self::parse_object::*;
9pub use self::parse_xref::*;
10
11use crate::error::*;
12use crate::primitive::StreamInner;
13use crate::primitive::{Primitive, Dictionary, PdfStream, PdfString};
14use crate::object::{ObjNr, GenNr, PlainRef, Resolve};
15// ACROFORM-RS: Removed crypt::Decoder import - only supporting decrypted PDFs
16use bitflags::bitflags;
17use istring::{SmallBytes, SmallString, IBytes};
18
19const MAX_DEPTH: usize = 20;
20
21
22bitflags! {
23    #[repr(transparent)]
24    #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
25    pub struct ParseFlags: u16 {
26        const INTEGER = 1 << 0;
27        const STREAM = 1 << 1;
28        const DICT = 1 << 2;
29        const NUMBER = 1 << 3;
30        const NAME = 1 << 4;
31        const ARRAY = 1 << 5;
32        const STRING = 1 << 6;
33        const BOOL = 1 << 7;
34        const NULL = 1 << 8;
35        const REF = 1 << 9;
36        const ANY = (1 << 10) - 1;
37    }
38}
39
40
41pub struct Context<'a> {
42    pub id: PlainRef,
43    _phantom: std::marker::PhantomData<&'a ()>,
44}
45impl<'a> Context<'a> {
46    pub fn decrypt<'buf>(&self, data: &'buf mut [u8]) -> Result<&'buf [u8]> {
47        // ACROFORM-RS: No decryption - only supporting decrypted PDFs
48        Ok(data)
49    }
50    #[cfg(test)]
51    fn fake() -> Self {
52        Context {
53            id: PlainRef { id: 0, gen: 0 },
54            _phantom: std::marker::PhantomData,
55        }
56    }
57}
58
59/// Can parse stream but only if its dictionary does not contain indirect references.
60/// Use `parse_stream` if this is insufficient.
61pub fn parse(data: &[u8], r: &impl Resolve, flags: ParseFlags) -> Result<Primitive> {
62    parse_with_lexer(&mut Lexer::new(data), r, flags)
63}
64
65/// Recursive. Can parse stream but only if its dictionary does not contain indirect references.
66/// Use `parse_stream` if this is not sufficient.
67pub fn parse_with_lexer(lexer: &mut Lexer, r: &impl Resolve, flags: ParseFlags) -> Result<Primitive> {
68    parse_with_lexer_ctx(lexer, r, None, flags, MAX_DEPTH)
69}
70
71fn parse_dictionary_object(lexer: &mut Lexer, r: &impl Resolve, ctx: Option<&Context>, max_depth: usize) -> Result<Dictionary> {
72    let mut dict = Dictionary::default();
73    loop {
74        // Expect a Name (and Object) or the '>>' delimiter
75        let token = t!(lexer.next());
76        if token.starts_with(b"/") {
77            let key = token.reslice(1..).to_name()?;
78            let obj = t!(parse_with_lexer_ctx(lexer, r, ctx, ParseFlags::ANY, max_depth));
79            dict.insert(key, obj);
80        } else if token.equals(b">>") {
81            break;
82        } else {
83            err!(PdfError::UnexpectedLexeme{ pos: lexer.get_pos(), lexeme: token.to_string(), expected: "/ or >>"});
84        }
85    }
86    Ok(dict)
87}
88
89fn parse_stream_object(dict: Dictionary, lexer: &mut Lexer, r: &impl Resolve, ctx: &Context) -> Result<PdfStream> {
90    t!(lexer.next_stream());
91
92    let length = match dict.get("Length") {
93        Some(&Primitive::Integer(n)) if n >= 0 => n as usize,
94        Some(&Primitive::Reference(reference)) => t!(t!(r.resolve_flags(reference, ParseFlags::INTEGER, 1)).as_usize()),
95        Some(other) => err!(PdfError::UnexpectedPrimitive { expected: "unsigned Integer or Reference", found: other.get_debug_name() }),
96        None => err!(PdfError::MissingEntry { typ: "<Stream>", field: "Length".into() }),
97    };
98
99    let stream_substr = lexer.read_n(length);
100
101    if stream_substr.len() != length {
102        err!(PdfError::EOF)
103    }
104
105    // Finish
106    t!(lexer.next_expect("endstream"));
107
108    Ok(PdfStream {
109        inner: StreamInner::InFile {
110            id: ctx.id,
111            file_range: stream_substr.file_range(),
112        },
113        info: dict,
114    })
115}
116
117#[inline]
118fn check(flags: ParseFlags, allowed: ParseFlags) -> Result<(), PdfError> {
119    if !flags.intersects(allowed) {
120        return Err(PdfError::PrimitiveNotAllowed { allowed, found: flags });
121    }
122    Ok(())
123}
124
125/// Recursive. Can parse stream but only if its dictionary does not contain indirect references.
126/// Use `parse_stream` if this is not sufficient.
127pub fn parse_with_lexer_ctx(lexer: &mut Lexer, r: &impl Resolve, ctx: Option<&Context>, flags: ParseFlags, max_depth: usize) -> Result<Primitive> {
128    let pos = lexer.get_pos();
129    match _parse_with_lexer_ctx(lexer, r, ctx, flags, max_depth) {
130        Ok(r) => Ok(r),
131        Err(e) => {
132            lexer.set_pos(pos);
133            Err(e)
134        }
135    }
136}
137fn _parse_with_lexer_ctx(lexer: &mut Lexer, r: &impl Resolve, ctx: Option<&Context>, flags: ParseFlags, max_depth: usize) -> Result<Primitive> {
138
139    let input = lexer.get_remaining_slice();
140    let first_lexeme = t!(lexer.next(), std::str::from_utf8(input));
141
142    let obj = if first_lexeme.equals(b"<<") {
143        check(flags, ParseFlags::DICT)?;
144
145        if max_depth == 0 {
146            return Err(PdfError::MaxDepth);
147        }
148        let dict = t!(parse_dictionary_object(lexer, r, ctx, max_depth-1));
149        // It might just be the dictionary in front of a stream.
150        if t!(lexer.peek()).equals(b"stream") {
151            let ctx = ctx.ok_or(PdfError::PrimitiveNotAllowed { allowed: ParseFlags::STREAM, found: flags })?;
152            Primitive::Stream(t!(parse_stream_object(dict, lexer, r, ctx)))
153        } else {
154            Primitive::Dictionary(dict)
155        }
156    } else if first_lexeme.is_integer() {
157        // May be Integer or Reference
158        check(flags, ParseFlags::INTEGER | ParseFlags::REF)?;
159
160        // First backup position
161        let pos_bk = lexer.get_pos();
162
163        let second_lexeme = t!(lexer.next());
164        if second_lexeme.is_integer() {
165            let third_lexeme = t!(lexer.next());
166            if third_lexeme.equals(b"R") {
167                // It is indeed a reference to an indirect object
168                check(flags, ParseFlags::REF)?;
169                Primitive::Reference (PlainRef {
170                    id: t!(first_lexeme.to::<ObjNr>()),
171                    gen: t!(second_lexeme.to::<GenNr>()),
172                })
173            } else {
174                check(flags, ParseFlags::INTEGER)?;
175                // We are probably in an array of numbers - it's not a reference anyway
176                lexer.set_pos(pos_bk); // (roll back the lexer first)
177                Primitive::Integer(t!(first_lexeme.to::<i32>()))
178            }
179        } else {
180            check(flags, ParseFlags::INTEGER)?;
181            // It is but a number
182            lexer.set_pos(pos_bk); // (roll back the lexer first)
183            Primitive::Integer(t!(first_lexeme.to::<i32>()))
184        }
185    } else if let Some(s) = first_lexeme.real_number() {
186        check(flags, ParseFlags::NUMBER)?;
187        // Real Number
188        Primitive::Number (t!(s.to::<f32>(), s.to_string()))
189    } else if first_lexeme.starts_with(b"/") {
190        check(flags, ParseFlags::NAME)?;
191        // Name
192
193        let mut rest: &[u8] = &first_lexeme.reslice(1..);
194        let s = if rest.contains(&b'#') {
195            let mut s = IBytes::new();
196            while let Some(idx) = rest.iter().position(|&b| b == b'#') {
197                use crate::enc::decode_nibble;
198                use std::convert::TryInto;
199                let [hi, lo]: [u8; 2] = rest.get(idx+1 .. idx+3).ok_or(PdfError::EOF)?.try_into().unwrap();
200                let byte = match (decode_nibble(lo), decode_nibble(hi)) {
201                    (Some(low), Some(high)) => low | high << 4,
202                    _ => return Err(PdfError::HexDecode { pos: idx, bytes: [hi, lo] }),
203                };
204                s.extend_from_slice(&rest[..idx]);
205                s.push(byte);
206                rest = &rest[idx+3..];
207            }
208            s.extend_from_slice(rest);
209            SmallBytes::from(s.as_slice())
210        } else {
211            SmallBytes::from(rest)
212        };
213        
214        Primitive::Name(SmallString::from_utf8(s)?)
215    } else if first_lexeme.equals(b"[") {
216        check(flags, ParseFlags::ARRAY)?;
217        if max_depth == 0 {
218            return Err(PdfError::MaxDepth);
219        }
220        let mut array = Vec::new();
221        // Array
222        loop {
223            // Exit if closing delimiter
224            if lexer.peek()?.equals(b"]") {
225                break;
226            }
227
228            let element = t!(parse_with_lexer_ctx(lexer, r, ctx, ParseFlags::ANY, max_depth-1));
229            array.push(element);
230        }
231        t!(lexer.next()); // Move beyond closing delimiter
232
233        Primitive::Array (array)
234    } else if first_lexeme.equals(b"(") {
235        check(flags, ParseFlags::STRING)?;
236        let mut string = IBytes::new();
237
238        let bytes_traversed = {
239            let mut string_lexer = StringLexer::new(lexer.get_remaining_slice());
240            for character in string_lexer.iter() {
241                string.push(t!(character));
242            }
243            string_lexer.get_offset()
244        };
245        // Advance to end of string
246        lexer.offset_pos(bytes_traversed);
247        // decrypt it
248        if let Some(ctx) = ctx {
249            string = t!(ctx.decrypt(&mut string)).into();
250        }
251        Primitive::String (PdfString::new(string))
252    } else if first_lexeme.equals(b"<") {
253        check(flags, ParseFlags::STRING)?;
254        let mut string = IBytes::new();
255
256        let bytes_traversed = {
257            let mut hex_string_lexer = HexStringLexer::new(lexer.get_remaining_slice());
258            for byte in hex_string_lexer.iter() {
259                string.push(t!(byte));
260            }
261            hex_string_lexer.get_offset()
262        };
263        // Advance to end of string
264        lexer.offset_pos(bytes_traversed);
265
266        // decrypt it
267        if let Some(ctx) = ctx {
268            string = t!(ctx.decrypt(&mut string)).into();
269        }
270        Primitive::String (PdfString::new(string))
271    } else if first_lexeme.equals(b"true") {
272        check(flags, ParseFlags::BOOL)?;
273        Primitive::Boolean (true)
274    } else if first_lexeme.equals(b"false") {
275        check(flags, ParseFlags::BOOL)?;
276        Primitive::Boolean (false)
277    } else if first_lexeme.equals(b"null") {
278        check(flags, ParseFlags::NULL)?;
279        Primitive::Null
280    } else {
281        err!(PdfError::UnknownType {pos: lexer.get_pos(), first_lexeme: first_lexeme.to_string(), rest: lexer.read_n(50).to_string()});
282    };
283
284    // trace!("Read object"; "Obj" => format!("{}", obj));
285
286    Ok(obj)
287}
288
289
290pub fn parse_stream(data: &[u8], resolve: &impl Resolve, ctx: &Context) -> Result<PdfStream> {
291    parse_stream_with_lexer(&mut Lexer::new(data), resolve, ctx)
292}
293
294
295fn parse_stream_with_lexer(lexer: &mut Lexer, r: &impl Resolve, ctx: &Context) -> Result<PdfStream> {
296    let first_lexeme = t!(lexer.next());
297
298    let obj = if first_lexeme.equals(b"<<") {
299        let dict = t!(parse_dictionary_object(lexer, r, None, MAX_DEPTH));
300        // It might just be the dictionary in front of a stream.
301        if t!(lexer.peek()).equals(b"stream") {
302            let ctx = Context {
303                // ACROFORM-RS: Removed decoder field
304                id: ctx.id,
305                _phantom: std::marker::PhantomData,
306            };
307            t!(parse_stream_object(dict, lexer, r, &ctx))
308        } else {
309            err!(PdfError::UnexpectedPrimitive { expected: "Stream", found: "Dictionary" });
310        }
311    } else {
312        err!(PdfError::UnexpectedPrimitive { expected: "Stream", found: "something else" });
313    };
314
315    Ok(obj)
316}
317
318#[cfg(test)]
319mod tests {
320    #[test]
321    fn dict_with_empty_name_as_value() {
322        use crate::object::NoResolve;
323        use super::{ParseFlags, Context};
324        {
325            let data = b"<</App<</Name/>>>>";
326            let primitive = super::parse(data, &NoResolve, ParseFlags::DICT).unwrap();
327            let dict = primitive.into_dictionary().unwrap();
328
329            assert_eq!(dict.len(), 1);
330            let app_dict = dict.get("App").unwrap().clone().into_dictionary().unwrap();
331            assert_eq!(app_dict.len(), 1);
332            let name = app_dict.get("Name").unwrap().as_name().unwrap();
333            assert_eq!(name, "");
334        }
335
336        {
337            let data = b"<</Length 0/App<</Name/>>>>stream\nendstream\n";
338            let stream = super::parse_stream(data, &NoResolve, &Context::fake()).unwrap();
339            let dict = stream.info;
340
341            assert_eq!(dict.len(), 2);
342            let app_dict = dict.get("App").unwrap().clone().into_dictionary().unwrap();
343            assert_eq!(app_dict.len(), 1);
344            let name = app_dict.get("Name").unwrap().as_name().unwrap();
345            assert_eq!(name, "");
346        }
347    }
348
349    #[test]
350    fn dict_with_empty_name_as_key() {
351        use crate::object::NoResolve;
352        use super::{ParseFlags, Context};
353
354        {
355            let data = b"<</ true>>";
356            let primitive = super::parse(data, &NoResolve, ParseFlags::DICT).unwrap();
357            let dict = primitive.into_dictionary().unwrap();
358
359            assert_eq!(dict.len(), 1);
360            assert!(dict.get("").unwrap().as_bool().unwrap());
361        }
362
363        {
364            let data = b"<</Length 0/ true>>stream\nendstream\n";
365            let stream = super::parse_stream(data, &NoResolve, &Context::fake()).unwrap();
366            let dict = stream.info;
367
368            assert_eq!(dict.len(), 2);
369            assert!(dict.get("").unwrap().as_bool().unwrap());
370        }
371    }
372
373    #[test]
374    fn empty_array() {
375        use crate::object::NoResolve;
376        use super::ParseFlags;
377
378        let data = b"[]";
379        let primitive = super::parse(data, &NoResolve, ParseFlags::ARRAY).unwrap();
380        let array = primitive.into_array().unwrap();
381        assert!(array.is_empty());
382    }
383
384    #[test]
385    fn compact_array() {
386        use crate::object::NoResolve;
387        use crate::primitive::{Primitive, PdfString};
388        use super::lexer::Lexer;
389        use super::*;
390        let mut lx = Lexer::new(b"[(Complete L)20(egend for Physical and P)20(olitical Maps)]TJ");
391        assert_eq!(parse_with_lexer(&mut lx, &NoResolve, ParseFlags::ANY).unwrap(),
392            Primitive::Array(vec![
393                Primitive::String(PdfString::new("Complete L".into())),
394                Primitive::Integer(20),
395                Primitive::String(PdfString::new("egend for Physical and P".into())),
396                Primitive::Integer(20),
397                Primitive::String(PdfString::new("olitical Maps".into()))
398            ])
399        );
400        assert_eq!(lx.next().unwrap().as_str().unwrap(), "TJ");
401        assert!(lx.next().unwrap_err().is_eof());
402    }
403}