pdf/parser/
mod.rs

1//! Basic functionality for parsing a PDF file.
2
3mod lexer;
4mod parse_object;
5mod parse_xref;
6
7pub use self::lexer::*;
8pub use self::parse_object::*;
9pub use self::parse_xref::*;
10
11use crate::error::*;
12use crate::primitive::StreamInner;
13use crate::primitive::{Primitive, Dictionary, PdfStream, PdfString};
14use crate::object::{ObjNr, GenNr, PlainRef, Resolve};
15use crate::crypt::Decoder;
16use bitflags::bitflags;
17use istring::{SmallBytes, SmallString, IBytes};
18
19const MAX_DEPTH: usize = 20;
20
21
22bitflags! {
23    pub struct ParseFlags: u16 {
24        const INTEGER = 1 << 0;
25        const STREAM = 1 << 1;
26        const DICT = 1 << 2;
27        const NUMBER = 1 << 3;
28        const NAME = 1 << 4;
29        const ARRAY = 1 << 5;
30        const STRING = 1 << 6;
31        const BOOL = 1 << 7;
32        const NULL = 1 << 8;
33        const REF = 1 << 9;
34        const ANY = (1 << 10) - 1;
35    }
36}
37
38
39pub struct Context<'a> {
40    pub decoder: Option<&'a Decoder>,
41    pub id: PlainRef,
42}
43impl<'a> Context<'a> {
44    pub fn decrypt<'buf>(&self, data: &'buf mut [u8]) -> Result<&'buf [u8]> {
45        if let Some(decoder) = self.decoder {
46            decoder.decrypt(self.id, data)
47        } else {
48            Ok(data)
49        }
50    }
51    #[cfg(test)]
52    fn fake() -> Self {
53        Context {
54            decoder: None,
55            id: PlainRef { id: 0, gen: 0 }
56        }
57    }
58}
59
60/// Can parse stream but only if its dictionary does not contain indirect references.
61/// Use `parse_stream` if this is insufficient.
62pub fn parse(data: &[u8], r: &impl Resolve, flags: ParseFlags) -> Result<Primitive> {
63    parse_with_lexer(&mut Lexer::new(data), r, flags)
64}
65
66/// Recursive. Can parse stream but only if its dictionary does not contain indirect references.
67/// Use `parse_stream` if this is not sufficient.
68pub fn parse_with_lexer(lexer: &mut Lexer, r: &impl Resolve, flags: ParseFlags) -> Result<Primitive> {
69    parse_with_lexer_ctx(lexer, r, None, flags, MAX_DEPTH)
70}
71
72fn parse_dictionary_object(lexer: &mut Lexer, r: &impl Resolve, ctx: Option<&Context>, max_depth: usize) -> Result<Dictionary> {
73    let mut dict = Dictionary::default();
74    loop {
75        // Expect a Name (and Object) or the '>>' delimiter
76        let token = t!(lexer.next());
77        if token.starts_with(b"/") {
78            let key = token.reslice(1..).to_name()?;
79            let obj = t!(parse_with_lexer_ctx(lexer, r, ctx, ParseFlags::ANY, max_depth));
80            dict.insert(key, obj);
81        } else if token.equals(b">>") {
82            break;
83        } else {
84            err!(PdfError::UnexpectedLexeme{ pos: lexer.get_pos(), lexeme: token.to_string(), expected: "/ or >>"});
85        }
86    }
87    Ok(dict)
88}
89
90fn parse_stream_object(dict: Dictionary, lexer: &mut Lexer, r: &impl Resolve, ctx: &Context) -> Result<PdfStream> {
91    t!(lexer.next_stream());
92
93    let length = match dict.get("Length") {
94        Some(&Primitive::Integer(n)) if n >= 0 => n as usize,
95        Some(&Primitive::Reference(reference)) => t!(t!(r.resolve_flags(reference, ParseFlags::INTEGER, 1)).as_usize()),
96        Some(other) => err!(PdfError::UnexpectedPrimitive { expected: "unsigned Integer or Reference", found: other.get_debug_name() }),
97        None => err!(PdfError::MissingEntry { typ: "<Stream>", field: "Length".into() }),
98    };
99
100    let stream_substr = lexer.read_n(length);
101
102    if stream_substr.len() != length {
103        err!(PdfError::EOF)
104    }
105
106    // Finish
107    t!(lexer.next_expect("endstream"));
108
109    Ok(PdfStream {
110        inner: StreamInner::InFile {
111            id: ctx.id,
112            file_range: stream_substr.file_range(),
113        },
114        info: dict,
115    })
116}
117
118#[inline]
119fn check(flags: ParseFlags, allowed: ParseFlags) -> Result<(), PdfError> {
120    if !flags.intersects(allowed) {
121        return Err(PdfError::PrimitiveNotAllowed { allowed, found: flags });
122    }
123    Ok(())
124}
125
126/// Recursive. Can parse stream but only if its dictionary does not contain indirect references.
127/// Use `parse_stream` if this is not sufficient.
128pub fn parse_with_lexer_ctx(lexer: &mut Lexer, r: &impl Resolve, ctx: Option<&Context>, flags: ParseFlags, max_depth: usize) -> Result<Primitive> {
129    let pos = lexer.get_pos();
130    match _parse_with_lexer_ctx(lexer, r, ctx, flags, max_depth) {
131        Ok(r) => Ok(r),
132        Err(e) => {
133            lexer.set_pos(pos);
134            Err(e)
135        }
136    }
137}
138fn _parse_with_lexer_ctx(lexer: &mut Lexer, r: &impl Resolve, ctx: Option<&Context>, flags: ParseFlags, max_depth: usize) -> Result<Primitive> {
139
140    let input = lexer.get_remaining_slice();
141    let first_lexeme = t!(lexer.next(), std::str::from_utf8(input));
142
143    let obj = if first_lexeme.equals(b"<<") {
144        check(flags, ParseFlags::DICT)?;
145
146        if max_depth == 0 {
147            return Err(PdfError::MaxDepth);
148        }
149        let dict = t!(parse_dictionary_object(lexer, r, ctx, max_depth-1));
150        // It might just be the dictionary in front of a stream.
151        if t!(lexer.peek()).equals(b"stream") {
152            let ctx = ctx.ok_or(PdfError::PrimitiveNotAllowed { allowed: ParseFlags::STREAM, found: flags })?;
153            Primitive::Stream(t!(parse_stream_object(dict, lexer, r, ctx)))
154        } else {
155            Primitive::Dictionary(dict)
156        }
157    } else if first_lexeme.is_integer() {
158        // May be Integer or Reference
159        check(flags, ParseFlags::INTEGER | ParseFlags::REF)?;
160
161        // First backup position
162        let pos_bk = lexer.get_pos();
163
164        let second_lexeme = t!(lexer.next());
165        if second_lexeme.is_integer() {
166            let third_lexeme = t!(lexer.next());
167            if third_lexeme.equals(b"R") {
168                // It is indeed a reference to an indirect object
169                check(flags, ParseFlags::REF)?;
170                Primitive::Reference (PlainRef {
171                    id: t!(first_lexeme.to::<ObjNr>()),
172                    gen: t!(second_lexeme.to::<GenNr>()),
173                })
174            } else {
175                check(flags, ParseFlags::INTEGER)?;
176                // We are probably in an array of numbers - it's not a reference anyway
177                lexer.set_pos(pos_bk); // (roll back the lexer first)
178                Primitive::Integer(t!(first_lexeme.to::<i32>()))
179            }
180        } else {
181            check(flags, ParseFlags::INTEGER)?;
182            // It is but a number
183            lexer.set_pos(pos_bk); // (roll back the lexer first)
184            Primitive::Integer(t!(first_lexeme.to::<i32>()))
185        }
186    } else if let Some(s) = first_lexeme.real_number() {
187        check(flags, ParseFlags::NUMBER)?;
188        // Real Number
189        Primitive::Number (t!(s.to::<f32>(), s.to_string()))
190    } else if first_lexeme.starts_with(b"/") {
191        check(flags, ParseFlags::NAME)?;
192        // Name
193
194        let mut rest: &[u8] = &first_lexeme.reslice(1..);
195        let s = if rest.contains(&b'#') {
196            let mut s = IBytes::new();
197            while let Some(idx) = rest.iter().position(|&b| b == b'#') {
198                use crate::enc::decode_nibble;
199                use std::convert::TryInto;
200                let [hi, lo]: [u8; 2] = rest.get(idx+1 .. idx+3).ok_or(PdfError::EOF)?.try_into().unwrap();
201                let byte = match (decode_nibble(lo), decode_nibble(hi)) {
202                    (Some(low), Some(high)) => low | high << 4,
203                    _ => return Err(PdfError::HexDecode { pos: idx, bytes: [hi, lo] }),
204                };
205                s.extend_from_slice(&rest[..idx]);
206                s.push(byte);
207                rest = &rest[idx+3..];
208            }
209            s.extend_from_slice(rest);
210            SmallBytes::from(s.as_slice())
211        } else {
212            SmallBytes::from(rest)
213        };
214        
215        Primitive::Name(SmallString::from_utf8(s)?)
216    } else if first_lexeme.equals(b"[") {
217        check(flags, ParseFlags::ARRAY)?;
218        if max_depth == 0 {
219            return Err(PdfError::MaxDepth);
220        }
221        let mut array = Vec::new();
222        // Array
223        loop {
224            // Exit if closing delimiter
225            if lexer.peek()?.equals(b"]") {
226                break;
227            }
228
229            let element = t!(parse_with_lexer_ctx(lexer, r, ctx, ParseFlags::ANY, max_depth-1));
230            array.push(element);
231        }
232        t!(lexer.next()); // Move beyond closing delimiter
233
234        Primitive::Array (array)
235    } else if first_lexeme.equals(b"(") {
236        check(flags, ParseFlags::STRING)?;
237        let mut string = IBytes::new();
238
239        let bytes_traversed = {
240            let mut string_lexer = StringLexer::new(lexer.get_remaining_slice());
241            for character in string_lexer.iter() {
242                string.push(t!(character));
243            }
244            string_lexer.get_offset()
245        };
246        // Advance to end of string
247        lexer.offset_pos(bytes_traversed);
248        // decrypt it
249        if let Some(ctx) = ctx {
250            string = t!(ctx.decrypt(&mut string)).into();
251        }
252        Primitive::String (PdfString::new(string))
253    } else if first_lexeme.equals(b"<") {
254        check(flags, ParseFlags::STRING)?;
255        let mut string = IBytes::new();
256
257        let bytes_traversed = {
258            let mut hex_string_lexer = HexStringLexer::new(lexer.get_remaining_slice());
259            for byte in hex_string_lexer.iter() {
260                string.push(t!(byte));
261            }
262            hex_string_lexer.get_offset()
263        };
264        // Advance to end of string
265        lexer.offset_pos(bytes_traversed);
266
267        // decrypt it
268        if let Some(ctx) = ctx {
269            string = t!(ctx.decrypt(&mut string)).into();
270        }
271        Primitive::String (PdfString::new(string))
272    } else if first_lexeme.equals(b"true") {
273        check(flags, ParseFlags::BOOL)?;
274        Primitive::Boolean (true)
275    } else if first_lexeme.equals(b"false") {
276        check(flags, ParseFlags::BOOL)?;
277        Primitive::Boolean (false)
278    } else if first_lexeme.equals(b"null") {
279        check(flags, ParseFlags::NULL)?;
280        Primitive::Null
281    } else {
282        err!(PdfError::UnknownType {pos: lexer.get_pos(), first_lexeme: first_lexeme.to_string(), rest: lexer.read_n(50).to_string()});
283    };
284
285    // trace!("Read object"; "Obj" => format!("{}", obj));
286
287    Ok(obj)
288}
289
290
291pub fn parse_stream(data: &[u8], resolve: &impl Resolve, ctx: &Context) -> Result<PdfStream> {
292    parse_stream_with_lexer(&mut Lexer::new(data), resolve, ctx)
293}
294
295
296fn parse_stream_with_lexer(lexer: &mut Lexer, r: &impl Resolve, ctx: &Context) -> Result<PdfStream> {
297    let first_lexeme = t!(lexer.next());
298
299    let obj = if first_lexeme.equals(b"<<") {
300        let dict = parse_dictionary_object(lexer, r, None, MAX_DEPTH)?;
301        // It might just be the dictionary in front of a stream.
302        if t!(lexer.peek()).equals(b"stream") {
303            let ctx = Context {
304                decoder: None,
305                id: ctx.id
306            };
307            t!(parse_stream_object(dict, lexer, r, &ctx))
308        } else {
309            err!(PdfError::UnexpectedPrimitive { expected: "Stream", found: "Dictionary" });
310        }
311    } else {
312        err!(PdfError::UnexpectedPrimitive { expected: "Stream", found: "something else" });
313    };
314
315    Ok(obj)
316}
317
318#[cfg(test)]
319mod tests {
320    #[test]
321    fn dict_with_empty_name_as_value() {
322        use crate::object::NoResolve;
323        use super::{ParseFlags, Context};
324        {
325            let data = b"<</App<</Name/>>>>";
326            let primitive = super::parse(data, &NoResolve, ParseFlags::DICT).unwrap();
327            let dict = primitive.into_dictionary().unwrap();
328
329            assert_eq!(dict.len(), 1);
330            let app_dict = dict.get("App").unwrap().clone().into_dictionary().unwrap();
331            assert_eq!(app_dict.len(), 1);
332            let name = app_dict.get("Name").unwrap().as_name().unwrap();
333            assert_eq!(name, "");
334        }
335
336        {
337            let data = b"<</Length 0/App<</Name/>>>>stream\nendstream\n";
338            let stream = super::parse_stream(data, &NoResolve, &Context::fake()).unwrap();
339            let dict = stream.info;
340
341            assert_eq!(dict.len(), 2);
342            let app_dict = dict.get("App").unwrap().clone().into_dictionary().unwrap();
343            assert_eq!(app_dict.len(), 1);
344            let name = app_dict.get("Name").unwrap().as_name().unwrap();
345            assert_eq!(name, "");
346        }
347    }
348
349    #[test]
350    fn dict_with_empty_name_as_key() {
351        use crate::object::NoResolve;
352        use super::{ParseFlags, Context};
353
354        {
355            let data = b"<</ true>>";
356            let primitive = super::parse(data, &NoResolve, ParseFlags::DICT).unwrap();
357            let dict = primitive.into_dictionary().unwrap();
358
359            assert_eq!(dict.len(), 1);
360            assert!(dict.get("").unwrap().as_bool().unwrap());
361        }
362
363        {
364            let data = b"<</Length 0/ true>>stream\nendstream\n";
365            let stream = super::parse_stream(data, &NoResolve, &Context::fake()).unwrap();
366            let dict = stream.info;
367
368            assert_eq!(dict.len(), 2);
369            assert!(dict.get("").unwrap().as_bool().unwrap());
370        }
371    }
372
373    #[test]
374    fn empty_array() {
375        use crate::object::NoResolve;
376        use super::ParseFlags;
377
378        let data = b"[]";
379        let primitive = super::parse(data, &NoResolve, ParseFlags::ARRAY).unwrap();
380        let array = primitive.into_array().unwrap();
381        assert!(array.is_empty());
382    }
383
384    #[test]
385    fn compact_array() {
386        use crate::object::NoResolve;
387        use crate::primitive::{Primitive, PdfString};
388        use super::lexer::Lexer;
389        use super::*;
390        let mut lx = Lexer::new(b"[(Complete L)20(egend for Physical and P)20(olitical Maps)]TJ");
391        assert_eq!(parse_with_lexer(&mut lx, &NoResolve, ParseFlags::ANY).unwrap(),
392            Primitive::Array(vec![
393                Primitive::String(PdfString::new("Complete L".into())),
394                Primitive::Integer(20),
395                Primitive::String(PdfString::new("egend for Physical and P".into())),
396                Primitive::Integer(20),
397                Primitive::String(PdfString::new("olitical Maps".into()))
398            ])
399        );
400        assert_eq!(lx.next().unwrap().as_str().unwrap(), "TJ");
401        assert!(lx.next().unwrap_err().is_eof());
402    }
403}