pdf/parser/
mod.rs

1//! Basic functionality for parsing a PDF file.
2
3mod lexer;
4mod parse_object;
5mod parse_xref;
6
7pub use self::lexer::*;
8pub use self::parse_object::*;
9pub use self::parse_xref::*;
10
11use crate::error::*;
12use crate::primitive::StreamInner;
13use crate::primitive::{Primitive, Dictionary, PdfStream, PdfString};
14use crate::object::{ObjNr, GenNr, PlainRef, Resolve};
15use crate::crypt::Decoder;
16use bitflags::bitflags;
17use istring::{SmallBytes, SmallString, IBytes};
18
19const MAX_DEPTH: usize = 20;
20
21
22bitflags! {
23    #[repr(transparent)]
24    #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
25    pub struct ParseFlags: u16 {
26        const INTEGER = 1 << 0;
27        const STREAM = 1 << 1;
28        const DICT = 1 << 2;
29        const NUMBER = 1 << 3;
30        const NAME = 1 << 4;
31        const ARRAY = 1 << 5;
32        const STRING = 1 << 6;
33        const BOOL = 1 << 7;
34        const NULL = 1 << 8;
35        const REF = 1 << 9;
36        const ANY = (1 << 10) - 1;
37    }
38}
39
40
41pub struct Context<'a> {
42    pub decoder: Option<&'a Decoder>,
43    pub id: PlainRef,
44}
45impl<'a> Context<'a> {
46    pub fn decrypt<'buf>(&self, data: &'buf mut [u8]) -> Result<&'buf [u8]> {
47        if let Some(decoder) = self.decoder {
48            decoder.decrypt(self.id, data)
49        } else {
50            Ok(data)
51        }
52    }
53    #[cfg(test)]
54    fn fake() -> Self {
55        Context {
56            decoder: None,
57            id: PlainRef { id: 0, gen: 0 }
58        }
59    }
60}
61
62/// Can parse stream but only if its dictionary does not contain indirect references.
63/// Use `parse_stream` if this is insufficient.
64pub fn parse(data: &[u8], r: &impl Resolve, flags: ParseFlags) -> Result<Primitive> {
65    parse_with_lexer(&mut Lexer::new(data), r, flags)
66}
67
68/// Recursive. Can parse stream but only if its dictionary does not contain indirect references.
69/// Use `parse_stream` if this is not sufficient.
70pub fn parse_with_lexer(lexer: &mut Lexer, r: &impl Resolve, flags: ParseFlags) -> Result<Primitive> {
71    parse_with_lexer_ctx(lexer, r, None, flags, MAX_DEPTH)
72}
73
74fn parse_dictionary_object(lexer: &mut Lexer, r: &impl Resolve, ctx: Option<&Context>, max_depth: usize) -> Result<Dictionary> {
75    let mut dict = Dictionary::default();
76    loop {
77        // Expect a Name (and Object) or the '>>' delimiter
78        let token = t!(lexer.next());
79        if token.starts_with(b"/") {
80            let key = token.reslice(1..).to_name()?;
81            let obj = t!(parse_with_lexer_ctx(lexer, r, ctx, ParseFlags::ANY, max_depth));
82            dict.insert(key, obj);
83        } else if token.equals(b">>") {
84            break;
85        } else {
86            err!(PdfError::UnexpectedLexeme{ pos: lexer.get_pos(), lexeme: token.to_string(), expected: "/ or >>"});
87        }
88    }
89    Ok(dict)
90}
91
92fn parse_stream_object(dict: Dictionary, lexer: &mut Lexer, r: &impl Resolve, ctx: &Context) -> Result<PdfStream> {
93    t!(lexer.next_stream());
94
95    let length = match dict.get("Length") {
96        Some(&Primitive::Integer(n)) if n >= 0 => n as usize,
97        Some(&Primitive::Reference(reference)) => t!(t!(r.resolve_flags(reference, ParseFlags::INTEGER, 1)).as_usize()),
98        Some(other) => err!(PdfError::UnexpectedPrimitive { expected: "unsigned Integer or Reference", found: other.get_debug_name() }),
99        None => err!(PdfError::MissingEntry { typ: "<Stream>", field: "Length".into() }),
100    };
101
102    let stream_substr = lexer.read_n(length);
103
104    if stream_substr.len() != length {
105        err!(PdfError::EOF)
106    }
107
108    // Finish
109    t!(lexer.next_expect("endstream"));
110
111    Ok(PdfStream {
112        inner: StreamInner::InFile {
113            id: ctx.id,
114            file_range: stream_substr.file_range(),
115        },
116        info: dict,
117    })
118}
119
120#[inline]
121fn check(flags: ParseFlags, allowed: ParseFlags) -> Result<(), PdfError> {
122    if !flags.intersects(allowed) {
123        return Err(PdfError::PrimitiveNotAllowed { allowed, found: flags });
124    }
125    Ok(())
126}
127
128/// Recursive. Can parse stream but only if its dictionary does not contain indirect references.
129/// Use `parse_stream` if this is not sufficient.
130pub fn parse_with_lexer_ctx(lexer: &mut Lexer, r: &impl Resolve, ctx: Option<&Context>, flags: ParseFlags, max_depth: usize) -> Result<Primitive> {
131    let pos = lexer.get_pos();
132    match _parse_with_lexer_ctx(lexer, r, ctx, flags, max_depth) {
133        Ok(r) => Ok(r),
134        Err(e) => {
135            lexer.set_pos(pos);
136            Err(e)
137        }
138    }
139}
140fn _parse_with_lexer_ctx(lexer: &mut Lexer, r: &impl Resolve, ctx: Option<&Context>, flags: ParseFlags, max_depth: usize) -> Result<Primitive> {
141
142    let input = lexer.get_remaining_slice();
143    let first_lexeme = t!(lexer.next(), std::str::from_utf8(input));
144
145    let obj = if first_lexeme.equals(b"<<") {
146        check(flags, ParseFlags::DICT)?;
147
148        if max_depth == 0 {
149            return Err(PdfError::MaxDepth);
150        }
151        let dict = t!(parse_dictionary_object(lexer, r, ctx, max_depth-1));
152        // It might just be the dictionary in front of a stream.
153        if t!(lexer.peek()).equals(b"stream") {
154            let ctx = ctx.ok_or(PdfError::PrimitiveNotAllowed { allowed: ParseFlags::STREAM, found: flags })?;
155            Primitive::Stream(t!(parse_stream_object(dict, lexer, r, ctx)))
156        } else {
157            Primitive::Dictionary(dict)
158        }
159    } else if first_lexeme.is_integer() {
160        // May be Integer or Reference
161        check(flags, ParseFlags::INTEGER | ParseFlags::REF)?;
162
163        // First backup position
164        let pos_bk = lexer.get_pos();
165
166        let second_lexeme = t!(lexer.next());
167        if second_lexeme.is_integer() {
168            let third_lexeme = t!(lexer.next());
169            if third_lexeme.equals(b"R") {
170                // It is indeed a reference to an indirect object
171                check(flags, ParseFlags::REF)?;
172                Primitive::Reference (PlainRef {
173                    id: t!(first_lexeme.to::<ObjNr>()),
174                    gen: t!(second_lexeme.to::<GenNr>()),
175                })
176            } else {
177                check(flags, ParseFlags::INTEGER)?;
178                // We are probably in an array of numbers - it's not a reference anyway
179                lexer.set_pos(pos_bk); // (roll back the lexer first)
180                Primitive::Integer(t!(first_lexeme.to::<i32>()))
181            }
182        } else {
183            check(flags, ParseFlags::INTEGER)?;
184            // It is but a number
185            lexer.set_pos(pos_bk); // (roll back the lexer first)
186            Primitive::Integer(t!(first_lexeme.to::<i32>()))
187        }
188    } else if let Some(s) = first_lexeme.real_number() {
189        check(flags, ParseFlags::NUMBER)?;
190        // Real Number
191        Primitive::Number (t!(s.to::<f32>(), s.to_string()))
192    } else if first_lexeme.starts_with(b"/") {
193        check(flags, ParseFlags::NAME)?;
194        // Name
195
196        let mut rest: &[u8] = &first_lexeme.reslice(1..);
197        let s = if rest.contains(&b'#') {
198            let mut s = IBytes::new();
199            while let Some(idx) = rest.iter().position(|&b| b == b'#') {
200                use crate::enc::decode_nibble;
201                use std::convert::TryInto;
202                let [hi, lo]: [u8; 2] = rest.get(idx+1 .. idx+3).ok_or(PdfError::EOF)?.try_into().unwrap();
203                let byte = match (decode_nibble(lo), decode_nibble(hi)) {
204                    (Some(low), Some(high)) => low | high << 4,
205                    _ => return Err(PdfError::HexDecode { pos: idx, bytes: [hi, lo] }),
206                };
207                s.extend_from_slice(&rest[..idx]);
208                s.push(byte);
209                rest = &rest[idx+3..];
210            }
211            s.extend_from_slice(rest);
212            SmallBytes::from(s.as_slice())
213        } else {
214            SmallBytes::from(rest)
215        };
216        
217        Primitive::Name(SmallString::from_utf8(s)?)
218    } else if first_lexeme.equals(b"[") {
219        check(flags, ParseFlags::ARRAY)?;
220        if max_depth == 0 {
221            return Err(PdfError::MaxDepth);
222        }
223        let mut array = Vec::new();
224        // Array
225        loop {
226            // Exit if closing delimiter
227            if lexer.peek()?.equals(b"]") {
228                break;
229            }
230
231            let element = t!(parse_with_lexer_ctx(lexer, r, ctx, ParseFlags::ANY, max_depth-1));
232            array.push(element);
233        }
234        t!(lexer.next()); // Move beyond closing delimiter
235
236        Primitive::Array (array)
237    } else if first_lexeme.equals(b"(") {
238        check(flags, ParseFlags::STRING)?;
239        let mut string = IBytes::new();
240
241        let bytes_traversed = {
242            let mut string_lexer = StringLexer::new(lexer.get_remaining_slice());
243            for character in string_lexer.iter() {
244                string.push(t!(character));
245            }
246            string_lexer.get_offset()
247        };
248        // Advance to end of string
249        lexer.offset_pos(bytes_traversed);
250        // decrypt it
251        if let Some(ctx) = ctx {
252            string = t!(ctx.decrypt(&mut string)).into();
253        }
254        Primitive::String (PdfString::new(string))
255    } else if first_lexeme.equals(b"<") {
256        check(flags, ParseFlags::STRING)?;
257        let mut string = IBytes::new();
258
259        let bytes_traversed = {
260            let mut hex_string_lexer = HexStringLexer::new(lexer.get_remaining_slice());
261            for byte in hex_string_lexer.iter() {
262                string.push(t!(byte));
263            }
264            hex_string_lexer.get_offset()
265        };
266        // Advance to end of string
267        lexer.offset_pos(bytes_traversed);
268
269        // decrypt it
270        if let Some(ctx) = ctx {
271            string = t!(ctx.decrypt(&mut string)).into();
272        }
273        Primitive::String (PdfString::new(string))
274    } else if first_lexeme.equals(b"true") {
275        check(flags, ParseFlags::BOOL)?;
276        Primitive::Boolean (true)
277    } else if first_lexeme.equals(b"false") {
278        check(flags, ParseFlags::BOOL)?;
279        Primitive::Boolean (false)
280    } else if first_lexeme.equals(b"null") {
281        check(flags, ParseFlags::NULL)?;
282        Primitive::Null
283    } else {
284        err!(PdfError::UnknownType {pos: lexer.get_pos(), first_lexeme: first_lexeme.to_string(), rest: lexer.read_n(50).to_string()});
285    };
286
287    // trace!("Read object"; "Obj" => format!("{}", obj));
288
289    Ok(obj)
290}
291
292
293pub fn parse_stream(data: &[u8], resolve: &impl Resolve, ctx: &Context) -> Result<PdfStream> {
294    parse_stream_with_lexer(&mut Lexer::new(data), resolve, ctx)
295}
296
297
298fn parse_stream_with_lexer(lexer: &mut Lexer, r: &impl Resolve, ctx: &Context) -> Result<PdfStream> {
299    let first_lexeme = t!(lexer.next());
300
301    let obj = if first_lexeme.equals(b"<<") {
302        let dict = t!(parse_dictionary_object(lexer, r, None, MAX_DEPTH));
303        // It might just be the dictionary in front of a stream.
304        if t!(lexer.peek()).equals(b"stream") {
305            let ctx = Context {
306                decoder: None,
307                id: ctx.id
308            };
309            t!(parse_stream_object(dict, lexer, r, &ctx))
310        } else {
311            err!(PdfError::UnexpectedPrimitive { expected: "Stream", found: "Dictionary" });
312        }
313    } else {
314        err!(PdfError::UnexpectedPrimitive { expected: "Stream", found: "something else" });
315    };
316
317    Ok(obj)
318}
319
320#[cfg(test)]
321mod tests {
322    #[test]
323    fn dict_with_empty_name_as_value() {
324        use crate::object::NoResolve;
325        use super::{ParseFlags, Context};
326        {
327            let data = b"<</App<</Name/>>>>";
328            let primitive = super::parse(data, &NoResolve, ParseFlags::DICT).unwrap();
329            let dict = primitive.into_dictionary().unwrap();
330
331            assert_eq!(dict.len(), 1);
332            let app_dict = dict.get("App").unwrap().clone().into_dictionary().unwrap();
333            assert_eq!(app_dict.len(), 1);
334            let name = app_dict.get("Name").unwrap().as_name().unwrap();
335            assert_eq!(name, "");
336        }
337
338        {
339            let data = b"<</Length 0/App<</Name/>>>>stream\nendstream\n";
340            let stream = super::parse_stream(data, &NoResolve, &Context::fake()).unwrap();
341            let dict = stream.info;
342
343            assert_eq!(dict.len(), 2);
344            let app_dict = dict.get("App").unwrap().clone().into_dictionary().unwrap();
345            assert_eq!(app_dict.len(), 1);
346            let name = app_dict.get("Name").unwrap().as_name().unwrap();
347            assert_eq!(name, "");
348        }
349    }
350
351    #[test]
352    fn dict_with_empty_name_as_key() {
353        use crate::object::NoResolve;
354        use super::{ParseFlags, Context};
355
356        {
357            let data = b"<</ true>>";
358            let primitive = super::parse(data, &NoResolve, ParseFlags::DICT).unwrap();
359            let dict = primitive.into_dictionary().unwrap();
360
361            assert_eq!(dict.len(), 1);
362            assert!(dict.get("").unwrap().as_bool().unwrap());
363        }
364
365        {
366            let data = b"<</Length 0/ true>>stream\nendstream\n";
367            let stream = super::parse_stream(data, &NoResolve, &Context::fake()).unwrap();
368            let dict = stream.info;
369
370            assert_eq!(dict.len(), 2);
371            assert!(dict.get("").unwrap().as_bool().unwrap());
372        }
373    }
374
375    #[test]
376    fn empty_array() {
377        use crate::object::NoResolve;
378        use super::ParseFlags;
379
380        let data = b"[]";
381        let primitive = super::parse(data, &NoResolve, ParseFlags::ARRAY).unwrap();
382        let array = primitive.into_array().unwrap();
383        assert!(array.is_empty());
384    }
385
386    #[test]
387    fn compact_array() {
388        use crate::object::NoResolve;
389        use crate::primitive::{Primitive, PdfString};
390        use super::lexer::Lexer;
391        use super::*;
392        let mut lx = Lexer::new(b"[(Complete L)20(egend for Physical and P)20(olitical Maps)]TJ");
393        assert_eq!(parse_with_lexer(&mut lx, &NoResolve, ParseFlags::ANY).unwrap(),
394            Primitive::Array(vec![
395                Primitive::String(PdfString::new("Complete L".into())),
396                Primitive::Integer(20),
397                Primitive::String(PdfString::new("egend for Physical and P".into())),
398                Primitive::Integer(20),
399                Primitive::String(PdfString::new("olitical Maps".into()))
400            ])
401        );
402        assert_eq!(lx.next().unwrap().as_str().unwrap(), "TJ");
403        assert!(lx.next().unwrap_err().is_eof());
404    }
405}