pdf/parser/lexer/
mod.rs

1/// Lexing an input file, in the sense of breaking it up into substrings based on delimiters and
2/// whitespace.
3
4use std::str::FromStr;
5use std::ops::{Range, Deref, RangeFrom};
6use std::borrow::Cow;
7
8use crate::error::*;
9use crate::primitive::Name;
10
11mod str;
12pub use self::str::{StringLexer, HexStringLexer};
13
14
15/// `Lexer` has functionality to jump around and traverse the PDF lexemes of a string in any direction.
16#[derive(Copy, Clone)]
17#[allow(dead_code)]
18pub struct Lexer<'a> {
19    pos: usize,
20    buf: &'a [u8],
21    file_offset: usize,
22}
23
24// find the position where condition(data[pos-1]) == false and condition(data[pos]) == true
25#[inline]
26fn boundary_rev(data: &[u8], pos: usize, condition: impl Fn(u8) -> bool) -> usize {
27    match data[.. pos].iter().rposition(|&b| !condition(b)) {
28        Some(start) => start + 1,
29        None => 0
30    }
31}
32
33// find the position where condition(data[pos-1]) == true and condition(data[pos]) == false
34#[inline]
35fn boundary(data: &[u8], pos: usize, condition: impl Fn(u8) -> bool) -> usize {
36    match data[pos ..].iter().position(|&b| !condition(b)) {
37        Some(start) => pos + start,
38        None => data.len()
39    }
40}
41
42#[inline]
43fn is_whitespace(b: u8) -> bool {
44    matches!(b, 0 | b' ' | b'\r' | b'\n' | b'\t')
45}
46#[inline]
47fn not<T>(f: impl Fn(T) -> bool) -> impl Fn(T) -> bool {
48    move |t| !f(t)
49}
50impl<'a> Lexer<'a> {
51    pub fn new(buf: &'a [u8]) -> Lexer<'a> {
52        Lexer {
53            pos: 0,
54            buf,
55            file_offset: 0
56        }
57    }
58    pub fn with_offset(buf: &'a [u8], file_offset: usize) -> Lexer<'a> {
59        Lexer {
60            pos: 0,
61            buf,
62            file_offset
63        }
64    }
65
66    /// Returns next lexeme. Lexer moves to the next byte after the lexeme. (needs to be tested)
67    #[allow(clippy::should_implement_trait)]
68    pub fn next(&mut self) -> Result<Substr<'a>> {
69        let (lexeme, pos) = self.next_word()?;
70        self.pos = pos;
71        Ok(lexeme)
72    }
73
74    /// consume the whitespace sequence following the stream start
75    pub fn next_stream(&mut self) -> Result<()> {
76        let pos = self.skip_whitespace(self.pos)?;
77        if !self.buf[pos ..].starts_with(b"stream") {
78            // bail!("next token isn't 'stream'");
79        }
80        
81        let &b0 = self.buf.get(pos + 6).ok_or(PdfError::EOF)?;
82        if b0 == b'\n' {
83            self.pos = pos + 7;
84        } else if b0 == b'\r' {
85            let &b1 = self.buf.get(pos + 7).ok_or(PdfError::EOF)?;
86            if b1 != b'\n' {
87                bail!("invalid whitespace following 'stream'");
88                // bail!("invalid whitespace following 'stream'");
89            }
90            self.pos = pos + 8;
91        } else {
92            bail!("invalid whitespace");
93        }
94        Ok(())
95    }
96    /// Gives previous lexeme. Lexer moves to the first byte of this lexeme. (needs to be tested)
97    pub fn back(&mut self) -> Result<Substr<'a>> {
98        //println!("back: {:?}", String::from_utf8_lossy(&self.buf[self.pos.saturating_sub(20) .. self.pos]));
99        
100        // first reverse until we find non-whitespace
101        let end_pos = boundary_rev(self.buf, self.pos, is_whitespace);
102        let start_pos = boundary_rev(self.buf, end_pos, not(is_whitespace));
103        self.pos = start_pos;
104        
105        Ok(self.new_substr(start_pos .. end_pos))
106    }
107
108    /// Look at the next lexeme. Will return empty substr if the next character is EOF.
109    pub fn peek(&self) -> Result<Substr<'a>> {
110        match self.next_word() {
111            Ok((substr, _)) => Ok(substr),
112            Err(PdfError::EOF) => Ok(self.new_substr(self.pos..self.pos)),
113            Err(e) => Err(e),
114        }
115
116    }
117
118    /// Returns `Ok` if the next lexeme matches `expected` - else `Err`.
119    pub fn next_expect(&mut self, expected: &'static str) -> Result<()> {
120        let word = self.next()?;
121        if word.equals(expected.as_bytes()) {
122            Ok(())
123        } else {
124            Err(PdfError::UnexpectedLexeme {
125                pos: self.pos,
126                lexeme: word.to_string(),
127                expected
128            })
129        }
130    }
131
132    /// skip whitespaces and return the position of the first non-whitespace character
133    #[inline]
134    fn skip_whitespace(&self, pos: usize) -> Result<usize> {
135        // Move away from eventual whitespace
136        let pos = boundary(self.buf, pos, is_whitespace);
137        if pos >= self.buf.len() {
138            Err(PdfError::EOF)
139        } else {
140            Ok(pos)
141        }
142    }
143
144    /// Used by next, peek and back - returns substring and new position
145    /// If forward, places pointer at the next non-whitespace character.
146    /// If backward, places pointer at the start of the current word.
147    // TODO ^ backward case is actually not tested or.. thought about that well.
148    fn next_word(&self) -> Result<(Substr<'a>, usize)> {
149        if self.pos == self.buf.len() {
150            return Err(PdfError::EOF);
151        }
152        let mut pos = self.skip_whitespace(self.pos)?;
153        while self.buf.get(pos) == Some(&b'%') {
154            pos += 1;
155            if let Some(off) = self.buf[pos..].iter().position(|&b| b == b'\n') {
156                pos += off+1;
157            }
158            
159            // Move away from eventual whitespace
160            pos = self.skip_whitespace(pos)?;
161        }
162        
163        let start_pos = pos;
164
165        // If first character is delimiter, this lexeme only contains that character.
166        //  - except << and >> which go together, and / which marks the start of a
167        // name token.
168        if self.is_delimiter(pos) {
169            if self.buf[pos] == b'/' {
170                pos = self.advance_pos(pos)?;
171                while !self.is_whitespace(pos) && !self.is_delimiter(pos) {
172                    match self.advance_pos(pos) {
173                        Ok(p) => pos = p,
174                        Err(_) => break,
175                    }
176                }
177                return Ok((self.new_substr(start_pos..pos), pos));
178            }
179
180            if let Some(slice) = self.buf.get(pos..=pos+1) {
181                if slice == b"<<" || slice == b">>" {
182                    pos = self.advance_pos(pos)?;
183                }
184            }
185
186            pos = self.advance_pos(pos)?;
187            return Ok((self.new_substr(start_pos..pos), pos));
188        }
189
190        // Read to past the end of lexeme
191        while !self.is_whitespace(pos) && !self.is_delimiter(pos) {
192            match self.advance_pos(pos) {
193                Ok(p) => pos = p,
194                Err(_) => break,
195            }
196        }
197        let result = self.new_substr(start_pos..pos);
198
199        // Move away from whitespace again
200        //pos = self.skip_whitespace(pos)?;
201        Ok((result, pos))
202    }
203
204    /// Just a helper for next_word.
205    #[inline]
206    fn advance_pos(&self, pos: usize) -> Result<usize> {
207        if pos < self.buf.len() {
208            Ok(pos + 1)
209        } else {
210            Err(PdfError::EOF)
211        }
212    }
213
214    #[inline]
215    pub fn next_as<T>(&mut self) -> Result<T>
216        where T: FromStr, T::Err: std::error::Error + Send + Sync + 'static
217    {
218        self.next().and_then(|word| word.to::<T>())
219    }
220
221    #[inline]
222    pub fn get_pos(&self) -> usize {
223        self.pos
224    }
225
226    #[inline]
227    pub fn new_substr(&self, mut range: Range<usize>) -> Substr<'a> {
228        // if the range is backward, fix it
229        // start is inclusive, end is exclusive. keep that in mind
230        if range.start > range.end {
231            let new_end = range.start + 1;
232            range.start = range.end + 1;
233            range.end = new_end;
234        }
235
236        Substr {
237            file_offset: self.file_offset + range.start,
238            slice: &self.buf[range],
239        }
240    }
241
242    /// Just a helper function for set_pos, set_pos_from_end and offset_pos.
243    #[inline]
244    pub fn set_pos(&mut self, wanted_pos: usize) -> Substr<'a> {
245        let new_pos = wanted_pos.min(self.buf.len());
246        let range = if self.pos < new_pos {
247            self.pos..new_pos
248        } else {
249            new_pos..self.pos
250        };
251        self.pos = new_pos;
252        self.new_substr(range)
253    }
254
255    /// Returns the substr between the old and new positions
256    #[inline]
257    pub fn set_pos_from_end(&mut self, new_pos: usize) -> Substr<'a> {
258        self.set_pos(self.buf.len().saturating_sub(new_pos).saturating_sub(1))
259    }
260    /// Returns the substr between the old and new positions
261    #[inline]
262    pub fn offset_pos(&mut self, offset: usize) -> Substr<'a> {
263        self.set_pos(self.pos.wrapping_add(offset))
264    }
265
266    /// Moves pos to start of next line. Returns the skipped-over substring.
267    #[allow(dead_code)]
268    pub fn seek_newline(&mut self) -> Substr{
269        let start = self.pos;
270        while self.buf[self.pos] != b'\n' 
271            && self.incr_pos() { }
272        self.incr_pos();
273
274        self.new_substr(start..self.pos)
275    }
276
277
278    // TODO: seek_substr and seek_substr_back should use next() or back()?
279    /// Moves pos to after the found `substr`. Returns Substr with traversed text if `substr` is found.
280    #[allow(dead_code)]
281    pub fn seek_substr(&mut self, substr: impl AsRef<[u8]>) -> Option<Substr<'a>> {
282        //
283        let substr = substr.as_ref();
284        let start = self.pos;
285        let mut matched = 0;
286        loop {
287            if self.pos >= self.buf.len() {
288                return None
289            }
290            if self.buf[self.pos] == substr[matched] {
291                matched += 1;
292            } else {
293                matched = 0;
294            }
295            if matched == substr.len() {
296                break;
297            }
298            self.pos += 1;
299        }
300        self.pos += 1;
301        Some(self.new_substr(start..(self.pos - substr.len())))
302    }
303
304    //TODO perhaps seek_substr_back should, like back(), move to the first letter of the substr.
305    /// Searches for string backward. Moves to after the found `substr`, returns the traversed
306    /// Substr if found.
307    pub fn seek_substr_back(&mut self, substr: &[u8]) -> Result<Substr<'a>> {
308        let end = self.pos;
309        match self.buf[.. end].windows(substr.len()).rposition(|w| w == substr) {
310            Some(start) => {
311                self.pos = start + substr.len();
312                Ok(self.new_substr(self.pos .. end))
313            }
314            None => Err(PdfError::NotFound {word: String::from_utf8_lossy(substr).into() })
315        }
316    }
317
318    /// Read and return slice of at most n bytes.
319    #[allow(dead_code)]
320    pub fn read_n(&mut self, n: usize) -> Substr<'a> {
321        let start_pos = self.pos;
322        self.pos += n;
323        if self.pos >= self.buf.len() {
324            self.pos = self.buf.len() - 1;
325        }
326        if start_pos < self.buf.len() {
327            self.new_substr(start_pos..self.pos)
328        } else {
329            self.new_substr(0..0)
330        }
331    }
332
333    /// Returns slice from current position to end.
334    #[inline]
335    pub fn get_remaining_slice(&self) -> &'a [u8] {
336        &self.buf[self.pos..]
337    }
338
339    /// for debugging
340    pub fn ctx(&self) -> Cow<str> {
341        String::from_utf8_lossy(&self.buf[self.pos.saturating_sub(40)..self.buf.len().min(self.pos+40)])
342    }
343
344    #[inline]
345    fn incr_pos(&mut self) -> bool {
346        if self.pos >= self.buf.len() - 1 {
347            false
348        } else {
349            self.pos += 1;
350            true
351        }
352    }
353    #[inline]
354    fn is_whitespace(&self, pos: usize) -> bool {
355        self.buf.get(pos).map(|&b| is_whitespace(b)).unwrap_or(false)
356    }
357
358    #[inline]
359    fn is_delimiter(&self, pos: usize) -> bool {
360        self.buf.get(pos).map(|b| b"()<>[]{}/%".contains(b)).unwrap_or(false)
361    }
362
363}
364
365
366
367/// A slice from some original string - a lexeme.
368#[derive(Copy, Clone, Debug)]
369pub struct Substr<'a> {
370    slice: &'a [u8],
371    file_offset: usize,
372}
373impl<'a> Substr<'a> {
374    pub fn new<T: AsRef<[u8]> + ?Sized>(data: &'a T, file_offset: usize) -> Self {
375        Substr { slice: data.as_ref(), file_offset }
376    }
377    // to: &S -> U. Possibly expensive conversion.
378    // as: &S -> &U. Cheap borrow conversion
379    // into: S -> U. Cheap ownership transfer conversion.
380
381    #[allow(clippy::inherent_to_string)]
382    pub fn to_string(&self) -> String {
383        String::from_utf8_lossy(self.as_slice()).into()
384    }
385    pub fn to_name(&self) -> Result<Name> {
386        Ok(Name(std::str::from_utf8(self.as_slice())?.into()))
387    }
388    pub fn to_vec(&self) -> Vec<u8> {
389        self.slice.to_vec()
390    }
391    pub fn to<T>(&self) -> Result<T>
392        where T: FromStr, T::Err: std::error::Error + Send + Sync + 'static
393    {
394        std::str::from_utf8(self.slice)?.parse::<T>().map_err(|e| PdfError::Parse { source: e.into() })
395    }
396    pub fn is_integer(&self) -> bool {
397        if self.slice.len() == 0 {
398            return false;
399        }
400        let mut slice = self.slice;
401        if slice[0] == b'-' {
402            if slice.len() < 2 {
403                return false;
404            }
405            slice = &slice[1..];
406        }
407        is_int(slice)
408    }
409    pub fn is_real_number(&self) -> bool {
410        self.real_number().is_some()
411    }
412    pub fn real_number(&self) -> Option<Self> {
413        if self.slice.len() == 0 {
414            return None;
415        }
416        let mut slice = self.slice;
417        if slice[0] == b'-' {
418            if slice.len() < 2 {
419                return None;
420            }
421            slice = &slice[1..];
422        }
423        if let Some(i) = slice.iter().position(|&b| b == b'.') {
424            if !is_int(&slice[..i]) {
425                return None;
426            }
427            slice = &slice[i+1..];
428        }
429        if let Some(len) = slice.iter().position(|&b| !b.is_ascii_digit()) {
430            if len == 0 {
431                return None;
432            }
433            let end = self.slice.len() - slice.len() + len;
434            Some(Substr {
435                file_offset: self.file_offset,
436                slice: &self.slice[..end]
437            })
438        } else {
439            Some(*self)
440        }
441    }
442
443    pub fn as_slice(&self) -> &'a [u8] {
444        self.slice
445    }
446    pub fn as_str(&self) -> Result<&str> {
447        std::str::from_utf8(self.slice).map_err(|e| PdfError::Parse { source: e.into() })
448    }
449
450    pub fn equals(&self, other: impl AsRef<[u8]>) -> bool {
451        self.slice == other.as_ref()
452    }
453
454    pub fn reslice(&self, range: RangeFrom<usize>) -> Substr<'a> {
455        Substr {
456            file_offset: self.file_offset + range.start,
457            slice: &self.slice[range],
458        }
459    }
460
461    pub fn file_range(&self) -> Range<usize> {
462        self.file_offset .. self.file_offset + self.slice.len()
463    }
464}
465
466#[inline]
467fn is_int(b: &[u8]) -> bool {
468    b.iter().all(|&b| b.is_ascii_digit())
469}
470impl<'a> Deref for Substr<'a> {
471    type Target = [u8];
472    fn deref(&self) -> &[u8] {
473        self.as_slice()
474    }
475}
476impl<'a> PartialEq<&[u8]> for Substr<'a> {
477    fn eq(&self, rhs: &&[u8]) -> bool {
478        self.equals(rhs)
479    }
480}
481
482impl<'a> PartialEq<&str> for Substr<'a> {
483    fn eq(&self, rhs: &&str) -> bool {
484        self.equals(rhs.as_bytes())
485    }
486}
487
488#[cfg(test)]
489mod tests {
490    use super::*;
491
492    #[test]
493    fn test_boundary_rev() {
494        assert_eq!(boundary_rev(b" hello", 3, not(is_whitespace)), 1);
495        assert_eq!(boundary_rev(b" hello", 3, is_whitespace), 3);
496    }
497
498    #[test]
499    fn test_boundary() {
500        assert_eq!(boundary(b" hello ", 3, not(is_whitespace)), 6);
501        assert_eq!(boundary(b" hello ", 3, is_whitespace), 3);
502        assert_eq!(boundary(b"01234  7orld", 5, is_whitespace), 7);
503        assert_eq!(boundary(b"01234  7orld", 7, is_whitespace), 7);
504        assert_eq!(boundary(b"q\n", 1, is_whitespace), 2);
505    }
506
507    #[test]
508    fn test_substr() {
509        assert!(Substr::new("123", 0).is_real_number());
510        assert!(Substr::new("123.", 0).is_real_number());
511        assert!(Substr::new("123.45", 0).is_real_number());
512        assert!(Substr::new(".45", 0).is_real_number());
513        assert!(Substr::new("-.45", 0).is_real_number());
514        assert!(!Substr::new("123.45", 0).is_integer());
515        assert!(Substr::new("123", 0).is_integer());
516    }
517}