pdf/parser/lexer/
str.rs

1use std::iter::Iterator;
2use crate::error::*;
3
4/// A lexer for PDF strings. Breaks the string up into single characters (`u8`)
5/// It's also possible to get the number of indices of the original array that was traversed by the
6/// Iterator.
7///
8/// ```
9/// let mut string: Vec<u8> = Vec::new();
10/// let bytes_traversed = {
11///     let mut string_lexer = StringLexer::new(lexer.get_remaining_slice());
12///     for character in string_lexer.iter() {
13///         let character = character?;
14///         string.push(character);
15///     }
16///     string_lexer.get_offset() as i64
17/// };
18/// // bytes_traversed now holds the number of bytes in the original array traversed.
19/// ```
20///
21
22#[derive(Clone)]
23pub struct StringLexer<'a> {
24    pos: usize, // points to next byte
25    nested: i32, // How far in () we are nested
26    buf: &'a [u8],
27}
28
29impl<'a> StringLexer<'a> {
30    /// `buf` should start right after the `(` delimiter, and may span all the way to EOF. StringLexer
31    /// will determine the end of the string.
32    pub fn new(buf: &'a [u8]) -> StringLexer<'a> {
33        StringLexer {
34            pos: 0,
35            nested: 0,
36            buf,
37        }
38    }
39    pub fn iter<'b>(&'b mut self) -> StringLexerIter<'a, 'b> {
40        StringLexerIter {lexer: self}
41    }
42    /// Get offset/pos from start of string
43    pub fn get_offset(&self) -> usize {
44        self.pos
45    }
46
47    /// (mostly just used by Iterator, but might be useful)
48    pub fn next_lexeme(&mut self) -> Result<Option<u8>> {
49        let c = self.next_byte()?;
50        match c {
51            b'\\' => {
52                let c = self.next_byte()?;
53                Ok(
54                match c {
55                    b'n' => Some(b'\n'),
56                    b'r' => Some(b'\r'),
57                    b't' => Some(b'\t'),
58                    b'b' => Some(b'\x08'),
59                    b'f' => Some(b'\x0c'),
60                    b'(' => Some(b'('),
61                    b')' => Some(b')'),
62                    b'\n' => {
63                        // ignore end-of-line marker
64                        if let Ok(b'\r') = self.peek_byte() {
65                            let _ = self.next_byte();
66                        }
67                        self.next_lexeme()?
68                    }
69                    b'\r' => {
70                        // ignore end-of-line marker
71                        if let Ok(b'\n') = self.peek_byte() {
72                            let _ = self.next_byte();
73                        }
74                        self.next_lexeme()?
75                    }
76                    b'\\' => Some(b'\\'),
77
78                    _ => {
79                        self.back()?;
80                        let _start = self.get_offset();
81                        let mut char_code: u16 = 0;
82
83                        // A character code must follow. 1-3 numbers.
84                        for _ in 0..3 {
85                            let c = self.peek_byte()?;
86                            if (b'0'..=b'7').contains(&c) {
87                                self.next_byte()?;
88                                char_code = char_code * 8 + (c - b'0') as u16;
89                            } else {
90                                break;
91                            }
92                        }
93                        Some(char_code as u8)
94                    }
95                }
96                )
97            },
98
99            b'(' => {
100                self.nested += 1;
101                Ok(Some(b'('))
102            },
103            b')' => {
104                self.nested -= 1;
105                if self.nested < 0 {
106                    Ok(None)
107                } else {
108                    Ok(Some(b')'))
109                }
110            },
111
112            c => Ok(Some(c))
113
114        }
115    }
116
117    fn next_byte(&mut self) -> Result<u8> {
118        if self.pos < self.buf.len() {
119            self.pos += 1;
120            Ok(self.buf[self.pos-1])
121        } else {
122            Err(PdfError::EOF)
123        }
124    }
125    fn back(&mut self) -> Result<()> {
126        if self.pos > 0 {
127            self.pos -= 1;
128            Ok(())
129        } else {
130            Err(PdfError::EOF)
131        }
132    }
133    fn peek_byte(&mut self) -> Result<u8> {
134        if self.pos < self.buf.len() {
135            Ok(self.buf[self.pos])
136        } else {
137            Err(PdfError::EOF)
138        }
139    }
140}
141
142// "'a is valid for at least 'b"
143pub struct StringLexerIter<'a: 'b, 'b> {
144    lexer: &'b mut StringLexer<'a>,
145}
146
147impl<'a, 'b> Iterator for StringLexerIter<'a, 'b> {
148    type Item = Result<u8>;
149    fn next(&mut self) -> Option<Result<u8>> {
150        match self.lexer.next_lexeme() {
151            Err(e) => Some(Err(e)),
152            Ok(Some(s)) => Some(Ok(s)),
153            Ok(None) => None,
154        }
155    }
156}
157
158pub struct HexStringLexer<'a> {
159    pos: usize, // points to next byte
160    buf: &'a [u8],
161}
162
163impl<'a> HexStringLexer<'a> {
164    /// `buf` should start right after the `<` delimiter, and may span all the way to EOF.
165    /// HexStringLexer will determine the end of the string.
166    pub fn new(buf: &'a [u8]) -> HexStringLexer<'a> {
167        HexStringLexer { pos: 0, buf }
168    }
169
170    pub fn iter<'b>(&'b mut self) -> HexStringLexerIter<'a, 'b> {
171        HexStringLexerIter { lexer: self }
172    }
173
174    /// Get offset/position from start of string
175    pub fn get_offset(&self) -> usize {
176        self.pos
177    }
178
179    fn next_non_whitespace_char(&mut self) -> Result<u8> {
180        let mut byte = self.read_byte()?;
181        while byte == b' ' || byte == b'\t' || byte == b'\n' || byte == b'\r' || byte == b'\x0c' {
182            byte = self.read_byte()?;
183        }
184        Ok(byte)
185    }
186
187    pub fn next_hex_byte(&mut self) -> Result<Option<u8>> {
188        let c1 = self.next_non_whitespace_char()?;
189        let high_nibble: u8 = match c1 {
190            b'0' ..= b'9' => c1 - b'0',
191            b'A' ..= b'F' => c1 - b'A' + 0xA,
192            b'a' ..= b'f' => c1 - b'a' + 0xA,
193            b'>' => return Ok(None),
194            _ => return Err(PdfError::HexDecode {
195                pos: self.pos,
196                bytes: [c1, self.peek_byte().unwrap_or(0)]
197            }),
198        };
199        let c2 = self.next_non_whitespace_char()?;
200        let low_nibble: u8 = match c2 {
201            b'0' ..= b'9' => c2 - b'0',
202            b'A' ..= b'F' => c2 - b'A' + 0xA,
203            b'a' ..= b'f' => c2 - b'a' + 0xA,
204            b'>' => {
205                self.back()?;
206                0
207            }
208            _ => return Err(PdfError::HexDecode {
209                pos: self.pos,
210                bytes: [c1, c2]
211            }),
212        };
213        Ok(Some((high_nibble << 4) | low_nibble))
214    }
215
216    fn read_byte(&mut self) -> Result<u8> {
217        if self.pos < self.buf.len() {
218            self.pos += 1;
219            Ok(self.buf[self.pos - 1])
220        } else {
221            Err(PdfError::EOF)
222        }
223    }
224
225    fn back(&mut self) -> Result<()> {
226        if self.pos > 0 {
227            self.pos -= 1;
228            Ok(())
229        } else {
230            Err(PdfError::EOF)
231        }
232    }
233
234    fn peek_byte(&mut self) -> Result<u8> {
235        if self.pos < self.buf.len() {
236            Ok(self.buf[self.pos])
237        } else {
238            Err(PdfError::EOF)
239        }
240    }
241}
242
243pub struct HexStringLexerIter<'a: 'b, 'b> {
244    lexer: &'b mut HexStringLexer<'a>,
245}
246
247impl<'a, 'b> Iterator for HexStringLexerIter<'a, 'b> {
248    type Item = Result<u8>;
249
250    fn next(&mut self) -> Option<Result<u8>> {
251        match self.lexer.next_hex_byte() {
252            Err(e) => Some(Err(e)),
253            Ok(Some(s)) => Some(Ok(s)),
254            Ok(None) => None,
255        }
256    }
257}
258
259#[cfg(test)]
260mod tests {
261    use crate::error::Result;
262    use crate::parser::lexer::{HexStringLexer, StringLexer};
263
264    #[test]
265    fn tests() {
266        let vec = b"a\\nb\\rc\\td\\(f/)\\\\hei)";
267        let mut lexer = StringLexer::new(vec);
268        let lexemes: Vec<u8> = lexer.iter().map(Result::unwrap).collect();
269        assert_eq!(lexemes, b"a\nb\rc\td(f/");
270    }
271
272    #[test]
273    fn string_split_lines() {
274        {
275            let data = b"These \\\ntwo strings \\\nare the same.)";
276            let mut lexer = StringLexer::new(data);
277            let result: Vec<u8> = lexer.iter().map(Result::unwrap).collect();
278            assert_eq!(result, b"These two strings are the same.");
279        }
280        {
281            let data = b"These \\\rtwo strings \\\rare the same.)";
282            let mut lexer = StringLexer::new(data);
283            let result: Vec<u8> = lexer.iter().map(Result::unwrap).collect();
284            assert_eq!(result, b"These two strings are the same.");
285        }
286        {
287            let data = b"These \\\r\ntwo strings \\\r\nare the same.)";
288            let mut lexer = StringLexer::new(data);
289            let result: Vec<u8> = lexer.iter().map(Result::unwrap).collect();
290            assert_eq!(result, b"These two strings are the same.");
291        }
292    }
293
294    #[test]
295    fn octal_escape() {
296        {
297            let data = b"This string contains\\245two octal characters\\307.)";
298            let mut lexer = StringLexer::new(data);
299            let result: Vec<u8> = lexer.iter().map(Result::unwrap).collect();
300            assert_eq!(result, &b"This string contains\xa5two octal characters\xc7."[..]);
301        }
302        {
303            let data = b"\\0053)";
304            let mut lexer = StringLexer::new(data);
305            let result: Vec<u8> = lexer.iter().map(Result::unwrap).collect();
306            assert_eq!(result, b"\x053");
307        }
308        {
309            let data = b"\\053)";
310            let mut lexer = StringLexer::new(data);
311            let result: Vec<u8> = lexer.iter().map(Result::unwrap).collect();
312            assert_eq!(result, b"+");
313        }
314        {
315            let data = b"\\53)";
316            let mut lexer = StringLexer::new(data);
317            let result: Vec<u8> = lexer.iter().map(Result::unwrap).collect();
318            assert_eq!(result, b"+");
319        }
320        {
321            // overflow is ignored
322            let data = b"\\541)";
323            let mut lexer = StringLexer::new(data);
324            let result: Vec<u8> = lexer.iter().map(Result::unwrap).collect();
325            assert_eq!(result, b"a");
326        }
327    }
328
329    #[test]
330    fn hex_test() {
331        let input = b"901FA3>";
332        let mut lexer = HexStringLexer::new(input);
333        let result: Vec<u8> = lexer.iter().map(Result::unwrap).collect();
334        assert_eq!(
335            result,
336            vec![
337                b'\x90',
338                b'\x1f',
339                b'\xa3',
340            ]
341        );
342
343        let input = b"901FA>";
344        let mut lexer = HexStringLexer::new(input);
345        let result: Vec<u8> = lexer.iter().map(Result::unwrap).collect();
346        assert_eq!(
347            result,
348            vec![
349                b'\x90',
350                b'\x1f',
351                b'\xa0',
352            ]
353        );
354
355        let input = b"1 9F\t5\r\n4\x0c62a>";
356        let mut lexer = HexStringLexer::new(input);
357        let result: Vec<u8> = lexer.iter().map(Result::unwrap).collect();
358        assert_eq!(
359            result,
360            vec![
361                b'\x19',
362                b'\xf5',
363                b'\x46',
364                b'\x2a',
365            ]
366        );
367    }
368}