1use std::iter::Iterator;
2use crate::error::*;
3
4#[derive(Clone)]
23pub struct StringLexer<'a> {
24 pos: usize, nested: i32, buf: &'a [u8],
27}
28
29impl<'a> StringLexer<'a> {
30 pub fn new(buf: &'a [u8]) -> StringLexer<'a> {
33 StringLexer {
34 pos: 0,
35 nested: 0,
36 buf,
37 }
38 }
39 pub fn iter<'b>(&'b mut self) -> StringLexerIter<'a, 'b> {
40 StringLexerIter {lexer: self}
41 }
42 pub fn get_offset(&self) -> usize {
44 self.pos
45 }
46
47 pub fn next_lexeme(&mut self) -> Result<Option<u8>> {
49 let c = self.next_byte()?;
50 match c {
51 b'\\' => {
52 let c = self.next_byte()?;
53 Ok(
54 match c {
55 b'n' => Some(b'\n'),
56 b'r' => Some(b'\r'),
57 b't' => Some(b'\t'),
58 b'b' => Some(b'\x08'),
59 b'f' => Some(b'\x0c'),
60 b'(' => Some(b'('),
61 b')' => Some(b')'),
62 b'\n' => {
63 if let Ok(b'\r') = self.peek_byte() {
65 let _ = self.next_byte();
66 }
67 self.next_lexeme()?
68 }
69 b'\r' => {
70 if let Ok(b'\n') = self.peek_byte() {
72 let _ = self.next_byte();
73 }
74 self.next_lexeme()?
75 }
76 b'\\' => Some(b'\\'),
77
78 _ => {
79 self.back()?;
80 let _start = self.get_offset();
81 let mut char_code: u16 = 0;
82
83 for _ in 0..3 {
85 let c = self.peek_byte()?;
86 if (b'0'..=b'7').contains(&c) {
87 self.next_byte()?;
88 char_code = char_code * 8 + (c - b'0') as u16;
89 } else {
90 break;
91 }
92 }
93 Some(char_code as u8)
94 }
95 }
96 )
97 },
98
99 b'(' => {
100 self.nested += 1;
101 Ok(Some(b'('))
102 },
103 b')' => {
104 self.nested -= 1;
105 if self.nested < 0 {
106 Ok(None)
107 } else {
108 Ok(Some(b')'))
109 }
110 },
111
112 c => Ok(Some(c))
113
114 }
115 }
116
117 fn next_byte(&mut self) -> Result<u8> {
118 if self.pos < self.buf.len() {
119 self.pos += 1;
120 Ok(self.buf[self.pos-1])
121 } else {
122 Err(PdfError::EOF)
123 }
124 }
125 fn back(&mut self) -> Result<()> {
126 if self.pos > 0 {
127 self.pos -= 1;
128 Ok(())
129 } else {
130 Err(PdfError::EOF)
131 }
132 }
133 fn peek_byte(&mut self) -> Result<u8> {
134 if self.pos < self.buf.len() {
135 Ok(self.buf[self.pos])
136 } else {
137 Err(PdfError::EOF)
138 }
139 }
140}
141
142pub struct StringLexerIter<'a: 'b, 'b> {
144 lexer: &'b mut StringLexer<'a>,
145}
146
147impl<'a, 'b> Iterator for StringLexerIter<'a, 'b> {
148 type Item = Result<u8>;
149 fn next(&mut self) -> Option<Result<u8>> {
150 match self.lexer.next_lexeme() {
151 Err(e) => Some(Err(e)),
152 Ok(Some(s)) => Some(Ok(s)),
153 Ok(None) => None,
154 }
155 }
156}
157
158pub struct HexStringLexer<'a> {
159 pos: usize, buf: &'a [u8],
161}
162
163impl<'a> HexStringLexer<'a> {
164 pub fn new(buf: &'a [u8]) -> HexStringLexer<'a> {
167 HexStringLexer { pos: 0, buf }
168 }
169
170 pub fn iter<'b>(&'b mut self) -> HexStringLexerIter<'a, 'b> {
171 HexStringLexerIter { lexer: self }
172 }
173
174 pub fn get_offset(&self) -> usize {
176 self.pos
177 }
178
179 fn next_non_whitespace_char(&mut self) -> Result<u8> {
180 let mut byte = self.read_byte()?;
181 while byte == b' ' || byte == b'\t' || byte == b'\n' || byte == b'\r' || byte == b'\x0c' {
182 byte = self.read_byte()?;
183 }
184 Ok(byte)
185 }
186
187 pub fn next_hex_byte(&mut self) -> Result<Option<u8>> {
188 let c1 = self.next_non_whitespace_char()?;
189 let high_nibble: u8 = match c1 {
190 b'0' ..= b'9' => c1 - b'0',
191 b'A' ..= b'F' => c1 - b'A' + 0xA,
192 b'a' ..= b'f' => c1 - b'a' + 0xA,
193 b'>' => return Ok(None),
194 _ => return Err(PdfError::HexDecode {
195 pos: self.pos,
196 bytes: [c1, self.peek_byte().unwrap_or(0)]
197 }),
198 };
199 let c2 = self.next_non_whitespace_char()?;
200 let low_nibble: u8 = match c2 {
201 b'0' ..= b'9' => c2 - b'0',
202 b'A' ..= b'F' => c2 - b'A' + 0xA,
203 b'a' ..= b'f' => c2 - b'a' + 0xA,
204 b'>' => {
205 self.back()?;
206 0
207 }
208 _ => return Err(PdfError::HexDecode {
209 pos: self.pos,
210 bytes: [c1, c2]
211 }),
212 };
213 Ok(Some((high_nibble << 4) | low_nibble))
214 }
215
216 fn read_byte(&mut self) -> Result<u8> {
217 if self.pos < self.buf.len() {
218 self.pos += 1;
219 Ok(self.buf[self.pos - 1])
220 } else {
221 Err(PdfError::EOF)
222 }
223 }
224
225 fn back(&mut self) -> Result<()> {
226 if self.pos > 0 {
227 self.pos -= 1;
228 Ok(())
229 } else {
230 Err(PdfError::EOF)
231 }
232 }
233
234 fn peek_byte(&mut self) -> Result<u8> {
235 if self.pos < self.buf.len() {
236 Ok(self.buf[self.pos])
237 } else {
238 Err(PdfError::EOF)
239 }
240 }
241}
242
243pub struct HexStringLexerIter<'a: 'b, 'b> {
244 lexer: &'b mut HexStringLexer<'a>,
245}
246
247impl<'a, 'b> Iterator for HexStringLexerIter<'a, 'b> {
248 type Item = Result<u8>;
249
250 fn next(&mut self) -> Option<Result<u8>> {
251 match self.lexer.next_hex_byte() {
252 Err(e) => Some(Err(e)),
253 Ok(Some(s)) => Some(Ok(s)),
254 Ok(None) => None,
255 }
256 }
257}
258
259#[cfg(test)]
260mod tests {
261 use crate::error::Result;
262 use crate::parser::lexer::{HexStringLexer, StringLexer};
263
264 #[test]
265 fn tests() {
266 let vec = b"a\\nb\\rc\\td\\(f/)\\\\hei)";
267 let mut lexer = StringLexer::new(vec);
268 let lexemes: Vec<u8> = lexer.iter().map(Result::unwrap).collect();
269 assert_eq!(lexemes, b"a\nb\rc\td(f/");
270 }
271
272 #[test]
273 fn string_split_lines() {
274 {
275 let data = b"These \\\ntwo strings \\\nare the same.)";
276 let mut lexer = StringLexer::new(data);
277 let result: Vec<u8> = lexer.iter().map(Result::unwrap).collect();
278 assert_eq!(result, b"These two strings are the same.");
279 }
280 {
281 let data = b"These \\\rtwo strings \\\rare the same.)";
282 let mut lexer = StringLexer::new(data);
283 let result: Vec<u8> = lexer.iter().map(Result::unwrap).collect();
284 assert_eq!(result, b"These two strings are the same.");
285 }
286 {
287 let data = b"These \\\r\ntwo strings \\\r\nare the same.)";
288 let mut lexer = StringLexer::new(data);
289 let result: Vec<u8> = lexer.iter().map(Result::unwrap).collect();
290 assert_eq!(result, b"These two strings are the same.");
291 }
292 }
293
294 #[test]
295 fn octal_escape() {
296 {
297 let data = b"This string contains\\245two octal characters\\307.)";
298 let mut lexer = StringLexer::new(data);
299 let result: Vec<u8> = lexer.iter().map(Result::unwrap).collect();
300 assert_eq!(result, &b"This string contains\xa5two octal characters\xc7."[..]);
301 }
302 {
303 let data = b"\\0053)";
304 let mut lexer = StringLexer::new(data);
305 let result: Vec<u8> = lexer.iter().map(Result::unwrap).collect();
306 assert_eq!(result, b"\x053");
307 }
308 {
309 let data = b"\\053)";
310 let mut lexer = StringLexer::new(data);
311 let result: Vec<u8> = lexer.iter().map(Result::unwrap).collect();
312 assert_eq!(result, b"+");
313 }
314 {
315 let data = b"\\53)";
316 let mut lexer = StringLexer::new(data);
317 let result: Vec<u8> = lexer.iter().map(Result::unwrap).collect();
318 assert_eq!(result, b"+");
319 }
320 {
321 let data = b"\\541)";
323 let mut lexer = StringLexer::new(data);
324 let result: Vec<u8> = lexer.iter().map(Result::unwrap).collect();
325 assert_eq!(result, b"a");
326 }
327 }
328
329 #[test]
330 fn hex_test() {
331 let input = b"901FA3>";
332 let mut lexer = HexStringLexer::new(input);
333 let result: Vec<u8> = lexer.iter().map(Result::unwrap).collect();
334 assert_eq!(
335 result,
336 vec![
337 b'\x90',
338 b'\x1f',
339 b'\xa3',
340 ]
341 );
342
343 let input = b"901FA>";
344 let mut lexer = HexStringLexer::new(input);
345 let result: Vec<u8> = lexer.iter().map(Result::unwrap).collect();
346 assert_eq!(
347 result,
348 vec![
349 b'\x90',
350 b'\x1f',
351 b'\xa0',
352 ]
353 );
354
355 let input = b"1 9F\t5\r\n4\x0c62a>";
356 let mut lexer = HexStringLexer::new(input);
357 let result: Vec<u8> = lexer.iter().map(Result::unwrap).collect();
358 assert_eq!(
359 result,
360 vec![
361 b'\x19',
362 b'\xf5',
363 b'\x46',
364 b'\x2a',
365 ]
366 );
367 }
368}