1use std::str::FromStr;
5use std::ops::{Range, Deref, RangeFrom};
6use std::borrow::Cow;
7
8use crate::error::*;
9use crate::primitive::Name;
10
11mod str;
12pub use self::str::{StringLexer, HexStringLexer};
13
14
15#[derive(Copy, Clone)]
17#[allow(dead_code)]
18pub struct Lexer<'a> {
19 pos: usize,
20 buf: &'a [u8],
21 file_offset: usize,
22}
23
24#[inline]
26fn boundary_rev(data: &[u8], pos: usize, condition: impl Fn(u8) -> bool) -> usize {
27 match data[.. pos].iter().rposition(|&b| !condition(b)) {
28 Some(start) => start + 1,
29 None => 0
30 }
31}
32
33#[inline]
35fn boundary(data: &[u8], pos: usize, condition: impl Fn(u8) -> bool) -> usize {
36 match data[pos ..].iter().position(|&b| !condition(b)) {
37 Some(start) => pos + start,
38 None => data.len()
39 }
40}
41
42#[inline]
43fn is_whitespace(b: u8) -> bool {
44 matches!(b, 0 | b' ' | b'\r' | b'\n' | b'\t')
45}
46#[inline]
47fn not<T>(f: impl Fn(T) -> bool) -> impl Fn(T) -> bool {
48 move |t| !f(t)
49}
50impl<'a> Lexer<'a> {
51 pub fn new(buf: &'a [u8]) -> Lexer<'a> {
52 Lexer {
53 pos: 0,
54 buf,
55 file_offset: 0
56 }
57 }
58 pub fn with_offset(buf: &'a [u8], file_offset: usize) -> Lexer<'a> {
59 Lexer {
60 pos: 0,
61 buf,
62 file_offset
63 }
64 }
65
66 #[allow(clippy::should_implement_trait)]
68 pub fn next(&mut self) -> Result<Substr<'a>> {
69 let (lexeme, pos) = self.next_word()?;
70 self.pos = pos;
71 Ok(lexeme)
72 }
73
74 pub fn next_stream(&mut self) -> Result<()> {
76 let pos = self.skip_whitespace(self.pos)?;
77 if !self.buf[pos ..].starts_with(b"stream") {
78 }
80
81 let &b0 = self.buf.get(pos + 6).ok_or(PdfError::EOF)?;
82 if b0 == b'\n' {
83 self.pos = pos + 7;
84 } else if b0 == b'\r' {
85 let &b1 = self.buf.get(pos + 7).ok_or(PdfError::EOF)?;
86 if b1 != b'\n' {
87 bail!("invalid whitespace following 'stream'");
88 }
90 self.pos = pos + 8;
91 } else {
92 bail!("invalid whitespace");
93 }
94 Ok(())
95 }
96 pub fn back(&mut self) -> Result<Substr<'a>> {
98 let end_pos = boundary_rev(self.buf, self.pos, is_whitespace);
102 let start_pos = boundary_rev(self.buf, end_pos, not(is_whitespace));
103 self.pos = start_pos;
104
105 Ok(self.new_substr(start_pos .. end_pos))
106 }
107
108 pub fn peek(&self) -> Result<Substr<'a>> {
110 match self.next_word() {
111 Ok((substr, _)) => Ok(substr),
112 Err(PdfError::EOF) => Ok(self.new_substr(self.pos..self.pos)),
113 Err(e) => Err(e),
114 }
115
116 }
117
118 pub fn next_expect(&mut self, expected: &'static str) -> Result<()> {
120 let word = self.next()?;
121 if word.equals(expected.as_bytes()) {
122 Ok(())
123 } else {
124 Err(PdfError::UnexpectedLexeme {
125 pos: self.pos,
126 lexeme: word.to_string(),
127 expected
128 })
129 }
130 }
131
132 #[inline]
134 fn skip_whitespace(&self, pos: usize) -> Result<usize> {
135 let pos = boundary(self.buf, pos, is_whitespace);
137 if pos >= self.buf.len() {
138 Err(PdfError::EOF)
139 } else {
140 Ok(pos)
141 }
142 }
143
144 fn next_word(&self) -> Result<(Substr<'a>, usize)> {
149 if self.pos == self.buf.len() {
150 return Err(PdfError::EOF);
151 }
152 let mut pos = self.skip_whitespace(self.pos)?;
153 while self.buf.get(pos) == Some(&b'%') {
154 pos += 1;
155 if let Some(off) = self.buf[pos..].iter().position(|&b| b == b'\n') {
156 pos += off+1;
157 }
158
159 pos = self.skip_whitespace(pos)?;
161 }
162
163 let start_pos = pos;
164
165 if self.is_delimiter(pos) {
169 if self.buf[pos] == b'/' {
170 pos = self.advance_pos(pos)?;
171 while !self.is_whitespace(pos) && !self.is_delimiter(pos) {
172 match self.advance_pos(pos) {
173 Ok(p) => pos = p,
174 Err(_) => break,
175 }
176 }
177 return Ok((self.new_substr(start_pos..pos), pos));
178 }
179
180 if let Some(slice) = self.buf.get(pos..=pos+1) {
181 if slice == b"<<" || slice == b">>" {
182 pos = self.advance_pos(pos)?;
183 }
184 }
185
186 pos = self.advance_pos(pos)?;
187 return Ok((self.new_substr(start_pos..pos), pos));
188 }
189
190 while !self.is_whitespace(pos) && !self.is_delimiter(pos) {
192 match self.advance_pos(pos) {
193 Ok(p) => pos = p,
194 Err(_) => break,
195 }
196 }
197 let result = self.new_substr(start_pos..pos);
198
199 Ok((result, pos))
202 }
203
204 #[inline]
206 fn advance_pos(&self, pos: usize) -> Result<usize> {
207 if pos < self.buf.len() {
208 Ok(pos + 1)
209 } else {
210 Err(PdfError::EOF)
211 }
212 }
213
214 #[inline]
215 pub fn next_as<T>(&mut self) -> Result<T>
216 where T: FromStr, T::Err: std::error::Error + Send + Sync + 'static
217 {
218 self.next().and_then(|word| word.to::<T>())
219 }
220
221 #[inline]
222 pub fn get_pos(&self) -> usize {
223 self.pos
224 }
225
226 #[inline]
227 pub fn new_substr(&self, mut range: Range<usize>) -> Substr<'a> {
228 if range.start > range.end {
231 let new_end = range.start + 1;
232 range.start = range.end + 1;
233 range.end = new_end;
234 }
235
236 Substr {
237 file_offset: self.file_offset + range.start,
238 slice: &self.buf[range],
239 }
240 }
241
242 #[inline]
244 pub fn set_pos(&mut self, wanted_pos: usize) -> Substr<'a> {
245 let new_pos = wanted_pos.min(self.buf.len());
246 let range = if self.pos < new_pos {
247 self.pos..new_pos
248 } else {
249 new_pos..self.pos
250 };
251 self.pos = new_pos;
252 self.new_substr(range)
253 }
254
255 #[inline]
257 pub fn set_pos_from_end(&mut self, new_pos: usize) -> Substr<'a> {
258 self.set_pos(self.buf.len().saturating_sub(new_pos).saturating_sub(1))
259 }
260 #[inline]
262 pub fn offset_pos(&mut self, offset: usize) -> Substr<'a> {
263 self.set_pos(self.pos.wrapping_add(offset))
264 }
265
266 #[allow(dead_code)]
268 pub fn seek_newline(&mut self) -> Substr{
269 let start = self.pos;
270 while self.buf[self.pos] != b'\n'
271 && self.incr_pos() { }
272 self.incr_pos();
273
274 self.new_substr(start..self.pos)
275 }
276
277
278 #[allow(dead_code)]
281 pub fn seek_substr(&mut self, substr: impl AsRef<[u8]>) -> Option<Substr<'a>> {
282 let substr = substr.as_ref();
284 let start = self.pos;
285 let mut matched = 0;
286 loop {
287 if self.pos >= self.buf.len() {
288 return None
289 }
290 if self.buf[self.pos] == substr[matched] {
291 matched += 1;
292 } else {
293 matched = 0;
294 }
295 if matched == substr.len() {
296 break;
297 }
298 self.pos += 1;
299 }
300 self.pos += 1;
301 Some(self.new_substr(start..(self.pos - substr.len())))
302 }
303
304 pub fn seek_substr_back(&mut self, substr: &[u8]) -> Result<Substr<'a>> {
308 let end = self.pos;
309 match self.buf[.. end].windows(substr.len()).rposition(|w| w == substr) {
310 Some(start) => {
311 self.pos = start + substr.len();
312 Ok(self.new_substr(self.pos .. end))
313 }
314 None => Err(PdfError::NotFound {word: String::from_utf8_lossy(substr).into() })
315 }
316 }
317
318 #[allow(dead_code)]
320 pub fn read_n(&mut self, n: usize) -> Substr<'a> {
321 let start_pos = self.pos;
322 self.pos += n;
323 if self.pos >= self.buf.len() {
324 self.pos = self.buf.len() - 1;
325 }
326 if start_pos < self.buf.len() {
327 self.new_substr(start_pos..self.pos)
328 } else {
329 self.new_substr(0..0)
330 }
331 }
332
333 #[inline]
335 pub fn get_remaining_slice(&self) -> &'a [u8] {
336 &self.buf[self.pos..]
337 }
338
339 pub fn ctx(&self) -> Cow<str> {
341 String::from_utf8_lossy(&self.buf[self.pos.saturating_sub(40)..self.buf.len().min(self.pos+40)])
342 }
343
344 #[inline]
345 fn incr_pos(&mut self) -> bool {
346 if self.pos >= self.buf.len() - 1 {
347 false
348 } else {
349 self.pos += 1;
350 true
351 }
352 }
353 #[inline]
354 fn is_whitespace(&self, pos: usize) -> bool {
355 self.buf.get(pos).map(|&b| is_whitespace(b)).unwrap_or(false)
356 }
357
358 #[inline]
359 fn is_delimiter(&self, pos: usize) -> bool {
360 self.buf.get(pos).map(|b| b"()<>[]{}/%".contains(b)).unwrap_or(false)
361 }
362
363}
364
365
366
367#[derive(Copy, Clone, Debug)]
369pub struct Substr<'a> {
370 slice: &'a [u8],
371 file_offset: usize,
372}
373impl<'a> Substr<'a> {
374 pub fn new<T: AsRef<[u8]> + ?Sized>(data: &'a T, file_offset: usize) -> Self {
375 Substr { slice: data.as_ref(), file_offset }
376 }
377 #[allow(clippy::inherent_to_string)]
382 pub fn to_string(&self) -> String {
383 String::from_utf8_lossy(self.as_slice()).into()
384 }
385 pub fn to_name(&self) -> Result<Name> {
386 Ok(Name(std::str::from_utf8(self.as_slice())?.into()))
387 }
388 pub fn to_vec(&self) -> Vec<u8> {
389 self.slice.to_vec()
390 }
391 pub fn to<T>(&self) -> Result<T>
392 where T: FromStr, T::Err: std::error::Error + Send + Sync + 'static
393 {
394 std::str::from_utf8(self.slice)?.parse::<T>().map_err(|e| PdfError::Parse { source: e.into() })
395 }
396 pub fn is_integer(&self) -> bool {
397 if self.slice.len() == 0 {
398 return false;
399 }
400 let mut slice = self.slice;
401 if slice[0] == b'-' {
402 if slice.len() < 2 {
403 return false;
404 }
405 slice = &slice[1..];
406 }
407 is_int(slice)
408 }
409 pub fn is_real_number(&self) -> bool {
410 self.real_number().is_some()
411 }
412 pub fn real_number(&self) -> Option<Self> {
413 if self.slice.len() == 0 {
414 return None;
415 }
416 let mut slice = self.slice;
417 if slice[0] == b'-' {
418 if slice.len() < 2 {
419 return None;
420 }
421 slice = &slice[1..];
422 }
423 if let Some(i) = slice.iter().position(|&b| b == b'.') {
424 if !is_int(&slice[..i]) {
425 return None;
426 }
427 slice = &slice[i+1..];
428 }
429 if let Some(len) = slice.iter().position(|&b| !b.is_ascii_digit()) {
430 if len == 0 {
431 return None;
432 }
433 let end = self.slice.len() - slice.len() + len;
434 Some(Substr {
435 file_offset: self.file_offset,
436 slice: &self.slice[..end]
437 })
438 } else {
439 Some(*self)
440 }
441 }
442
443 pub fn as_slice(&self) -> &'a [u8] {
444 self.slice
445 }
446 pub fn as_str(&self) -> Result<&str> {
447 std::str::from_utf8(self.slice).map_err(|e| PdfError::Parse { source: e.into() })
448 }
449
450 pub fn equals(&self, other: impl AsRef<[u8]>) -> bool {
451 self.slice == other.as_ref()
452 }
453
454 pub fn reslice(&self, range: RangeFrom<usize>) -> Substr<'a> {
455 Substr {
456 file_offset: self.file_offset + range.start,
457 slice: &self.slice[range],
458 }
459 }
460
461 pub fn file_range(&self) -> Range<usize> {
462 self.file_offset .. self.file_offset + self.slice.len()
463 }
464}
465
466#[inline]
467fn is_int(b: &[u8]) -> bool {
468 b.iter().all(|&b| b.is_ascii_digit())
469}
470impl<'a> Deref for Substr<'a> {
471 type Target = [u8];
472 fn deref(&self) -> &[u8] {
473 self.as_slice()
474 }
475}
476impl<'a> PartialEq<&[u8]> for Substr<'a> {
477 fn eq(&self, rhs: &&[u8]) -> bool {
478 self.equals(rhs)
479 }
480}
481
482impl<'a> PartialEq<&str> for Substr<'a> {
483 fn eq(&self, rhs: &&str) -> bool {
484 self.equals(rhs.as_bytes())
485 }
486}
487
488#[cfg(test)]
489mod tests {
490 use super::*;
491
492 #[test]
493 fn test_boundary_rev() {
494 assert_eq!(boundary_rev(b" hello", 3, not(is_whitespace)), 1);
495 assert_eq!(boundary_rev(b" hello", 3, is_whitespace), 3);
496 }
497
498 #[test]
499 fn test_boundary() {
500 assert_eq!(boundary(b" hello ", 3, not(is_whitespace)), 6);
501 assert_eq!(boundary(b" hello ", 3, is_whitespace), 3);
502 assert_eq!(boundary(b"01234 7orld", 5, is_whitespace), 7);
503 assert_eq!(boundary(b"01234 7orld", 7, is_whitespace), 7);
504 assert_eq!(boundary(b"q\n", 1, is_whitespace), 2);
505 }
506
507 #[test]
508 fn test_substr() {
509 assert!(Substr::new("123", 0).is_real_number());
510 assert!(Substr::new("123.", 0).is_real_number());
511 assert!(Substr::new("123.45", 0).is_real_number());
512 assert!(Substr::new(".45", 0).is_real_number());
513 assert!(Substr::new("-.45", 0).is_real_number());
514 assert!(!Substr::new("123.45", 0).is_integer());
515 assert!(Substr::new("123", 0).is_integer());
516 }
517}