use std::str::FromStr;
use std::ops::{Range, Deref, RangeFrom};
use std::borrow::Cow;
use crate::error::*;
use crate::primitive::Name;
mod str;
pub use self::str::{StringLexer, HexStringLexer};
#[derive(Copy, Clone)]
#[allow(dead_code)]
pub struct Lexer<'a> {
pos: usize,
buf: &'a [u8],
file_offset: usize,
}
#[inline]
fn boundary_rev(data: &[u8], pos: usize, condition: impl Fn(u8) -> bool) -> usize {
match data[.. pos].iter().rposition(|&b| !condition(b)) {
Some(start) => start + 1,
None => 0
}
}
#[inline]
fn boundary(data: &[u8], pos: usize, condition: impl Fn(u8) -> bool) -> usize {
match data[pos ..].iter().position(|&b| !condition(b)) {
Some(start) => pos + start,
None => data.len()
}
}
#[inline]
fn is_whitespace(b: u8) -> bool {
matches!(b, 0 | b' ' | b'\r' | b'\n' | b'\t')
}
#[inline]
fn not<T>(f: impl Fn(T) -> bool) -> impl Fn(T) -> bool {
move |t| !f(t)
}
impl<'a> Lexer<'a> {
pub fn new(buf: &'a [u8]) -> Lexer<'a> {
Lexer {
pos: 0,
buf,
file_offset: 0
}
}
pub fn with_offset(buf: &'a [u8], file_offset: usize) -> Lexer<'a> {
Lexer {
pos: 0,
buf,
file_offset
}
}
#[allow(clippy::should_implement_trait)]
pub fn next(&mut self) -> Result<Substr<'a>> {
let (lexeme, pos) = self.next_word()?;
self.pos = pos;
Ok(lexeme)
}
pub fn next_stream(&mut self) -> Result<()> {
let pos = self.skip_whitespace(self.pos)?;
if !self.buf[pos ..].starts_with(b"stream") {
}
let &b0 = self.buf.get(pos + 6).ok_or(PdfError::EOF)?;
if b0 == b'\n' {
self.pos = pos + 7;
} else if b0 == b'\r' {
let &b1 = self.buf.get(pos + 7).ok_or(PdfError::EOF)?;
if b1 != b'\n' {
bail!("invalid whitespace following 'stream'");
}
self.pos = pos + 8;
} else {
bail!("invalid whitespace");
}
Ok(())
}
pub fn back(&mut self) -> Result<Substr<'a>> {
let end_pos = boundary_rev(self.buf, self.pos, is_whitespace);
let start_pos = boundary_rev(self.buf, end_pos, not(is_whitespace));
self.pos = start_pos;
Ok(self.new_substr(start_pos .. end_pos))
}
pub fn peek(&self) -> Result<Substr<'a>> {
match self.next_word() {
Ok((substr, _)) => Ok(substr),
Err(PdfError::EOF) => Ok(self.new_substr(self.pos..self.pos)),
Err(e) => Err(e),
}
}
pub fn next_expect(&mut self, expected: &'static str) -> Result<()> {
let word = self.next()?;
if word.equals(expected.as_bytes()) {
Ok(())
} else {
Err(PdfError::UnexpectedLexeme {
pos: self.pos,
lexeme: word.to_string(),
expected
})
}
}
#[inline]
fn skip_whitespace(&self, pos: usize) -> Result<usize> {
let pos = boundary(self.buf, pos, is_whitespace);
if pos >= self.buf.len() {
Err(PdfError::EOF)
} else {
Ok(pos)
}
}
fn next_word(&self) -> Result<(Substr<'a>, usize)> {
if self.pos == self.buf.len() {
return Err(PdfError::EOF);
}
let mut pos = self.skip_whitespace(self.pos)?;
while self.buf.get(pos) == Some(&b'%') {
pos += 1;
if let Some(off) = self.buf[pos..].iter().position(|&b| b == b'\n') {
pos += off+1;
}
pos = self.skip_whitespace(pos)?;
}
let start_pos = pos;
if self.is_delimiter(pos) {
if self.buf[pos] == b'/' {
pos = self.advance_pos(pos)?;
while !self.is_whitespace(pos) && !self.is_delimiter(pos) {
match self.advance_pos(pos) {
Ok(p) => pos = p,
Err(_) => break,
}
}
return Ok((self.new_substr(start_pos..pos), pos));
}
if let Some(slice) = self.buf.get(pos..=pos+1) {
if slice == b"<<" || slice == b">>" {
pos = self.advance_pos(pos)?;
}
}
pos = self.advance_pos(pos)?;
return Ok((self.new_substr(start_pos..pos), pos));
}
while !self.is_whitespace(pos) && !self.is_delimiter(pos) {
match self.advance_pos(pos) {
Ok(p) => pos = p,
Err(_) => break,
}
}
let result = self.new_substr(start_pos..pos);
Ok((result, pos))
}
#[inline]
fn advance_pos(&self, pos: usize) -> Result<usize> {
if pos < self.buf.len() {
Ok(pos + 1)
} else {
Err(PdfError::EOF)
}
}
#[inline]
pub fn next_as<T>(&mut self) -> Result<T>
where T: FromStr, T::Err: std::error::Error + Send + Sync + 'static
{
self.next().and_then(|word| word.to::<T>())
}
#[inline]
pub fn get_pos(&self) -> usize {
self.pos
}
#[inline]
pub fn new_substr(&self, mut range: Range<usize>) -> Substr<'a> {
if range.start > range.end {
let new_end = range.start + 1;
range.start = range.end + 1;
range.end = new_end;
}
Substr {
file_offset: self.file_offset + range.start,
slice: &self.buf[range],
}
}
#[inline]
pub fn set_pos(&mut self, wanted_pos: usize) -> Substr<'a> {
let new_pos = wanted_pos.min(self.buf.len());
let range = if self.pos < new_pos {
self.pos..new_pos
} else {
new_pos..self.pos
};
self.pos = new_pos;
self.new_substr(range)
}
#[inline]
pub fn set_pos_from_end(&mut self, new_pos: usize) -> Substr<'a> {
self.set_pos(self.buf.len().saturating_sub(new_pos).saturating_sub(1))
}
#[inline]
pub fn offset_pos(&mut self, offset: usize) -> Substr<'a> {
self.set_pos(self.pos.wrapping_add(offset))
}
#[allow(dead_code)]
pub fn seek_newline(&mut self) -> Substr{
let start = self.pos;
while self.buf[self.pos] != b'\n'
&& self.incr_pos() { }
self.incr_pos();
self.new_substr(start..self.pos)
}
#[allow(dead_code)]
pub fn seek_substr(&mut self, substr: impl AsRef<[u8]>) -> Option<Substr<'a>> {
let substr = substr.as_ref();
let start = self.pos;
let mut matched = 0;
loop {
if self.pos >= self.buf.len() {
return None
}
if self.buf[self.pos] == substr[matched] {
matched += 1;
} else {
matched = 0;
}
if matched == substr.len() {
break;
}
self.pos += 1;
}
self.pos += 1;
Some(self.new_substr(start..(self.pos - substr.len())))
}
pub fn seek_substr_back(&mut self, substr: &[u8]) -> Result<Substr<'a>> {
let end = self.pos;
match self.buf[.. end].windows(substr.len()).rposition(|w| w == substr) {
Some(start) => {
self.pos = start + substr.len();
Ok(self.new_substr(self.pos .. end))
}
None => Err(PdfError::NotFound {word: String::from_utf8_lossy(substr).into() })
}
}
#[allow(dead_code)]
pub fn read_n(&mut self, n: usize) -> Substr<'a> {
let start_pos = self.pos;
self.pos += n;
if self.pos >= self.buf.len() {
self.pos = self.buf.len() - 1;
}
if start_pos < self.buf.len() {
self.new_substr(start_pos..self.pos)
} else {
self.new_substr(0..0)
}
}
#[inline]
pub fn get_remaining_slice(&self) -> &'a [u8] {
&self.buf[self.pos..]
}
pub fn ctx(&self) -> Cow<str> {
String::from_utf8_lossy(&self.buf[self.pos.saturating_sub(40)..self.buf.len().min(self.pos+40)])
}
#[inline]
fn incr_pos(&mut self) -> bool {
if self.pos >= self.buf.len() - 1 {
false
} else {
self.pos += 1;
true
}
}
#[inline]
fn is_whitespace(&self, pos: usize) -> bool {
self.buf.get(pos).map(|&b| is_whitespace(b)).unwrap_or(false)
}
#[inline]
fn is_delimiter(&self, pos: usize) -> bool {
self.buf.get(pos).map(|b| b"()<>[]{}/%".contains(b)).unwrap_or(false)
}
}
#[derive(Copy, Clone, Debug)]
pub struct Substr<'a> {
slice: &'a [u8],
file_offset: usize,
}
impl<'a> Substr<'a> {
pub fn new<T: AsRef<[u8]> + ?Sized>(data: &'a T, file_offset: usize) -> Self {
Substr { slice: data.as_ref(), file_offset }
}
#[allow(clippy::inherent_to_string)]
pub fn to_string(&self) -> String {
String::from_utf8_lossy(self.as_slice()).into()
}
pub fn to_name(&self) -> Result<Name> {
Ok(Name(std::str::from_utf8(self.as_slice())?.into()))
}
pub fn to_vec(&self) -> Vec<u8> {
self.slice.to_vec()
}
pub fn to<T>(&self) -> Result<T>
where T: FromStr, T::Err: std::error::Error + Send + Sync + 'static
{
std::str::from_utf8(self.slice)?.parse::<T>().map_err(|e| PdfError::Parse { source: e.into() })
}
pub fn is_integer(&self) -> bool {
if self.slice.len() == 0 {
return false;
}
let mut slice = self.slice;
if slice[0] == b'-' {
if slice.len() < 2 {
return false;
}
slice = &slice[1..];
}
is_int(slice)
}
pub fn is_real_number(&self) -> bool {
self.real_number().is_some()
}
pub fn real_number(&self) -> Option<Self> {
if self.slice.len() == 0 {
return None;
}
let mut slice = self.slice;
if slice[0] == b'-' {
if slice.len() < 2 {
return None;
}
slice = &slice[1..];
}
if let Some(i) = slice.iter().position(|&b| b == b'.') {
if !is_int(&slice[..i]) {
return None;
}
slice = &slice[i+1..];
}
if let Some(len) = slice.iter().position(|&b| !matches!(b, b'0'..=b'9')) {
if len == 0 {
return None;
}
let end = self.slice.len() - slice.len() + len;
Some(Substr {
file_offset: self.file_offset,
slice: &self.slice[..end]
})
} else {
Some(*self)
}
}
pub fn as_slice(&self) -> &'a [u8] {
self.slice
}
pub fn as_str(&self) -> Result<&str> {
std::str::from_utf8(self.slice).map_err(|e| PdfError::Parse { source: e.into() })
}
pub fn equals(&self, other: impl AsRef<[u8]>) -> bool {
self.slice == other.as_ref()
}
pub fn reslice(&self, range: RangeFrom<usize>) -> Substr<'a> {
Substr {
file_offset: self.file_offset + range.start,
slice: &self.slice[range],
}
}
pub fn file_range(&self) -> Range<usize> {
self.file_offset .. self.file_offset + self.slice.len()
}
}
#[inline]
fn is_int(b: &[u8]) -> bool {
b.iter().all(|&b| matches!(b, b'0'..=b'9'))
}
impl<'a> Deref for Substr<'a> {
type Target = [u8];
fn deref(&self) -> &[u8] {
self.as_slice()
}
}
impl<'a> PartialEq<&[u8]> for Substr<'a> {
fn eq(&self, rhs: &&[u8]) -> bool {
self.equals(rhs)
}
}
impl<'a> PartialEq<&str> for Substr<'a> {
fn eq(&self, rhs: &&str) -> bool {
self.equals(rhs.as_bytes())
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_boundary_rev() {
assert_eq!(boundary_rev(&*b" hello", 3, not(is_whitespace)), 1);
assert_eq!(boundary_rev(&*b" hello", 3, is_whitespace), 3);
}
#[test]
fn test_boundary() {
assert_eq!(boundary(&*b" hello ", 3, not(is_whitespace)), 6);
assert_eq!(boundary(&*b" hello ", 3, is_whitespace), 3);
assert_eq!(boundary(&*b"01234 7orld", 5, is_whitespace), 7);
assert_eq!(boundary(&*b"01234 7orld", 7, is_whitespace), 7);
assert_eq!(boundary(&*b"q\n", 1, is_whitespace), 2);
}
#[test]
fn test_substr() {
assert!(Substr::new("123", 0).is_real_number());
assert!(Substr::new("123.", 0).is_real_number());
assert!(Substr::new("123.45", 0).is_real_number());
assert!(Substr::new(".45", 0).is_real_number());
assert!(Substr::new("-.45", 0).is_real_number());
assert!(!Substr::new("123.45", 0).is_integer());
assert!(Substr::new("123", 0).is_integer());
}
}