use nom::{
branch::alt,
bytes::complete::{tag, take_till, take_while},
character::complete::{char, digit1, one_of},
combinator::{map, opt, value},
multi::many0,
sequence::{delimited, preceded},
IResult, Parser,
};
#[derive(Debug, PartialEq, Clone)]
pub enum Token<'a> {
Integer(i64),
Real(f64),
LiteralString(&'a [u8]),
HexString(&'a [u8]),
Name(String),
True,
False,
Null,
ArrayStart,
ArrayEnd,
DictStart,
DictEnd,
ObjStart,
ObjEnd,
StreamStart,
StreamEnd,
R,
}
fn whitespace(input: &[u8]) -> IResult<&[u8], ()> {
let (remaining, ws) =
take_while(|c| matches!(c, b' ' | b'\t' | b'\r' | b'\n' | 0x00 | 0x0C)).parse(input)?;
if ws.is_empty() {
return Err(nom::Err::Error(nom::error::Error::new(input, nom::error::ErrorKind::Space)));
}
Ok((remaining, ()))
}
fn comment(input: &[u8]) -> IResult<&[u8], ()> {
value((), preceded(char('%'), take_till(|c| c == b'\r' || c == b'\n'))).parse(input)
}
fn skip_ws(input: &[u8]) -> IResult<&[u8], &[u8]> {
let mut remaining = input;
loop {
let before = remaining;
if let Ok((rest, _)) = whitespace(remaining) {
remaining = rest;
continue;
}
if let Ok((rest, _)) = comment(remaining) {
remaining = rest;
continue;
}
if remaining == before {
break;
}
}
Ok((remaining, input))
}
fn parse_number(input: &[u8]) -> IResult<&[u8], Token<'_>> {
let (input, sign) = opt(one_of("+-")).parse(input)?;
let (input, int_part) = opt(digit1).parse(input)?;
let (input, frac_part) = opt(preceded(char('.'), opt(digit1))).parse(input)?;
if int_part.is_none() && frac_part.is_none() {
if sign.is_some() {
return Ok((input, Token::Integer(0)));
}
return Err(nom::Err::Error(nom::error::Error::new(input, nom::error::ErrorKind::Digit)));
}
if frac_part.is_some() {
let mut num_str = String::new();
if sign == Some('-') {
num_str.push('-');
}
if let Some(int) = int_part {
num_str.push_str(std::str::from_utf8(int).map_err(|_| {
nom::Err::Error(nom::error::Error::new(input, nom::error::ErrorKind::Digit))
})?);
} else {
num_str.push('0'); }
num_str.push('.');
if let Some(Some(frac)) = frac_part {
num_str.push_str(std::str::from_utf8(frac).map_err(|_| {
nom::Err::Error(nom::error::Error::new(input, nom::error::ErrorKind::Digit))
})?);
} else {
num_str.push('0'); }
let num: f64 = num_str.parse().map_err(|_| {
nom::Err::Error(nom::error::Error::new(input, nom::error::ErrorKind::Digit))
})?;
Ok((input, Token::Real(num)))
} else {
let int_bytes = int_part.ok_or_else(|| {
nom::Err::Error(nom::error::Error::new(input, nom::error::ErrorKind::Digit))
})?;
let int_str = std::str::from_utf8(int_bytes).map_err(|_| {
nom::Err::Error(nom::error::Error::new(input, nom::error::ErrorKind::Digit))
})?;
let mut num: i64 = int_str.parse().map_err(|_| {
nom::Err::Error(nom::error::Error::new(input, nom::error::ErrorKind::Digit))
})?;
if sign == Some('-') {
num = -num;
}
Ok((input, Token::Integer(num)))
}
}
fn parse_literal_string(input: &[u8]) -> IResult<&[u8], Token<'_>> {
let (mut remaining, _) = char('(')(input)?;
let mut depth = 1;
let mut pos = 0;
while depth > 0 && pos < remaining.len() {
match remaining[pos] {
b'\\' => {
pos += 1;
if pos < remaining.len() {
if remaining[pos].is_ascii_digit() {
pos += 1;
if pos < remaining.len() && remaining[pos].is_ascii_digit() {
pos += 1;
}
if pos < remaining.len() && remaining[pos].is_ascii_digit() {
pos += 1;
}
} else {
pos += 1; }
}
},
b'(' => {
depth += 1;
pos += 1;
},
b')' => {
depth -= 1;
pos += 1;
},
_ => {
pos += 1;
},
}
}
if depth != 0 {
return Err(nom::Err::Error(nom::error::Error::new(input, nom::error::ErrorKind::Tag)));
}
let content = &remaining[..pos - 1];
remaining = &remaining[pos..];
Ok((remaining, Token::LiteralString(content)))
}
fn parse_hex_string(input: &[u8]) -> IResult<&[u8], Token<'_>> {
if input.len() >= 2 && input[0] == b'<' && input[1] == b'<' {
return Err(nom::Err::Error(nom::error::Error::new(input, nom::error::ErrorKind::Tag)));
}
delimited(char('<'), map(take_while(|c: u8| c != b'>'), Token::HexString), char('>'))
.parse(input)
}
pub fn decode_name_escapes(name: &str) -> String {
let mut result = String::with_capacity(name.len());
let mut chars = name.chars().peekable();
while let Some(ch) = chars.next() {
if ch == '#' {
let hex1 = chars.next();
let hex2 = chars.next();
if let (Some(h1), Some(h2)) = (hex1, hex2) {
let hex_str = format!("{}{}", h1, h2);
if let Ok(byte) = u8::from_str_radix(&hex_str, 16) {
result.push(byte as char);
continue;
}
result.push('#');
result.push(h1);
result.push(h2);
} else if let Some(h1) = hex1 {
result.push('#');
result.push(h1);
} else {
result.push('#');
}
} else {
result.push(ch);
}
}
result
}
fn parse_name(input: &[u8]) -> IResult<&[u8], Token<'_>> {
preceded(
char('/'),
map(
take_while(|c: u8| {
!matches!(
c,
b' ' | b'\t' | b'\r' | b'\n' | 0x00 | 0x0C | b'/' | b'%' | b'(' | b')' | b'<' | b'>' | b'[' | b']' | b'{' | b'}' )
}),
|bytes| {
let name_str = std::str::from_utf8(bytes).unwrap_or("");
Token::Name(decode_name_escapes(name_str))
},
),
)
.parse(input)
}
fn parse_keyword(input: &[u8]) -> IResult<&[u8], Token<'_>> {
alt((
value(Token::False, tag(&b"false"[..])),
value(Token::True, tag(&b"true"[..])),
value(Token::Null, tag(&b"null"[..])),
value(Token::ObjStart, tag(&b"obj"[..])),
value(Token::ObjEnd, tag(&b"endobj"[..])),
value(Token::StreamEnd, tag(&b"endstream"[..])), value(Token::StreamStart, tag(&b"stream"[..])),
value(Token::DictStart, tag(&b"<<"[..])),
value(Token::DictEnd, tag(&b">>"[..])),
value(Token::ArrayStart, tag(&b"["[..])),
value(Token::ArrayEnd, tag(&b"]"[..])),
parse_r_token,
))
.parse(input)
}
fn parse_r_token(input: &[u8]) -> IResult<&[u8], Token<'_>> {
if input.first() != Some(&b'R') {
return Err(nom::Err::Error(nom::error::Error::new(input, nom::error::ErrorKind::Tag)));
}
if input.len() > 1 && input[1].is_ascii_alphabetic() {
return Err(nom::Err::Error(nom::error::Error::new(input, nom::error::ErrorKind::Tag)));
}
Ok((&input[1..], Token::R))
}
pub fn token(input: &[u8]) -> IResult<&[u8], Token<'_>> {
let (input, _) = skip_ws(input)?;
alt((
parse_keyword, parse_name, parse_number, parse_literal_string, parse_hex_string, ))
.parse(input)
}
pub fn tokens(input: &[u8]) -> IResult<&[u8], Vec<Token<'_>>> {
many0(token).parse(input)
}
#[cfg(test)]
mod tests {
use super::*;
#[allow(clippy::approx_constant)]
fn _allow_approx_const() {}
#[test]
fn test_parse_positive_integer() {
let result = token(b"42");
assert_eq!(result, Ok((&b""[..], Token::Integer(42))));
}
#[test]
fn test_parse_negative_integer() {
let result = token(b"-123");
assert_eq!(result, Ok((&b""[..], Token::Integer(-123))));
}
#[test]
fn test_parse_zero() {
let result = token(b"0");
assert_eq!(result, Ok((&b""[..], Token::Integer(0))));
}
#[test]
#[allow(clippy::approx_constant)]
fn test_parse_positive_real() {
let result = token(b"3.14");
assert_eq!(result, Ok((&b""[..], Token::Real(3.14))));
}
#[test]
fn test_parse_negative_real() {
let result = token(b"-2.5");
assert_eq!(result, Ok((&b""[..], Token::Real(-2.5))));
}
#[test]
fn test_parse_real_starting_with_dot() {
let result = token(b".5");
assert_eq!(result, Ok((&b""[..], Token::Real(0.5))));
}
#[test]
fn test_parse_real_ending_with_dot() {
let result = token(b"5.");
assert_eq!(result, Ok((&b""[..], Token::Real(5.0))));
}
#[test]
fn test_parse_negative_real_starting_with_dot() {
let result = token(b"-.002");
assert_eq!(result, Ok((&b""[..], Token::Real(-0.002))));
}
#[test]
fn test_parse_literal_string() {
let result = token(b"(Hello)");
assert_eq!(result, Ok((&b""[..], Token::LiteralString(b"Hello"))));
}
#[test]
fn test_parse_literal_string_with_spaces() {
let result = token(b"(Hello World)");
assert_eq!(result, Ok((&b""[..], Token::LiteralString(b"Hello World"))));
}
#[test]
fn test_parse_literal_string_with_nested_parens() {
let result = token(b"(Hello (nested) World)");
assert_eq!(result, Ok((&b""[..], Token::LiteralString(b"Hello (nested) World"))));
}
#[test]
fn test_parse_literal_string_with_escape() {
let result = token(b"(Line1\\nLine2)");
assert_eq!(result, Ok((&b""[..], Token::LiteralString(b"Line1\\nLine2"))));
}
#[test]
fn test_parse_literal_string_with_escaped_paren() {
let result = token(b"(Open \\( Close \\))");
assert_eq!(result, Ok((&b""[..], Token::LiteralString(b"Open \\( Close \\)"))));
}
#[test]
fn test_parse_empty_literal_string() {
let result = token(b"()");
assert_eq!(result, Ok((&b""[..], Token::LiteralString(b""))));
}
#[test]
fn test_parse_hex_string() {
let result = token(b"<48656C6C6F>");
assert_eq!(result, Ok((&b""[..], Token::HexString(b"48656C6C6F"))));
}
#[test]
fn test_parse_hex_string_with_whitespace() {
let result = token(b"<48 65 6C 6C 6F>");
assert_eq!(result, Ok((&b""[..], Token::HexString(b"48 65 6C 6C 6F"))));
}
#[test]
fn test_parse_empty_hex_string() {
let result = token(b"<>");
assert_eq!(result, Ok((&b""[..], Token::HexString(b""))));
}
#[test]
fn test_parse_name() {
let result = token(b"/Type");
assert_eq!(result, Ok((&b""[..], Token::Name("Type".to_string()))));
}
#[test]
fn test_parse_name_with_special_chars() {
let result = token(b"/A;Name_With-Various***Characters");
assert_eq!(
result,
Ok((&b""[..], Token::Name("A;Name_With-Various***Characters".to_string())))
);
}
#[test]
fn test_parse_empty_name() {
let result = token(b"/ ");
assert_eq!(result, Ok((&b" "[..], Token::Name("".to_string()))));
}
#[test]
fn test_parse_name_with_hex_escape() {
let result = token(b"/A#20B");
assert_eq!(result, Ok((&b""[..], Token::Name("A B".to_string()))));
}
#[test]
fn test_parse_name_with_multiple_hex_escapes() {
let result = token(b"/A#20B#23C");
assert_eq!(result, Ok((&b""[..], Token::Name("A B#C".to_string()))));
}
#[test]
fn test_parse_name_with_invalid_hex_escape() {
let result = token(b"/A#ZZ");
assert_eq!(result, Ok((&b""[..], Token::Name("A#ZZ".to_string()))));
}
#[test]
fn test_decode_name_escapes_directly() {
assert_eq!(decode_name_escapes("Type"), "Type");
assert_eq!(decode_name_escapes("A#20B"), "A B");
assert_eq!(decode_name_escapes("A#20B#23C"), "A B#C");
assert_eq!(decode_name_escapes("A#"), "A#"); assert_eq!(decode_name_escapes("A#2"), "A#2"); assert_eq!(decode_name_escapes("A#ZZ"), "A#ZZ"); }
#[test]
fn test_parse_true() {
let result = token(b"true");
assert_eq!(result, Ok((&b""[..], Token::True)));
}
#[test]
fn test_parse_false() {
let result = token(b"false");
assert_eq!(result, Ok((&b""[..], Token::False)));
}
#[test]
fn test_parse_null() {
let result = token(b"null");
assert_eq!(result, Ok((&b""[..], Token::Null)));
}
#[test]
fn test_parse_array_start() {
let result = token(b"[");
assert_eq!(result, Ok((&b""[..], Token::ArrayStart)));
}
#[test]
fn test_parse_array_end() {
let result = token(b"]");
assert_eq!(result, Ok((&b""[..], Token::ArrayEnd)));
}
#[test]
fn test_parse_dict_start() {
let result = token(b"<<");
assert_eq!(result, Ok((&b""[..], Token::DictStart)));
}
#[test]
fn test_parse_dict_end() {
let result = token(b">>");
assert_eq!(result, Ok((&b""[..], Token::DictEnd)));
}
#[test]
fn test_parse_obj_start() {
let result = token(b"obj");
assert_eq!(result, Ok((&b""[..], Token::ObjStart)));
}
#[test]
fn test_parse_obj_end() {
let result = token(b"endobj");
assert_eq!(result, Ok((&b""[..], Token::ObjEnd)));
}
#[test]
fn test_parse_stream_start() {
let result = token(b"stream");
assert_eq!(result, Ok((&b""[..], Token::StreamStart)));
}
#[test]
fn test_parse_stream_end() {
let result = token(b"endstream");
assert_eq!(result, Ok((&b""[..], Token::StreamEnd)));
}
#[test]
fn test_parse_reference_marker() {
let result = token(b"R");
assert_eq!(result, Ok((&b""[..], Token::R)));
}
#[test]
fn test_skip_leading_whitespace() {
let result = token(b" \n\t42");
assert_eq!(result, Ok((&b""[..], Token::Integer(42))));
}
#[test]
fn test_skip_comment() {
let result = token(b"% This is a comment\n42");
assert_eq!(result, Ok((&b""[..], Token::Integer(42))));
}
#[test]
fn test_skip_multiple_comments() {
let result = token(b"% Comment 1\n% Comment 2\n42");
assert_eq!(result, Ok((&b""[..], Token::Integer(42))));
}
#[test]
fn test_skip_mixed_whitespace_and_comments() {
let result = token(b" % Comment\n \t% Another\n 42");
assert_eq!(result, Ok((&b""[..], Token::Integer(42))));
}
#[test]
fn test_multiple_tokens() {
let input = b"42 /Type (Hello) true";
let (input, tok1) = token(input).unwrap();
assert_eq!(tok1, Token::Integer(42));
let (input, tok2) = token(input).unwrap();
assert_eq!(tok2, Token::Name("Type".to_string()));
let (input, tok3) = token(input).unwrap();
assert_eq!(tok3, Token::LiteralString(b"Hello"));
let (input, tok4) = token(input).unwrap();
assert_eq!(tok4, Token::True);
assert_eq!(input, &b""[..]);
}
#[test]
fn test_tokens_function() {
let input = b"42 /Type (Hello) true";
let (remaining, toks) = tokens(input).unwrap();
assert_eq!(remaining, &b""[..]);
assert_eq!(toks.len(), 4);
assert_eq!(toks[0], Token::Integer(42));
assert_eq!(toks[1], Token::Name("Type".to_string()));
assert_eq!(toks[2], Token::LiteralString(b"Hello"));
assert_eq!(toks[3], Token::True);
}
#[test]
fn test_dict_vs_hex_string() {
let result = token(b"<<");
assert_eq!(result, Ok((&b""[..], Token::DictStart)));
let result = token(b"<ABC>");
assert_eq!(result, Ok((&b""[..], Token::HexString(b"ABC"))));
}
#[test]
fn test_complex_pdf_snippet() {
let input = b"1 0 obj\n<< /Type /Catalog /Pages 2 0 R >>\nendobj";
let (input, tok1) = token(input).unwrap();
assert_eq!(tok1, Token::Integer(1));
let (input, tok2) = token(input).unwrap();
assert_eq!(tok2, Token::Integer(0));
let (input, tok3) = token(input).unwrap();
assert_eq!(tok3, Token::ObjStart);
let (input, tok4) = token(input).unwrap();
assert_eq!(tok4, Token::DictStart);
let (input, tok5) = token(input).unwrap();
assert_eq!(tok5, Token::Name("Type".to_string()));
let (input, tok6) = token(input).unwrap();
assert_eq!(tok6, Token::Name("Catalog".to_string()));
let (input, tok7) = token(input).unwrap();
assert_eq!(tok7, Token::Name("Pages".to_string()));
let (input, tok8) = token(input).unwrap();
assert_eq!(tok8, Token::Integer(2));
let (input, tok9) = token(input).unwrap();
assert_eq!(tok9, Token::Integer(0));
let (input, tok10) = token(input).unwrap();
assert_eq!(tok10, Token::R);
let (input, tok11) = token(input).unwrap();
assert_eq!(tok11, Token::DictEnd);
let (input, tok12) = token(input).unwrap();
assert_eq!(tok12, Token::ObjEnd);
assert_eq!(input, &b""[..]);
}
#[test]
fn test_real_vs_integer_distinction() {
assert!(matches!(token(b"0").unwrap().1, Token::Integer(0)));
assert!(matches!(token(b"42").unwrap().1, Token::Integer(42)));
assert!(matches!(token(b"-123").unwrap().1, Token::Integer(-123)));
assert!(matches!(token(b"0.0").unwrap().1, Token::Real(_)));
assert!(matches!(token(b"3.14").unwrap().1, Token::Real(_)));
assert!(matches!(token(b".5").unwrap().1, Token::Real(_)));
assert!(matches!(token(b"5.").unwrap().1, Token::Real(_)));
}
}