Skip to main content

pdf_ast/parser/
lexer.rs

1use nom::{
2    branch::alt,
3    bytes::complete::{tag, take_until, take_while, take_while1},
4    character::complete::{char, digit1, multispace0, multispace1, one_of},
5    combinator::{map, map_res, opt, recognize, value},
6    multi::many0,
7    sequence::{delimited, pair, preceded, tuple},
8    IResult,
9};
10
11pub fn skip_whitespace(input: &[u8]) -> IResult<&[u8], ()> {
12    value((), multispace0)(input)
13}
14
15pub fn skip_whitespace_and_comments(input: &[u8]) -> IResult<&[u8], ()> {
16    value((), many0(alt((value((), multispace1), value((), comment)))))(input)
17}
18
19pub fn comment(input: &[u8]) -> IResult<&[u8], &[u8]> {
20    preceded(
21        char('%'),
22        alt((take_until("\n"), take_until("\r"), nom::combinator::rest)),
23    )(input)
24}
25
26pub fn pdf_header(input: &[u8]) -> IResult<&[u8], (u8, u8)> {
27    let (input, _) = tag(b"%PDF-")(input)?;
28    let (input, major) = map_res(digit1, |s: &[u8]| {
29        std::str::from_utf8(s).unwrap().parse::<u8>()
30    })(input)?;
31    let (input, _) = char('.')(input)?;
32    let (input, minor) = map_res(digit1, |s: &[u8]| {
33        std::str::from_utf8(s).unwrap().parse::<u8>()
34    })(input)?;
35    Ok((input, (major, minor)))
36}
37
38pub fn pdf_eof(input: &[u8]) -> IResult<&[u8], &[u8]> {
39    tag(b"%%EOF")(input)
40}
41
42pub fn is_whitespace(c: u8) -> bool {
43    matches!(c, b' ' | b'\t' | b'\n' | b'\r' | b'\x0C' | b'\0')
44}
45
46pub fn is_delimiter(c: u8) -> bool {
47    matches!(
48        c,
49        b'(' | b')' | b'<' | b'>' | b'[' | b']' | b'{' | b'}' | b'/' | b'%'
50    )
51}
52
53pub fn is_regular_char(c: u8) -> bool {
54    !is_whitespace(c) && !is_delimiter(c)
55}
56
57pub fn regular_chars(input: &[u8]) -> IResult<&[u8], &[u8]> {
58    take_while1(is_regular_char)(input)
59}
60
61pub fn keyword(input: &[u8]) -> IResult<&[u8], &[u8]> {
62    alt((
63        tag(b"true"),
64        tag(b"false"),
65        tag(b"null"),
66        tag(b"obj"),
67        tag(b"endobj"),
68        tag(b"stream"),
69        tag(b"endstream"),
70        tag(b"xref"),
71        tag(b"startxref"),
72        tag(b"trailer"),
73        tag(b"R"),
74        tag(b"n"),
75        tag(b"f"),
76    ))(input)
77}
78
79pub fn integer(input: &[u8]) -> IResult<&[u8], i64> {
80    map_res(recognize(pair(opt(one_of("+-")), digit1)), |s: &[u8]| {
81        std::str::from_utf8(s).unwrap().parse::<i64>()
82    })(input)
83}
84
85pub fn real(input: &[u8]) -> IResult<&[u8], f64> {
86    map_res(
87        recognize(tuple((
88            opt(one_of("+-")),
89            alt((
90                recognize(tuple((digit1, char('.'), opt(digit1)))),
91                recognize(tuple((opt(digit1), char('.'), digit1))),
92            )),
93        ))),
94        |s: &[u8]| std::str::from_utf8(s).unwrap().parse::<f64>(),
95    )(input)
96}
97
98pub fn hex_string(input: &[u8]) -> IResult<&[u8], Vec<u8>> {
99    delimited(
100        char('<'),
101        map(
102            take_while(|c: u8| c.is_ascii_hexdigit() || is_whitespace(c)),
103            |hex: &[u8]| {
104                let hex_str: String = hex
105                    .iter()
106                    .filter(|&&c| !is_whitespace(c))
107                    .map(|&c| c as char)
108                    .collect();
109
110                let mut result = Vec::new();
111                let mut chars = hex_str.chars();
112
113                while let Some(c1) = chars.next() {
114                    let c2 = chars.next().unwrap_or('0');
115                    if let Ok(byte) = u8::from_str_radix(&format!("{}{}", c1, c2), 16) {
116                        result.push(byte);
117                    }
118                }
119
120                result
121            },
122        ),
123        char('>'),
124    )(input)
125}
126
127pub fn literal_string(input: &[u8]) -> IResult<&[u8], Vec<u8>> {
128    delimited(
129        char('('),
130        map(
131            many0(alt((
132                preceded(char('\\'), escape_sequence),
133                map(take_while1(|c| c != b')' && c != b'\\'), |s: &[u8]| {
134                    s.to_vec()
135                }),
136            ))),
137            |parts| parts.into_iter().flatten().collect(),
138        ),
139        char(')'),
140    )(input)
141}
142
143fn escape_sequence(input: &[u8]) -> IResult<&[u8], Vec<u8>> {
144    alt((
145        value(vec![b'\n'], char('n')),
146        value(vec![b'\r'], char('r')),
147        value(vec![b'\t'], char('t')),
148        value(vec![b'\x08'], char('b')),
149        value(vec![b'\x0C'], char('f')),
150        value(vec![b'('], char('(')),
151        value(vec![b')'], char(')')),
152        value(vec![b'\\'], char('\\')),
153        map(octal_escape, |b| vec![b]),
154    ))(input)
155}
156
157fn octal_escape(input: &[u8]) -> IResult<&[u8], u8> {
158    map_res(
159        recognize(tuple((
160            one_of("01234567"),
161            opt(one_of("01234567")),
162            opt(one_of("01234567")),
163        ))),
164        |s: &[u8]| u8::from_str_radix(std::str::from_utf8(s).unwrap(), 8),
165    )(input)
166}
167
168pub fn name(input: &[u8]) -> IResult<&[u8], String> {
169    preceded(
170        char('/'),
171        map(
172            take_while(|c: u8| !is_whitespace(c) && !is_delimiter(c)),
173            |bytes: &[u8]| {
174                let mut result = String::new();
175                let mut chars = bytes.iter();
176
177                while let Some(&c) = chars.next() {
178                    if c == b'#' {
179                        if let (Some(&c1), Some(&c2)) = (chars.next(), chars.next()) {
180                            if let Ok(byte) =
181                                u8::from_str_radix(&format!("{}{}", c1 as char, c2 as char), 16)
182                            {
183                                result.push(byte as char);
184                                continue;
185                            }
186                        }
187                        result.push('#');
188                    } else {
189                        result.push(c as char);
190                    }
191                }
192
193                format!("/{}", result)
194            },
195        ),
196    )(input)
197}