misc_conf/apache/
lexer.rs

1use nom::character::{complete::space0, streaming::anychar};
2#[allow(unused_imports)]
3use nom::{
4    branch::alt,
5    bytes::complete::{escaped, tag, take_till, take_until, take_while, take_while_m_n},
6    character::{
7        complete::{alphanumeric1, char as cchar, multispace0, multispace1, none_of, one_of},
8        is_alphabetic, is_newline, is_space,
9        streaming::space1,
10    },
11    combinator::{fail, map, map_res, opt, value},
12    number::complete::be_u8,
13    sequence::{delimited, tuple},
14    IResult,
15};
16
17use crate::lexer::Literal;
18
19#[derive(Debug, PartialEq, Eq, Clone, Copy)]
20pub enum Token<'a> {
21    OpenTag,
22    EndTag,
23    CloseTag,
24    NewLine,
25    Eof,
26    Literal(Literal<'a>),
27}
28
29impl<'a> Token<'a> {
30    pub fn ident(&self) -> Option<&'a str> {
31        // TODO: check ident rule
32        self.raw_string()
33    }
34
35    pub fn raw_string(&self) -> Option<&'a str> {
36        self.literal().map(|l| l.raw)
37    }
38
39    pub fn literal(&self) -> Option<Literal<'a>> {
40        match self {
41            Self::Literal(l) => Some(*l),
42            _ => None,
43        }
44    }
45}
46
47fn opentag(input: &[u8]) -> IResult<&[u8], Token> {
48    value(Token::OpenTag, tag(b"<"))(input)
49}
50
51fn endtag(input: &[u8]) -> IResult<&[u8], Token> {
52    value(Token::EndTag, tag(b">"))(input)
53}
54
55fn newline(input: &[u8]) -> IResult<&[u8], Token> {
56    value(Token::NewLine, tuple((opt(tag(b"\r")), tag(b"\n"))))(input)
57}
58
59fn closetag(input: &[u8]) -> IResult<&[u8], Token> {
60    value(Token::CloseTag, tag(b"</"))(input)
61}
62
63fn comment(input: &[u8]) -> IResult<&[u8], &[u8]> {
64    map(
65        tuple((
66            tag("#"),
67            take_till(is_newline),
68            opt(tag(b"\r")),
69            opt(tag(b"\n")),
70        )),
71        |x| x.1,
72    )(input)
73}
74
75fn literal(input: &[u8]) -> IResult<&[u8], Token> {
76    let (_, mut first) = be_u8(input)?;
77
78    let (input, raw) = match first {
79        b'"' => map_res(
80            // for empty string
81            map(
82                delimited(
83                    cchar(first as _),
84                    opt(escaped(none_of(r#"\""#), '\\', anychar)),
85                    cchar(first as _),
86                ),
87                Option::unwrap_or_default,
88            ),
89            std::str::from_utf8,
90        )(input),
91        b'\'' => map_res(
92            // for empty string
93            map(
94                delimited(
95                    cchar(first as _),
96                    opt(escaped(none_of(r#"\'"#), '\\', anychar)),
97                    cchar(first as _),
98                ),
99                Option::unwrap_or_default,
100            ),
101            std::str::from_utf8,
102        )(input),
103        _ => {
104            first = 0;
105            map_res(
106                escaped(none_of(" \t\r\n<>'\"\\"), '\\', anychar),
107                std::str::from_utf8,
108            )(input)
109        }
110    }?;
111    Ok((input, Token::Literal(Literal { raw, quote: first })))
112}
113
114pub fn tokenizer(input: &[u8]) -> IResult<&[u8], Token> {
115    inner_tokenizer::<false>(input)
116}
117
118fn space_and_comment<const NL: bool>(input: &[u8]) -> IResult<&[u8], Option<&[u8]>> {
119    let space = if NL { space0 } else { multispace0 };
120    map(tuple((space, opt(comment))), |x| x.1)(input)
121}
122
123pub fn inner_tokenizer<const NL: bool>(mut input: &[u8]) -> IResult<&[u8], Token> {
124    loop {
125        let (rest, cmt) = space_and_comment::<NL>(input)?;
126        input = rest;
127        if cmt.is_some() {
128            // println!("[cmt] {:?}", std::str::from_utf8(cmt.unwrap()));
129        } else {
130            break;
131        }
132    }
133    if input.len() == 0 {
134        return Ok((input, Token::Eof));
135    }
136    alt((closetag, opentag, endtag, newline, literal))(input)
137}