sentinel_modsec/parser/
lexer.rs

1//! Lexer for ModSecurity configuration syntax.
2
3use std::iter::Peekable;
4use std::str::Chars;
5
6/// Token produced by the lexer.
7#[derive(Debug, Clone)]
8pub struct Token {
9    /// The type of token.
10    pub kind: TokenKind,
11    /// Line number (1-indexed).
12    pub line: usize,
13    /// Column number (1-indexed).
14    pub column: usize,
15}
16
17/// Types of tokens.
18#[derive(Debug, Clone, PartialEq)]
19pub enum TokenKind {
20    /// A directive name (e.g., SecRule, SecAction).
21    Directive(String),
22    /// An unquoted word.
23    Word(String),
24    /// A quoted string (single or double quotes).
25    QuotedString(String),
26    /// A comment (starting with #).
27    Comment,
28    /// A newline.
29    Newline,
30    /// End of input.
31    Eof,
32}
33
34/// Lexer for ModSecurity configuration.
35pub struct Lexer<'a> {
36    input: Peekable<Chars<'a>>,
37    line: usize,
38    column: usize,
39    at_line_start: bool,
40}
41
42impl<'a> Lexer<'a> {
43    /// Create a new lexer for the given input.
44    pub fn new(input: &'a str) -> Self {
45        Self {
46            input: input.chars().peekable(),
47            line: 1,
48            column: 1,
49            at_line_start: true,
50        }
51    }
52
53    /// Peek at the next character without consuming it.
54    pub fn peek(&mut self) -> Option<char> {
55        self.input.peek().copied()
56    }
57
58    /// Consume the next character.
59    fn advance(&mut self) -> Option<char> {
60        let c = self.input.next();
61        if let Some(ch) = c {
62            if ch == '\n' {
63                self.line += 1;
64                self.column = 1;
65                self.at_line_start = true;
66            } else {
67                self.column += 1;
68                if !ch.is_whitespace() {
69                    self.at_line_start = false;
70                }
71            }
72        }
73        c
74    }
75
76    /// Skip whitespace (but not newlines), including backslash-newline continuations.
77    pub fn skip_whitespace(&mut self) {
78        loop {
79            match self.input.peek() {
80                Some(&' ') | Some(&'\t') => {
81                    self.advance();
82                }
83                Some(&'\\') => {
84                    // Check for line continuation
85                    let mut chars = self.input.clone();
86                    chars.next(); // consume the backslash
87                    if chars.peek() == Some(&'\n') {
88                        self.advance(); // consume backslash
89                        self.advance(); // consume newline
90                        // Continue skipping whitespace on next line
91                    } else if chars.peek() == Some(&'\r') {
92                        chars.next();
93                        if chars.peek() == Some(&'\n') {
94                            self.advance(); // consume backslash
95                            self.advance(); // consume \r
96                            self.advance(); // consume \n
97                        } else {
98                            break;
99                        }
100                    } else {
101                        break;
102                    }
103                }
104                _ => break,
105            }
106        }
107    }
108
109    /// Skip whitespace including newlines.
110    fn skip_all_whitespace(&mut self) {
111        while let Some(&c) = self.input.peek() {
112            if c.is_whitespace() {
113                self.advance();
114            } else {
115                break;
116            }
117        }
118    }
119
120    /// Get the next token.
121    pub fn next_token(&mut self) -> Option<Token> {
122        self.skip_whitespace();
123
124        let line = self.line;
125        let column = self.column;
126
127        match self.peek()? {
128            '\n' => {
129                self.advance();
130                Some(Token {
131                    kind: TokenKind::Newline,
132                    line,
133                    column,
134                })
135            }
136            '#' => {
137                // Comment - skip to end of line
138                while let Some(c) = self.advance() {
139                    if c == '\n' {
140                        break;
141                    }
142                }
143                Some(Token {
144                    kind: TokenKind::Comment,
145                    line,
146                    column,
147                })
148            }
149            '"' | '\'' => {
150                let quote = self.advance().unwrap();
151                let s = self.read_quoted_string(quote);
152                Some(Token {
153                    kind: TokenKind::QuotedString(s),
154                    line,
155                    column,
156                })
157            }
158            '\\' => {
159                // Line continuation
160                self.advance();
161                if self.peek() == Some('\n') {
162                    self.advance();
163                }
164                self.next_token()
165            }
166            _ => {
167                // Capture at_line_start BEFORE reading the word (since advance() will set it to false)
168                let was_at_line_start = self.at_line_start;
169                let word = self.read_word();
170                if word.is_empty() {
171                    return None;
172                }
173
174                // Check if this is a directive (at start of line, starts with Sec or Include)
175                let kind = if was_at_line_start
176                    && (word.to_lowercase().starts_with("sec")
177                        || word.to_lowercase() == "include")
178                {
179                    TokenKind::Directive(word)
180                } else {
181                    TokenKind::Word(word)
182                };
183
184                Some(Token { kind, line, column })
185            }
186        }
187    }
188
189    /// Read a quoted string, handling line continuation.
190    fn read_quoted_string(&mut self, quote: char) -> String {
191        let mut s = String::new();
192        let mut escaped = false;
193
194        while let Some(c) = self.advance() {
195            if escaped {
196                match c {
197                    '\n' => {
198                        // Line continuation inside quoted string - skip newline and leading whitespace
199                        while self.peek().map(|c| c == ' ' || c == '\t').unwrap_or(false) {
200                            self.advance();
201                        }
202                    }
203                    '\r' => {
204                        // Handle Windows line endings
205                        if self.peek() == Some('\n') {
206                            self.advance();
207                        }
208                        // Skip leading whitespace
209                        while self.peek().map(|c| c == ' ' || c == '\t').unwrap_or(false) {
210                            self.advance();
211                        }
212                    }
213                    'n' => s.push('\n'),
214                    't' => s.push('\t'),
215                    'r' => s.push('\r'),
216                    '\\' => s.push('\\'),
217                    '"' => s.push('"'),
218                    '\'' => s.push('\''),
219                    _ => {
220                        s.push('\\');
221                        s.push(c);
222                    }
223                }
224                escaped = false;
225            } else if c == '\\' {
226                escaped = true;
227            } else if c == quote {
228                break;
229            } else {
230                s.push(c);
231            }
232        }
233
234        s
235    }
236
237    /// Read an unquoted word, handling backslash-newline continuation.
238    fn read_word(&mut self) -> String {
239        let mut s = String::new();
240
241        while let Some(&c) = self.input.peek() {
242            if c == '\\' {
243                // Check for line continuation
244                self.advance();
245                if self.peek() == Some('\n') {
246                    self.advance();
247                    // Continue reading on next line
248                    continue;
249                } else {
250                    // Not a line continuation, include the backslash
251                    s.push('\\');
252                    continue;
253                }
254            }
255            if c.is_whitespace() || c == '"' || c == '\'' || c == '#' {
256                break;
257            }
258            s.push(c);
259            self.advance();
260        }
261
262        s
263    }
264}
265
266#[cfg(test)]
267mod tests {
268    use super::*;
269
270    #[test]
271    fn test_lex_directive() {
272        let mut lexer = Lexer::new("SecRule");
273        let token = lexer.next_token().unwrap();
274        assert!(matches!(token.kind, TokenKind::Directive(s) if s == "SecRule"));
275    }
276
277    #[test]
278    fn test_lex_quoted_string() {
279        let mut lexer = Lexer::new(r#""hello world""#);
280        let token = lexer.next_token().unwrap();
281        assert!(matches!(token.kind, TokenKind::QuotedString(s) if s == "hello world"));
282    }
283
284    #[test]
285    fn test_lex_escaped_quote() {
286        let mut lexer = Lexer::new(r#""hello \"world\"""#);
287        let token = lexer.next_token().unwrap();
288        assert!(matches!(token.kind, TokenKind::QuotedString(s) if s == r#"hello "world""#));
289    }
290
291    #[test]
292    fn test_lex_comment() {
293        let mut lexer = Lexer::new("# this is a comment\nSecRule");
294        let token = lexer.next_token().unwrap();
295        assert!(matches!(token.kind, TokenKind::Comment));
296
297        let token = lexer.next_token().unwrap();
298        assert!(matches!(token.kind, TokenKind::Directive(s) if s == "SecRule"));
299    }
300
301    #[test]
302    fn test_lex_line_continuation() {
303        // Line continuation in middle of word should join them
304        let mut lexer = Lexer::new("Sec\\\nRule");
305        let token = lexer.next_token().unwrap();
306        // Should be joined into "SecRule" and recognized as a directive
307        assert!(matches!(token.kind, TokenKind::Directive(s) if s == "SecRule"));
308    }
309
310    #[test]
311    fn test_lex_line_continuation_between_tokens() {
312        // Line continuation between tokens
313        let mut lexer = Lexer::new("SecRule \\\n  REQUEST_URI");
314        let token = lexer.next_token().unwrap();
315        assert!(matches!(token.kind, TokenKind::Directive(s) if s == "SecRule"));
316
317        let token = lexer.next_token().unwrap();
318        assert!(matches!(token.kind, TokenKind::Word(s) if s == "REQUEST_URI"));
319    }
320
321    #[test]
322    fn test_lex_full_rule() {
323        let mut lexer = Lexer::new(r#"SecRule REQUEST_URI "@contains /admin" "id:1,deny""#);
324
325        let token = lexer.next_token().unwrap();
326        assert!(matches!(token.kind, TokenKind::Directive(s) if s == "SecRule"));
327
328        let token = lexer.next_token().unwrap();
329        assert!(matches!(token.kind, TokenKind::Word(s) if s == "REQUEST_URI"));
330
331        let token = lexer.next_token().unwrap();
332        assert!(matches!(token.kind, TokenKind::QuotedString(s) if s == "@contains /admin"));
333
334        let token = lexer.next_token().unwrap();
335        assert!(matches!(token.kind, TokenKind::QuotedString(s) if s == "id:1,deny"));
336    }
337}