Skip to main content

nginx_lint_parser/
parser.rs

1//! Rowan-based recursive-descent parser for nginx configuration files.
2//!
3//! Takes the token sequence from [`lexer_rowan::tokenize`](crate::lexer_rowan::tokenize)
4//! and builds a lossless green tree using [`rowan::GreenNodeBuilder`].
5
6use crate::syntax_kind::SyntaxKind;
7use rowan::GreenNode;
8use rowan::GreenNodeBuilder;
9
10/// Parse errors collected during tree construction.
11#[derive(Debug, Clone, PartialEq, Eq)]
12pub struct SyntaxError {
13    pub message: String,
14    pub offset: usize,
15}
16
17/// Parse a flat token list into a rowan green tree.
18///
19/// Returns the root green node and any errors encountered during parsing.
20pub fn parse(tokens: Vec<(SyntaxKind, &str)>) -> (GreenNode, Vec<SyntaxError>) {
21    let mut parser = Parser::new(tokens);
22    parser.parse_root();
23    (parser.builder.finish(), parser.errors)
24}
25
26// ── Parser ──────────────────────────────────────────────────────────────
27
28struct Parser<'a> {
29    tokens: Vec<(SyntaxKind, &'a str)>,
30    pos: usize,
31    builder: GreenNodeBuilder<'static>,
32    errors: Vec<SyntaxError>,
33    /// Byte offset into the original source (sum of consumed token lengths).
34    offset: usize,
35}
36
37impl<'a> Parser<'a> {
38    fn new(tokens: Vec<(SyntaxKind, &'a str)>) -> Self {
39        Self {
40            tokens,
41            pos: 0,
42            builder: GreenNodeBuilder::new(),
43            errors: Vec::new(),
44            offset: 0,
45        }
46    }
47
48    // ── Helpers ──────────────────────────────────────────────────────
49
50    /// Current token kind, or `None` at EOF.
51    fn current(&self) -> Option<SyntaxKind> {
52        self.tokens.get(self.pos).map(|(k, _)| *k)
53    }
54
55    /// Current token text.
56    fn current_text(&self) -> &'a str {
57        self.tokens.get(self.pos).map(|(_, t)| *t).unwrap_or("")
58    }
59
60    /// Check if the current token is `kind`.
61    fn at(&self, kind: SyntaxKind) -> bool {
62        self.current() == Some(kind)
63    }
64
65    /// Check if we're at end of file.
66    fn at_end(&self) -> bool {
67        self.pos >= self.tokens.len()
68    }
69
70    /// Consume the current token and add it as a leaf to the builder.
71    fn bump(&mut self) {
72        if let Some(&(kind, text)) = self.tokens.get(self.pos) {
73            self.builder.token(kind.into(), text);
74            self.offset += text.len();
75            self.pos += 1;
76        }
77    }
78
79    /// Consume whitespace and newline tokens (trivia), adding them to the tree.
80    fn eat_trivia(&mut self) {
81        while let Some(kind) = self.current() {
82            if kind == SyntaxKind::WHITESPACE || kind == SyntaxKind::NEWLINE {
83                self.bump();
84            } else {
85                break;
86            }
87        }
88    }
89
90    /// Peek at the next non-trivia token kind.
91    fn peek_non_trivia(&self) -> Option<SyntaxKind> {
92        let mut i = self.pos;
93        while i < self.tokens.len() {
94            let kind = self.tokens[i].0;
95            if kind != SyntaxKind::WHITESPACE && kind != SyntaxKind::NEWLINE {
96                return Some(kind);
97            }
98            i += 1;
99        }
100        None
101    }
102
103    fn error(&mut self, message: impl Into<String>) {
104        self.errors.push(SyntaxError {
105            message: message.into(),
106            offset: self.offset,
107        });
108    }
109
110    // ── Grammar rules ───────────────────────────────────────────────
111
112    /// ROOT → item*
113    fn parse_root(&mut self) {
114        self.builder.start_node(SyntaxKind::ROOT.into());
115        self.parse_items(false);
116        self.builder.finish_node();
117    }
118
119    /// Parse items until EOF (if `in_block` is false) or R_BRACE (if `in_block` is true).
120    fn parse_items(&mut self, in_block: bool) {
121        loop {
122            match self.current() {
123                None => break,
124                Some(SyntaxKind::R_BRACE) if in_block => break,
125                Some(SyntaxKind::R_BRACE) => {
126                    // Unexpected '}' at top level — wrap in ERROR node.
127                    self.error("unexpected '}'");
128                    self.builder.start_node(SyntaxKind::ERROR.into());
129                    self.bump();
130                    self.builder.finish_node();
131                }
132                Some(SyntaxKind::WHITESPACE) => {
133                    // Check for blank line: WHITESPACE followed by NEWLINE,
134                    // where previous token was NEWLINE (or start of input).
135                    if self.is_blank_line_start() {
136                        self.parse_blank_line();
137                    } else {
138                        self.bump(); // plain leading whitespace
139                    }
140                }
141                Some(SyntaxKind::NEWLINE) => {
142                    // Could be a blank line (consecutive newlines) or just a newline.
143                    // If the next token is also NEWLINE or WHITESPACE+NEWLINE, it's blank.
144                    self.bump();
145                }
146                Some(SyntaxKind::COMMENT) => {
147                    self.bump();
148                }
149                Some(kind) if is_directive_start(kind) => {
150                    self.parse_directive();
151                }
152                Some(SyntaxKind::ERROR) => {
153                    self.error("unexpected token");
154                    self.bump();
155                }
156                Some(_) => {
157                    // Any other token at item level is an error.
158                    self.error(format!("unexpected token: {:?}", self.current().unwrap()));
159                    self.builder.start_node(SyntaxKind::ERROR.into());
160                    self.bump();
161                    self.builder.finish_node();
162                }
163            }
164        }
165    }
166
167    /// Check if current WHITESPACE token starts a blank line.
168    /// A blank line is WHITESPACE followed by NEWLINE where we're at the start
169    /// of a line (previous token was NEWLINE or we're at position 0).
170    fn is_blank_line_start(&self) -> bool {
171        if !self.at(SyntaxKind::WHITESPACE) {
172            return false;
173        }
174        // Check that next token is NEWLINE
175        let next = self.tokens.get(self.pos + 1).map(|(k, _)| *k);
176        if next != Some(SyntaxKind::NEWLINE) {
177            return false;
178        }
179        // Check that we're at start of a line
180        if self.pos == 0 {
181            return true;
182        }
183        let prev = self.tokens[self.pos - 1].0;
184        prev == SyntaxKind::NEWLINE
185    }
186
187    /// Parse a BLANK_LINE node: WHITESPACE NEWLINE
188    fn parse_blank_line(&mut self) {
189        self.builder.start_node(SyntaxKind::BLANK_LINE.into());
190        self.bump(); // WHITESPACE
191        self.bump(); // NEWLINE
192        self.builder.finish_node();
193    }
194
195    /// DIRECTIVE → (IDENT | argument-token) argument* (SEMICOLON | block) COMMENT?
196    ///
197    /// Most directives start with IDENT, but inside `map`/`geo`/`split_clients`
198    /// blocks, entries can start with ARGUMENT, quoted strings, or VARIABLE.
199    fn parse_directive(&mut self) {
200        self.builder.start_node(SyntaxKind::DIRECTIVE.into());
201
202        // Directive name (or first token of a map/geo entry)
203        let name = self.current_text().to_string();
204        self.bump(); // IDENT or argument-like token
205
206        // Arguments (consume whitespace + argument tokens)
207        self.parse_arguments();
208
209        // Check for lua block
210        let is_lua_block = name.ends_with("_by_lua_block");
211
212        // Terminator: semicolon or block
213        match self.peek_non_trivia() {
214            Some(SyntaxKind::SEMICOLON) => {
215                self.eat_trivia();
216                self.bump(); // SEMICOLON
217                // Consume trailing whitespace + comment on same line
218                self.eat_trailing_comment();
219            }
220            Some(SyntaxKind::L_BRACE) => {
221                self.eat_trivia();
222                if is_lua_block {
223                    self.parse_raw_block();
224                } else {
225                    self.parse_block();
226                }
227            }
228            _ => {
229                // Missing terminator — error recovery
230                self.error("expected ';' or '{'");
231            }
232        }
233
234        self.builder.finish_node(); // DIRECTIVE
235    }
236
237    /// Parse directive arguments: sequences of ARGUMENT, IDENT, VARIABLE,
238    /// DOUBLE_QUOTED_STRING, SINGLE_QUOTED_STRING separated by whitespace/newlines.
239    ///
240    /// nginx treats newlines as whitespace between tokens, so arguments can
241    /// span multiple lines (e.g. `log_format ... '...'\n    "...";`).
242    fn parse_arguments(&mut self) {
243        loop {
244            // Peek past whitespace and newlines to see if next meaningful token
245            // is an argument.
246            let mut lookahead = self.pos;
247            while lookahead < self.tokens.len() {
248                let kind = self.tokens[lookahead].0;
249                if kind == SyntaxKind::WHITESPACE || kind == SyntaxKind::NEWLINE {
250                    lookahead += 1;
251                } else {
252                    break;
253                }
254            }
255            if lookahead >= self.tokens.len() {
256                break;
257            }
258            let next_kind = self.tokens[lookahead].0;
259
260            if is_argument_kind(next_kind) {
261                // Consume trivia (whitespace + newlines) before the argument
262                self.eat_trivia();
263                self.bump(); // the argument token
264            } else {
265                break;
266            }
267        }
268    }
269
270    /// Eat optional trailing whitespace + comment on the same line as a semicolon.
271    ///
272    /// WHITESPACE is only consumed when followed by COMMENT — bare trailing
273    /// whitespace belongs to the directive's `space_before_terminator` or
274    /// `trailing_whitespace` and is handled by the AST conversion layer.
275    fn eat_trailing_comment(&mut self) {
276        if self.at(SyntaxKind::WHITESPACE) {
277            let next = self.tokens.get(self.pos + 1).map(|(k, _)| *k);
278            if next == Some(SyntaxKind::COMMENT) {
279                self.bump(); // WHITESPACE
280                self.bump(); // COMMENT
281            }
282        }
283    }
284
285    /// BLOCK → L_BRACE item* R_BRACE
286    fn parse_block(&mut self) {
287        self.builder.start_node(SyntaxKind::BLOCK.into());
288        self.bump(); // L_BRACE
289
290        self.parse_items(true);
291
292        if self.at(SyntaxKind::R_BRACE) {
293            self.bump(); // R_BRACE
294        } else {
295            self.error("expected '}'");
296        }
297        self.builder.finish_node();
298    }
299
300    /// Parse a raw block for `*_by_lua_block` directives.
301    /// All tokens between L_BRACE and matching R_BRACE are consumed as-is,
302    /// tracking brace depth.
303    fn parse_raw_block(&mut self) {
304        self.builder.start_node(SyntaxKind::BLOCK.into());
305        self.bump(); // L_BRACE
306
307        let mut depth: u32 = 1;
308        while !self.at_end() && depth > 0 {
309            match self.current() {
310                Some(SyntaxKind::L_BRACE) => {
311                    depth += 1;
312                    self.bump();
313                }
314                Some(SyntaxKind::R_BRACE) => {
315                    depth -= 1;
316                    if depth == 0 {
317                        self.bump(); // closing R_BRACE
318                    } else {
319                        self.bump(); // nested R_BRACE
320                    }
321                }
322                Some(_) => {
323                    self.bump();
324                }
325                None => break,
326            }
327        }
328
329        if depth > 0 {
330            self.error("expected '}' for lua block");
331        }
332
333        self.builder.finish_node();
334    }
335}
336
337/// Returns `true` if `kind` can appear as a directive argument.
338fn is_argument_kind(kind: SyntaxKind) -> bool {
339    matches!(
340        kind,
341        SyntaxKind::ARGUMENT
342            | SyntaxKind::IDENT
343            | SyntaxKind::VARIABLE
344            | SyntaxKind::DOUBLE_QUOTED_STRING
345            | SyntaxKind::SINGLE_QUOTED_STRING
346    )
347}
348
349/// Returns `true` if `kind` can start a directive.
350///
351/// Besides IDENT (normal directives), map/geo/split_clients block entries
352/// can start with ARGUMENT, quoted strings, or VARIABLE.
353fn is_directive_start(kind: SyntaxKind) -> bool {
354    matches!(
355        kind,
356        SyntaxKind::IDENT
357            | SyntaxKind::ARGUMENT
358            | SyntaxKind::VARIABLE
359            | SyntaxKind::DOUBLE_QUOTED_STRING
360            | SyntaxKind::SINGLE_QUOTED_STRING
361    )
362}
363
364// ── Tests ───────────────────────────────────────────────────────────────
365
366#[cfg(test)]
367mod tests {
368    use super::*;
369    use crate::lexer_rowan::tokenize;
370    use crate::syntax_kind::SyntaxNode;
371
372    fn parse_source(source: &str) -> (SyntaxNode, Vec<SyntaxError>) {
373        let tokens = tokenize(source);
374        let (green, errors) = parse(tokens);
375        (SyntaxNode::new_root(green), errors)
376    }
377
378    /// The tree text must equal the original source (lossless).
379    fn assert_lossless(source: &str) {
380        let (root, _) = parse_source(source);
381        assert_eq!(
382            root.text().to_string(),
383            source,
384            "lossless round-trip failed"
385        );
386    }
387
388    /// Assert no parse errors.
389    fn assert_no_errors(source: &str) -> SyntaxNode {
390        let (root, errors) = parse_source(source);
391        assert!(errors.is_empty(), "unexpected errors: {:?}", errors);
392        root
393    }
394
395    /// Find the first DIRECTIVE child node under root.
396    fn first_directive(root: &SyntaxNode) -> SyntaxNode {
397        root.children()
398            .find(|n| n.kind() == SyntaxKind::DIRECTIVE)
399            .expect("no DIRECTIVE node found")
400    }
401
402    /// Collect child kinds of a node.
403    fn child_kinds(node: &SyntaxNode) -> Vec<SyntaxKind> {
404        node.children_with_tokens()
405            .map(|child| child.kind())
406            .collect()
407    }
408
409    // ── Basic directive tests ───────────────────────────────────────
410
411    #[test]
412    fn simple_directive() {
413        let source = "listen 80;";
414        let root = assert_no_errors(source);
415        assert_lossless(source);
416
417        let dir = first_directive(&root);
418        let kinds = child_kinds(&dir);
419        assert_eq!(
420            kinds,
421            vec![
422                SyntaxKind::IDENT,
423                SyntaxKind::WHITESPACE,
424                SyntaxKind::ARGUMENT,
425                SyntaxKind::SEMICOLON
426            ]
427        );
428    }
429
430    #[test]
431    fn directive_no_args() {
432        let source = "accept_mutex on;";
433        let root = assert_no_errors(source);
434        assert_lossless(source);
435
436        let dir = first_directive(&root);
437        let kinds = child_kinds(&dir);
438        assert_eq!(
439            kinds,
440            vec![
441                SyntaxKind::IDENT,
442                SyntaxKind::WHITESPACE,
443                SyntaxKind::IDENT,
444                SyntaxKind::SEMICOLON
445            ]
446        );
447    }
448
449    // ── Block directive tests ───────────────────────────────────────
450
451    #[test]
452    fn block_directive() {
453        let source = "server { listen 80; }";
454        let root = assert_no_errors(source);
455        assert_lossless(source);
456
457        let dir = first_directive(&root);
458        let kinds = child_kinds(&dir);
459        // DIRECTIVE: IDENT WHITESPACE BLOCK
460        assert!(kinds.contains(&SyntaxKind::IDENT));
461        assert!(kinds.contains(&SyntaxKind::BLOCK));
462    }
463
464    #[test]
465    fn nested_blocks() {
466        let source = "http { server { listen 80; } }";
467        assert_no_errors(source);
468        assert_lossless(source);
469    }
470
471    // ── Multiline with indentation ──────────────────────────────────
472
473    #[test]
474    fn multiline_config() {
475        let source = "http {\n    server {\n        listen 80;\n    }\n}";
476        assert_no_errors(source);
477        assert_lossless(source);
478    }
479
480    // ── Comments ────────────────────────────────────────────────────
481
482    #[test]
483    fn comment_standalone() {
484        let source = "# this is a comment\nlisten 80;";
485        assert_no_errors(source);
486        assert_lossless(source);
487    }
488
489    #[test]
490    fn comment_after_directive() {
491        let source = "listen 80; # port";
492        let root = assert_no_errors(source);
493        assert_lossless(source);
494
495        // The comment should be inside the DIRECTIVE node
496        let dir = first_directive(&root);
497        let kinds = child_kinds(&dir);
498        assert!(kinds.contains(&SyntaxKind::COMMENT));
499    }
500
501    // ── Quoted strings and variables ────────────────────────────────
502
503    #[test]
504    fn double_quoted_string_arg() {
505        let source = r#"return 200 "hello world";"#;
506        let root = assert_no_errors(source);
507        assert_lossless(source);
508
509        let dir = first_directive(&root);
510        let kinds = child_kinds(&dir);
511        assert!(kinds.contains(&SyntaxKind::DOUBLE_QUOTED_STRING));
512    }
513
514    #[test]
515    fn single_quoted_string_arg() {
516        let source = "return 200 'hello world';";
517        let root = assert_no_errors(source);
518        assert_lossless(source);
519
520        let dir = first_directive(&root);
521        let kinds = child_kinds(&dir);
522        assert!(kinds.contains(&SyntaxKind::SINGLE_QUOTED_STRING));
523    }
524
525    #[test]
526    fn variable_arg() {
527        let source = "set $var value;";
528        let root = assert_no_errors(source);
529        assert_lossless(source);
530
531        let dir = first_directive(&root);
532        let kinds = child_kinds(&dir);
533        assert!(kinds.contains(&SyntaxKind::VARIABLE));
534    }
535
536    // ── Lua block ───────────────────────────────────────────────────
537
538    #[test]
539    fn lua_block() {
540        let source = "content_by_lua_block {\n    ngx.say(\"hello\")\n}";
541        let root = assert_no_errors(source);
542        assert_lossless(source);
543
544        let dir = first_directive(&root);
545        let kinds = child_kinds(&dir);
546        assert!(kinds.contains(&SyntaxKind::BLOCK));
547    }
548
549    #[test]
550    fn lua_block_nested_braces() {
551        let source =
552            "content_by_lua_block {\n    if true then\n        local t = {1, 2}\n    end\n}";
553        assert_no_errors(source);
554        assert_lossless(source);
555    }
556
557    // ── Error recovery ──────────────────────────────────────────────
558
559    #[test]
560    fn missing_semicolon() {
561        // nginx treats newlines as whitespace, so `listen 80\nserver_name ...`
562        // is parsed as a single directive. A true missing-semicolon case
563        // requires EOF without terminator.
564        let source = "listen 80";
565        let (_root, errors) = parse_source(source);
566        assert_lossless(source);
567        assert!(!errors.is_empty(), "should report missing semicolon");
568    }
569
570    #[test]
571    fn missing_closing_brace() {
572        let source = "server { listen 80;";
573        let (_root, errors) = parse_source(source);
574        assert_lossless(source);
575        assert!(!errors.is_empty(), "should report missing '}}'");
576    }
577
578    #[test]
579    fn unexpected_closing_brace() {
580        let source = "} listen 80;";
581        let (_root, errors) = parse_source(source);
582        assert_lossless(source);
583        assert!(!errors.is_empty(), "should report unexpected '}}'");
584    }
585
586    // ── Lossless round-trip tests ───────────────────────────────────
587
588    #[test]
589    fn lossless_empty() {
590        assert_lossless("");
591    }
592
593    #[test]
594    fn lossless_whitespace_only() {
595        assert_lossless("  \n  \n");
596    }
597
598    #[test]
599    fn lossless_complex_config() {
600        let source = r#"http {
601    # Main server
602    server {
603        listen 80;
604        server_name example.com;
605        location / {
606            proxy_pass http://backend;
607        }
608    }
609}
610"#;
611        assert_lossless(source);
612        assert_no_errors(source);
613    }
614
615    #[test]
616    fn lossless_blank_lines() {
617        let source = "listen 80;\n\nlisten 443;\n";
618        assert_lossless(source);
619        assert_no_errors(source);
620    }
621
622    #[test]
623    fn lossless_utf8() {
624        let source = "# これは日本語コメント\nlisten 80;\n";
625        assert_lossless(source);
626        assert_no_errors(source);
627    }
628
629    #[test]
630    fn location_with_regex() {
631        let source = "location ~ ^/api/(.*) {\n    proxy_pass http://backend;\n}";
632        assert_no_errors(source);
633        assert_lossless(source);
634    }
635
636    #[test]
637    fn multiple_directives() {
638        let source = "worker_processes auto;\nevents {\n    worker_connections 1024;\n}\n";
639        assert_no_errors(source);
640        assert_lossless(source);
641    }
642}