Skip to main content

brief/minify/
sql.rs

1//! SQL minifier (Postgres-flavored, the canonical dialect per spec §4.5).
2//!
3//! Distinguishing features:
4//!
5//! - Line comments are `--`, not `//`.
6//! - Block comments `/* … */` **nest** in Postgres.
7//! - String literals are single-quoted with `''` doubled-escape; there
8//!   are no backslash escapes in standard SQL.
9//! - Identifier quoting `"…"` with `""` doubled-escape (Postgres).
10//! - Dollar-quoted strings `$tag$ … $tag$` (Postgres) — verbatim, with
11//!   arbitrary user-defined `tag` (which may be empty: `$$ … $$`).
12//!
13//! Strategy: aggressive (Strategy A). SQL statements are explicitly
14//! `;`-terminated, so newlines can be stripped freely.
15
16use super::c_common::{Token, TokenKind, emit_aggressive};
17use super::{MinifyError, MinifyOptions, MinifyOutput};
18
19pub fn minify(source: &str, opts: &MinifyOptions) -> Result<MinifyOutput, MinifyError> {
20    let toks = tokenize(source)?;
21    emit_aggressive(&toks, opts.keep_comments)
22}
23
24fn tokenize(src: &str) -> Result<Vec<Token<'_>>, MinifyError> {
25    let bytes = src.as_bytes();
26    let mut out: Vec<Token<'_>> = Vec::new();
27    let mut i = 0usize;
28    while i < bytes.len() {
29        let c = bytes[i];
30        if matches!(c, b' ' | b'\t' | b'\r') {
31            i += 1;
32            continue;
33        }
34        if c == b'\n' {
35            out.push(Token::new(TokenKind::Newline));
36            i += 1;
37            continue;
38        }
39        if c == b'-' && peek(bytes, i + 1) == Some(b'-') {
40            let start = i + 2;
41            let mut j = start;
42            while j < bytes.len() && bytes[j] != b'\n' {
43                j += 1;
44            }
45            out.push(Token::new(TokenKind::LineComment(&src[start..j])));
46            i = j;
47            continue;
48        }
49        if c == b'/' && peek(bytes, i + 1) == Some(b'*') {
50            // Postgres allows nested /* */.
51            let body_start = i + 2;
52            let mut j = body_start;
53            let mut depth = 1usize;
54            while j < bytes.len() {
55                if bytes[j] == b'/' && peek(bytes, j + 1) == Some(b'*') {
56                    depth += 1;
57                    j += 2;
58                    continue;
59                }
60                if bytes[j] == b'*' && peek(bytes, j + 1) == Some(b'/') {
61                    depth -= 1;
62                    if depth == 0 {
63                        let body = &src[body_start..j];
64                        out.push(Token::new(TokenKind::BlockComment(body)));
65                        i = j + 2;
66                        break;
67                    }
68                    j += 2;
69                    continue;
70                }
71                j += 1;
72            }
73            if depth != 0 {
74                return Err(MinifyError::new("unterminated /* */ block comment"));
75            }
76            continue;
77        }
78        if c == b'\'' {
79            let n = scan_sq_string(src, i)?;
80            out.push(Token::new(TokenKind::StrLit(&src[i..i + n])));
81            i += n;
82            continue;
83        }
84        if c == b'"' {
85            let n = scan_quoted_ident(src, i)?;
86            out.push(Token::new(TokenKind::StrLit(&src[i..i + n])));
87            i += n;
88            continue;
89        }
90        if c == b'$' {
91            // Could be a dollar-quoted string ($tag$ ... $tag$) or a
92            // positional parameter ($1, $2, ...).
93            if let Some((tag_end, body_end)) = try_scan_dollar_quoted(bytes, i) {
94                out.push(Token::new(TokenKind::StrLit(&src[i..body_end])));
95                i = body_end;
96                let _ = tag_end;
97                continue;
98            }
99            // Positional param `$1`, etc. — treat the run as a Word.
100            let mut j = i + 1;
101            while j < bytes.len() && bytes[j].is_ascii_digit() {
102                j += 1;
103            }
104            if j > i + 1 {
105                out.push(Token::new(TokenKind::Word(&src[i..j])));
106                i = j;
107                continue;
108            }
109            // Bare `$` not followed by tag/digit — emit as Punct.
110            out.push(Token::new(TokenKind::Punct(&src[i..i + 1])));
111            i += 1;
112            continue;
113        }
114        if is_word_start(src, i) {
115            let n = scan_word(src, i);
116            out.push(Token::new(TokenKind::Word(&src[i..i + n])));
117            i += n;
118            continue;
119        }
120        let n = scan_multi_punct(bytes, i);
121        out.push(Token::new(TokenKind::Punct(&src[i..i + n])));
122        i += n;
123    }
124    Ok(out)
125}
126
127fn scan_sq_string(src: &str, i: usize) -> Result<usize, MinifyError> {
128    let bytes = src.as_bytes();
129    debug_assert_eq!(bytes[i], b'\'');
130    let mut j = i + 1;
131    while j < bytes.len() {
132        if bytes[j] == b'\'' {
133            // Doubled `''` → escaped quote, keep scanning.
134            if peek(bytes, j + 1) == Some(b'\'') {
135                j += 2;
136                continue;
137            }
138            return Ok(j + 1 - i);
139        }
140        j += 1;
141    }
142    Err(MinifyError::new("unterminated string literal"))
143}
144
145fn scan_quoted_ident(src: &str, i: usize) -> Result<usize, MinifyError> {
146    let bytes = src.as_bytes();
147    debug_assert_eq!(bytes[i], b'"');
148    let mut j = i + 1;
149    while j < bytes.len() {
150        if bytes[j] == b'"' {
151            if peek(bytes, j + 1) == Some(b'"') {
152                j += 2;
153                continue;
154            }
155            return Ok(j + 1 - i);
156        }
157        j += 1;
158    }
159    Err(MinifyError::new("unterminated quoted identifier"))
160}
161
162/// On success returns `(tag_end, body_end)` where:
163/// - `tag_end` is the byte index of the second `$` of the opening `$tag$`
164/// - `body_end` is the byte index just past the closing `$tag$`
165fn try_scan_dollar_quoted(bytes: &[u8], i: usize) -> Option<(usize, usize)> {
166    debug_assert_eq!(bytes[i], b'$');
167    let tag_start = i + 1;
168    let mut j = tag_start;
169    while j < bytes.len() {
170        let b = bytes[j];
171        if b == b'$' {
172            break;
173        }
174        if !(b.is_ascii_alphanumeric() || b == b'_') {
175            return None;
176        }
177        j += 1;
178    }
179    if j >= bytes.len() || bytes[j] != b'$' {
180        return None;
181    }
182    let tag_end = j; // position of the closing `$` of the opener
183    let tag = &bytes[tag_start..tag_end];
184    let body_start = tag_end + 1;
185    // Find matching `$<tag>$` close.
186    let mut k = body_start;
187    while k < bytes.len() {
188        if bytes[k] == b'$' && k + 1 + tag.len() < bytes.len() {
189            if &bytes[k + 1..k + 1 + tag.len()] == tag
190                && bytes.get(k + 1 + tag.len()) == Some(&b'$')
191            {
192                return Some((tag_end, k + 1 + tag.len() + 1));
193            }
194        }
195        if bytes[k] == b'$' && tag.is_empty() {
196            // `$$ … $$`
197            if peek(bytes, k + 1) == Some(b'$') && k > body_start {
198                return Some((tag_end, k + 2));
199            }
200        }
201        k += 1;
202    }
203    None
204}
205
206fn is_word_start(src: &str, i: usize) -> bool {
207    let c = char_at(src, i);
208    c.is_alphabetic() || c == '_' || c.is_ascii_digit()
209}
210
211fn scan_word(src: &str, i: usize) -> usize {
212    let bytes = src.as_bytes();
213    let mut j = i;
214    while j < bytes.len() {
215        let c = char_at(src, j);
216        if c.is_alphanumeric() || c == '_' {
217            j += c.len_utf8();
218            continue;
219        }
220        if c == '.' {
221            // Decimal: 1.5
222            let next = peek(bytes, j + 1);
223            if matches!(next, Some(b'0'..=b'9')) && j > i {
224                j += 1;
225                continue;
226            }
227        }
228        break;
229    }
230    j - i
231}
232
233fn scan_multi_punct(bytes: &[u8], i: usize) -> usize {
234    let two = bytes
235        .get(i..i + 2)
236        .map(|s| std::str::from_utf8(s).unwrap_or(""))
237        .unwrap_or("");
238    if matches!(two, "<=" | ">=" | "<>" | "!=" | "||" | "::") {
239        return 2;
240    }
241    let c = char_at(unsafe { std::str::from_utf8_unchecked(bytes) }, i);
242    c.len_utf8()
243}
244
245fn peek(bytes: &[u8], i: usize) -> Option<u8> {
246    bytes.get(i).copied()
247}
248
249fn char_at(src: &str, i: usize) -> char {
250    src[i..].chars().next().unwrap_or('\0')
251}
252
253#[cfg(test)]
254mod tests {
255    use super::*;
256
257    fn min(s: &str) -> String {
258        minify(s, &MinifyOptions::default()).unwrap().body
259    }
260
261    #[test]
262    fn select_with_whitespace() {
263        let src = "SELECT  *\n  FROM users\n  WHERE id = 1;";
264        let out = min(src);
265        // `*` is punct not a word char, so `SELECT*FROM` is valid lexically:
266        // SELECT (kw), * (op), FROM (kw). All major SQL dialects parse this.
267        assert_eq!(out, "SELECT*FROM users WHERE id=1;");
268    }
269
270    #[test]
271    fn line_comment_stripped() {
272        let src = "-- comment\nSELECT 1;";
273        let out = min(src);
274        assert_eq!(out, "SELECT 1;");
275    }
276
277    #[test]
278    fn block_comment_stripped() {
279        let src = "/* hi */ SELECT 1;";
280        let out = min(src);
281        assert_eq!(out, "SELECT 1;");
282    }
283
284    #[test]
285    fn nested_block_comment() {
286        let src = "/* outer /* inner */ outer */ SELECT 1;";
287        let out = min(src);
288        assert_eq!(out, "SELECT 1;");
289    }
290
291    #[test]
292    fn doubled_quote_in_string() {
293        let src = "SELECT 'O''Brien';";
294        let out = min(src);
295        // `SELECT` (word) → `'` (punct); not word-word, no space needed.
296        assert_eq!(out, "SELECT'O''Brien';");
297    }
298
299    #[test]
300    fn quoted_identifier() {
301        let src = "SELECT \"my col\" FROM t;";
302        let out = min(src);
303        // Quoted identifiers' delimiters are `"`, which is punct; abuts
304        // SELECT and FROM with no required space.
305        assert_eq!(out, "SELECT\"my col\"FROM t;");
306    }
307
308    #[test]
309    fn dollar_quoted_string() {
310        let src = "DO $$ BEGIN RAISE NOTICE 'hi'; END $$;";
311        let out = min(src);
312        assert!(
313            out.contains("$$ BEGIN RAISE NOTICE 'hi'; END $$"),
314            "{}",
315            out
316        );
317    }
318
319    #[test]
320    fn dollar_quoted_with_tag() {
321        let src = "SELECT $tag$ raw \"text\" $tag$;";
322        let out = min(src);
323        assert!(out.contains("$tag$ raw \"text\" $tag$"));
324    }
325
326    #[test]
327    fn positional_param() {
328        let src = "SELECT * FROM t WHERE id = $1;";
329        let out = min(src);
330        assert_eq!(out, "SELECT*FROM t WHERE id=$1;");
331    }
332
333    #[test]
334    fn keep_comments_converts() {
335        let src = "-- hi\nSELECT 1;";
336        let r = minify(
337            src,
338            &MinifyOptions {
339                keep_comments: true,
340            },
341        )
342        .unwrap();
343        assert!(r.body.starts_with("/* hi*/"));
344        assert_eq!(r.warnings.len(), 1);
345    }
346
347    #[test]
348    fn unterminated_string() {
349        assert!(minify("SELECT 'oops", &MinifyOptions::default()).is_err());
350    }
351
352    #[test]
353    fn unterminated_block_comment() {
354        assert!(minify("/* unterminated", &MinifyOptions::default()).is_err());
355    }
356
357    #[test]
358    fn case_preservation() {
359        let src = "select Foo from Bar;";
360        let out = min(src);
361        // We do not normalize keyword case (spec §4.5 — opt-in only).
362        assert_eq!(out, "select Foo from Bar;");
363    }
364
365    #[test]
366    fn double_dash_only_at_start_of_word() {
367        // `--` between digits like `5-1` is a subtraction; only `--` after
368        // whitespace is a comment. Our tokenizer treats any `--` as start
369        // of comment. In practice SQL minifiers do too.
370        let src = "SELECT 5--1\nFROM t;";
371        let out = min(src);
372        assert_eq!(out, "SELECT 5 FROM t;");
373    }
374}