Skip to main content

brief/minify/
javascript.rs

1//! JavaScript / TypeScript minifier.
2//!
3//! The JS lexer is the most complex of the v0.3 set:
4//!
5//! - **Template literals** `` `…${expr}…${expr}…` ``. The body is
6//!   literal except inside `${…}` interpolations, which contain
7//!   arbitrary JS code (and may themselves contain template literals,
8//!   recursively). We track interpolation by scanning for `${`, then
9//!   counting `{`/`}` until the brace balance returns to zero.
10//! - **Regex literals** `/pattern/flags`. Lexically ambiguous with
11//!   division. We disambiguate via the previous-significant-token
12//!   heuristic: regex iff the previous non-whitespace, non-comment
13//!   token was a punctuator (other than `)`/`]`/`}`/`++`/`--`) or one
14//!   of the expression-position keywords (`return`, `typeof`, `in`,
15//!   `of`, `delete`, `void`, `new`, `throw`, `await`, `yield`,
16//!   `instanceof`, `case`, `do`, `else`).
17//! - **ASI**. JavaScript can implicitly insert `;` at end of certain
18//!   lines. Stripping such newlines without inserting an explicit `;`
19//!   changes semantics (`return\n{x:1}` returns undefined; `return{x:1}`
20//!   returns the object). Without a real parser we **preserve newlines**
21//!   verbatim and trust the engine's ASI rules.
22//! - TypeScript adds type syntax (`x: T`, `<T>`, `as T`) but the
23//!   tokenization is unchanged from JS, so the same lexer handles both.
24//!
25//! Strategy: conservative (Strategy B). Newlines preserved.
26
27use super::c_common::{Token, TokenKind, emit_conservative};
28use super::{MinifyError, MinifyOptions, MinifyOutput};
29
30pub fn minify(source: &str, opts: &MinifyOptions) -> Result<MinifyOutput, MinifyError> {
31    let toks = tokenize(source)?;
32    emit_conservative(&toks, opts.keep_comments)
33}
34
35fn tokenize(src: &str) -> Result<Vec<Token<'_>>, MinifyError> {
36    let bytes = src.as_bytes();
37    let mut out: Vec<Token<'_>> = Vec::new();
38    let mut i = 0usize;
39    while i < bytes.len() {
40        let c = bytes[i];
41        if matches!(c, b' ' | b'\t' | b'\r') {
42            i += 1;
43            continue;
44        }
45        if c == b'\n' {
46            out.push(Token::new(TokenKind::Newline));
47            i += 1;
48            continue;
49        }
50        if c == b'/' && peek(bytes, i + 1) == Some(b'/') {
51            let start = i + 2;
52            let mut j = start;
53            while j < bytes.len() && bytes[j] != b'\n' {
54                j += 1;
55            }
56            out.push(Token::new(TokenKind::LineComment(&src[start..j])));
57            i = j;
58            continue;
59        }
60        if c == b'/' && peek(bytes, i + 1) == Some(b'*') {
61            let body_start = i + 2;
62            let mut j = body_start;
63            let mut found = false;
64            while j + 1 < bytes.len() {
65                if bytes[j] == b'*' && bytes[j + 1] == b'/' {
66                    found = true;
67                    break;
68                }
69                j += 1;
70            }
71            if !found {
72                return Err(MinifyError::new("unterminated /* */ block comment"));
73            }
74            out.push(Token::new(TokenKind::BlockComment(&src[body_start..j])));
75            i = j + 2;
76            continue;
77        }
78        // Regex disambiguation: a `/` may start a regex or be division.
79        if c == b'/' && regex_is_expected(&out) {
80            let n = scan_regex(src, i)?;
81            out.push(Token::new(TokenKind::Regex(&src[i..i + n])));
82            i += n;
83            continue;
84        }
85        if c == b'"' || c == b'\'' {
86            let n = scan_quoted_string(src, i, c)?;
87            out.push(Token::new(TokenKind::StrLit(&src[i..i + n])));
88            i += n;
89            continue;
90        }
91        if c == b'`' {
92            let n = scan_template(src, i)?;
93            out.push(Token::new(TokenKind::Template(&src[i..i + n])));
94            i += n;
95            continue;
96        }
97        if is_word_start(src, i) {
98            let n = scan_word(src, i);
99            out.push(Token::new(TokenKind::Word(&src[i..i + n])));
100            i += n;
101            continue;
102        }
103        let n = scan_multi_punct(bytes, i);
104        out.push(Token::new(TokenKind::Punct(&src[i..i + n])));
105        i += n;
106    }
107    Ok(out)
108}
109
110/// Heuristic: should a `/` here start a regex literal? Yes iff the previous
111/// "significant" token (skipping whitespace/comments/newlines) is one
112/// after which an expression is expected.
113fn regex_is_expected(prev_tokens: &[Token<'_>]) -> bool {
114    // Scan backward past comments/newlines.
115    for tok in prev_tokens.iter().rev() {
116        match &tok.kind {
117            TokenKind::LineComment(_) | TokenKind::BlockComment(_) | TokenKind::Newline => continue,
118            TokenKind::Word(s) => {
119                return matches!(
120                    *s,
121                    "return"
122                        | "typeof"
123                        | "in"
124                        | "of"
125                        | "delete"
126                        | "void"
127                        | "new"
128                        | "throw"
129                        | "await"
130                        | "yield"
131                        | "instanceof"
132                        | "case"
133                        | "do"
134                        | "else"
135                );
136            }
137            TokenKind::Punct(s) => {
138                // After `)`, `]`, `}`, `++`, `--`, an expression has
139                // ended; `/` is division. Anything else, expect regex.
140                return !matches!(*s, ")" | "]" | "}" | "++" | "--");
141            }
142            TokenKind::StrLit(_)
143            | TokenKind::Template(_)
144            | TokenKind::Regex(_)
145            | TokenKind::Preproc(_) => return false,
146        }
147    }
148    // No previous significant token: top of source. An expression at the
149    // very start could begin with a regex literal (rare but legal).
150    true
151}
152
153fn scan_regex(src: &str, i: usize) -> Result<usize, MinifyError> {
154    let bytes = src.as_bytes();
155    debug_assert_eq!(bytes[i], b'/');
156    let mut j = i + 1;
157    let mut in_class = false;
158    while j < bytes.len() {
159        match bytes[j] {
160            b'\\' => {
161                j += 2;
162                continue;
163            }
164            b'[' => {
165                in_class = true;
166                j += 1;
167            }
168            b']' if in_class => {
169                in_class = false;
170                j += 1;
171            }
172            b'/' if !in_class => {
173                // skip closing /, then flags (Latin letters)
174                j += 1;
175                while j < bytes.len() && bytes[j].is_ascii_alphabetic() {
176                    j += 1;
177                }
178                return Ok(j - i);
179            }
180            b'\n' => return Err(MinifyError::new("newline in regex literal")),
181            _ => j += 1,
182        }
183    }
184    Err(MinifyError::new("unterminated regex literal"))
185}
186
187fn scan_quoted_string(src: &str, i: usize, quote: u8) -> Result<usize, MinifyError> {
188    let bytes = src.as_bytes();
189    debug_assert_eq!(bytes[i], quote);
190    let mut j = i + 1;
191    while j < bytes.len() {
192        if bytes[j] == b'\\' {
193            // Line continuation: `\<nl>` is allowed in JS strings.
194            if peek(bytes, j + 1) == Some(b'\n') {
195                j += 2;
196                continue;
197            }
198            j += 2;
199            continue;
200        }
201        if bytes[j] == quote {
202            return Ok(j + 1 - i);
203        }
204        if bytes[j] == b'\n' {
205            return Err(MinifyError::new("newline in string literal"));
206        }
207        j += 1;
208    }
209    Err(MinifyError::new("unterminated string literal"))
210}
211
212fn scan_template(src: &str, i: usize) -> Result<usize, MinifyError> {
213    let bytes = src.as_bytes();
214    debug_assert_eq!(bytes[i], b'`');
215    let mut j = i + 1;
216    while j < bytes.len() {
217        match bytes[j] {
218            b'\\' => {
219                j += 2;
220            }
221            b'`' => return Ok(j + 1 - i),
222            b'$' if peek(bytes, j + 1) == Some(b'{') => {
223                // Skip `${`, then content until matching `}` (counting
224                // braces, accounting for nested templates/strings).
225                j += 2;
226                let mut depth = 1usize;
227                while j < bytes.len() && depth > 0 {
228                    match bytes[j] {
229                        b'{' => {
230                            depth += 1;
231                            j += 1;
232                        }
233                        b'}' => {
234                            depth -= 1;
235                            j += 1;
236                        }
237                        b'`' => {
238                            // Nested template literal — recurse.
239                            let inner = scan_template(src, j)?;
240                            j += inner;
241                        }
242                        b'"' | b'\'' => {
243                            let q = bytes[j];
244                            j += scan_quoted_string(src, j, q)?;
245                        }
246                        b'/' if peek(bytes, j + 1) == Some(b'/') => {
247                            while j < bytes.len() && bytes[j] != b'\n' {
248                                j += 1;
249                            }
250                        }
251                        b'/' if peek(bytes, j + 1) == Some(b'*') => {
252                            j += 2;
253                            while j + 1 < bytes.len() && !(bytes[j] == b'*' && bytes[j + 1] == b'/')
254                            {
255                                j += 1;
256                            }
257                            if j + 1 >= bytes.len() {
258                                return Err(MinifyError::new("unterminated /* */ inside template"));
259                            }
260                            j += 2;
261                        }
262                        b'\\' => {
263                            j += 2;
264                        }
265                        _ => j += 1,
266                    }
267                }
268                if depth != 0 {
269                    return Err(MinifyError::new("unterminated `${…}` in template"));
270                }
271            }
272            _ => j += 1,
273        }
274    }
275    Err(MinifyError::new("unterminated template literal"))
276}
277
278fn is_word_start(src: &str, i: usize) -> bool {
279    let c = char_at(src, i);
280    c.is_alphabetic() || c == '_' || c == '$' || c.is_ascii_digit()
281}
282
283fn scan_word(src: &str, i: usize) -> usize {
284    let bytes = src.as_bytes();
285    let mut j = i;
286    while j < bytes.len() {
287        let c = char_at(src, j);
288        if c.is_alphanumeric() || c == '_' || c == '$' {
289            j += c.len_utf8();
290            continue;
291        }
292        if c == '.' {
293            // Decimal: 1.5; scientific: 1e3 (handled by alnum already).
294            let next = peek(bytes, j + 1);
295            if matches!(next, Some(b'0'..=b'9')) && j > i {
296                j += 1;
297                continue;
298            }
299        }
300        break;
301    }
302    j - i
303}
304
305fn scan_multi_punct(bytes: &[u8], i: usize) -> usize {
306    let four = bytes
307        .get(i..i + 4)
308        .map(|s| std::str::from_utf8(s).unwrap_or(""))
309        .unwrap_or("");
310    let three = bytes
311        .get(i..i + 3)
312        .map(|s| std::str::from_utf8(s).unwrap_or(""))
313        .unwrap_or("");
314    let two = bytes
315        .get(i..i + 2)
316        .map(|s| std::str::from_utf8(s).unwrap_or(""))
317        .unwrap_or("");
318    if matches!(four, ">>>=") {
319        return 4;
320    }
321    if matches!(
322        three,
323        "===" | "!==" | "..." | ">>>" | "**=" | "<<=" | ">>=" | "??="
324    ) {
325        return 3;
326    }
327    if matches!(
328        two,
329        "=>" | "=="
330            | "!="
331            | "<="
332            | ">="
333            | "&&"
334            | "||"
335            | "??"
336            | "?."
337            | "++"
338            | "--"
339            | "<<"
340            | ">>"
341            | "**"
342            | "+="
343            | "-="
344            | "*="
345            | "/="
346            | "%="
347            | "&="
348            | "|="
349            | "^="
350            | "&&="
351            | "||="
352    ) {
353        return 2;
354    }
355    let c = char_at(unsafe { std::str::from_utf8_unchecked(bytes) }, i);
356    c.len_utf8()
357}
358
359fn peek(bytes: &[u8], i: usize) -> Option<u8> {
360    bytes.get(i).copied()
361}
362
363fn char_at(src: &str, i: usize) -> char {
364    src[i..].chars().next().unwrap_or('\0')
365}
366
367#[cfg(test)]
368mod tests {
369    use super::*;
370
371    fn min(s: &str) -> String {
372        minify(s, &MinifyOptions::default()).unwrap().body
373    }
374
375    #[test]
376    fn basic_function() {
377        let src = "function add(a, b) {\n    return a + b;\n}\n";
378        let out = min(src);
379        // Newlines preserved (ASI), horizontal ws stripped.
380        assert_eq!(out, "function add(a,b){\nreturn a+b;\n}\n");
381    }
382
383    #[test]
384    fn strips_line_comment() {
385        let src = "// hi\nlet x = 1;\n";
386        let out = min(src);
387        assert_eq!(out, "\nlet x=1;\n");
388    }
389
390    #[test]
391    fn strips_block_comment_inline() {
392        let src = "let x = /* y */ 1;\n";
393        let out = min(src);
394        assert_eq!(out, "let x=1;\n");
395    }
396
397    #[test]
398    fn template_literal() {
399        let src = "const s = `hello, ${name}!`;\n";
400        let out = min(src);
401        assert!(out.contains("`hello, ${name}!`"), "got: {}", out);
402    }
403
404    #[test]
405    fn nested_template() {
406        let src = "const s = `a${`b${c}d`}e`;\n";
407        let out = min(src);
408        assert!(out.contains("`a${`b${c}d`}e`"), "got: {}", out);
409    }
410
411    #[test]
412    fn template_with_string_in_interpolation() {
413        let src = "const s = `${\"hi\"}`;\n";
414        let out = min(src);
415        assert!(out.contains("`${\"hi\"}`"), "got: {}", out);
416    }
417
418    #[test]
419    fn regex_literal() {
420        let src = "const re = /[a-z]+/gi;\n";
421        let out = min(src);
422        assert_eq!(out, "const re=/[a-z]+/gi;\n");
423    }
424
425    #[test]
426    fn regex_after_return() {
427        let src = "function f() { return /\\d+/.test(x); }\n";
428        let out = min(src);
429        assert!(out.contains("/\\d+/"), "got: {}", out);
430    }
431
432    #[test]
433    fn division_after_value() {
434        let src = "const x = a / b;\n";
435        let out = min(src);
436        assert_eq!(out, "const x=a/b;\n");
437    }
438
439    #[test]
440    fn division_after_paren() {
441        let src = "const x = (a + b) / c;\n";
442        let out = min(src);
443        assert_eq!(out, "const x=(a+b)/c;\n");
444    }
445
446    #[test]
447    fn return_then_object_preserves_newline() {
448        // ASI hazard: `return\n{x:1}` returns undefined. Stripping the
449        // newline would change behavior. Conservative emitter preserves.
450        let src = "function f() {\n    return\n    {x: 1};\n}\n";
451        let out = min(src);
452        assert!(
453            out.contains("return\n"),
454            "newline preserved after return: {:?}",
455            out
456        );
457    }
458
459    #[test]
460    fn arrow_function() {
461        let src = "const f = (x) => x + 1;\n";
462        let out = min(src);
463        assert_eq!(out, "const f=(x)=>x+1;\n");
464    }
465
466    #[test]
467    fn nullish_coalescing() {
468        let src = "const x = a ?? b;\n";
469        let out = min(src);
470        assert_eq!(out, "const x=a??b;\n");
471    }
472
473    #[test]
474    fn optional_chaining() {
475        let src = "const x = obj?.prop;\n";
476        let out = min(src);
477        assert_eq!(out, "const x=obj?.prop;\n");
478    }
479
480    #[test]
481    fn strict_equality() {
482        let src = "if (a === b) {}\n";
483        let out = min(src);
484        assert_eq!(out, "if(a===b){}\n");
485    }
486
487    #[test]
488    fn typescript_type_annotation() {
489        let src = "function f(x: number): string { return String(x); }\n";
490        let out = min(src);
491        // Newline-free single-line input → single-line output.
492        assert_eq!(out, "function f(x:number):string{return String(x);}\n");
493    }
494
495    #[test]
496    fn typescript_generic() {
497        let src = "function f<T>(x: T): T { return x; }\n";
498        let out = min(src);
499        assert_eq!(out, "function f<T>(x:T):T{return x;}\n");
500    }
501
502    #[test]
503    fn double_quoted_string_with_escape() {
504        let src = "const s = \"a\\\"b\";\n";
505        let out = min(src);
506        assert_eq!(out, "const s=\"a\\\"b\";\n");
507    }
508
509    #[test]
510    fn dollar_in_identifier() {
511        let src = "const $foo = 1;\n";
512        let out = min(src);
513        assert_eq!(out, "const $foo=1;\n");
514    }
515
516    #[test]
517    fn keep_comments_converts_line() {
518        let src = "// hi\nlet x = 1;\n";
519        let r = minify(
520            src,
521            &MinifyOptions {
522                keep_comments: true,
523            },
524        )
525        .unwrap();
526        assert!(r.body.contains("/* hi*/"));
527        assert_eq!(r.warnings.len(), 1);
528    }
529
530    #[test]
531    fn unterminated_string() {
532        assert!(minify("const s = \"oops", &MinifyOptions::default()).is_err());
533    }
534
535    #[test]
536    fn unterminated_template() {
537        assert!(minify("const s = `oops", &MinifyOptions::default()).is_err());
538    }
539
540    #[test]
541    fn unterminated_regex() {
542        assert!(minify("const r = /oops", &MinifyOptions::default()).is_err());
543    }
544
545    #[test]
546    fn regex_with_class() {
547        let src = "const r = /[/]/g;\n";
548        let out = min(src);
549        assert!(out.contains("/[/]/g"), "got: {}", out);
550    }
551
552    #[test]
553    fn regex_at_start_of_file() {
554        let src = "/abc/.test(s)\n";
555        let out = min(src);
556        assert!(out.starts_with("/abc/"), "got: {}", out);
557    }
558}