Skip to main content

brief/minify/
rust.rs

1//! Rust minifier.
2//!
3//! Handles the lexical exotica that distinguishes Rust from "generic
4//! C-family":
5//!
6//! - Raw strings `r"…"`, `r#"…"#`, `r##"…"##` (any `#` count)
7//! - Byte strings `b"…"`, raw byte strings `br"…"`, `br#"…"#`
8//! - Char literals `'a'`, `'\n'`, `'\u{1234}'`, `b'a'`
9//! - Lifetimes `'static`, `'a`, `'_` — distinguished from char literals
10//!   by the absence of a closing apostrophe after a single ident token
11//! - Block comments that **nest**: `/* outer /* inner */ outer */`
12//! - Underscored numeric literals (`1_000_000`), suffixed numbers
13//!   (`1u32`), hex/oct/bin (`0xFF`, `0b10`)
14//! - Doc comments `///` and `//!` are line comments to us — stripped by
15//!   default per spec §4.2 (which warns this is intentional). Use
16//!   `@minify-keep-comments` to retain.
17//!
18//! Strategy: aggressive (Strategy A) — strip newlines and all
19//! non-required whitespace.
20
21use super::c_common::{Token, TokenKind, emit_aggressive};
22use super::{MinifyError, MinifyOptions, MinifyOutput};
23
24pub fn minify(source: &str, opts: &MinifyOptions) -> Result<MinifyOutput, MinifyError> {
25    let toks = tokenize(source)?;
26    emit_aggressive(&toks, opts.keep_comments)
27}
28
29fn tokenize(src: &str) -> Result<Vec<Token<'_>>, MinifyError> {
30    let bytes = src.as_bytes();
31    let mut out: Vec<Token<'_>> = Vec::new();
32    let mut i = 0usize;
33    while i < bytes.len() {
34        let c = bytes[i];
35        // Whitespace and newlines.
36        if matches!(c, b' ' | b'\t' | b'\r') {
37            i += 1;
38            continue;
39        }
40        if c == b'\n' {
41            out.push(Token::new(TokenKind::Newline));
42            i += 1;
43            continue;
44        }
45        // Comments.
46        if c == b'/' && peek(bytes, i + 1) == Some(b'/') {
47            let start = i + 2;
48            let mut j = start;
49            while j < bytes.len() && bytes[j] != b'\n' {
50                j += 1;
51            }
52            out.push(Token::new(TokenKind::LineComment(&src[start..j])));
53            i = j;
54            continue;
55        }
56        if c == b'/' && peek(bytes, i + 1) == Some(b'*') {
57            // Nested block comment.
58            let body_start = i + 2;
59            let mut j = body_start;
60            let mut depth = 1usize;
61            while j < bytes.len() {
62                if bytes[j] == b'/' && peek(bytes, j + 1) == Some(b'*') {
63                    depth += 1;
64                    j += 2;
65                    continue;
66                }
67                if bytes[j] == b'*' && peek(bytes, j + 1) == Some(b'/') {
68                    depth -= 1;
69                    if depth == 0 {
70                        let body = &src[body_start..j];
71                        out.push(Token::new(TokenKind::BlockComment(body)));
72                        i = j + 2;
73                        break;
74                    }
75                    j += 2;
76                    continue;
77                }
78                j += 1;
79            }
80            if depth != 0 {
81                return Err(MinifyError::new("unterminated /* */ block comment"));
82            }
83            continue;
84        }
85        // Raw / byte / byte-raw strings.
86        if c == b'r' || c == b'b' {
87            if let Some((tok, n)) = try_scan_special_string(src, i)? {
88                out.push(Token::new(TokenKind::StrLit(tok)));
89                i += n;
90                continue;
91            }
92        }
93        // Regular string.
94        if c == b'"' {
95            let n = scan_dq_string(src, i)?;
96            out.push(Token::new(TokenKind::StrLit(&src[i..i + n])));
97            i += n;
98            continue;
99        }
100        // Char literal vs lifetime.
101        if c == b'\'' {
102            let (kind, n) = scan_quote(src, i)?;
103            match kind {
104                QuoteKind::Char => out.push(Token::new(TokenKind::StrLit(&src[i..i + n]))),
105                QuoteKind::Lifetime => out.push(Token::new(TokenKind::Word(&src[i..i + n]))),
106            }
107            i += n;
108            continue;
109        }
110        // Word: identifier or number. Identifiers can include non-ASCII
111        // (Rust permits XID identifiers); we accept all alphanumerics here
112        // because we're not validating, just lexing.
113        if is_word_start(src, i) {
114            let n = scan_word(src, i);
115            out.push(Token::new(TokenKind::Word(&src[i..i + n])));
116            i += n;
117            continue;
118        }
119        // Punctuation. Multi-char operators must be lexed as one token,
120        // because the emitter inserts a space between any two punct chars
121        // that would form a different operator if joined (the dangerous-
122        // pair table). Without this, `->` written as two single-char Puncts
123        // would render `- >` because `-`/`>` is in the table.
124        let n = scan_multi_punct(bytes, i);
125        out.push(Token::new(TokenKind::Punct(&src[i..i + n])));
126        i += n;
127    }
128    Ok(out)
129}
130
131fn scan_multi_punct(bytes: &[u8], i: usize) -> usize {
132    let three = bytes
133        .get(i..i + 3)
134        .map(|s| std::str::from_utf8(s).unwrap_or(""))
135        .unwrap_or("");
136    let two = bytes
137        .get(i..i + 2)
138        .map(|s| std::str::from_utf8(s).unwrap_or(""))
139        .unwrap_or("");
140    if matches!(three, "..=" | "<<=" | ">>=") {
141        return 3;
142    }
143    if matches!(
144        two,
145        "->" | "=>"
146            | "::"
147            | "=="
148            | "!="
149            | "<="
150            | ">="
151            | "&&"
152            | "||"
153            | "<<"
154            | ">>"
155            | ".."
156            | "+="
157            | "-="
158            | "*="
159            | "/="
160            | "%="
161            | "&="
162            | "|="
163            | "^="
164    ) {
165        return 2;
166    }
167    // Single byte (UTF-8 punctuation isn't expected in Rust source).
168    let c = char_at(unsafe { std::str::from_utf8_unchecked(bytes) }, i);
169    c.len_utf8()
170}
171
172#[derive(Debug)]
173enum QuoteKind {
174    Char,
175    Lifetime,
176}
177
178fn try_scan_special_string(src: &str, i: usize) -> Result<Option<(&str, usize)>, MinifyError> {
179    let bytes = src.as_bytes();
180    let mut p = i;
181    let mut byte = false;
182    if bytes[p] == b'b' {
183        // `b"…"` / `br"…"` / `br#"…"#` / `b'…'`
184        if peek(bytes, p + 1) == Some(b'\'') {
185            // Byte char literal — handled by scan_quote later.
186            return Ok(None);
187        }
188        byte = true;
189        p += 1;
190    }
191    let mut raw = false;
192    if peek(bytes, p) == Some(b'r') && p > i {
193        // `br…` so far
194        raw = true;
195        p += 1;
196    } else if !byte && peek(bytes, p) == Some(b'r') {
197        raw = true;
198        p += 1;
199    }
200    // Count `#`s if raw.
201    let mut hashes = 0usize;
202    if raw {
203        while peek(bytes, p) == Some(b'#') {
204            hashes += 1;
205            p += 1;
206        }
207    }
208    // Must now see `"`. If not, this was not a special string — back out.
209    if peek(bytes, p) != Some(b'"') {
210        // For `b` followed by ident chars (not `"`/`'`), it's just a normal
211        // identifier. Same for `r` not followed by `"` or `#"`.
212        return Ok(None);
213    }
214    // We are committed: scan the body.
215    let body_start = p + 1;
216    if raw {
217        // Find the next `"` followed by `hashes` `#`s.
218        let mut j = body_start;
219        loop {
220            if j >= bytes.len() {
221                return Err(MinifyError::new("unterminated raw string literal"));
222            }
223            if bytes[j] == b'"' {
224                // Check for matching # count.
225                let mut k = j + 1;
226                let mut found = 0;
227                while k < bytes.len() && bytes[k] == b'#' && found < hashes {
228                    found += 1;
229                    k += 1;
230                }
231                if found == hashes {
232                    let total = k - i;
233                    return Ok(Some((&src[i..i + total], total)));
234                }
235            }
236            j += 1;
237        }
238    } else {
239        // Byte-string with escapes — same as regular string.
240        let n = scan_dq_string(src, p)?;
241        let total = (p - i) + n;
242        Ok(Some((&src[i..i + total], total)))
243    }
244}
245
246fn scan_dq_string(src: &str, i: usize) -> Result<usize, MinifyError> {
247    let bytes = src.as_bytes();
248    debug_assert_eq!(bytes[i], b'"');
249    let mut j = i + 1;
250    while j < bytes.len() {
251        match bytes[j] {
252            b'\\' => {
253                j += 2;
254            }
255            b'"' => return Ok(j + 1 - i),
256            _ => {
257                j += 1;
258            }
259        }
260    }
261    Err(MinifyError::new("unterminated string literal"))
262}
263
264fn scan_quote(src: &str, i: usize) -> Result<(QuoteKind, usize), MinifyError> {
265    let bytes = src.as_bytes();
266    debug_assert_eq!(bytes[i], b'\'');
267    // Lookahead: if `'<single_char>'` or `'\<escape>'` it's a char
268    // literal. Otherwise it's a lifetime: `'<ident>`.
269    // Heuristic: scan ident chars after `'`, then check for closing `'`.
270    let after = i + 1;
271    if after >= bytes.len() {
272        return Err(MinifyError::new("unterminated `'`"));
273    }
274    // Escape-led char literal: `'\…'`
275    if bytes[after] == b'\\' {
276        let mut j = after + 1;
277        // Common escapes: '\n','\t','\\','\'','\"','\0','\xNN','\u{…}'
278        if j >= bytes.len() {
279            return Err(MinifyError::new("unterminated char escape"));
280        }
281        let esc = bytes[j];
282        j += 1;
283        if esc == b'x' {
284            j = j.saturating_add(2).min(bytes.len()); // two hex digits
285        } else if esc == b'u' && peek(bytes, j) == Some(b'{') {
286            // skip until matching '}'
287            j += 1;
288            while j < bytes.len() && bytes[j] != b'}' {
289                j += 1;
290            }
291            if j < bytes.len() {
292                j += 1;
293            }
294        }
295        if peek(bytes, j) != Some(b'\'') {
296            return Err(MinifyError::new("malformed char literal"));
297        }
298        return Ok((QuoteKind::Char, j + 1 - i));
299    }
300    // Otherwise: read ident chars.
301    let id_start = after;
302    let mut j = id_start;
303    while j < bytes.len() && is_id_continue(char_at(src, j)) {
304        j += char_at(src, j).len_utf8();
305    }
306    // If stopped at a closing quote, it's a char literal `'X…'`.
307    // Otherwise it's a lifetime.
308    if j < bytes.len() && bytes[j] == b'\'' {
309        // Char literal — but only if exactly one char between the quotes
310        // (since multi-ident is invalid). Tokenize as char regardless: it
311        // emits verbatim.
312        return Ok((QuoteKind::Char, j + 1 - i));
313    }
314    // Special-case: an empty `''` is invalid; report.
315    if j == id_start {
316        // Single non-ident char, no closing quote → not valid Rust.
317        // Try scanning a single utf-8 char and see if next is `'`.
318        let cl = char_at(src, j).len_utf8();
319        if peek(bytes, j + cl) == Some(b'\'') {
320            return Ok((QuoteKind::Char, j + cl + 1 - i));
321        }
322        return Err(MinifyError::new("malformed `'` token"));
323    }
324    Ok((QuoteKind::Lifetime, j - i))
325}
326
327fn is_word_start(src: &str, i: usize) -> bool {
328    let c = char_at(src, i);
329    c.is_alphabetic() || c == '_' || c.is_ascii_digit()
330}
331
332fn is_id_continue(c: char) -> bool {
333    c.is_alphanumeric() || c == '_'
334}
335
336fn scan_word(src: &str, i: usize) -> usize {
337    let mut j = i;
338    let bytes = src.as_bytes();
339    let len = bytes.len();
340    while j < len {
341        let c = char_at(src, j);
342        if c.is_alphanumeric() || c == '_' {
343            j += c.len_utf8();
344            continue;
345        }
346        // Numeric literals can have `.` (1.5), `e`/`E` exponent (already
347        // handled via alnum), and `_` separator (handled). For our purposes
348        // we keep `1.5` as one Word — but tokenizing `.` separately is also
349        // fine because `1` then `.` then `5` reassembles via the no-space
350        // rule (1.5 has prev=`1` next=`.` no_space, then prev=`.` next=`5`
351        // no_space). However `..` (range) and `1.0` collide: `1.0..5.0`
352        // would render `1.0..5.0` either way. Safer to treat the digit run
353        // and decimal as one token.
354        if c == '.' && j > i {
355            // Only consume the `.` if followed by a digit (so `1.5` stays
356            // word, but `1..5` produces Word(1) then Punct(..) Word(5)).
357            let next = peek(bytes, j + 1);
358            if matches!(next, Some(b'0'..=b'9')) {
359                j += 1;
360                continue;
361            }
362        }
363        break;
364    }
365    j - i
366}
367
368fn peek(bytes: &[u8], i: usize) -> Option<u8> {
369    bytes.get(i).copied()
370}
371
372fn char_at(src: &str, i: usize) -> char {
373    src[i..].chars().next().unwrap_or('\0')
374}
375
376#[cfg(test)]
377mod tests {
378    use super::*;
379
380    fn min(s: &str) -> String {
381        minify(s, &MinifyOptions::default()).unwrap().body
382    }
383
384    fn min_keep(s: &str) -> String {
385        minify(
386            s,
387            &MinifyOptions {
388                keep_comments: true,
389            },
390        )
391        .unwrap()
392        .body
393    }
394
395    #[test]
396    fn basic_function() {
397        let src = "fn add(a: i32, b: i32) -> i32 {\n    a + b\n}\n";
398        let out = min(src);
399        assert_eq!(out, "fn add(a:i32,b:i32)->i32{a+b}");
400    }
401
402    #[test]
403    fn strips_line_comments() {
404        let src = "fn x() {\n    // hi\n    1\n}\n";
405        let out = min(src);
406        assert_eq!(out, "fn x(){1}");
407    }
408
409    #[test]
410    fn strips_doc_comments() {
411        // Doc comments are line comments to us; stripped by default.
412        let src = "/// docs go here\nfn x() {}\n";
413        let out = min(src);
414        assert_eq!(out, "fn x(){}");
415    }
416
417    #[test]
418    fn nested_block_comment_stripped() {
419        let src = "fn x() { /* outer /* inner */ outer */ 1 }";
420        let out = min(src);
421        assert_eq!(out, "fn x(){1}");
422    }
423
424    #[test]
425    fn keep_comments_converts_line_to_block() {
426        let src = "fn x() {\n    // hello\n    1\n}\n";
427        let r = minify(
428            src,
429            &MinifyOptions {
430                keep_comments: true,
431            },
432        )
433        .unwrap();
434        assert!(r.body.contains("/* hello*/"));
435        assert_eq!(r.warnings.len(), 1);
436    }
437
438    #[test]
439    fn keep_comments_preserves_block_comment() {
440        let src = "fn x() { /* hello */ 1 }";
441        let out = min_keep(src);
442        assert!(out.contains("/* hello */"));
443    }
444
445    #[test]
446    fn raw_string_simple() {
447        let src = r#"let s = r"hello";"#;
448        let out = min(src);
449        assert_eq!(out, r#"let s=r"hello";"#);
450    }
451
452    #[test]
453    fn raw_string_with_hashes() {
454        // Source: `let s = r##"con"tains"##;` (raw strings have no
455        // backslash escapes — the inner `"` is a literal char).
456        let src = "let s = r##\"con\"tains\"##;";
457        let out = min(src);
458        assert!(out.contains("r##\"con\"tains\"##"), "got: {}", out);
459    }
460
461    #[test]
462    fn byte_string() {
463        let src = r#"let s = b"\xff\x00";"#;
464        let out = min(src);
465        assert_eq!(out, r#"let s=b"\xff\x00";"#);
466    }
467
468    #[test]
469    fn raw_byte_string() {
470        let src = r#"let s = br"raw bytes";"#;
471        let out = min(src);
472        assert!(out.contains(r#"br"raw bytes""#));
473    }
474
475    #[test]
476    fn lifetime_preserved() {
477        let src = "fn foo<'a>(x: &'a str) -> &'a str { x }";
478        let out = min(src);
479        assert_eq!(out, "fn foo<'a>(x:&'a str)->&'a str{x}");
480    }
481
482    #[test]
483    fn static_lifetime() {
484        let src = "let s: &'static str = \"hi\";";
485        let out = min(src);
486        assert_eq!(out, "let s:&'static str=\"hi\";");
487    }
488
489    #[test]
490    fn char_literal() {
491        let src = "let c = 'a'; let d = '\\n'; let e = '\\u{1F600}';";
492        let out = min(src);
493        assert!(out.contains("'a'"));
494        assert!(out.contains("'\\n'"));
495        assert!(out.contains("'\\u{1F600}'"));
496    }
497
498    #[test]
499    fn byte_char() {
500        let src = "let c = b'a';";
501        let out = min(src);
502        assert_eq!(out, "let c=b'a';");
503    }
504
505    #[test]
506    fn underscored_number() {
507        let src = "let n = 1_000_000;";
508        let out = min(src);
509        assert_eq!(out, "let n=1_000_000;");
510    }
511
512    #[test]
513    fn hex_number_with_suffix() {
514        let src = "let n = 0xFF_u32;";
515        let out = min(src);
516        assert_eq!(out, "let n=0xFF_u32;");
517    }
518
519    #[test]
520    fn float_literal() {
521        let src = "let f = 1.5e10;";
522        let out = min(src);
523        assert_eq!(out, "let f=1.5e10;");
524    }
525
526    #[test]
527    fn double_colon_preserved() {
528        let src = "use std::collections::HashMap;";
529        let out = min(src);
530        assert_eq!(out, "use std::collections::HashMap;");
531    }
532
533    #[test]
534    fn arrow_preserved() {
535        let src = "fn x() -> i32 { 0 }";
536        let out = min(src);
537        assert_eq!(out, "fn x()->i32{0}");
538    }
539
540    #[test]
541    fn fat_arrow_preserved() {
542        let src = "match x { 1 => true, _ => false }";
543        let out = min(src);
544        assert_eq!(out, "match x{1=>true,_=>false}");
545    }
546
547    #[test]
548    fn unicode_identifier() {
549        let src = "let π = 3.14;";
550        let out = min(src);
551        assert_eq!(out, "let π=3.14;");
552    }
553
554    #[test]
555    fn range_operator() {
556        let src = "let r = 1..5;";
557        let out = min(src);
558        assert_eq!(out, "let r=1..5;");
559    }
560
561    #[test]
562    fn unterminated_string_errors() {
563        let src = "let s = \"unterminated";
564        assert!(minify(src, &MinifyOptions::default()).is_err());
565    }
566
567    #[test]
568    fn unterminated_block_comment_errors() {
569        let src = "fn x() { /* no end";
570        assert!(minify(src, &MinifyOptions::default()).is_err());
571    }
572
573    #[test]
574    fn nested_block_comment_unbalanced_errors() {
575        let src = "fn x() { /* /* */ }";
576        assert!(minify(src, &MinifyOptions::default()).is_err());
577    }
578}