Skip to main content

brief/minify/
c_cpp.rs

1//! C and C++ minifier.
2//!
3//! Distinguishing features:
4//!
5//! - Preprocessor directives (`#include`, `#define`, …) are
6//!   line-sensitive — the C preprocessor reads input line-by-line and
7//!   directives must end at a newline (or be continued via `\<nl>`). We
8//!   tokenize each `#`-line as a single `Preproc` token and the emitter
9//!   places it on its own line.
10//! - String prefixes: `L"…"`, `u8"…"`, `u"…"`, `U"…"` (wide / UTF
11//!   literals).
12//! - C++ raw strings `R"delim(…)delim"` (and prefixed forms like
13//!   `LR"delim(…)delim"`, `u8R"…"`, etc.) — only when `is_cpp` is true.
14//! - Block comments do **not** nest.
15//!
16//! Strategy: aggressive (Strategy A) **between** preprocessor lines.
17//! Around `#`-lines we still emit a leading and trailing newline.
18//!
19//! `is_cpp` toggles the C++-only features (raw strings). For C tags
20//! (`c`, `h`) it's false; for C++ tags (`cpp`, `c++`, `cc`, `cxx`,
21//! `hpp`, `hxx`) it's true.
22
23use super::c_common::{Token, TokenKind};
24use super::{MinifyError, MinifyOptions, MinifyOutput, MinifyWarning};
25
26pub fn minify(
27    source: &str,
28    opts: &MinifyOptions,
29    is_cpp: bool,
30) -> Result<MinifyOutput, MinifyError> {
31    let toks = tokenize(source, is_cpp)?;
32    emit(&toks, opts.keep_comments)
33}
34
35fn tokenize(src: &str, is_cpp: bool) -> Result<Vec<Token<'_>>, MinifyError> {
36    let bytes = src.as_bytes();
37    let mut out: Vec<Token<'_>> = Vec::new();
38    let mut i = 0usize;
39    let mut at_line_start = true;
40    while i < bytes.len() {
41        let c = bytes[i];
42        if matches!(c, b' ' | b'\t' | b'\r') {
43            i += 1;
44            continue;
45        }
46        if c == b'\n' {
47            out.push(Token::new(TokenKind::Newline));
48            i += 1;
49            at_line_start = true;
50            continue;
51        }
52        // Preprocessor directive: `#` at start of line (after optional
53        // whitespace) — already consumed above. Captures everything up to
54        // the newline, plus any backslash-newline continuations.
55        if at_line_start && c == b'#' {
56            let start = i;
57            let mut j = i;
58            while j < bytes.len() {
59                if bytes[j] == b'\\' && peek(bytes, j + 1) == Some(b'\n') {
60                    j += 2;
61                    continue;
62                }
63                if bytes[j] == b'\\'
64                    && peek(bytes, j + 1) == Some(b'\r')
65                    && peek(bytes, j + 2) == Some(b'\n')
66                {
67                    j += 3;
68                    continue;
69                }
70                if bytes[j] == b'\n' {
71                    break;
72                }
73                j += 1;
74            }
75            out.push(Token::new(TokenKind::Preproc(&src[start..j])));
76            i = j;
77            // Don't consume the newline — let the main loop handle it so
78            // the emitter sees a Newline after the Preproc.
79            at_line_start = false;
80            continue;
81        }
82        at_line_start = false;
83        if c == b'/' && peek(bytes, i + 1) == Some(b'/') {
84            let start = i + 2;
85            let mut j = start;
86            while j < bytes.len() && bytes[j] != b'\n' {
87                j += 1;
88            }
89            out.push(Token::new(TokenKind::LineComment(&src[start..j])));
90            i = j;
91            continue;
92        }
93        if c == b'/' && peek(bytes, i + 1) == Some(b'*') {
94            let body_start = i + 2;
95            let mut j = body_start;
96            let mut found = false;
97            while j + 1 < bytes.len() {
98                if bytes[j] == b'*' && bytes[j + 1] == b'/' {
99                    found = true;
100                    break;
101                }
102                j += 1;
103            }
104            if !found {
105                return Err(MinifyError::new("unterminated /* */ block comment"));
106            }
107            out.push(Token::new(TokenKind::BlockComment(&src[body_start..j])));
108            i = j + 2;
109            continue;
110        }
111        // String / raw-string detection. C++ raw strings use the `R"d(…)d"`
112        // form; combinations include `LR`, `uR`, `UR`, `u8R`. Plain strings
113        // can be prefixed `L`, `u`, `u8`, `U`.
114        if let Some(n) = try_scan_string(src, i, is_cpp)? {
115            out.push(Token::new(TokenKind::StrLit(&src[i..i + n])));
116            i += n;
117            continue;
118        }
119        if c == b'\'' {
120            let n = scan_char_literal(src, i)?;
121            out.push(Token::new(TokenKind::StrLit(&src[i..i + n])));
122            i += n;
123            continue;
124        }
125        if is_word_start(src, i) {
126            let n = scan_word(src, i);
127            out.push(Token::new(TokenKind::Word(&src[i..i + n])));
128            i += n;
129            continue;
130        }
131        let n = scan_multi_punct(bytes, i);
132        out.push(Token::new(TokenKind::Punct(&src[i..i + n])));
133        i += n;
134    }
135    Ok(out)
136}
137
138fn emit(tokens: &[Token<'_>], keep_comments: bool) -> Result<MinifyOutput, MinifyError> {
139    let mut out = String::new();
140    let mut warnings: Vec<MinifyWarning> = Vec::new();
141    let mut prev_emit_last: Option<char> = None;
142    let mut last_was_preproc = false;
143    for tok in tokens {
144        match &tok.kind {
145            TokenKind::Newline => {
146                // Newlines are always discarded UNLESS the previous emitted
147                // token was a preprocessor line, in which case we keep one
148                // terminating newline so `#include <…>` is on its own line.
149                if last_was_preproc && !out.ends_with('\n') {
150                    out.push('\n');
151                    prev_emit_last = None;
152                    last_was_preproc = false;
153                }
154            }
155            TokenKind::LineComment(body) => {
156                if !keep_comments {
157                    continue;
158                }
159                let block = format!("/*{}*/", body);
160                push_with_space(&mut out, &mut prev_emit_last, &block);
161                warnings.push(MinifyWarning::LineCommentConverted);
162            }
163            TokenKind::BlockComment(body) => {
164                if !keep_comments {
165                    continue;
166                }
167                let block = format!("/*{}*/", body);
168                push_with_space(&mut out, &mut prev_emit_last, &block);
169            }
170            TokenKind::Word(s)
171            | TokenKind::Punct(s)
172            | TokenKind::StrLit(s)
173            | TokenKind::Template(s)
174            | TokenKind::Regex(s) => {
175                push_with_space(&mut out, &mut prev_emit_last, s);
176                last_was_preproc = false;
177            }
178            TokenKind::Preproc(s) => {
179                if !out.is_empty() && !out.ends_with('\n') {
180                    out.push('\n');
181                }
182                out.push_str(s);
183                prev_emit_last = None;
184                last_was_preproc = true;
185            }
186        }
187    }
188    if last_was_preproc && !out.ends_with('\n') {
189        out.push('\n');
190    }
191    Ok(MinifyOutput {
192        body: out,
193        warnings,
194    })
195}
196
197fn push_with_space(out: &mut String, prev_emit_last: &mut Option<char>, s: &str) {
198    if s.is_empty() {
199        return;
200    }
201    use super::c_common::needs_space;
202    if let Some(prev) = *prev_emit_last {
203        if let Some(next) = s.chars().next() {
204            if needs_space(prev, next) {
205                out.push(' ');
206            }
207        }
208    }
209    out.push_str(s);
210    *prev_emit_last = s.chars().next_back();
211}
212
213fn try_scan_string(src: &str, i: usize, is_cpp: bool) -> Result<Option<usize>, MinifyError> {
214    let bytes = src.as_bytes();
215    // Possible prefixes: `L`, `u`, `u8`, `U`. Each may also be combined
216    // with `R` for C++ raw strings (`LR`, `uR`, `u8R`, `UR`).
217    let mut p = i;
218    let mut had_prefix = false;
219    // u8 first
220    if peek(bytes, p) == Some(b'u') && peek(bytes, p + 1) == Some(b'8') {
221        // Could be u8" or u8R" — but only commit if a quote/R follows.
222        let after = p + 2;
223        if peek(bytes, after) == Some(b'"')
224            || (is_cpp && peek(bytes, after) == Some(b'R') && peek(bytes, after + 1) == Some(b'"'))
225        {
226            p = after;
227            had_prefix = true;
228        }
229    } else if matches!(peek(bytes, p), Some(b'L') | Some(b'u') | Some(b'U')) {
230        let after = p + 1;
231        if peek(bytes, after) == Some(b'"')
232            || (is_cpp && peek(bytes, after) == Some(b'R') && peek(bytes, after + 1) == Some(b'"'))
233        {
234            p = after;
235            had_prefix = true;
236        }
237    }
238    let raw = is_cpp && peek(bytes, p) == Some(b'R') && peek(bytes, p + 1) == Some(b'"');
239    if raw {
240        // R"delim(…)delim"
241        p += 1; // skip R
242        debug_assert_eq!(bytes[p], b'"');
243        let delim_start = p + 1;
244        let mut j = delim_start;
245        while j < bytes.len() && bytes[j] != b'(' {
246            j += 1;
247        }
248        if j >= bytes.len() {
249            return Err(MinifyError::new("malformed raw string"));
250        }
251        let delim = &bytes[delim_start..j];
252        let body_start = j + 1;
253        // find `)delim"`
254        let mut k = body_start;
255        loop {
256            if k >= bytes.len() {
257                return Err(MinifyError::new("unterminated raw string"));
258            }
259            if bytes[k] == b')' && k + 1 + delim.len() < bytes.len() {
260                if &bytes[k + 1..k + 1 + delim.len()] == delim
261                    && bytes.get(k + 1 + delim.len()) == Some(&b'"')
262                {
263                    let total = k + 1 + delim.len() + 1 - i;
264                    return Ok(Some(total));
265                }
266            }
267            k += 1;
268        }
269    }
270    if peek(bytes, p) == Some(b'"') {
271        let n = scan_dq_string(src, p)?;
272        return Ok(Some(p + n - i));
273    }
274    if had_prefix {
275        // We thought we had a prefix but no quote followed — back out.
276        return Ok(None);
277    }
278    Ok(None)
279}
280
281fn scan_dq_string(src: &str, i: usize) -> Result<usize, MinifyError> {
282    let bytes = src.as_bytes();
283    debug_assert_eq!(bytes[i], b'"');
284    let mut j = i + 1;
285    while j < bytes.len() {
286        match bytes[j] {
287            b'\\' => j += 2,
288            b'"' => return Ok(j + 1 - i),
289            b'\n' => return Err(MinifyError::new("newline in string literal")),
290            _ => j += 1,
291        }
292    }
293    Err(MinifyError::new("unterminated string literal"))
294}
295
296fn scan_char_literal(src: &str, i: usize) -> Result<usize, MinifyError> {
297    let bytes = src.as_bytes();
298    debug_assert_eq!(bytes[i], b'\'');
299    let mut j = i + 1;
300    while j < bytes.len() {
301        if bytes[j] == b'\\' {
302            j += 2;
303            continue;
304        }
305        if bytes[j] == b'\'' {
306            return Ok(j + 1 - i);
307        }
308        if bytes[j] == b'\n' {
309            return Err(MinifyError::new("newline in char literal"));
310        }
311        j += 1;
312    }
313    Err(MinifyError::new("unterminated char literal"))
314}
315
316fn is_word_start(src: &str, i: usize) -> bool {
317    let c = char_at(src, i);
318    c.is_alphabetic() || c == '_' || c.is_ascii_digit()
319}
320
321fn scan_word(src: &str, i: usize) -> usize {
322    let bytes = src.as_bytes();
323    let mut j = i;
324    while j < bytes.len() {
325        let c = char_at(src, j);
326        if c.is_alphanumeric() || c == '_' {
327            j += c.len_utf8();
328            continue;
329        }
330        if c == '.' {
331            let next = peek(bytes, j + 1);
332            if matches!(next, Some(b'0'..=b'9')) && j > i {
333                j += 1;
334                continue;
335            }
336        }
337        break;
338    }
339    j - i
340}
341
342fn scan_multi_punct(bytes: &[u8], i: usize) -> usize {
343    let three = bytes
344        .get(i..i + 3)
345        .map(|s| std::str::from_utf8(s).unwrap_or(""))
346        .unwrap_or("");
347    let two = bytes
348        .get(i..i + 2)
349        .map(|s| std::str::from_utf8(s).unwrap_or(""))
350        .unwrap_or("");
351    if matches!(three, "<<=" | ">>=" | "..." | "->*") {
352        return 3;
353    }
354    if matches!(
355        two,
356        "->" | "::"
357            | "=="
358            | "!="
359            | "<="
360            | ">="
361            | "&&"
362            | "||"
363            | "<<"
364            | ">>"
365            | "+="
366            | "-="
367            | "*="
368            | "/="
369            | "%="
370            | "&="
371            | "|="
372            | "^="
373            | "++"
374            | "--"
375            | ".*"
376    ) {
377        return 2;
378    }
379    let c = char_at(unsafe { std::str::from_utf8_unchecked(bytes) }, i);
380    c.len_utf8()
381}
382
383fn peek(bytes: &[u8], i: usize) -> Option<u8> {
384    bytes.get(i).copied()
385}
386
387fn char_at(src: &str, i: usize) -> char {
388    src[i..].chars().next().unwrap_or('\0')
389}
390
391#[cfg(test)]
392mod tests {
393    use super::*;
394
395    fn min_c(s: &str) -> String {
396        minify(s, &MinifyOptions::default(), false).unwrap().body
397    }
398    fn min_cpp(s: &str) -> String {
399        minify(s, &MinifyOptions::default(), true).unwrap().body
400    }
401
402    #[test]
403    fn c_basic() {
404        let src = "int main() {\n    return 0;\n}\n";
405        assert_eq!(min_c(src), "int main(){return 0;}");
406    }
407
408    #[test]
409    fn c_preprocessor_kept_on_own_line() {
410        let src = "#include <stdio.h>\nint main() { return 0; }\n";
411        let out = min_c(src);
412        assert!(
413            out.starts_with("#include <stdio.h>\n"),
414            "preproc on own line: {:?}",
415            out
416        );
417        assert!(out.contains("int main(){return 0;}"));
418    }
419
420    #[test]
421    fn c_multiple_preprocessor_lines() {
422        let src = "#include <stdio.h>\n#include <stdlib.h>\nint x;\n";
423        let out = min_c(src);
424        assert_eq!(out, "#include <stdio.h>\n#include <stdlib.h>\nint x;");
425    }
426
427    #[test]
428    fn c_define_with_continuation() {
429        let src = "#define FOO(x) \\\n    do { x; } while (0)\nint y = 1;\n";
430        let out = min_c(src);
431        // The whole `#define` line, including its `\<nl>` continuations,
432        // is one Preproc token; the next `int y = 1;` is on a new line.
433        assert!(out.starts_with("#define FOO(x) \\\n    do { x; } while (0)\n"));
434        assert!(out.ends_with("int y=1;"));
435    }
436
437    #[test]
438    fn c_strips_line_comment() {
439        let src = "// hi\nint x;\n";
440        assert_eq!(min_c(src), "int x;");
441    }
442
443    #[test]
444    fn c_strips_block_comment() {
445        let src = "/* hi */ int x;\n";
446        assert_eq!(min_c(src), "int x;");
447    }
448
449    #[test]
450    fn cpp_template_double_close() {
451        // C++11+ parsers correctly disambiguate `>>` in template contexts
452        // from the right-shift operator, so collapsing the source `>> ` to
453        // `>>` is safe. The lexer emits `>>` as one Punct because the
454        // source already had them adjacent.
455        let src = "vector<vector<int>> v;";
456        let out = min_cpp(src);
457        assert_eq!(out, "vector<vector<int>>v;");
458    }
459
460    #[test]
461    fn cpp_template_with_space_at_close() {
462        // If the source separated the closing `> >` with a space, the
463        // lexer sees them as two separate Puncts; the emitter then injects
464        // a space because `>` `>` is in the dangerous-pair table.
465        let src = "vector<vector<int> > v;";
466        let out = min_cpp(src);
467        assert!(out.contains("> >"), "got: {}", out);
468    }
469
470    #[test]
471    fn cpp_raw_string() {
472        let src = r#"const char* s = R"x(hi)x";"#;
473        let out = min_cpp(src);
474        assert!(out.contains(r#"R"x(hi)x""#), "got: {}", out);
475    }
476
477    #[test]
478    fn cpp_wide_string() {
479        let src = "const wchar_t* s = L\"hi\";";
480        let out = min_cpp(src);
481        assert!(out.contains("L\"hi\""));
482    }
483
484    #[test]
485    fn cpp_u8_string() {
486        let src = "const char* s = u8\"hi\";";
487        let out = min_cpp(src);
488        assert!(out.contains("u8\"hi\""));
489    }
490
491    #[test]
492    fn cpp_arrow_member() {
493        let src = "p->x = 1;";
494        let out = min_cpp(src);
495        assert_eq!(out, "p->x=1;");
496    }
497
498    #[test]
499    fn cpp_scope_resolution() {
500        let src = "std::string s;";
501        let out = min_cpp(src);
502        assert_eq!(out, "std::string s;");
503    }
504
505    #[test]
506    fn c_keep_comments() {
507        let src = "// hi\nint x;\n";
508        let r = minify(
509            src,
510            &MinifyOptions {
511                keep_comments: true,
512            },
513            false,
514        )
515        .unwrap();
516        assert!(r.body.starts_with("/* hi*/"));
517        assert_eq!(r.warnings.len(), 1);
518    }
519
520    #[test]
521    fn c_unterminated_block_comment() {
522        assert!(minify("/* unterminated", &MinifyOptions::default(), false).is_err());
523    }
524
525    #[test]
526    fn c_unterminated_string() {
527        assert!(minify("char* s = \"oops", &MinifyOptions::default(), false).is_err());
528    }
529}