Skip to main content

brief/minify/
java.rs

1//! Java minifier.
2//!
3//! Distinguishing features vs the generic C-family base:
4//!
5//! - Text blocks `"""…"""` (Java 13+) — content can span multiple lines
6//!   and contain unescaped `"`. We treat the whole literal as one
7//!   StrLit, scanning until the next `"""` not preceded by `\`.
8//! - Annotations: `@Override`, `@SuppressWarnings("foo")`. The `@` is
9//!   not a comment marker — it's part of an identifier-like token. We
10//!   emit `@<ident>` as one Word.
11//! - Block comments do **not** nest (unlike Rust).
12//!
13//! Strategy: aggressive (Strategy A). Java requires explicit semicolons,
14//! so newlines can be stripped freely.
15
16use super::c_common::{Token, TokenKind, emit_aggressive};
17use super::{MinifyError, MinifyOptions, MinifyOutput};
18
19pub fn minify(source: &str, opts: &MinifyOptions) -> Result<MinifyOutput, MinifyError> {
20    let toks = tokenize(source)?;
21    emit_aggressive(&toks, opts.keep_comments)
22}
23
24fn tokenize(src: &str) -> Result<Vec<Token<'_>>, MinifyError> {
25    let bytes = src.as_bytes();
26    let mut out: Vec<Token<'_>> = Vec::new();
27    let mut i = 0usize;
28    while i < bytes.len() {
29        let c = bytes[i];
30        if matches!(c, b' ' | b'\t' | b'\r') {
31            i += 1;
32            continue;
33        }
34        if c == b'\n' {
35            out.push(Token::new(TokenKind::Newline));
36            i += 1;
37            continue;
38        }
39        if c == b'/' && peek(bytes, i + 1) == Some(b'/') {
40            let start = i + 2;
41            let mut j = start;
42            while j < bytes.len() && bytes[j] != b'\n' {
43                j += 1;
44            }
45            out.push(Token::new(TokenKind::LineComment(&src[start..j])));
46            i = j;
47            continue;
48        }
49        if c == b'/' && peek(bytes, i + 1) == Some(b'*') {
50            let body_start = i + 2;
51            let mut j = body_start;
52            while j + 1 < bytes.len() {
53                if bytes[j] == b'*' && bytes[j + 1] == b'/' {
54                    let body = &src[body_start..j];
55                    out.push(Token::new(TokenKind::BlockComment(body)));
56                    i = j + 2;
57                    break;
58                }
59                j += 1;
60            }
61            if i <= body_start {
62                return Err(MinifyError::new("unterminated /* */ block comment"));
63            }
64            continue;
65        }
66        // Text block `"""…"""`.
67        if c == b'"' && peek(bytes, i + 1) == Some(b'"') && peek(bytes, i + 2) == Some(b'"') {
68            let start = i;
69            let mut j = i + 3;
70            loop {
71                if j + 2 >= bytes.len() {
72                    return Err(MinifyError::new("unterminated text block"));
73                }
74                if bytes[j] == b'"' && bytes[j + 1] == b'"' && bytes[j + 2] == b'"' {
75                    // Unescaped triple-quote? It's the close. Java allows
76                    // `\"""` for an in-text triple-quote sequence.
77                    let escaped = j > start + 3 && bytes[j - 1] == b'\\';
78                    if !escaped {
79                        j += 3;
80                        break;
81                    }
82                }
83                j += 1;
84            }
85            out.push(Token::new(TokenKind::StrLit(&src[start..j])));
86            i = j;
87            continue;
88        }
89        if c == b'"' {
90            let n = scan_dq_string(src, i)?;
91            out.push(Token::new(TokenKind::StrLit(&src[i..i + n])));
92            i += n;
93            continue;
94        }
95        if c == b'\'' {
96            let n = scan_char_literal(src, i)?;
97            out.push(Token::new(TokenKind::StrLit(&src[i..i + n])));
98            i += n;
99            continue;
100        }
101        // Annotation `@<ident>`. The `@` is part of the token so it stays
102        // bound to the ident on emit (no risk of `@ Override` rendering).
103        if c == b'@' && peek(bytes, i + 1).map_or(false, is_ident_start_byte) {
104            let mut j = i + 1;
105            while j < bytes.len() && is_ident_continue_byte(bytes[j]) {
106                j += 1;
107            }
108            out.push(Token::new(TokenKind::Word(&src[i..j])));
109            i = j;
110            continue;
111        }
112        if is_word_start(src, i) {
113            let n = scan_word(src, i);
114            out.push(Token::new(TokenKind::Word(&src[i..i + n])));
115            i += n;
116            continue;
117        }
118        let n = scan_multi_punct(bytes, i);
119        out.push(Token::new(TokenKind::Punct(&src[i..i + n])));
120        i += n;
121    }
122    Ok(out)
123}
124
125fn scan_dq_string(src: &str, i: usize) -> Result<usize, MinifyError> {
126    let bytes = src.as_bytes();
127    debug_assert_eq!(bytes[i], b'"');
128    let mut j = i + 1;
129    while j < bytes.len() {
130        match bytes[j] {
131            b'\\' => {
132                j += 2;
133            }
134            b'"' => return Ok(j + 1 - i),
135            b'\n' => return Err(MinifyError::new("newline in string literal")),
136            _ => j += 1,
137        }
138    }
139    Err(MinifyError::new("unterminated string literal"))
140}
141
142fn scan_char_literal(src: &str, i: usize) -> Result<usize, MinifyError> {
143    let bytes = src.as_bytes();
144    debug_assert_eq!(bytes[i], b'\'');
145    let mut j = i + 1;
146    if j >= bytes.len() {
147        return Err(MinifyError::new("unterminated char literal"));
148    }
149    if bytes[j] == b'\\' {
150        j += 2;
151        // simple escapes; numeric escapes (\uXXXX, octal) handled greedily
152        while j < bytes.len() && bytes[j] != b'\'' && bytes[j] != b'\n' {
153            j += 1;
154        }
155    } else {
156        // single Unicode char
157        j += char_at(src, j).len_utf8();
158    }
159    if peek(bytes, j) != Some(b'\'') {
160        return Err(MinifyError::new("malformed char literal"));
161    }
162    Ok(j + 1 - i)
163}
164
165fn is_ident_start_byte(b: u8) -> bool {
166    b.is_ascii_alphabetic() || b == b'_' || b == b'$'
167}
168fn is_ident_continue_byte(b: u8) -> bool {
169    b.is_ascii_alphanumeric() || b == b'_' || b == b'$'
170}
171
172fn is_word_start(src: &str, i: usize) -> bool {
173    let c = char_at(src, i);
174    c.is_alphabetic() || c == '_' || c == '$' || c.is_ascii_digit()
175}
176
177fn scan_word(src: &str, i: usize) -> usize {
178    let bytes = src.as_bytes();
179    let mut j = i;
180    while j < bytes.len() {
181        let c = char_at(src, j);
182        if c.is_alphanumeric() || c == '_' || c == '$' {
183            j += c.len_utf8();
184            continue;
185        }
186        if c == '.' {
187            // 1.5, 1.5e10
188            let next = peek(bytes, j + 1);
189            if matches!(next, Some(b'0'..=b'9')) && j > i {
190                j += 1;
191                continue;
192            }
193        }
194        break;
195    }
196    j - i
197}
198
199fn scan_multi_punct(bytes: &[u8], i: usize) -> usize {
200    let three = bytes
201        .get(i..i + 3)
202        .map(|s| std::str::from_utf8(s).unwrap_or(""))
203        .unwrap_or("");
204    let two = bytes
205        .get(i..i + 2)
206        .map(|s| std::str::from_utf8(s).unwrap_or(""))
207        .unwrap_or("");
208    if matches!(three, "<<=" | ">>=" | ">>>" | "..." | "->>") {
209        return 3;
210    }
211    if matches!(
212        two,
213        "->" | "=="
214            | "!="
215            | "<="
216            | ">="
217            | "&&"
218            | "||"
219            | "<<"
220            | ">>"
221            | "+="
222            | "-="
223            | "*="
224            | "/="
225            | "%="
226            | "&="
227            | "|="
228            | "^="
229            | "++"
230            | "--"
231    ) {
232        return 2;
233    }
234    let c = char_at(unsafe { std::str::from_utf8_unchecked(bytes) }, i);
235    c.len_utf8()
236}
237
238fn peek(bytes: &[u8], i: usize) -> Option<u8> {
239    bytes.get(i).copied()
240}
241
242fn char_at(src: &str, i: usize) -> char {
243    src[i..].chars().next().unwrap_or('\0')
244}
245
246#[cfg(test)]
247mod tests {
248    use super::*;
249
250    fn min(s: &str) -> String {
251        minify(s, &MinifyOptions::default()).unwrap().body
252    }
253
254    #[test]
255    fn class_with_method() {
256        let src = "public class Foo {\n    public int add(int a, int b) {\n        return a + b;\n    }\n}\n";
257        let out = min(src);
258        assert_eq!(
259            out,
260            "public class Foo{public int add(int a,int b){return a+b;}}"
261        );
262    }
263
264    #[test]
265    fn strips_line_comment() {
266        let src = "// hi\nint x;\n";
267        let out = min(src);
268        assert_eq!(out, "int x;");
269    }
270
271    #[test]
272    fn strips_block_comment() {
273        let src = "/* hi */ int x;\n";
274        let out = min(src);
275        assert_eq!(out, "int x;");
276    }
277
278    #[test]
279    fn annotation_preserved() {
280        let src = "@Override public void f() {}";
281        let out = min(src);
282        assert_eq!(out, "@Override public void f(){}");
283    }
284
285    #[test]
286    fn annotation_with_args() {
287        let src = "@SuppressWarnings(\"unchecked\") void f() {}";
288        let out = min(src);
289        assert_eq!(out, "@SuppressWarnings(\"unchecked\")void f(){}");
290    }
291
292    #[test]
293    fn text_block_preserved() {
294        let src = "String s = \"\"\"\nhello\nworld\n\"\"\";\n";
295        let out = min(src);
296        assert!(out.contains("\"\"\"\nhello\nworld\n\"\"\""));
297    }
298
299    #[test]
300    fn string_with_escape() {
301        let src = "String s = \"a\\\"b\";";
302        let out = min(src);
303        assert_eq!(out, "String s=\"a\\\"b\";");
304    }
305
306    #[test]
307    fn char_literal() {
308        let src = "char c = 'a';";
309        let out = min(src);
310        assert_eq!(out, "char c='a';");
311    }
312
313    #[test]
314    fn keep_comments_converts_line() {
315        let src = "// hi\nint x;\n";
316        let r = minify(
317            src,
318            &MinifyOptions {
319                keep_comments: true,
320            },
321        )
322        .unwrap();
323        assert!(r.body.starts_with("/* hi*/"));
324        assert_eq!(r.warnings.len(), 1);
325    }
326
327    #[test]
328    fn dollar_in_identifier() {
329        let src = "int $x = 1;";
330        let out = min(src);
331        assert_eq!(out, "int $x=1;");
332    }
333
334    #[test]
335    fn unterminated_string_errors() {
336        assert!(minify("String s = \"oops", &MinifyOptions::default()).is_err());
337    }
338
339    #[test]
340    fn unterminated_block_comment_errors() {
341        assert!(minify("/* unterminated", &MinifyOptions::default()).is_err());
342    }
343
344    #[test]
345    fn lambda_arrow() {
346        let src = "x -> x + 1";
347        let out = min(src);
348        assert_eq!(out, "x->x+1");
349    }
350
351    #[test]
352    fn diamond_operator() {
353        let src = "List<Integer> xs = new ArrayList<>();";
354        let out = min(src);
355        // `>` to `x` (punct → word) needs no space; `>>=`/`<<` collisions
356        // are handled at lex time so we don't emit `>>` accidentally.
357        assert_eq!(out, "List<Integer>xs=new ArrayList<>();");
358    }
359}