oxc_mangler/lib.rs
1use std::iter::{self, repeat_with};
2
3use fixedbitset::FixedBitSet;
4use itertools::Itertools;
5use keep_names::collect_name_symbols;
6use rustc_hash::FxHashSet;
7
8use base54::base54;
9use oxc_allocator::{Allocator, Vec};
10use oxc_ast::ast::{Declaration, Program, Statement};
11use oxc_data_structures::inline_string::InlineString;
12use oxc_index::Idx;
13use oxc_semantic::{AstNodes, Scoping, Semantic, SemanticBuilder, SymbolId};
14use oxc_span::Atom;
15
16pub(crate) mod base54;
17mod keep_names;
18
19pub use keep_names::MangleOptionsKeepNames;
20
21#[derive(Default, Debug, Clone, Copy)]
22pub struct MangleOptions {
23 /// Pass true to mangle names declared in the top level scope.
24 ///
25 /// Default: `false`
26 pub top_level: bool,
27
28 /// Keep function / class names
29 pub keep_names: MangleOptionsKeepNames,
30
31 /// Use more readable mangled names
32 /// (e.g. `slot_0`, `slot_1`, `slot_2`, ...) for debugging.
33 ///
34 /// Uses base54 if false.
35 pub debug: bool,
36}
37
38type Slot = usize;
39
40/// Enum to handle both owned and borrowed allocators. This is not `Cow` because that type
41/// requires `ToOwned`/`Clone`, which is not implemented for `Allocator`. Although this does
42/// incur some pointer indirection on each reference to the allocator, it allows the API to be
43/// more ergonomic by either accepting an existing allocator, or allowing an internal one to
44/// be created and used temporarily automatically.
45enum TempAllocator<'t> {
46 Owned(Allocator),
47 Borrowed(&'t Allocator),
48}
49
50impl TempAllocator<'_> {
51 /// Get a reference to the allocator, regardless of whether it's owned or borrowed
52 fn as_ref(&self) -> &Allocator {
53 match self {
54 TempAllocator::Owned(allocator) => allocator,
55 TempAllocator::Borrowed(allocator) => allocator,
56 }
57 }
58}
59
60/// # Name Mangler / Symbol Minification
61///
62/// ## Example
63///
64/// ```rust
65/// use oxc_codegen::{Codegen, CodegenOptions};
66/// use oxc_ast::ast::Program;
67/// use oxc_parser::Parser;
68/// use oxc_allocator::Allocator;
69/// use oxc_span::SourceType;
70/// use oxc_mangler::{MangleOptions, Mangler};
71///
72/// let allocator = Allocator::default();
73/// let source = "const result = 1 + 2;";
74/// let parsed = Parser::new(&allocator, source, SourceType::mjs()).parse();
75/// assert!(parsed.errors.is_empty());
76///
77/// let mangled_symbols = Mangler::new()
78/// .with_options(MangleOptions { top_level: true, debug: true })
79/// .build(&parsed.program);
80///
81/// let js = Codegen::new().with_symbol_table(mangled_symbols).build(&parsed.program);
82/// // this will be `const a = 1 + 2;` if debug = false
83/// assert_eq!(js.code, "const slot_0 = 1 + 2;\n");
84/// ```
85///
86/// ## Implementation
87///
88/// See:
89/// * [esbuild](https://github.com/evanw/esbuild/blob/v0.24.0/docs/architecture.md#symbol-minification)
90///
91/// This algorithm is based on the implementation of esbuild and additionally implements improved name reuse functionality.
92/// It targets for better gzip compression.
93///
94/// A slot is a placeholder for binding identifiers that shares the same name.
95/// Visually, it is the index position for binding identifiers:
96///
97/// ```javascript
98/// function slot0(slot1, slot2, slot3) {
99/// slot2 = 1;
100/// }
101/// function slot1(slot0) {
102/// function slot2() {
103/// slot0 = 1;
104/// }
105/// }
106/// ```
107///
108/// The slot number for a new scope starts after the maximum slot of the parent scope.
109///
110/// Occurrences of slots and their corresponding newly assigned short identifiers are:
111/// - slot2: 3 - a
112/// - slot0: 2 - b
113/// - slot1: 2 - c
114/// - slot3: 1 - d
115///
116/// After swapping out the mangled names:
117///
118/// ```javascript
119/// function b(c, a, d) {
120/// a = 1;
121/// }
122/// function c(b) {
123/// function a() {
124/// b = 1;
125/// }
126/// }
127/// ```
128///
129/// ### Name Reuse Calculation
130///
131/// This improvement was inspired by [evanw/esbuild#2614](https://github.com/evanw/esbuild/pull/2614).
132///
133/// For better compression, we shadow the variables where possible to reuse the same name.
134/// For example, the following code:
135/// ```javascript
136/// var top_level_a = 0;
137/// var top_level_b = 1;
138/// function foo() {
139/// var foo_a = 1;
140/// console.log(top_level_b, foo_a);
141/// }
142/// function bar() {
143/// var bar_a = 1;
144/// console.log(top_level_b, bar_a);
145/// }
146/// console.log(top_level_a, foo(), bar())
147/// ```
148/// `top_level_a` is declared in the root scope, but is not used in function `foo` and function `bar`.
149/// Therefore, we can reuse the same name for `top_level_a` and `foo_a` and `bar_a`.
150///
151/// To calculate whether the variable name can be reused in the descendant scopes,
152/// this mangler introduces a concept of symbol liveness and slot liveness.
153/// Symbol liveness is a subtree of the scope tree that contains the declared scope of the symbol and
154/// all the scopes that the symbol is used in. It is a subtree, so any scopes that are between the declared scope and the used scope
155/// are also included. This is to ensure that the symbol is not shadowed by a different symbol before the use in the descendant scope.
156///
157/// For the example above, the liveness of each symbols are:
158/// - `top_level_a`: root_scope
159/// - `top_level_b`: root_scope -> foo, root_scope -> bar
160/// - `foo_a`: root_scope -> foo
161/// - `bar_a`: root_scope -> bar
162/// - `foo`: root_scope
163/// - `bar`: root_scope
164///
165/// Slot liveness is the same as symbol liveness, but it is a subforest (multiple subtrees) of the scope tree that can contain
166/// multiple symbol liveness.
167///
168/// Now that we have the liveness of each symbol, we want to assign symbols to minimal number of slots.
169/// This is a graph coloring problem where the node of the graph is the symbol and the edge of the graph indicates whether
170/// the symbols has a common alive scope and the color of the node is the slot.
171/// This mangler uses a greedy algorithm to assign symbols to slots to achieve that.
172/// In other words, it assigns symbols to the first slot that does not live in the liveness of the symbol.
173/// For the example above, each symbol is assigned to the following slots:
174/// - slot 0: `top_level_a`
175/// - slot 1: `top_level_b`, `foo_a`, `bar_a`
176/// - slot 2: `foo`
177/// - slot 3: `bar`
178pub struct Mangler<'t> {
179 options: MangleOptions,
180 /// An allocator meant to be used for temporary allocations during mangling.
181 /// It can be cleared after mangling is done, to free up memory for subsequent
182 /// files or other operations.
183 temp_allocator: TempAllocator<'t>,
184}
185
186impl Default for Mangler<'_> {
187 fn default() -> Self {
188 Self {
189 options: MangleOptions::default(),
190 temp_allocator: TempAllocator::Owned(Allocator::default()),
191 }
192 }
193}
194
195impl<'t> Mangler<'t> {
196 #[must_use]
197 pub fn new() -> Self {
198 Self::default()
199 }
200
201 /// Creates a new `Mangler` using an existing temporary allocator. This is an allocator
202 /// that can be reset after mangling and is only used for temporary allocations during
203 /// the mangling process. This makes processing multiple files at once much more efficient,
204 /// because the same memory can be used for mangling each file.
205 ///
206 /// # Examples
207 ///
208 /// ```rust
209 /// use oxc_allocator::Allocator;
210 /// use oxc_mangler::Mangler;
211 /// use oxc_parser::Parser;
212 /// use oxc_span::SourceType;
213 ///
214 /// let allocator = Allocator::default();
215 /// let mut temp_allocator = Allocator::default();
216 /// let source = "function myFunction(param) { return param + 1; }";
217 ///
218 /// let parsed = Parser::new(&allocator, source, SourceType::mjs()).parse();
219 /// let mangled_symbols = Mangler::new_with_temp_allocator(&temp_allocator)
220 /// .build(&parsed.program);
221 ///
222 /// // Reset the allocator to free temporary memory
223 /// temp_allocator.reset();
224 /// ```
225 ///
226 /// Processing multiple files:
227 ///
228 /// ```rust
229 /// # use oxc_allocator::Allocator;
230 /// # use oxc_mangler::Mangler;
231 /// # use oxc_parser::Parser;
232 /// # use oxc_span::SourceType;
233 /// let allocator = Allocator::default();
234 /// let mut temp_allocator = Allocator::default();
235 /// let files = ["function foo() {}", "function bar() {}"];
236 ///
237 /// for source in files {
238 /// let parsed = Parser::new(&allocator, source, SourceType::mjs()).parse();
239 /// let mangled_symbols = Mangler::new_with_temp_allocator(&temp_allocator)
240 /// .build(&parsed.program);
241 /// temp_allocator.reset(); // Free memory between files
242 /// }
243 /// ```
244 #[must_use]
245 pub fn new_with_temp_allocator(temp_allocator: &'t Allocator) -> Self {
246 Self {
247 options: MangleOptions::default(),
248 temp_allocator: TempAllocator::Borrowed(temp_allocator),
249 }
250 }
251
252 #[must_use]
253 pub fn with_options(mut self, options: MangleOptions) -> Self {
254 self.options = options;
255 self
256 }
257
258 /// Mangles the program. The resulting SymbolTable contains the mangled symbols - `program` is not modified.
259 /// Pass the symbol table to oxc_codegen to generate the mangled code.
260 #[must_use]
261 pub fn build(self, program: &Program<'_>) -> Scoping {
262 let mut semantic =
263 SemanticBuilder::new().with_scope_tree_child_ids(true).build(program).semantic;
264 self.build_with_semantic(&mut semantic, program);
265 semantic.into_scoping()
266 }
267
268 /// # Panics
269 ///
270 /// Panics if the child_ids does not exist in scope_tree.
271 pub fn build_with_semantic(self, semantic: &mut Semantic<'_>, program: &Program<'_>) {
272 if self.options.debug {
273 self.build_with_semantic_impl(semantic, program, debug_name);
274 } else {
275 self.build_with_semantic_impl(semantic, program, base54);
276 }
277 }
278
279 fn build_with_semantic_impl<const CAPACITY: usize, G: Fn(u32) -> InlineString<CAPACITY, u8>>(
280 self,
281 semantic: &mut Semantic<'_>,
282 program: &Program<'_>,
283 generate_name: G,
284 ) {
285 let (scoping, ast_nodes) = semantic.scoping_mut_and_nodes();
286
287 assert!(scoping.has_scope_child_ids(), "child_id needs to be generated");
288
289 // TODO: implement opt-out of direct-eval in a branch of scopes.
290 if scoping.root_scope_flags().contains_direct_eval() {
291 return;
292 }
293
294 let (exported_names, exported_symbols) = if self.options.top_level {
295 Mangler::collect_exported_symbols(program)
296 } else {
297 Default::default()
298 };
299 let (keep_name_names, keep_name_symbols) =
300 Mangler::collect_keep_name_symbols(self.options.keep_names, scoping, ast_nodes);
301
302 let temp_allocator = self.temp_allocator.as_ref();
303
304 // All symbols with their assigned slots. Keyed by symbol id.
305 let mut slots = Vec::from_iter_in(iter::repeat_n(0, scoping.symbols_len()), temp_allocator);
306
307 // Stores the lived scope ids for each slot. Keyed by slot number.
308 let mut slot_liveness: std::vec::Vec<FixedBitSet> = vec![];
309 let mut tmp_bindings = Vec::with_capacity_in(100, temp_allocator);
310
311 let mut reusable_slots = Vec::new_in(temp_allocator);
312 // Walk down the scope tree and assign a slot number for each symbol.
313 // It is possible to do this in a loop over the symbol list,
314 // but walking down the scope tree seems to generate a better code.
315 for (scope_id, bindings) in scoping.iter_bindings() {
316 if bindings.is_empty() {
317 continue;
318 }
319
320 // Sort `bindings` in declaration order.
321 tmp_bindings.clear();
322 tmp_bindings.extend(
323 bindings.values().copied().filter(|binding| !keep_name_symbols.contains(binding)),
324 );
325 tmp_bindings.sort_unstable();
326 if tmp_bindings.is_empty() {
327 continue;
328 }
329
330 let mut slot = slot_liveness.len();
331
332 reusable_slots.clear();
333 reusable_slots.extend(
334 // Slots that are already assigned to other symbols, but does not live in the current scope.
335 slot_liveness
336 .iter()
337 .enumerate()
338 .filter(|(_, slot_liveness)| !slot_liveness.contains(scope_id.index()))
339 .map(|(slot, _)| slot)
340 .take(tmp_bindings.len()),
341 );
342
343 // The number of new slots that needs to be allocated.
344 let remaining_count = tmp_bindings.len() - reusable_slots.len();
345 reusable_slots.extend(slot..slot + remaining_count);
346
347 slot += remaining_count;
348 if slot_liveness.len() < slot {
349 slot_liveness
350 .resize_with(slot, || FixedBitSet::with_capacity(scoping.scopes_len()));
351 }
352
353 for (&symbol_id, assigned_slot) in
354 tmp_bindings.iter().zip(reusable_slots.iter().copied())
355 {
356 slots[symbol_id.index()] = assigned_slot;
357
358 // If the symbol is declared by `var`, then it can be hoisted to
359 // parent, so we need to include the scope where it is declared.
360 // (for cases like `function foo() { { var x; let y; } }`)
361 let declared_scope_id =
362 ast_nodes.get_node(scoping.symbol_declaration(symbol_id)).scope_id();
363
364 // Calculate the scope ids that this symbol is alive in.
365 let lived_scope_ids = scoping
366 .get_resolved_references(symbol_id)
367 .map(|reference| ast_nodes.get_node(reference.node_id()).scope_id())
368 .chain([scope_id, declared_scope_id])
369 .flat_map(|used_scope_id| {
370 scoping.scope_ancestors(used_scope_id).take_while(|s_id| *s_id != scope_id)
371 });
372
373 // Since the slot is now assigned to this symbol, it is alive in all the scopes that this symbol is alive in.
374 slot_liveness[assigned_slot].extend(lived_scope_ids.map(oxc_index::Idx::index));
375 }
376 }
377
378 let total_number_of_slots = slot_liveness.len();
379
380 let frequencies = self.tally_slot_frequencies(
381 scoping,
382 &exported_symbols,
383 &keep_name_symbols,
384 total_number_of_slots,
385 &slots,
386 );
387
388 let root_unresolved_references = scoping.root_unresolved_references();
389 let root_bindings = scoping.get_bindings(scoping.root_scope_id());
390
391 let mut reserved_names = Vec::with_capacity_in(total_number_of_slots, temp_allocator);
392
393 let mut count = 0;
394 for _ in 0..total_number_of_slots {
395 let name = loop {
396 let name = generate_name(count);
397 count += 1;
398 // Do not mangle keywords and unresolved references
399 let n = name.as_str();
400 if !is_keyword(n)
401 && !is_special_name(n)
402 && !root_unresolved_references.contains_key(n)
403 && !(root_bindings.contains_key(n)
404 && (!self.options.top_level || exported_names.contains(n)))
405 // TODO: only skip the names that are kept in the current scope
406 && !keep_name_names.contains(n)
407 {
408 break name;
409 }
410 };
411 reserved_names.push(name);
412 }
413
414 // Group similar symbols for smaller gzipped file
415 // <https://github.com/google/closure-compiler/blob/c383a3a1d2fce33b6c778ef76b5a626e07abca41/src/com/google/javascript/jscomp/RenameVars.java#L475-L483>
416 // Original Comment:
417 // 1) The most frequent vars get the shorter names.
418 // 2) If N number of vars are going to be assigned names of the same
419 // length, we assign the N names based on the order at which the vars
420 // first appear in the source. This makes the output somewhat less
421 // random, because symbols declared close together are assigned names
422 // that are quite similar. With this heuristic, the output is more
423 // compressible.
424 // For instance, the output may look like:
425 // var da = "..", ea = "..";
426 // function fa() { .. } function ga() { .. }
427
428 let mut freq_iter = frequencies.iter();
429 let mut symbols_renamed_in_this_batch = Vec::with_capacity_in(100, temp_allocator);
430 let mut slice_of_same_len_strings = Vec::with_capacity_in(100, temp_allocator);
431 // 2. "N number of vars are going to be assigned names of the same length"
432 for (_, slice_of_same_len_strings_group) in
433 &reserved_names.into_iter().chunk_by(InlineString::len)
434 {
435 // 1. "The most frequent vars get the shorter names"
436 // (freq_iter is sorted by frequency from highest to lowest,
437 // so taking means take the N most frequent symbols remaining)
438 slice_of_same_len_strings.clear();
439 slice_of_same_len_strings.extend(slice_of_same_len_strings_group);
440 symbols_renamed_in_this_batch.clear();
441 symbols_renamed_in_this_batch
442 .extend(freq_iter.by_ref().take(slice_of_same_len_strings.len()));
443
444 debug_assert_eq!(symbols_renamed_in_this_batch.len(), slice_of_same_len_strings.len());
445
446 // 2. "we assign the N names based on the order at which the vars first appear in the source."
447 // sorting by slot enables us to sort by the order at which the vars first appear in the source
448 // (this is possible because the slots are discovered currently in a DFS method which is the same order
449 // as variables appear in the source code)
450 symbols_renamed_in_this_batch.sort_unstable_by_key(|a| a.slot);
451
452 // here we just zip the iterator of symbols to rename with the iterator of new names for the next for loop
453 let symbols_to_rename_with_new_names =
454 symbols_renamed_in_this_batch.iter().zip(slice_of_same_len_strings.iter());
455
456 // rename the variables
457 for (symbol_to_rename, new_name) in symbols_to_rename_with_new_names {
458 for &symbol_id in &symbol_to_rename.symbol_ids {
459 scoping.set_symbol_name(symbol_id, new_name);
460 }
461 }
462 }
463 }
464
465 fn tally_slot_frequencies<'a>(
466 &'a self,
467 scoping: &Scoping,
468 exported_symbols: &FxHashSet<SymbolId>,
469 keep_name_symbols: &FxHashSet<SymbolId>,
470 total_number_of_slots: usize,
471 slots: &[Slot],
472 ) -> Vec<'a, SlotFrequency<'a>> {
473 let root_scope_id = scoping.root_scope_id();
474 let temp_allocator = self.temp_allocator.as_ref();
475 let mut frequencies = Vec::from_iter_in(
476 repeat_with(|| SlotFrequency::new(temp_allocator)).take(total_number_of_slots),
477 temp_allocator,
478 );
479
480 for (symbol_id, slot) in slots.iter().copied().enumerate() {
481 let symbol_id = SymbolId::from_usize(symbol_id);
482 if scoping.symbol_scope_id(symbol_id) == root_scope_id
483 && (!self.options.top_level || exported_symbols.contains(&symbol_id))
484 {
485 continue;
486 }
487 if is_special_name(scoping.symbol_name(symbol_id)) {
488 continue;
489 }
490 if keep_name_symbols.contains(&symbol_id) {
491 continue;
492 }
493 let index = slot;
494 frequencies[index].slot = slot;
495 frequencies[index].frequency += scoping.get_resolved_reference_ids(symbol_id).len();
496 frequencies[index].symbol_ids.push(symbol_id);
497 }
498 frequencies.sort_unstable_by_key(|x| std::cmp::Reverse(x.frequency));
499 frequencies
500 }
501
502 fn collect_exported_symbols<'a>(
503 program: &Program<'a>,
504 ) -> (FxHashSet<Atom<'a>>, FxHashSet<SymbolId>) {
505 program
506 .body
507 .iter()
508 .filter_map(|statement| {
509 let Statement::ExportNamedDeclaration(v) = statement else { return None };
510 v.declaration.as_ref()
511 })
512 .flat_map(|decl| {
513 if let Declaration::VariableDeclaration(decl) = decl {
514 itertools::Either::Left(
515 decl.declarations
516 .iter()
517 .filter_map(|decl| decl.id.get_binding_identifier()),
518 )
519 } else {
520 itertools::Either::Right(decl.id().into_iter())
521 }
522 })
523 .map(|id| (id.name, id.symbol_id()))
524 .collect()
525 }
526
527 fn collect_keep_name_symbols<'a>(
528 keep_names: MangleOptionsKeepNames,
529 scoping: &'a Scoping,
530 nodes: &AstNodes,
531 ) -> (FxHashSet<&'a str>, FxHashSet<SymbolId>) {
532 let ids = collect_name_symbols(keep_names, scoping, nodes);
533 (ids.iter().map(|id| scoping.symbol_name(*id)).collect(), ids)
534 }
535}
536
537fn is_special_name(name: &str) -> bool {
538 matches!(name, "exports" | "arguments")
539}
540
541#[derive(Debug)]
542struct SlotFrequency<'a> {
543 pub slot: Slot,
544 pub frequency: usize,
545 pub symbol_ids: Vec<'a, SymbolId>,
546}
547
548impl<'t> SlotFrequency<'t> {
549 fn new(temp_allocator: &'t Allocator) -> Self {
550 Self { slot: 0, frequency: 0, symbol_ids: Vec::new_in(temp_allocator) }
551 }
552}
553
554#[rustfmt::skip]
555fn is_keyword(s: &str) -> bool {
556 matches!(s, "as" | "do" | "if" | "in" | "is" | "of" | "any" | "for" | "get"
557 | "let" | "new" | "out" | "set" | "try" | "var" | "case" | "else"
558 | "enum" | "from" | "meta" | "null" | "this" | "true" | "type"
559 | "void" | "with")
560}
561
562// Maximum length of string is 15 (`slot_4294967295` for `u32::MAX`).
563fn debug_name(n: u32) -> InlineString<15, u8> {
564 // Using `format!` here allocates a string unnecessarily.
565 // But this function is not for use in production, so let's not worry about it.
566 // We shouldn't resort to unsafe code, when it's not critical for performance.
567 InlineString::from_str(&format!("slot_{n}"))
568}