oxc_mangler/
lib.rs

1use std::iter::{self, repeat_with};
2
3use fixedbitset::FixedBitSet;
4use itertools::Itertools;
5use keep_names::collect_name_symbols;
6use rustc_hash::FxHashSet;
7
8use base54::base54;
9use oxc_allocator::{Allocator, Vec};
10use oxc_ast::ast::{Declaration, Program, Statement};
11use oxc_data_structures::inline_string::InlineString;
12use oxc_index::Idx;
13use oxc_semantic::{AstNodes, Scoping, Semantic, SemanticBuilder, SymbolId};
14use oxc_span::Atom;
15
16pub(crate) mod base54;
17mod keep_names;
18
19pub use keep_names::MangleOptionsKeepNames;
20
21#[derive(Default, Debug, Clone, Copy)]
22pub struct MangleOptions {
23    /// Pass true to mangle names declared in the top level scope.
24    ///
25    /// Default: `false`
26    pub top_level: bool,
27
28    /// Keep function / class names
29    pub keep_names: MangleOptionsKeepNames,
30
31    /// Use more readable mangled names
32    /// (e.g. `slot_0`, `slot_1`, `slot_2`, ...) for debugging.
33    ///
34    /// Uses base54 if false.
35    pub debug: bool,
36}
37
38type Slot = usize;
39
40/// Enum to handle both owned and borrowed allocators. This is not `Cow` because that type
41/// requires `ToOwned`/`Clone`, which is not implemented for `Allocator`. Although this does
42/// incur some pointer indirection on each reference to the allocator, it allows the API to be
43/// more ergonomic by either accepting an existing allocator, or allowing an internal one to
44/// be created and used temporarily automatically.
45enum TempAllocator<'t> {
46    Owned(Allocator),
47    Borrowed(&'t Allocator),
48}
49
50impl TempAllocator<'_> {
51    /// Get a reference to the allocator, regardless of whether it's owned or borrowed
52    fn as_ref(&self) -> &Allocator {
53        match self {
54            TempAllocator::Owned(allocator) => allocator,
55            TempAllocator::Borrowed(allocator) => allocator,
56        }
57    }
58}
59
60/// # Name Mangler / Symbol Minification
61///
62/// ## Example
63///
64/// ```rust
65/// use oxc_codegen::{Codegen, CodegenOptions};
66/// use oxc_ast::ast::Program;
67/// use oxc_parser::Parser;
68/// use oxc_allocator::Allocator;
69/// use oxc_span::SourceType;
70/// use oxc_mangler::{MangleOptions, Mangler};
71///
72/// let allocator = Allocator::default();
73/// let source = "const result = 1 + 2;";
74/// let parsed = Parser::new(&allocator, source, SourceType::mjs()).parse();
75/// assert!(parsed.errors.is_empty());
76///
77/// let mangled_symbols = Mangler::new()
78///     .with_options(MangleOptions { top_level: true, debug: true })
79///     .build(&parsed.program);
80///
81/// let js = Codegen::new().with_symbol_table(mangled_symbols).build(&parsed.program);
82/// // this will be `const a = 1 + 2;` if debug = false
83/// assert_eq!(js.code, "const slot_0 = 1 + 2;\n");
84/// ```
85///
86/// ## Implementation
87///
88/// See:
89///   * [esbuild](https://github.com/evanw/esbuild/blob/v0.24.0/docs/architecture.md#symbol-minification)
90///
91/// This algorithm is based on the implementation of esbuild and additionally implements improved name reuse functionality.
92/// It targets for better gzip compression.
93///
94/// A slot is a placeholder for binding identifiers that shares the same name.
95/// Visually, it is the index position for binding identifiers:
96///
97/// ```javascript
98/// function slot0(slot1, slot2, slot3) {
99///     slot2 = 1;
100/// }
101/// function slot1(slot0) {
102///     function slot2() {
103///         slot0 = 1;
104///     }
105/// }
106/// ```
107///
108/// The slot number for a new scope starts after the maximum slot of the parent scope.
109///
110/// Occurrences of slots and their corresponding newly assigned short identifiers are:
111/// - slot2: 3 - a
112/// - slot0: 2 - b
113/// - slot1: 2 - c
114/// - slot3: 1 - d
115///
116/// After swapping out the mangled names:
117///
118/// ```javascript
119/// function b(c, a, d) {
120///     a = 1;
121/// }
122/// function c(b) {
123///     function a() {
124///         b = 1;
125///     }
126/// }
127/// ```
128///
129/// ### Name Reuse Calculation
130///
131/// This improvement was inspired by [evanw/esbuild#2614](https://github.com/evanw/esbuild/pull/2614).
132///
133/// For better compression, we shadow the variables where possible to reuse the same name.
134/// For example, the following code:
135/// ```javascript
136/// var top_level_a = 0;
137/// var top_level_b = 1;
138/// function foo() {
139///   var foo_a = 1;
140///   console.log(top_level_b, foo_a);
141/// }
142/// function bar() {
143///   var bar_a = 1;
144///   console.log(top_level_b, bar_a);
145/// }
146/// console.log(top_level_a, foo(), bar())
147/// ```
148/// `top_level_a` is declared in the root scope, but is not used in function `foo` and function `bar`.
149/// Therefore, we can reuse the same name for `top_level_a` and `foo_a` and `bar_a`.
150///
151/// To calculate whether the variable name can be reused in the descendant scopes,
152/// this mangler introduces a concept of symbol liveness and slot liveness.
153/// Symbol liveness is a subtree of the scope tree that contains the declared scope of the symbol and
154/// all the scopes that the symbol is used in. It is a subtree, so any scopes that are between the declared scope and the used scope
155/// are also included. This is to ensure that the symbol is not shadowed by a different symbol before the use in the descendant scope.
156///
157/// For the example above, the liveness of each symbols are:
158/// - `top_level_a`: root_scope
159/// - `top_level_b`: root_scope -> foo, root_scope -> bar
160/// - `foo_a`: root_scope -> foo
161/// - `bar_a`: root_scope -> bar
162/// - `foo`: root_scope
163/// - `bar`: root_scope
164///
165/// Slot liveness is the same as symbol liveness, but it is a subforest (multiple subtrees) of the scope tree that can contain
166/// multiple symbol liveness.
167///
168/// Now that we have the liveness of each symbol, we want to assign symbols to minimal number of slots.
169/// This is a graph coloring problem where the node of the graph is the symbol and the edge of the graph indicates whether
170/// the symbols has a common alive scope and the color of the node is the slot.
171/// This mangler uses a greedy algorithm to assign symbols to slots to achieve that.
172/// In other words, it assigns symbols to the first slot that does not live in the liveness of the symbol.
173/// For the example above, each symbol is assigned to the following slots:
174/// - slot 0: `top_level_a`
175/// - slot 1: `top_level_b`, `foo_a`, `bar_a`
176/// - slot 2: `foo`
177/// - slot 3: `bar`
178pub struct Mangler<'t> {
179    options: MangleOptions,
180    /// An allocator meant to be used for temporary allocations during mangling.
181    /// It can be cleared after mangling is done, to free up memory for subsequent
182    /// files or other operations.
183    temp_allocator: TempAllocator<'t>,
184}
185
186impl Default for Mangler<'_> {
187    fn default() -> Self {
188        Self {
189            options: MangleOptions::default(),
190            temp_allocator: TempAllocator::Owned(Allocator::default()),
191        }
192    }
193}
194
195impl<'t> Mangler<'t> {
196    #[must_use]
197    pub fn new() -> Self {
198        Self::default()
199    }
200
201    /// Creates a new `Mangler` using an existing temporary allocator. This is an allocator
202    /// that can be reset after mangling and is only used for temporary allocations during
203    /// the mangling process. This makes processing multiple files at once much more efficient,
204    /// because the same memory can be used for mangling each file.
205    ///
206    /// # Examples
207    ///
208    /// ```rust
209    /// use oxc_allocator::Allocator;
210    /// use oxc_mangler::Mangler;
211    /// use oxc_parser::Parser;
212    /// use oxc_span::SourceType;
213    ///
214    /// let allocator = Allocator::default();
215    /// let mut temp_allocator = Allocator::default();
216    /// let source = "function myFunction(param) { return param + 1; }";
217    ///
218    /// let parsed = Parser::new(&allocator, source, SourceType::mjs()).parse();
219    /// let mangled_symbols = Mangler::new_with_temp_allocator(&temp_allocator)
220    ///     .build(&parsed.program);
221    ///
222    /// // Reset the allocator to free temporary memory
223    /// temp_allocator.reset();
224    /// ```
225    ///
226    /// Processing multiple files:
227    ///
228    /// ```rust
229    /// # use oxc_allocator::Allocator;
230    /// # use oxc_mangler::Mangler;
231    /// # use oxc_parser::Parser;
232    /// # use oxc_span::SourceType;
233    /// let allocator = Allocator::default();
234    /// let mut temp_allocator = Allocator::default();
235    /// let files = ["function foo() {}", "function bar() {}"];
236    ///
237    /// for source in files {
238    ///     let parsed = Parser::new(&allocator, source, SourceType::mjs()).parse();
239    ///     let mangled_symbols = Mangler::new_with_temp_allocator(&temp_allocator)
240    ///         .build(&parsed.program);
241    ///     temp_allocator.reset(); // Free memory between files
242    /// }
243    /// ```
244    #[must_use]
245    pub fn new_with_temp_allocator(temp_allocator: &'t Allocator) -> Self {
246        Self {
247            options: MangleOptions::default(),
248            temp_allocator: TempAllocator::Borrowed(temp_allocator),
249        }
250    }
251
252    #[must_use]
253    pub fn with_options(mut self, options: MangleOptions) -> Self {
254        self.options = options;
255        self
256    }
257
258    /// Mangles the program. The resulting SymbolTable contains the mangled symbols - `program` is not modified.
259    /// Pass the symbol table to oxc_codegen to generate the mangled code.
260    #[must_use]
261    pub fn build(self, program: &Program<'_>) -> Scoping {
262        let mut semantic =
263            SemanticBuilder::new().with_scope_tree_child_ids(true).build(program).semantic;
264        self.build_with_semantic(&mut semantic, program);
265        semantic.into_scoping()
266    }
267
268    /// # Panics
269    ///
270    /// Panics if the child_ids does not exist in scope_tree.
271    pub fn build_with_semantic(self, semantic: &mut Semantic<'_>, program: &Program<'_>) {
272        if self.options.debug {
273            self.build_with_semantic_impl(semantic, program, debug_name);
274        } else {
275            self.build_with_semantic_impl(semantic, program, base54);
276        }
277    }
278
279    fn build_with_semantic_impl<const CAPACITY: usize, G: Fn(u32) -> InlineString<CAPACITY, u8>>(
280        self,
281        semantic: &mut Semantic<'_>,
282        program: &Program<'_>,
283        generate_name: G,
284    ) {
285        let (scoping, ast_nodes) = semantic.scoping_mut_and_nodes();
286
287        assert!(scoping.has_scope_child_ids(), "child_id needs to be generated");
288
289        // TODO: implement opt-out of direct-eval in a branch of scopes.
290        if scoping.root_scope_flags().contains_direct_eval() {
291            return;
292        }
293
294        let (exported_names, exported_symbols) = if self.options.top_level {
295            Mangler::collect_exported_symbols(program)
296        } else {
297            Default::default()
298        };
299        let (keep_name_names, keep_name_symbols) =
300            Mangler::collect_keep_name_symbols(self.options.keep_names, scoping, ast_nodes);
301
302        let temp_allocator = self.temp_allocator.as_ref();
303
304        // All symbols with their assigned slots. Keyed by symbol id.
305        let mut slots = Vec::from_iter_in(iter::repeat_n(0, scoping.symbols_len()), temp_allocator);
306
307        // Stores the lived scope ids for each slot. Keyed by slot number.
308        let mut slot_liveness: std::vec::Vec<FixedBitSet> = vec![];
309        let mut tmp_bindings = Vec::with_capacity_in(100, temp_allocator);
310
311        let mut reusable_slots = Vec::new_in(temp_allocator);
312        // Walk down the scope tree and assign a slot number for each symbol.
313        // It is possible to do this in a loop over the symbol list,
314        // but walking down the scope tree seems to generate a better code.
315        for (scope_id, bindings) in scoping.iter_bindings() {
316            if bindings.is_empty() {
317                continue;
318            }
319
320            // Sort `bindings` in declaration order.
321            tmp_bindings.clear();
322            tmp_bindings.extend(
323                bindings.values().copied().filter(|binding| !keep_name_symbols.contains(binding)),
324            );
325            tmp_bindings.sort_unstable();
326            if tmp_bindings.is_empty() {
327                continue;
328            }
329
330            let mut slot = slot_liveness.len();
331
332            reusable_slots.clear();
333            reusable_slots.extend(
334                // Slots that are already assigned to other symbols, but does not live in the current scope.
335                slot_liveness
336                    .iter()
337                    .enumerate()
338                    .filter(|(_, slot_liveness)| !slot_liveness.contains(scope_id.index()))
339                    .map(|(slot, _)| slot)
340                    .take(tmp_bindings.len()),
341            );
342
343            // The number of new slots that needs to be allocated.
344            let remaining_count = tmp_bindings.len() - reusable_slots.len();
345            reusable_slots.extend(slot..slot + remaining_count);
346
347            slot += remaining_count;
348            if slot_liveness.len() < slot {
349                slot_liveness
350                    .resize_with(slot, || FixedBitSet::with_capacity(scoping.scopes_len()));
351            }
352
353            for (&symbol_id, assigned_slot) in
354                tmp_bindings.iter().zip(reusable_slots.iter().copied())
355            {
356                slots[symbol_id.index()] = assigned_slot;
357
358                // If the symbol is declared by `var`, then it can be hoisted to
359                // parent, so we need to include the scope where it is declared.
360                // (for cases like `function foo() { { var x; let y; } }`)
361                let declared_scope_id =
362                    ast_nodes.get_node(scoping.symbol_declaration(symbol_id)).scope_id();
363
364                // Calculate the scope ids that this symbol is alive in.
365                let lived_scope_ids = scoping
366                    .get_resolved_references(symbol_id)
367                    .map(|reference| ast_nodes.get_node(reference.node_id()).scope_id())
368                    .chain([scope_id, declared_scope_id])
369                    .flat_map(|used_scope_id| {
370                        scoping.scope_ancestors(used_scope_id).take_while(|s_id| *s_id != scope_id)
371                    });
372
373                // Since the slot is now assigned to this symbol, it is alive in all the scopes that this symbol is alive in.
374                slot_liveness[assigned_slot].extend(lived_scope_ids.map(oxc_index::Idx::index));
375            }
376        }
377
378        let total_number_of_slots = slot_liveness.len();
379
380        let frequencies = self.tally_slot_frequencies(
381            scoping,
382            &exported_symbols,
383            &keep_name_symbols,
384            total_number_of_slots,
385            &slots,
386        );
387
388        let root_unresolved_references = scoping.root_unresolved_references();
389        let root_bindings = scoping.get_bindings(scoping.root_scope_id());
390
391        let mut reserved_names = Vec::with_capacity_in(total_number_of_slots, temp_allocator);
392
393        let mut count = 0;
394        for _ in 0..total_number_of_slots {
395            let name = loop {
396                let name = generate_name(count);
397                count += 1;
398                // Do not mangle keywords and unresolved references
399                let n = name.as_str();
400                if !is_keyword(n)
401                    && !is_special_name(n)
402                    && !root_unresolved_references.contains_key(n)
403                    && !(root_bindings.contains_key(n)
404                        && (!self.options.top_level || exported_names.contains(n)))
405                        // TODO: only skip the names that are kept in the current scope
406                        && !keep_name_names.contains(n)
407                {
408                    break name;
409                }
410            };
411            reserved_names.push(name);
412        }
413
414        // Group similar symbols for smaller gzipped file
415        // <https://github.com/google/closure-compiler/blob/c383a3a1d2fce33b6c778ef76b5a626e07abca41/src/com/google/javascript/jscomp/RenameVars.java#L475-L483>
416        // Original Comment:
417        // 1) The most frequent vars get the shorter names.
418        // 2) If N number of vars are going to be assigned names of the same
419        //    length, we assign the N names based on the order at which the vars
420        //    first appear in the source. This makes the output somewhat less
421        //    random, because symbols declared close together are assigned names
422        //    that are quite similar. With this heuristic, the output is more
423        //    compressible.
424        //    For instance, the output may look like:
425        //    var da = "..", ea = "..";
426        //    function fa() { .. } function ga() { .. }
427
428        let mut freq_iter = frequencies.iter();
429        let mut symbols_renamed_in_this_batch = Vec::with_capacity_in(100, temp_allocator);
430        let mut slice_of_same_len_strings = Vec::with_capacity_in(100, temp_allocator);
431        // 2. "N number of vars are going to be assigned names of the same length"
432        for (_, slice_of_same_len_strings_group) in
433            &reserved_names.into_iter().chunk_by(InlineString::len)
434        {
435            // 1. "The most frequent vars get the shorter names"
436            // (freq_iter is sorted by frequency from highest to lowest,
437            //  so taking means take the N most frequent symbols remaining)
438            slice_of_same_len_strings.clear();
439            slice_of_same_len_strings.extend(slice_of_same_len_strings_group);
440            symbols_renamed_in_this_batch.clear();
441            symbols_renamed_in_this_batch
442                .extend(freq_iter.by_ref().take(slice_of_same_len_strings.len()));
443
444            debug_assert_eq!(symbols_renamed_in_this_batch.len(), slice_of_same_len_strings.len());
445
446            // 2. "we assign the N names based on the order at which the vars first appear in the source."
447            // sorting by slot enables us to sort by the order at which the vars first appear in the source
448            // (this is possible because the slots are discovered currently in a DFS method which is the same order
449            //  as variables appear in the source code)
450            symbols_renamed_in_this_batch.sort_unstable_by_key(|a| a.slot);
451
452            // here we just zip the iterator of symbols to rename with the iterator of new names for the next for loop
453            let symbols_to_rename_with_new_names =
454                symbols_renamed_in_this_batch.iter().zip(slice_of_same_len_strings.iter());
455
456            // rename the variables
457            for (symbol_to_rename, new_name) in symbols_to_rename_with_new_names {
458                for &symbol_id in &symbol_to_rename.symbol_ids {
459                    scoping.set_symbol_name(symbol_id, new_name);
460                }
461            }
462        }
463    }
464
465    fn tally_slot_frequencies<'a>(
466        &'a self,
467        scoping: &Scoping,
468        exported_symbols: &FxHashSet<SymbolId>,
469        keep_name_symbols: &FxHashSet<SymbolId>,
470        total_number_of_slots: usize,
471        slots: &[Slot],
472    ) -> Vec<'a, SlotFrequency<'a>> {
473        let root_scope_id = scoping.root_scope_id();
474        let temp_allocator = self.temp_allocator.as_ref();
475        let mut frequencies = Vec::from_iter_in(
476            repeat_with(|| SlotFrequency::new(temp_allocator)).take(total_number_of_slots),
477            temp_allocator,
478        );
479
480        for (symbol_id, slot) in slots.iter().copied().enumerate() {
481            let symbol_id = SymbolId::from_usize(symbol_id);
482            if scoping.symbol_scope_id(symbol_id) == root_scope_id
483                && (!self.options.top_level || exported_symbols.contains(&symbol_id))
484            {
485                continue;
486            }
487            if is_special_name(scoping.symbol_name(symbol_id)) {
488                continue;
489            }
490            if keep_name_symbols.contains(&symbol_id) {
491                continue;
492            }
493            let index = slot;
494            frequencies[index].slot = slot;
495            frequencies[index].frequency += scoping.get_resolved_reference_ids(symbol_id).len();
496            frequencies[index].symbol_ids.push(symbol_id);
497        }
498        frequencies.sort_unstable_by_key(|x| std::cmp::Reverse(x.frequency));
499        frequencies
500    }
501
502    fn collect_exported_symbols<'a>(
503        program: &Program<'a>,
504    ) -> (FxHashSet<Atom<'a>>, FxHashSet<SymbolId>) {
505        program
506            .body
507            .iter()
508            .filter_map(|statement| {
509                let Statement::ExportNamedDeclaration(v) = statement else { return None };
510                v.declaration.as_ref()
511            })
512            .flat_map(|decl| {
513                if let Declaration::VariableDeclaration(decl) = decl {
514                    itertools::Either::Left(
515                        decl.declarations
516                            .iter()
517                            .filter_map(|decl| decl.id.get_binding_identifier()),
518                    )
519                } else {
520                    itertools::Either::Right(decl.id().into_iter())
521                }
522            })
523            .map(|id| (id.name, id.symbol_id()))
524            .collect()
525    }
526
527    fn collect_keep_name_symbols<'a>(
528        keep_names: MangleOptionsKeepNames,
529        scoping: &'a Scoping,
530        nodes: &AstNodes,
531    ) -> (FxHashSet<&'a str>, FxHashSet<SymbolId>) {
532        let ids = collect_name_symbols(keep_names, scoping, nodes);
533        (ids.iter().map(|id| scoping.symbol_name(*id)).collect(), ids)
534    }
535}
536
537fn is_special_name(name: &str) -> bool {
538    matches!(name, "exports" | "arguments")
539}
540
541#[derive(Debug)]
542struct SlotFrequency<'a> {
543    pub slot: Slot,
544    pub frequency: usize,
545    pub symbol_ids: Vec<'a, SymbolId>,
546}
547
548impl<'t> SlotFrequency<'t> {
549    fn new(temp_allocator: &'t Allocator) -> Self {
550        Self { slot: 0, frequency: 0, symbol_ids: Vec::new_in(temp_allocator) }
551    }
552}
553
554#[rustfmt::skip]
555fn is_keyword(s: &str) -> bool {
556    matches!(s, "as" | "do" | "if" | "in" | "is" | "of" | "any" | "for" | "get"
557            | "let" | "new" | "out" | "set" | "try" | "var" | "case" | "else"
558            | "enum" | "from" | "meta" | "null" | "this" | "true" | "type"
559            | "void" | "with")
560}
561
562// Maximum length of string is 15 (`slot_4294967295` for `u32::MAX`).
563fn debug_name(n: u32) -> InlineString<15, u8> {
564    // Using `format!` here allocates a string unnecessarily.
565    // But this function is not for use in production, so let's not worry about it.
566    // We shouldn't resort to unsafe code, when it's not critical for performance.
567    InlineString::from_str(&format!("slot_{n}"))
568}