Skip to main content

fallow_core/duplicates/tokenize/
mod.rs

1use std::path::Path;
2
3use oxc_allocator::Allocator;
4use oxc_ast_visit::Visit;
5use oxc_parser::Parser;
6use oxc_span::{SourceType, Span};
7
8mod lexical;
9
10pub use super::token_types::{
11    FileTokens, KeywordType, OperatorType, PunctuationType, SourceToken, TokenKind,
12};
13use super::token_visitor::TokenExtractor;
14
15/// Tokenize a source file into a sequence of normalized tokens.
16///
17/// For Vue/Svelte SFC files, extracts `<script>` blocks first and tokenizes
18/// their content, mirroring the main analysis pipeline's SFC handling.
19/// For Astro files, extracts frontmatter. For MDX files, extracts import/export statements.
20///
21/// When `strip_types` is true, TypeScript type annotations, interfaces, and type
22/// aliases are stripped from the token stream. This enables cross-language clone
23/// detection between `.ts` and `.js` files.
24///
25/// When `skip_imports` is true, module-wiring declarations are excluded from the
26/// token stream to reduce noise from import, re-export, and top-level static
27/// require binding blocks.
28#[must_use]
29pub fn tokenize_file(path: &Path, source: &str, skip_imports: bool) -> FileTokens {
30    tokenize_file_inner(path, source, false, skip_imports)
31}
32
33/// Tokenize a source file with optional type stripping for cross-language detection.
34#[must_use]
35pub fn tokenize_file_cross_language(
36    path: &Path,
37    source: &str,
38    strip_types: bool,
39    skip_imports: bool,
40) -> FileTokens {
41    tokenize_file_inner(path, source, strip_types, skip_imports)
42}
43
44fn tokenize_file_inner(
45    path: &Path,
46    source: &str,
47    strip_types: bool,
48    skip_imports: bool,
49) -> FileTokens {
50    use crate::extract::{extract_astro_frontmatter, extract_mdx_statements, is_sfc_file};
51
52    let ext = path.extension().and_then(|e| e.to_str()).unwrap_or("");
53
54    if is_sfc_file(path) {
55        return tokenize_sfc(source, strip_types, skip_imports);
56    }
57    if ext == "astro" {
58        return tokenize_astro(source, strip_types, skip_imports, extract_astro_frontmatter);
59    }
60    if ext == "mdx" {
61        return tokenize_mdx(source, strip_types, skip_imports, extract_mdx_statements);
62    }
63    if matches!(ext, "css" | "scss" | "sass" | "less") {
64        return tokenize_style_source(source);
65    }
66
67    tokenize_js_ts(path, source, strip_types, skip_imports)
68}
69
70/// Tokenize Vue/Svelte SFC `<script>` blocks.
71fn tokenize_sfc(source: &str, strip_types: bool, skip_imports: bool) -> FileTokens {
72    let scripts = crate::extract::extract_sfc_scripts(source);
73    let mut sections = Vec::new();
74
75    for script in &scripts {
76        let source_type = match (script.is_typescript, script.is_jsx) {
77            (true, true) => SourceType::tsx(),
78            (true, false) => SourceType::ts(),
79            (false, true) => SourceType::jsx(),
80            (false, false) => SourceType::mjs(),
81        };
82        sections.push(tokenize_js_section(
83            "js",
84            &script.body,
85            script.byte_offset,
86            source_type,
87            strip_types,
88            skip_imports,
89        ));
90    }
91
92    for region in crate::extract::extract_sfc_template_regions(source) {
93        sections.push(tokenize_lexical_section(
94            "markup",
95            &region.body,
96            region.byte_offset,
97        ));
98    }
99
100    for style in crate::extract::extract_sfc_styles(source) {
101        if style.src.is_none() {
102            sections.push(tokenize_lexical_section(
103                "style",
104                &style.body,
105                style.byte_offset,
106            ));
107        }
108    }
109
110    let (all_tokens, atomic_invocation_spans) = merge_sections(sections);
111
112    FileTokens {
113        tokens: all_tokens,
114        atomic_invocation_spans,
115        source: source.to_string(),
116        line_count: source.lines().count().max(1),
117    }
118}
119
120/// Tokenize Astro frontmatter between `---` delimiters.
121fn tokenize_astro(
122    source: &str,
123    strip_types: bool,
124    skip_imports: bool,
125    extract_fn: fn(&str) -> Option<fallow_extract::sfc::SfcScript>,
126) -> FileTokens {
127    if let Some(script) = extract_fn(source) {
128        let mut sections = vec![tokenize_js_section(
129            "js",
130            &script.body,
131            script.byte_offset,
132            SourceType::ts(),
133            strip_types,
134            skip_imports,
135        )];
136        for region in crate::extract::extract_astro_template_regions(source) {
137            sections.push(tokenize_lexical_section(
138                "markup",
139                &region.body,
140                region.byte_offset,
141            ));
142        }
143        for region in crate::extract::extract_astro_style_regions(source) {
144            sections.push(tokenize_lexical_section(
145                "style",
146                &region.body,
147                region.byte_offset,
148            ));
149        }
150        let (tokens, atomic_invocation_spans) = merge_sections(sections);
151        return FileTokens {
152            tokens,
153            atomic_invocation_spans,
154            source: source.to_string(),
155            line_count: source.lines().count().max(1),
156        };
157    }
158    let mut sections = Vec::new();
159    for region in crate::extract::extract_astro_template_regions(source) {
160        sections.push(tokenize_lexical_section(
161            "markup",
162            &region.body,
163            region.byte_offset,
164        ));
165    }
166    for region in crate::extract::extract_astro_style_regions(source) {
167        sections.push(tokenize_lexical_section(
168            "style",
169            &region.body,
170            region.byte_offset,
171        ));
172    }
173    let (tokens, atomic_invocation_spans) = merge_sections(sections);
174    FileTokens {
175        tokens,
176        atomic_invocation_spans,
177        source: source.to_string(),
178        line_count: source.lines().count().max(1),
179    }
180}
181
182/// Tokenize MDX import/export statements.
183fn tokenize_mdx(
184    source: &str,
185    strip_types: bool,
186    skip_imports: bool,
187    extract_fn: fn(&str) -> String,
188) -> FileTokens {
189    let statements = extract_fn(source);
190    if !statements.is_empty() {
191        let allocator = Allocator::default();
192        let parser_return = Parser::new(&allocator, &statements, SourceType::jsx()).parse();
193
194        let mut extractor = TokenExtractor::new(strip_types, skip_imports);
195        extractor.visit_program(&parser_return.program);
196
197        return FileTokens {
198            tokens: extractor.tokens,
199            atomic_invocation_spans: extractor.atomic_invocation_spans,
200            source: source.to_string(),
201            line_count: source.lines().count().max(1),
202        };
203    }
204    empty_tokens(source)
205}
206
207/// Return empty tokens for a source file that has no tokenized regions.
208fn empty_tokens(source: &str) -> FileTokens {
209    FileTokens {
210        tokens: Vec::new(),
211        atomic_invocation_spans: Vec::new(),
212        source: source.to_string(),
213        line_count: source.lines().count().max(1),
214    }
215}
216
217fn tokenize_style_source(source: &str) -> FileTokens {
218    let mut tokens = Vec::with_capacity(source.len().min(64));
219    tokens.push(lexical::boundary_token("style", 0));
220    tokens.extend(lexical::tokenize_lexical_region(source, 0, true));
221    FileTokens {
222        tokens,
223        atomic_invocation_spans: Vec::new(),
224        source: source.to_string(),
225        line_count: source.lines().count().max(1),
226    }
227}
228
229struct TokenSection {
230    name: &'static str,
231    start: usize,
232    tokens: Vec<SourceToken>,
233    atomic_invocation_spans: Vec<Span>,
234}
235
236fn tokenize_js_section(
237    name: &'static str,
238    source: &str,
239    byte_offset: usize,
240    source_type: SourceType,
241    strip_types: bool,
242    skip_imports: bool,
243) -> TokenSection {
244    let allocator = Allocator::default();
245    let parser_return = Parser::new(&allocator, source, source_type).parse();
246
247    let mut extractor = TokenExtractor::new(strip_types, skip_imports);
248    extractor.visit_program(&parser_return.program);
249
250    let offset = byte_offset as u32;
251    for token in &mut extractor.tokens {
252        token.span = Span::new(token.span.start + offset, token.span.end + offset);
253    }
254    for span in &mut extractor.atomic_invocation_spans {
255        *span = Span::new(span.start + offset, span.end + offset);
256    }
257
258    TokenSection {
259        name,
260        start: byte_offset,
261        tokens: extractor.tokens,
262        atomic_invocation_spans: extractor.atomic_invocation_spans,
263    }
264}
265
266fn tokenize_lexical_section(name: &'static str, source: &str, byte_offset: usize) -> TokenSection {
267    // CSS value canonicalization is scoped to the `"style"` section so markup
268    // tokens (and, transitively, JS) are provably untouched.
269    let css = name == "style";
270    TokenSection {
271        name,
272        start: byte_offset,
273        tokens: lexical::tokenize_lexical_region(source, byte_offset, css),
274        atomic_invocation_spans: Vec::new(),
275    }
276}
277
278fn merge_sections(mut sections: Vec<TokenSection>) -> (Vec<SourceToken>, Vec<Span>) {
279    sections.retain(|section| !section.tokens.is_empty());
280    sections.sort_by_key(|section| section.start);
281
282    let mut tokens = Vec::new();
283    let mut atomic_invocation_spans = Vec::new();
284
285    for section in sections {
286        tokens.push(lexical::boundary_token(section.name, section.start));
287        tokens.extend(section.tokens);
288        atomic_invocation_spans.extend(section.atomic_invocation_spans);
289    }
290
291    (tokens, atomic_invocation_spans)
292}
293
294/// Tokenize a standard JS/TS file, with JSX fallback for parse errors.
295fn tokenize_js_ts(path: &Path, source: &str, strip_types: bool, skip_imports: bool) -> FileTokens {
296    let source_type = match path.extension().and_then(|ext| ext.to_str()) {
297        Some("gts") => SourceType::ts(),
298        Some("gjs") => SourceType::mjs(),
299        _ => SourceType::from_path(path).unwrap_or_default(),
300    };
301    let stripped_glimmer_source = crate::extract::is_glimmer_file(path)
302        .then(|| crate::extract::strip_glimmer_templates(source))
303        .flatten();
304    let parser_source = stripped_glimmer_source.as_deref().unwrap_or(source);
305    let allocator = Allocator::default();
306    let parser_return = Parser::new(&allocator, parser_source, source_type).parse();
307
308    let mut extractor = TokenExtractor::new(strip_types, skip_imports);
309    extractor.visit_program(&parser_return.program);
310
311    if extractor.tokens.len() < 5 && source.len() > 100 && !source_type.is_jsx() {
312        let jsx_type = if source_type.is_typescript() {
313            SourceType::tsx()
314        } else {
315            SourceType::jsx()
316        };
317        let allocator2 = Allocator::default();
318        let retry_return = Parser::new(&allocator2, parser_source, jsx_type).parse();
319        let mut retry_extractor = TokenExtractor::new(strip_types, skip_imports);
320        retry_extractor.visit_program(&retry_return.program);
321        if retry_extractor.tokens.len() > extractor.tokens.len() {
322            extractor = retry_extractor;
323        }
324    }
325
326    FileTokens {
327        tokens: extractor.tokens,
328        atomic_invocation_spans: extractor.atomic_invocation_spans,
329        source: source.to_string(),
330        line_count: source.lines().count().max(1),
331    }
332}
333
334#[cfg(test)]
335mod tests;