Skip to main content

fallow_core/duplicates/tokenize/
mod.rs

1use std::path::Path;
2
3use oxc_allocator::Allocator;
4use oxc_ast_visit::Visit;
5use oxc_parser::Parser;
6use oxc_span::{SourceType, Span};
7
8pub use super::token_types::{
9    FileTokens, KeywordType, OperatorType, PunctuationType, SourceToken, TokenKind,
10};
11use super::token_visitor::TokenExtractor;
12
13/// Tokenize a source file into a sequence of normalized tokens.
14///
15/// For Vue/Svelte SFC files, extracts `<script>` blocks first and tokenizes
16/// their content, mirroring the main analysis pipeline's SFC handling.
17/// For Astro files, extracts frontmatter. For MDX files, extracts import/export statements.
18///
19/// When `strip_types` is true, TypeScript type annotations, interfaces, and type
20/// aliases are stripped from the token stream. This enables cross-language clone
21/// detection between `.ts` and `.js` files.
22///
23/// When `skip_imports` is true, ES `import` declarations are excluded from the
24/// token stream to reduce noise from sorted import blocks.
25#[must_use]
26pub fn tokenize_file(path: &Path, source: &str, skip_imports: bool) -> FileTokens {
27    tokenize_file_inner(path, source, false, skip_imports)
28}
29
30/// Tokenize a source file with optional type stripping for cross-language detection.
31#[must_use]
32pub fn tokenize_file_cross_language(
33    path: &Path,
34    source: &str,
35    strip_types: bool,
36    skip_imports: bool,
37) -> FileTokens {
38    tokenize_file_inner(path, source, strip_types, skip_imports)
39}
40
41fn tokenize_file_inner(
42    path: &Path,
43    source: &str,
44    strip_types: bool,
45    skip_imports: bool,
46) -> FileTokens {
47    use crate::extract::{extract_astro_frontmatter, extract_mdx_statements, is_sfc_file};
48
49    let ext = path.extension().and_then(|e| e.to_str()).unwrap_or("");
50
51    if is_sfc_file(path) {
52        return tokenize_sfc(source, strip_types, skip_imports);
53    }
54    if ext == "astro" {
55        return tokenize_astro(source, strip_types, skip_imports, extract_astro_frontmatter);
56    }
57    if ext == "mdx" {
58        return tokenize_mdx(source, strip_types, skip_imports, extract_mdx_statements);
59    }
60    if ext == "css" || ext == "scss" {
61        return empty_tokens(source);
62    }
63
64    tokenize_js_ts(path, source, strip_types, skip_imports)
65}
66
67/// Tokenize Vue/Svelte SFC `<script>` blocks.
68#[expect(
69    clippy::cast_possible_truncation,
70    reason = "byte offsets are bounded by source size"
71)]
72fn tokenize_sfc(source: &str, strip_types: bool, skip_imports: bool) -> FileTokens {
73    let scripts = crate::extract::extract_sfc_scripts(source);
74    let mut all_tokens = Vec::new();
75    let mut atomic_invocation_spans = Vec::new();
76
77    for script in &scripts {
78        let source_type = match (script.is_typescript, script.is_jsx) {
79            (true, true) => SourceType::tsx(),
80            (true, false) => SourceType::ts(),
81            (false, true) => SourceType::jsx(),
82            (false, false) => SourceType::mjs(),
83        };
84        let allocator = Allocator::default();
85        let parser_return = Parser::new(&allocator, &script.body, source_type).parse();
86
87        let mut extractor = TokenExtractor::new(strip_types, skip_imports);
88        extractor.visit_program(&parser_return.program);
89
90        let offset = script.byte_offset as u32;
91        for token in &mut extractor.tokens {
92            token.span = Span::new(token.span.start + offset, token.span.end + offset);
93        }
94        for span in &mut extractor.atomic_invocation_spans {
95            *span = Span::new(span.start + offset, span.end + offset);
96        }
97        all_tokens.extend(extractor.tokens);
98        atomic_invocation_spans.extend(extractor.atomic_invocation_spans);
99    }
100
101    FileTokens {
102        tokens: all_tokens,
103        atomic_invocation_spans,
104        source: source.to_string(),
105        line_count: source.lines().count().max(1),
106    }
107}
108
109/// Tokenize Astro frontmatter between `---` delimiters.
110#[expect(
111    clippy::cast_possible_truncation,
112    reason = "byte offsets are bounded by source size"
113)]
114fn tokenize_astro(
115    source: &str,
116    strip_types: bool,
117    skip_imports: bool,
118    extract_fn: fn(&str) -> Option<fallow_extract::sfc::SfcScript>,
119) -> FileTokens {
120    if let Some(script) = extract_fn(source) {
121        let allocator = Allocator::default();
122        let parser_return = Parser::new(&allocator, &script.body, SourceType::ts()).parse();
123
124        let mut extractor = TokenExtractor::new(strip_types, skip_imports);
125        extractor.visit_program(&parser_return.program);
126
127        let offset = script.byte_offset as u32;
128        for token in &mut extractor.tokens {
129            token.span = Span::new(token.span.start + offset, token.span.end + offset);
130        }
131        for span in &mut extractor.atomic_invocation_spans {
132            *span = Span::new(span.start + offset, span.end + offset);
133        }
134
135        return FileTokens {
136            tokens: extractor.tokens,
137            atomic_invocation_spans: extractor.atomic_invocation_spans,
138            source: source.to_string(),
139            line_count: source.lines().count().max(1),
140        };
141    }
142    empty_tokens(source)
143}
144
145/// Tokenize MDX import/export statements.
146fn tokenize_mdx(
147    source: &str,
148    strip_types: bool,
149    skip_imports: bool,
150    extract_fn: fn(&str) -> String,
151) -> FileTokens {
152    let statements = extract_fn(source);
153    if !statements.is_empty() {
154        let allocator = Allocator::default();
155        let parser_return = Parser::new(&allocator, &statements, SourceType::jsx()).parse();
156
157        let mut extractor = TokenExtractor::new(strip_types, skip_imports);
158        extractor.visit_program(&parser_return.program);
159
160        return FileTokens {
161            tokens: extractor.tokens,
162            atomic_invocation_spans: extractor.atomic_invocation_spans,
163            source: source.to_string(),
164            line_count: source.lines().count().max(1),
165        };
166    }
167    empty_tokens(source)
168}
169
170/// Return empty tokens for a source file (CSS, no-frontmatter Astro, empty MDX).
171fn empty_tokens(source: &str) -> FileTokens {
172    FileTokens {
173        tokens: Vec::new(),
174        atomic_invocation_spans: Vec::new(),
175        source: source.to_string(),
176        line_count: source.lines().count().max(1),
177    }
178}
179
180/// Tokenize a standard JS/TS file, with JSX fallback for parse errors.
181fn tokenize_js_ts(path: &Path, source: &str, strip_types: bool, skip_imports: bool) -> FileTokens {
182    let source_type = match path.extension().and_then(|ext| ext.to_str()) {
183        Some("gts") => SourceType::ts(),
184        Some("gjs") => SourceType::mjs(),
185        _ => SourceType::from_path(path).unwrap_or_default(),
186    };
187    let stripped_glimmer_source = crate::extract::is_glimmer_file(path)
188        .then(|| crate::extract::strip_glimmer_templates(source))
189        .flatten();
190    let parser_source = stripped_glimmer_source.as_deref().unwrap_or(source);
191    let allocator = Allocator::default();
192    let parser_return = Parser::new(&allocator, parser_source, source_type).parse();
193
194    let mut extractor = TokenExtractor::new(strip_types, skip_imports);
195    extractor.visit_program(&parser_return.program);
196
197    if extractor.tokens.len() < 5 && source.len() > 100 && !source_type.is_jsx() {
198        let jsx_type = if source_type.is_typescript() {
199            SourceType::tsx()
200        } else {
201            SourceType::jsx()
202        };
203        let allocator2 = Allocator::default();
204        let retry_return = Parser::new(&allocator2, parser_source, jsx_type).parse();
205        let mut retry_extractor = TokenExtractor::new(strip_types, skip_imports);
206        retry_extractor.visit_program(&retry_return.program);
207        if retry_extractor.tokens.len() > extractor.tokens.len() {
208            extractor = retry_extractor;
209        }
210    }
211
212    FileTokens {
213        tokens: extractor.tokens,
214        atomic_invocation_spans: extractor.atomic_invocation_spans,
215        source: source.to_string(),
216        line_count: source.lines().count().max(1),
217    }
218}
219
220#[cfg(test)]
221mod tests;