Skip to main content

fallow_core/duplicates/tokenize/
mod.rs

1use std::path::Path;
2
3use oxc_allocator::Allocator;
4use oxc_ast_visit::Visit;
5use oxc_parser::Parser;
6use oxc_span::{SourceType, Span};
7
8// Re-export all public types so existing `use ... tokenize::X` paths continue to work.
9pub use super::token_types::{
10    FileTokens, KeywordType, OperatorType, PunctuationType, SourceToken, TokenKind,
11};
12use super::token_visitor::TokenExtractor;
13
14/// Tokenize a source file into a sequence of normalized tokens.
15///
16/// For Vue/Svelte SFC files, extracts `<script>` blocks first and tokenizes
17/// their content, mirroring the main analysis pipeline's SFC handling.
18/// For Astro files, extracts frontmatter. For MDX files, extracts import/export statements.
19///
20/// When `strip_types` is true, TypeScript type annotations, interfaces, and type
21/// aliases are stripped from the token stream. This enables cross-language clone
22/// detection between `.ts` and `.js` files.
23///
24/// When `skip_imports` is true, ES `import` declarations are excluded from the
25/// token stream to reduce noise from sorted import blocks.
26#[must_use]
27pub fn tokenize_file(path: &Path, source: &str, skip_imports: bool) -> FileTokens {
28    tokenize_file_inner(path, source, false, skip_imports)
29}
30
31/// Tokenize a source file with optional type stripping for cross-language detection.
32#[must_use]
33pub fn tokenize_file_cross_language(
34    path: &Path,
35    source: &str,
36    strip_types: bool,
37    skip_imports: bool,
38) -> FileTokens {
39    tokenize_file_inner(path, source, strip_types, skip_imports)
40}
41
42fn tokenize_file_inner(
43    path: &Path,
44    source: &str,
45    strip_types: bool,
46    skip_imports: bool,
47) -> FileTokens {
48    use crate::extract::{extract_astro_frontmatter, extract_mdx_statements, is_sfc_file};
49
50    let ext = path.extension().and_then(|e| e.to_str()).unwrap_or("");
51
52    if is_sfc_file(path) {
53        return tokenize_sfc(source, strip_types, skip_imports);
54    }
55    if ext == "astro" {
56        return tokenize_astro(source, strip_types, skip_imports, extract_astro_frontmatter);
57    }
58    if ext == "mdx" {
59        return tokenize_mdx(source, strip_types, skip_imports, extract_mdx_statements);
60    }
61    if ext == "css" || ext == "scss" {
62        return empty_tokens(source);
63    }
64
65    tokenize_js_ts(path, source, strip_types, skip_imports)
66}
67
68/// Tokenize Vue/Svelte SFC `<script>` blocks.
69#[expect(
70    clippy::cast_possible_truncation,
71    reason = "byte offsets are bounded by source size"
72)]
73fn tokenize_sfc(source: &str, strip_types: bool, skip_imports: bool) -> FileTokens {
74    let scripts = crate::extract::extract_sfc_scripts(source);
75    let mut all_tokens = Vec::new();
76    let mut atomic_invocation_spans = Vec::new();
77
78    for script in &scripts {
79        let source_type = match (script.is_typescript, script.is_jsx) {
80            (true, true) => SourceType::tsx(),
81            (true, false) => SourceType::ts(),
82            (false, true) => SourceType::jsx(),
83            (false, false) => SourceType::mjs(),
84        };
85        let allocator = Allocator::default();
86        let parser_return = Parser::new(&allocator, &script.body, source_type).parse();
87
88        let mut extractor = TokenExtractor::new(strip_types, skip_imports);
89        extractor.visit_program(&parser_return.program);
90
91        let offset = script.byte_offset as u32;
92        for token in &mut extractor.tokens {
93            token.span = Span::new(token.span.start + offset, token.span.end + offset);
94        }
95        for span in &mut extractor.atomic_invocation_spans {
96            *span = Span::new(span.start + offset, span.end + offset);
97        }
98        all_tokens.extend(extractor.tokens);
99        atomic_invocation_spans.extend(extractor.atomic_invocation_spans);
100    }
101
102    FileTokens {
103        tokens: all_tokens,
104        atomic_invocation_spans,
105        source: source.to_string(),
106        line_count: source.lines().count().max(1),
107    }
108}
109
110/// Tokenize Astro frontmatter between `---` delimiters.
111#[expect(
112    clippy::cast_possible_truncation,
113    reason = "byte offsets are bounded by source size"
114)]
115fn tokenize_astro(
116    source: &str,
117    strip_types: bool,
118    skip_imports: bool,
119    extract_fn: fn(&str) -> Option<fallow_extract::sfc::SfcScript>,
120) -> FileTokens {
121    if let Some(script) = extract_fn(source) {
122        let allocator = Allocator::default();
123        let parser_return = Parser::new(&allocator, &script.body, SourceType::ts()).parse();
124
125        let mut extractor = TokenExtractor::new(strip_types, skip_imports);
126        extractor.visit_program(&parser_return.program);
127
128        let offset = script.byte_offset as u32;
129        for token in &mut extractor.tokens {
130            token.span = Span::new(token.span.start + offset, token.span.end + offset);
131        }
132        for span in &mut extractor.atomic_invocation_spans {
133            *span = Span::new(span.start + offset, span.end + offset);
134        }
135
136        return FileTokens {
137            tokens: extractor.tokens,
138            atomic_invocation_spans: extractor.atomic_invocation_spans,
139            source: source.to_string(),
140            line_count: source.lines().count().max(1),
141        };
142    }
143    empty_tokens(source)
144}
145
146/// Tokenize MDX import/export statements.
147fn tokenize_mdx(
148    source: &str,
149    strip_types: bool,
150    skip_imports: bool,
151    extract_fn: fn(&str) -> String,
152) -> FileTokens {
153    let statements = extract_fn(source);
154    if !statements.is_empty() {
155        let allocator = Allocator::default();
156        let parser_return = Parser::new(&allocator, &statements, SourceType::jsx()).parse();
157
158        let mut extractor = TokenExtractor::new(strip_types, skip_imports);
159        extractor.visit_program(&parser_return.program);
160
161        return FileTokens {
162            tokens: extractor.tokens,
163            atomic_invocation_spans: extractor.atomic_invocation_spans,
164            source: source.to_string(),
165            line_count: source.lines().count().max(1),
166        };
167    }
168    empty_tokens(source)
169}
170
171/// Return empty tokens for a source file (CSS, no-frontmatter Astro, empty MDX).
172fn empty_tokens(source: &str) -> FileTokens {
173    FileTokens {
174        tokens: Vec::new(),
175        atomic_invocation_spans: Vec::new(),
176        source: source.to_string(),
177        line_count: source.lines().count().max(1),
178    }
179}
180
181/// Tokenize a standard JS/TS file, with JSX fallback for parse errors.
182fn tokenize_js_ts(path: &Path, source: &str, strip_types: bool, skip_imports: bool) -> FileTokens {
183    let source_type = match path.extension().and_then(|ext| ext.to_str()) {
184        Some("gts") => SourceType::ts(),
185        Some("gjs") => SourceType::mjs(),
186        _ => SourceType::from_path(path).unwrap_or_default(),
187    };
188    let stripped_glimmer_source = crate::extract::is_glimmer_file(path)
189        .then(|| crate::extract::strip_glimmer_templates(source))
190        .flatten();
191    let parser_source = stripped_glimmer_source.as_deref().unwrap_or(source);
192    let allocator = Allocator::default();
193    let parser_return = Parser::new(&allocator, parser_source, source_type).parse();
194
195    let mut extractor = TokenExtractor::new(strip_types, skip_imports);
196    extractor.visit_program(&parser_return.program);
197
198    // If parsing produced very few tokens relative to source size (likely parse errors
199    // from Flow types or JSX in .js files), retry with JSX/TSX source type as a fallback.
200    if extractor.tokens.len() < 5 && source.len() > 100 && !source_type.is_jsx() {
201        let jsx_type = if source_type.is_typescript() {
202            SourceType::tsx()
203        } else {
204            SourceType::jsx()
205        };
206        let allocator2 = Allocator::default();
207        let retry_return = Parser::new(&allocator2, parser_source, jsx_type).parse();
208        let mut retry_extractor = TokenExtractor::new(strip_types, skip_imports);
209        retry_extractor.visit_program(&retry_return.program);
210        if retry_extractor.tokens.len() > extractor.tokens.len() {
211            extractor = retry_extractor;
212        }
213    }
214
215    FileTokens {
216        tokens: extractor.tokens,
217        atomic_invocation_spans: extractor.atomic_invocation_spans,
218        source: source.to_string(),
219        line_count: source.lines().count().max(1),
220    }
221}
222
223#[cfg(test)]
224mod tests;