Skip to main content

fallow_core/duplicates/tokenize/
mod.rs

1use std::path::Path;
2
3use oxc_allocator::Allocator;
4use oxc_ast_visit::Visit;
5use oxc_parser::Parser;
6use oxc_span::{SourceType, Span};
7
8// Re-export all public types so existing `use ... tokenize::X` paths continue to work.
9pub use super::token_types::{
10    FileTokens, KeywordType, OperatorType, PunctuationType, SourceToken, TokenKind,
11};
12use super::token_visitor::TokenExtractor;
13
14/// Tokenize a source file into a sequence of normalized tokens.
15///
16/// For Vue/Svelte SFC files, extracts `<script>` blocks first and tokenizes
17/// their content, mirroring the main analysis pipeline's SFC handling.
18/// For Astro files, extracts frontmatter. For MDX files, extracts import/export statements.
19///
20/// When `strip_types` is true, TypeScript type annotations, interfaces, and type
21/// aliases are stripped from the token stream. This enables cross-language clone
22/// detection between `.ts` and `.js` files.
23#[must_use]
24pub fn tokenize_file(path: &Path, source: &str) -> FileTokens {
25    tokenize_file_inner(path, source, false)
26}
27
28/// Tokenize a source file with optional type stripping for cross-language detection.
29#[must_use]
30pub fn tokenize_file_cross_language(path: &Path, source: &str, strip_types: bool) -> FileTokens {
31    tokenize_file_inner(path, source, strip_types)
32}
33
34fn tokenize_file_inner(path: &Path, source: &str, strip_types: bool) -> FileTokens {
35    use crate::extract::{extract_astro_frontmatter, extract_mdx_statements, is_sfc_file};
36
37    let ext = path.extension().and_then(|e| e.to_str()).unwrap_or("");
38
39    if is_sfc_file(path) {
40        return tokenize_sfc(source, strip_types);
41    }
42    if ext == "astro" {
43        return tokenize_astro(source, strip_types, extract_astro_frontmatter);
44    }
45    if ext == "mdx" {
46        return tokenize_mdx(source, strip_types, extract_mdx_statements);
47    }
48    if ext == "css" || ext == "scss" {
49        return empty_tokens(source);
50    }
51
52    tokenize_js_ts(path, source, strip_types)
53}
54
55/// Tokenize Vue/Svelte SFC `<script>` blocks.
56#[expect(
57    clippy::cast_possible_truncation,
58    reason = "byte offsets are bounded by source size"
59)]
60fn tokenize_sfc(source: &str, strip_types: bool) -> FileTokens {
61    let scripts = crate::extract::extract_sfc_scripts(source);
62    let mut all_tokens = Vec::new();
63
64    for script in &scripts {
65        let source_type = match (script.is_typescript, script.is_jsx) {
66            (true, true) => SourceType::tsx(),
67            (true, false) => SourceType::ts(),
68            (false, true) => SourceType::jsx(),
69            (false, false) => SourceType::mjs(),
70        };
71        let allocator = Allocator::default();
72        let parser_return = Parser::new(&allocator, &script.body, source_type).parse();
73
74        let mut extractor = TokenExtractor::with_strip_types(strip_types);
75        extractor.visit_program(&parser_return.program);
76
77        let offset = script.byte_offset as u32;
78        for token in &mut extractor.tokens {
79            token.span = Span::new(token.span.start + offset, token.span.end + offset);
80        }
81        all_tokens.extend(extractor.tokens);
82    }
83
84    FileTokens {
85        tokens: all_tokens,
86        source: source.to_string(),
87        line_count: source.lines().count().max(1),
88    }
89}
90
91/// Tokenize Astro frontmatter between `---` delimiters.
92#[expect(
93    clippy::cast_possible_truncation,
94    reason = "byte offsets are bounded by source size"
95)]
96fn tokenize_astro(
97    source: &str,
98    strip_types: bool,
99    extract_fn: fn(&str) -> Option<fallow_extract::sfc::SfcScript>,
100) -> FileTokens {
101    if let Some(script) = extract_fn(source) {
102        let allocator = Allocator::default();
103        let parser_return = Parser::new(&allocator, &script.body, SourceType::ts()).parse();
104
105        let mut extractor = TokenExtractor::with_strip_types(strip_types);
106        extractor.visit_program(&parser_return.program);
107
108        let offset = script.byte_offset as u32;
109        for token in &mut extractor.tokens {
110            token.span = Span::new(token.span.start + offset, token.span.end + offset);
111        }
112
113        return FileTokens {
114            tokens: extractor.tokens,
115            source: source.to_string(),
116            line_count: source.lines().count().max(1),
117        };
118    }
119    empty_tokens(source)
120}
121
122/// Tokenize MDX import/export statements.
123fn tokenize_mdx(source: &str, strip_types: bool, extract_fn: fn(&str) -> String) -> FileTokens {
124    let statements = extract_fn(source);
125    if !statements.is_empty() {
126        let allocator = Allocator::default();
127        let parser_return = Parser::new(&allocator, &statements, SourceType::jsx()).parse();
128
129        let mut extractor = TokenExtractor::with_strip_types(strip_types);
130        extractor.visit_program(&parser_return.program);
131
132        return FileTokens {
133            tokens: extractor.tokens,
134            source: source.to_string(),
135            line_count: source.lines().count().max(1),
136        };
137    }
138    empty_tokens(source)
139}
140
141/// Return empty tokens for a source file (CSS, no-frontmatter Astro, empty MDX).
142fn empty_tokens(source: &str) -> FileTokens {
143    FileTokens {
144        tokens: Vec::new(),
145        source: source.to_string(),
146        line_count: source.lines().count().max(1),
147    }
148}
149
150/// Tokenize a standard JS/TS file, with JSX fallback for parse errors.
151fn tokenize_js_ts(path: &Path, source: &str, strip_types: bool) -> FileTokens {
152    let source_type = SourceType::from_path(path).unwrap_or_default();
153    let allocator = Allocator::default();
154    let parser_return = Parser::new(&allocator, source, source_type).parse();
155
156    let mut extractor = TokenExtractor::with_strip_types(strip_types);
157    extractor.visit_program(&parser_return.program);
158
159    // If parsing produced very few tokens relative to source size (likely parse errors
160    // from Flow types or JSX in .js files), retry with JSX/TSX source type as a fallback.
161    if extractor.tokens.len() < 5 && source.len() > 100 && !source_type.is_jsx() {
162        let jsx_type = if source_type.is_typescript() {
163            SourceType::tsx()
164        } else {
165            SourceType::jsx()
166        };
167        let allocator2 = Allocator::default();
168        let retry_return = Parser::new(&allocator2, source, jsx_type).parse();
169        let mut retry_extractor = TokenExtractor::with_strip_types(strip_types);
170        retry_extractor.visit_program(&retry_return.program);
171        if retry_extractor.tokens.len() > extractor.tokens.len() {
172            extractor = retry_extractor;
173        }
174    }
175
176    FileTokens {
177        tokens: extractor.tokens,
178        source: source.to_string(),
179        line_count: source.lines().count().max(1),
180    }
181}
182
183#[cfg(test)]
184mod tests;