Skip to main content

fallow_core/duplicates/tokenize/
mod.rs

1use std::path::Path;
2
3use oxc_allocator::Allocator;
4use oxc_ast_visit::Visit;
5use oxc_parser::Parser;
6use oxc_span::{SourceType, Span};
7
8// Re-export all public types so existing `use ... tokenize::X` paths continue to work.
9pub use super::token_types::{
10    FileTokens, KeywordType, OperatorType, PunctuationType, SourceToken, TokenKind,
11};
12use super::token_visitor::TokenExtractor;
13
14/// Tokenize a source file into a sequence of normalized tokens.
15///
16/// For Vue/Svelte SFC files, extracts `<script>` blocks first and tokenizes
17/// their content, mirroring the main analysis pipeline's SFC handling.
18/// For Astro files, extracts frontmatter. For MDX files, extracts import/export statements.
19///
20/// When `strip_types` is true, TypeScript type annotations, interfaces, and type
21/// aliases are stripped from the token stream. This enables cross-language clone
22/// detection between `.ts` and `.js` files.
23///
24/// When `skip_imports` is true, ES `import` declarations are excluded from the
25/// token stream to reduce noise from sorted import blocks.
26#[must_use]
27pub fn tokenize_file(path: &Path, source: &str, skip_imports: bool) -> FileTokens {
28    tokenize_file_inner(path, source, false, skip_imports)
29}
30
31/// Tokenize a source file with optional type stripping for cross-language detection.
32#[must_use]
33pub fn tokenize_file_cross_language(
34    path: &Path,
35    source: &str,
36    strip_types: bool,
37    skip_imports: bool,
38) -> FileTokens {
39    tokenize_file_inner(path, source, strip_types, skip_imports)
40}
41
42fn tokenize_file_inner(
43    path: &Path,
44    source: &str,
45    strip_types: bool,
46    skip_imports: bool,
47) -> FileTokens {
48    use crate::extract::{extract_astro_frontmatter, extract_mdx_statements, is_sfc_file};
49
50    let ext = path.extension().and_then(|e| e.to_str()).unwrap_or("");
51
52    if is_sfc_file(path) {
53        return tokenize_sfc(source, strip_types, skip_imports);
54    }
55    if ext == "astro" {
56        return tokenize_astro(source, strip_types, skip_imports, extract_astro_frontmatter);
57    }
58    if ext == "mdx" {
59        return tokenize_mdx(source, strip_types, skip_imports, extract_mdx_statements);
60    }
61    if ext == "css" || ext == "scss" {
62        return empty_tokens(source);
63    }
64
65    tokenize_js_ts(path, source, strip_types, skip_imports)
66}
67
68/// Tokenize Vue/Svelte SFC `<script>` blocks.
69#[expect(
70    clippy::cast_possible_truncation,
71    reason = "byte offsets are bounded by source size"
72)]
73fn tokenize_sfc(source: &str, strip_types: bool, skip_imports: bool) -> FileTokens {
74    let scripts = crate::extract::extract_sfc_scripts(source);
75    let mut all_tokens = Vec::new();
76
77    for script in &scripts {
78        let source_type = match (script.is_typescript, script.is_jsx) {
79            (true, true) => SourceType::tsx(),
80            (true, false) => SourceType::ts(),
81            (false, true) => SourceType::jsx(),
82            (false, false) => SourceType::mjs(),
83        };
84        let allocator = Allocator::default();
85        let parser_return = Parser::new(&allocator, &script.body, source_type).parse();
86
87        let mut extractor = TokenExtractor::new(strip_types, skip_imports);
88        extractor.visit_program(&parser_return.program);
89
90        let offset = script.byte_offset as u32;
91        for token in &mut extractor.tokens {
92            token.span = Span::new(token.span.start + offset, token.span.end + offset);
93        }
94        all_tokens.extend(extractor.tokens);
95    }
96
97    FileTokens {
98        tokens: all_tokens,
99        source: source.to_string(),
100        line_count: source.lines().count().max(1),
101    }
102}
103
104/// Tokenize Astro frontmatter between `---` delimiters.
105#[expect(
106    clippy::cast_possible_truncation,
107    reason = "byte offsets are bounded by source size"
108)]
109fn tokenize_astro(
110    source: &str,
111    strip_types: bool,
112    skip_imports: bool,
113    extract_fn: fn(&str) -> Option<fallow_extract::sfc::SfcScript>,
114) -> FileTokens {
115    if let Some(script) = extract_fn(source) {
116        let allocator = Allocator::default();
117        let parser_return = Parser::new(&allocator, &script.body, SourceType::ts()).parse();
118
119        let mut extractor = TokenExtractor::new(strip_types, skip_imports);
120        extractor.visit_program(&parser_return.program);
121
122        let offset = script.byte_offset as u32;
123        for token in &mut extractor.tokens {
124            token.span = Span::new(token.span.start + offset, token.span.end + offset);
125        }
126
127        return FileTokens {
128            tokens: extractor.tokens,
129            source: source.to_string(),
130            line_count: source.lines().count().max(1),
131        };
132    }
133    empty_tokens(source)
134}
135
136/// Tokenize MDX import/export statements.
137fn tokenize_mdx(
138    source: &str,
139    strip_types: bool,
140    skip_imports: bool,
141    extract_fn: fn(&str) -> String,
142) -> FileTokens {
143    let statements = extract_fn(source);
144    if !statements.is_empty() {
145        let allocator = Allocator::default();
146        let parser_return = Parser::new(&allocator, &statements, SourceType::jsx()).parse();
147
148        let mut extractor = TokenExtractor::new(strip_types, skip_imports);
149        extractor.visit_program(&parser_return.program);
150
151        return FileTokens {
152            tokens: extractor.tokens,
153            source: source.to_string(),
154            line_count: source.lines().count().max(1),
155        };
156    }
157    empty_tokens(source)
158}
159
160/// Return empty tokens for a source file (CSS, no-frontmatter Astro, empty MDX).
161fn empty_tokens(source: &str) -> FileTokens {
162    FileTokens {
163        tokens: Vec::new(),
164        source: source.to_string(),
165        line_count: source.lines().count().max(1),
166    }
167}
168
169/// Tokenize a standard JS/TS file, with JSX fallback for parse errors.
170fn tokenize_js_ts(path: &Path, source: &str, strip_types: bool, skip_imports: bool) -> FileTokens {
171    let source_type = SourceType::from_path(path).unwrap_or_default();
172    let allocator = Allocator::default();
173    let parser_return = Parser::new(&allocator, source, source_type).parse();
174
175    let mut extractor = TokenExtractor::new(strip_types, skip_imports);
176    extractor.visit_program(&parser_return.program);
177
178    // If parsing produced very few tokens relative to source size (likely parse errors
179    // from Flow types or JSX in .js files), retry with JSX/TSX source type as a fallback.
180    if extractor.tokens.len() < 5 && source.len() > 100 && !source_type.is_jsx() {
181        let jsx_type = if source_type.is_typescript() {
182            SourceType::tsx()
183        } else {
184            SourceType::jsx()
185        };
186        let allocator2 = Allocator::default();
187        let retry_return = Parser::new(&allocator2, source, jsx_type).parse();
188        let mut retry_extractor = TokenExtractor::new(strip_types, skip_imports);
189        retry_extractor.visit_program(&retry_return.program);
190        if retry_extractor.tokens.len() > extractor.tokens.len() {
191            extractor = retry_extractor;
192        }
193    }
194
195    FileTokens {
196        tokens: extractor.tokens,
197        source: source.to_string(),
198        line_count: source.lines().count().max(1),
199    }
200}
201
202#[cfg(test)]
203mod tests;