Skip to main content

fallow_core/duplicates/tokenize/
mod.rs

1use std::path::Path;
2
3use oxc_allocator::Allocator;
4use oxc_ast_visit::Visit;
5use oxc_parser::Parser;
6use oxc_span::{SourceType, Span};
7
8// Re-export all public types so existing `use ... tokenize::X` paths continue to work.
9pub use super::token_types::{
10    FileTokens, KeywordType, OperatorType, PunctuationType, SourceToken, TokenKind,
11};
12use super::token_visitor::TokenExtractor;
13
14/// Tokenize a source file into a sequence of normalized tokens.
15///
16/// For Vue/Svelte SFC files, extracts `<script>` blocks first and tokenizes
17/// their content, mirroring the main analysis pipeline's SFC handling.
18/// For Astro files, extracts frontmatter. For MDX files, extracts import/export statements.
19///
20/// When `strip_types` is true, TypeScript type annotations, interfaces, and type
21/// aliases are stripped from the token stream. This enables cross-language clone
22/// detection between `.ts` and `.js` files.
23pub fn tokenize_file(path: &Path, source: &str) -> FileTokens {
24    tokenize_file_inner(path, source, false)
25}
26
27/// Tokenize a source file with optional type stripping for cross-language detection.
28pub fn tokenize_file_cross_language(path: &Path, source: &str, strip_types: bool) -> FileTokens {
29    tokenize_file_inner(path, source, strip_types)
30}
31
32fn tokenize_file_inner(path: &Path, source: &str, strip_types: bool) -> FileTokens {
33    use crate::extract::{
34        extract_astro_frontmatter, extract_mdx_statements, extract_sfc_scripts, is_sfc_file,
35    };
36
37    let ext = path.extension().and_then(|e| e.to_str()).unwrap_or("");
38
39    // For Vue/Svelte SFCs, extract and tokenize `<script>` blocks.
40    if is_sfc_file(path) {
41        let scripts = extract_sfc_scripts(source);
42        let mut all_tokens = Vec::new();
43
44        for script in &scripts {
45            let source_type = match (script.is_typescript, script.is_jsx) {
46                (true, true) => SourceType::tsx(),
47                (true, false) => SourceType::ts(),
48                (false, true) => SourceType::jsx(),
49                (false, false) => SourceType::mjs(),
50            };
51            let allocator = Allocator::default();
52            let parser_return = Parser::new(&allocator, &script.body, source_type).parse();
53
54            let mut extractor = TokenExtractor::with_strip_types(strip_types);
55            extractor.visit_program(&parser_return.program);
56
57            // Adjust token spans to reference positions in the full SFC source
58            // rather than the extracted script block.
59            let offset = script.byte_offset as u32;
60            for token in &mut extractor.tokens {
61                token.span = Span::new(token.span.start + offset, token.span.end + offset);
62            }
63            all_tokens.extend(extractor.tokens);
64        }
65
66        let line_count = source.lines().count().max(1);
67        return FileTokens {
68            tokens: all_tokens,
69            source: source.to_string(),
70            line_count,
71        };
72    }
73
74    // For Astro files, extract and tokenize frontmatter.
75    if ext == "astro" {
76        if let Some(script) = extract_astro_frontmatter(source) {
77            let allocator = Allocator::default();
78            let parser_return = Parser::new(&allocator, &script.body, SourceType::ts()).parse();
79
80            let mut extractor = TokenExtractor::with_strip_types(strip_types);
81            extractor.visit_program(&parser_return.program);
82
83            let offset = script.byte_offset as u32;
84            for token in &mut extractor.tokens {
85                token.span = Span::new(token.span.start + offset, token.span.end + offset);
86            }
87
88            let line_count = source.lines().count().max(1);
89            return FileTokens {
90                tokens: extractor.tokens,
91                source: source.to_string(),
92                line_count,
93            };
94        }
95        // No frontmatter — return empty tokens.
96        let line_count = source.lines().count().max(1);
97        return FileTokens {
98            tokens: Vec::new(),
99            source: source.to_string(),
100            line_count,
101        };
102    }
103
104    // For MDX files, extract and tokenize import/export statements.
105    if ext == "mdx" {
106        let statements = extract_mdx_statements(source);
107        if !statements.is_empty() {
108            let allocator = Allocator::default();
109            let parser_return = Parser::new(&allocator, &statements, SourceType::jsx()).parse();
110
111            let mut extractor = TokenExtractor::with_strip_types(strip_types);
112            extractor.visit_program(&parser_return.program);
113
114            let line_count = source.lines().count().max(1);
115            return FileTokens {
116                tokens: extractor.tokens,
117                source: source.to_string(),
118                line_count,
119            };
120        }
121        let line_count = source.lines().count().max(1);
122        return FileTokens {
123            tokens: Vec::new(),
124            source: source.to_string(),
125            line_count,
126        };
127    }
128
129    // CSS/SCSS files are not JS/TS — skip tokenization for duplication detection.
130    if ext == "css" || ext == "scss" {
131        let line_count = source.lines().count().max(1);
132        return FileTokens {
133            tokens: Vec::new(),
134            source: source.to_string(),
135            line_count,
136        };
137    }
138
139    let source_type = SourceType::from_path(path).unwrap_or_default();
140    let allocator = Allocator::default();
141    let parser_return = Parser::new(&allocator, source, source_type).parse();
142
143    let mut extractor = TokenExtractor::with_strip_types(strip_types);
144    extractor.visit_program(&parser_return.program);
145
146    // If parsing produced very few tokens relative to source size (likely parse errors
147    // from Flow types or JSX in .js files), retry with JSX/TSX source type as a fallback.
148    if extractor.tokens.len() < 5 && source.len() > 100 && !source_type.is_jsx() {
149        let jsx_type = if source_type.is_typescript() {
150            SourceType::tsx()
151        } else {
152            SourceType::jsx()
153        };
154        let allocator2 = Allocator::default();
155        let retry_return = Parser::new(&allocator2, source, jsx_type).parse();
156        let mut retry_extractor = TokenExtractor::with_strip_types(strip_types);
157        retry_extractor.visit_program(&retry_return.program);
158        if retry_extractor.tokens.len() > extractor.tokens.len() {
159            extractor = retry_extractor;
160        }
161    }
162
163    let line_count = source.lines().count().max(1);
164
165    FileTokens {
166        tokens: extractor.tokens,
167        source: source.to_string(),
168        line_count,
169    }
170}
171
172#[cfg(test)]
173mod tests;