fallow_core/duplicates/tokenize/
mod.rs1use std::path::Path;
2
3use oxc_allocator::Allocator;
4use oxc_ast_visit::Visit;
5use oxc_parser::Parser;
6use oxc_span::{SourceType, Span};
7
8pub use super::token_types::{
10 FileTokens, KeywordType, OperatorType, PunctuationType, SourceToken, TokenKind,
11};
12use super::token_visitor::TokenExtractor;
13
14#[must_use]
27pub fn tokenize_file(path: &Path, source: &str, skip_imports: bool) -> FileTokens {
28 tokenize_file_inner(path, source, false, skip_imports)
29}
30
31#[must_use]
33pub fn tokenize_file_cross_language(
34 path: &Path,
35 source: &str,
36 strip_types: bool,
37 skip_imports: bool,
38) -> FileTokens {
39 tokenize_file_inner(path, source, strip_types, skip_imports)
40}
41
42fn tokenize_file_inner(
43 path: &Path,
44 source: &str,
45 strip_types: bool,
46 skip_imports: bool,
47) -> FileTokens {
48 use crate::extract::{extract_astro_frontmatter, extract_mdx_statements, is_sfc_file};
49
50 let ext = path.extension().and_then(|e| e.to_str()).unwrap_or("");
51
52 if is_sfc_file(path) {
53 return tokenize_sfc(source, strip_types, skip_imports);
54 }
55 if ext == "astro" {
56 return tokenize_astro(source, strip_types, skip_imports, extract_astro_frontmatter);
57 }
58 if ext == "mdx" {
59 return tokenize_mdx(source, strip_types, skip_imports, extract_mdx_statements);
60 }
61 if ext == "css" || ext == "scss" {
62 return empty_tokens(source);
63 }
64
65 tokenize_js_ts(path, source, strip_types, skip_imports)
66}
67
68#[expect(
70 clippy::cast_possible_truncation,
71 reason = "byte offsets are bounded by source size"
72)]
73fn tokenize_sfc(source: &str, strip_types: bool, skip_imports: bool) -> FileTokens {
74 let scripts = crate::extract::extract_sfc_scripts(source);
75 let mut all_tokens = Vec::new();
76
77 for script in &scripts {
78 let source_type = match (script.is_typescript, script.is_jsx) {
79 (true, true) => SourceType::tsx(),
80 (true, false) => SourceType::ts(),
81 (false, true) => SourceType::jsx(),
82 (false, false) => SourceType::mjs(),
83 };
84 let allocator = Allocator::default();
85 let parser_return = Parser::new(&allocator, &script.body, source_type).parse();
86
87 let mut extractor = TokenExtractor::new(strip_types, skip_imports);
88 extractor.visit_program(&parser_return.program);
89
90 let offset = script.byte_offset as u32;
91 for token in &mut extractor.tokens {
92 token.span = Span::new(token.span.start + offset, token.span.end + offset);
93 }
94 all_tokens.extend(extractor.tokens);
95 }
96
97 FileTokens {
98 tokens: all_tokens,
99 source: source.to_string(),
100 line_count: source.lines().count().max(1),
101 }
102}
103
104#[expect(
106 clippy::cast_possible_truncation,
107 reason = "byte offsets are bounded by source size"
108)]
109fn tokenize_astro(
110 source: &str,
111 strip_types: bool,
112 skip_imports: bool,
113 extract_fn: fn(&str) -> Option<fallow_extract::sfc::SfcScript>,
114) -> FileTokens {
115 if let Some(script) = extract_fn(source) {
116 let allocator = Allocator::default();
117 let parser_return = Parser::new(&allocator, &script.body, SourceType::ts()).parse();
118
119 let mut extractor = TokenExtractor::new(strip_types, skip_imports);
120 extractor.visit_program(&parser_return.program);
121
122 let offset = script.byte_offset as u32;
123 for token in &mut extractor.tokens {
124 token.span = Span::new(token.span.start + offset, token.span.end + offset);
125 }
126
127 return FileTokens {
128 tokens: extractor.tokens,
129 source: source.to_string(),
130 line_count: source.lines().count().max(1),
131 };
132 }
133 empty_tokens(source)
134}
135
136fn tokenize_mdx(
138 source: &str,
139 strip_types: bool,
140 skip_imports: bool,
141 extract_fn: fn(&str) -> String,
142) -> FileTokens {
143 let statements = extract_fn(source);
144 if !statements.is_empty() {
145 let allocator = Allocator::default();
146 let parser_return = Parser::new(&allocator, &statements, SourceType::jsx()).parse();
147
148 let mut extractor = TokenExtractor::new(strip_types, skip_imports);
149 extractor.visit_program(&parser_return.program);
150
151 return FileTokens {
152 tokens: extractor.tokens,
153 source: source.to_string(),
154 line_count: source.lines().count().max(1),
155 };
156 }
157 empty_tokens(source)
158}
159
160fn empty_tokens(source: &str) -> FileTokens {
162 FileTokens {
163 tokens: Vec::new(),
164 source: source.to_string(),
165 line_count: source.lines().count().max(1),
166 }
167}
168
169fn tokenize_js_ts(path: &Path, source: &str, strip_types: bool, skip_imports: bool) -> FileTokens {
171 let source_type = SourceType::from_path(path).unwrap_or_default();
172 let allocator = Allocator::default();
173 let parser_return = Parser::new(&allocator, source, source_type).parse();
174
175 let mut extractor = TokenExtractor::new(strip_types, skip_imports);
176 extractor.visit_program(&parser_return.program);
177
178 if extractor.tokens.len() < 5 && source.len() > 100 && !source_type.is_jsx() {
181 let jsx_type = if source_type.is_typescript() {
182 SourceType::tsx()
183 } else {
184 SourceType::jsx()
185 };
186 let allocator2 = Allocator::default();
187 let retry_return = Parser::new(&allocator2, source, jsx_type).parse();
188 let mut retry_extractor = TokenExtractor::new(strip_types, skip_imports);
189 retry_extractor.visit_program(&retry_return.program);
190 if retry_extractor.tokens.len() > extractor.tokens.len() {
191 extractor = retry_extractor;
192 }
193 }
194
195 FileTokens {
196 tokens: extractor.tokens,
197 source: source.to_string(),
198 line_count: source.lines().count().max(1),
199 }
200}
201
202#[cfg(test)]
203mod tests;