fallow_core/duplicates/tokenize/
mod.rs1use std::path::Path;
2
3use oxc_allocator::Allocator;
4use oxc_ast_visit::Visit;
5use oxc_parser::Parser;
6use oxc_span::{SourceType, Span};
7
8pub use super::token_types::{
10 FileTokens, KeywordType, OperatorType, PunctuationType, SourceToken, TokenKind,
11};
12use super::token_visitor::TokenExtractor;
13
14#[must_use]
27pub fn tokenize_file(path: &Path, source: &str, skip_imports: bool) -> FileTokens {
28 tokenize_file_inner(path, source, false, skip_imports)
29}
30
31#[must_use]
33pub fn tokenize_file_cross_language(
34 path: &Path,
35 source: &str,
36 strip_types: bool,
37 skip_imports: bool,
38) -> FileTokens {
39 tokenize_file_inner(path, source, strip_types, skip_imports)
40}
41
42fn tokenize_file_inner(
43 path: &Path,
44 source: &str,
45 strip_types: bool,
46 skip_imports: bool,
47) -> FileTokens {
48 use crate::extract::{extract_astro_frontmatter, extract_mdx_statements, is_sfc_file};
49
50 let ext = path.extension().and_then(|e| e.to_str()).unwrap_or("");
51
52 if is_sfc_file(path) {
53 return tokenize_sfc(source, strip_types, skip_imports);
54 }
55 if ext == "astro" {
56 return tokenize_astro(source, strip_types, skip_imports, extract_astro_frontmatter);
57 }
58 if ext == "mdx" {
59 return tokenize_mdx(source, strip_types, skip_imports, extract_mdx_statements);
60 }
61 if ext == "css" || ext == "scss" {
62 return empty_tokens(source);
63 }
64
65 tokenize_js_ts(path, source, strip_types, skip_imports)
66}
67
68#[expect(
70 clippy::cast_possible_truncation,
71 reason = "byte offsets are bounded by source size"
72)]
73fn tokenize_sfc(source: &str, strip_types: bool, skip_imports: bool) -> FileTokens {
74 let scripts = crate::extract::extract_sfc_scripts(source);
75 let mut all_tokens = Vec::new();
76 let mut atomic_invocation_spans = Vec::new();
77
78 for script in &scripts {
79 let source_type = match (script.is_typescript, script.is_jsx) {
80 (true, true) => SourceType::tsx(),
81 (true, false) => SourceType::ts(),
82 (false, true) => SourceType::jsx(),
83 (false, false) => SourceType::mjs(),
84 };
85 let allocator = Allocator::default();
86 let parser_return = Parser::new(&allocator, &script.body, source_type).parse();
87
88 let mut extractor = TokenExtractor::new(strip_types, skip_imports);
89 extractor.visit_program(&parser_return.program);
90
91 let offset = script.byte_offset as u32;
92 for token in &mut extractor.tokens {
93 token.span = Span::new(token.span.start + offset, token.span.end + offset);
94 }
95 for span in &mut extractor.atomic_invocation_spans {
96 *span = Span::new(span.start + offset, span.end + offset);
97 }
98 all_tokens.extend(extractor.tokens);
99 atomic_invocation_spans.extend(extractor.atomic_invocation_spans);
100 }
101
102 FileTokens {
103 tokens: all_tokens,
104 atomic_invocation_spans,
105 source: source.to_string(),
106 line_count: source.lines().count().max(1),
107 }
108}
109
110#[expect(
112 clippy::cast_possible_truncation,
113 reason = "byte offsets are bounded by source size"
114)]
115fn tokenize_astro(
116 source: &str,
117 strip_types: bool,
118 skip_imports: bool,
119 extract_fn: fn(&str) -> Option<fallow_extract::sfc::SfcScript>,
120) -> FileTokens {
121 if let Some(script) = extract_fn(source) {
122 let allocator = Allocator::default();
123 let parser_return = Parser::new(&allocator, &script.body, SourceType::ts()).parse();
124
125 let mut extractor = TokenExtractor::new(strip_types, skip_imports);
126 extractor.visit_program(&parser_return.program);
127
128 let offset = script.byte_offset as u32;
129 for token in &mut extractor.tokens {
130 token.span = Span::new(token.span.start + offset, token.span.end + offset);
131 }
132 for span in &mut extractor.atomic_invocation_spans {
133 *span = Span::new(span.start + offset, span.end + offset);
134 }
135
136 return FileTokens {
137 tokens: extractor.tokens,
138 atomic_invocation_spans: extractor.atomic_invocation_spans,
139 source: source.to_string(),
140 line_count: source.lines().count().max(1),
141 };
142 }
143 empty_tokens(source)
144}
145
146fn tokenize_mdx(
148 source: &str,
149 strip_types: bool,
150 skip_imports: bool,
151 extract_fn: fn(&str) -> String,
152) -> FileTokens {
153 let statements = extract_fn(source);
154 if !statements.is_empty() {
155 let allocator = Allocator::default();
156 let parser_return = Parser::new(&allocator, &statements, SourceType::jsx()).parse();
157
158 let mut extractor = TokenExtractor::new(strip_types, skip_imports);
159 extractor.visit_program(&parser_return.program);
160
161 return FileTokens {
162 tokens: extractor.tokens,
163 atomic_invocation_spans: extractor.atomic_invocation_spans,
164 source: source.to_string(),
165 line_count: source.lines().count().max(1),
166 };
167 }
168 empty_tokens(source)
169}
170
171fn empty_tokens(source: &str) -> FileTokens {
173 FileTokens {
174 tokens: Vec::new(),
175 atomic_invocation_spans: Vec::new(),
176 source: source.to_string(),
177 line_count: source.lines().count().max(1),
178 }
179}
180
181fn tokenize_js_ts(path: &Path, source: &str, strip_types: bool, skip_imports: bool) -> FileTokens {
183 let source_type = match path.extension().and_then(|ext| ext.to_str()) {
184 Some("gts") => SourceType::ts(),
185 Some("gjs") => SourceType::mjs(),
186 _ => SourceType::from_path(path).unwrap_or_default(),
187 };
188 let stripped_glimmer_source = crate::extract::is_glimmer_file(path)
189 .then(|| crate::extract::strip_glimmer_templates(source))
190 .flatten();
191 let parser_source = stripped_glimmer_source.as_deref().unwrap_or(source);
192 let allocator = Allocator::default();
193 let parser_return = Parser::new(&allocator, parser_source, source_type).parse();
194
195 let mut extractor = TokenExtractor::new(strip_types, skip_imports);
196 extractor.visit_program(&parser_return.program);
197
198 if extractor.tokens.len() < 5 && source.len() > 100 && !source_type.is_jsx() {
201 let jsx_type = if source_type.is_typescript() {
202 SourceType::tsx()
203 } else {
204 SourceType::jsx()
205 };
206 let allocator2 = Allocator::default();
207 let retry_return = Parser::new(&allocator2, parser_source, jsx_type).parse();
208 let mut retry_extractor = TokenExtractor::new(strip_types, skip_imports);
209 retry_extractor.visit_program(&retry_return.program);
210 if retry_extractor.tokens.len() > extractor.tokens.len() {
211 extractor = retry_extractor;
212 }
213 }
214
215 FileTokens {
216 tokens: extractor.tokens,
217 atomic_invocation_spans: extractor.atomic_invocation_spans,
218 source: source.to_string(),
219 line_count: source.lines().count().max(1),
220 }
221}
222
223#[cfg(test)]
224mod tests;