fallow_core/duplicates/tokenize/
mod.rs1use std::path::Path;
2
3use oxc_allocator::Allocator;
4use oxc_ast_visit::Visit;
5use oxc_parser::Parser;
6use oxc_span::{SourceType, Span};
7
8pub use super::token_types::{
9 FileTokens, KeywordType, OperatorType, PunctuationType, SourceToken, TokenKind,
10};
11use super::token_visitor::TokenExtractor;
12
13#[must_use]
26pub fn tokenize_file(path: &Path, source: &str, skip_imports: bool) -> FileTokens {
27 tokenize_file_inner(path, source, false, skip_imports)
28}
29
30#[must_use]
32pub fn tokenize_file_cross_language(
33 path: &Path,
34 source: &str,
35 strip_types: bool,
36 skip_imports: bool,
37) -> FileTokens {
38 tokenize_file_inner(path, source, strip_types, skip_imports)
39}
40
41fn tokenize_file_inner(
42 path: &Path,
43 source: &str,
44 strip_types: bool,
45 skip_imports: bool,
46) -> FileTokens {
47 use crate::extract::{extract_astro_frontmatter, extract_mdx_statements, is_sfc_file};
48
49 let ext = path.extension().and_then(|e| e.to_str()).unwrap_or("");
50
51 if is_sfc_file(path) {
52 return tokenize_sfc(source, strip_types, skip_imports);
53 }
54 if ext == "astro" {
55 return tokenize_astro(source, strip_types, skip_imports, extract_astro_frontmatter);
56 }
57 if ext == "mdx" {
58 return tokenize_mdx(source, strip_types, skip_imports, extract_mdx_statements);
59 }
60 if ext == "css" || ext == "scss" {
61 return empty_tokens(source);
62 }
63
64 tokenize_js_ts(path, source, strip_types, skip_imports)
65}
66
67#[expect(
69 clippy::cast_possible_truncation,
70 reason = "byte offsets are bounded by source size"
71)]
72fn tokenize_sfc(source: &str, strip_types: bool, skip_imports: bool) -> FileTokens {
73 let scripts = crate::extract::extract_sfc_scripts(source);
74 let mut all_tokens = Vec::new();
75 let mut atomic_invocation_spans = Vec::new();
76
77 for script in &scripts {
78 let source_type = match (script.is_typescript, script.is_jsx) {
79 (true, true) => SourceType::tsx(),
80 (true, false) => SourceType::ts(),
81 (false, true) => SourceType::jsx(),
82 (false, false) => SourceType::mjs(),
83 };
84 let allocator = Allocator::default();
85 let parser_return = Parser::new(&allocator, &script.body, source_type).parse();
86
87 let mut extractor = TokenExtractor::new(strip_types, skip_imports);
88 extractor.visit_program(&parser_return.program);
89
90 let offset = script.byte_offset as u32;
91 for token in &mut extractor.tokens {
92 token.span = Span::new(token.span.start + offset, token.span.end + offset);
93 }
94 for span in &mut extractor.atomic_invocation_spans {
95 *span = Span::new(span.start + offset, span.end + offset);
96 }
97 all_tokens.extend(extractor.tokens);
98 atomic_invocation_spans.extend(extractor.atomic_invocation_spans);
99 }
100
101 FileTokens {
102 tokens: all_tokens,
103 atomic_invocation_spans,
104 source: source.to_string(),
105 line_count: source.lines().count().max(1),
106 }
107}
108
109#[expect(
111 clippy::cast_possible_truncation,
112 reason = "byte offsets are bounded by source size"
113)]
114fn tokenize_astro(
115 source: &str,
116 strip_types: bool,
117 skip_imports: bool,
118 extract_fn: fn(&str) -> Option<fallow_extract::sfc::SfcScript>,
119) -> FileTokens {
120 if let Some(script) = extract_fn(source) {
121 let allocator = Allocator::default();
122 let parser_return = Parser::new(&allocator, &script.body, SourceType::ts()).parse();
123
124 let mut extractor = TokenExtractor::new(strip_types, skip_imports);
125 extractor.visit_program(&parser_return.program);
126
127 let offset = script.byte_offset as u32;
128 for token in &mut extractor.tokens {
129 token.span = Span::new(token.span.start + offset, token.span.end + offset);
130 }
131 for span in &mut extractor.atomic_invocation_spans {
132 *span = Span::new(span.start + offset, span.end + offset);
133 }
134
135 return FileTokens {
136 tokens: extractor.tokens,
137 atomic_invocation_spans: extractor.atomic_invocation_spans,
138 source: source.to_string(),
139 line_count: source.lines().count().max(1),
140 };
141 }
142 empty_tokens(source)
143}
144
145fn tokenize_mdx(
147 source: &str,
148 strip_types: bool,
149 skip_imports: bool,
150 extract_fn: fn(&str) -> String,
151) -> FileTokens {
152 let statements = extract_fn(source);
153 if !statements.is_empty() {
154 let allocator = Allocator::default();
155 let parser_return = Parser::new(&allocator, &statements, SourceType::jsx()).parse();
156
157 let mut extractor = TokenExtractor::new(strip_types, skip_imports);
158 extractor.visit_program(&parser_return.program);
159
160 return FileTokens {
161 tokens: extractor.tokens,
162 atomic_invocation_spans: extractor.atomic_invocation_spans,
163 source: source.to_string(),
164 line_count: source.lines().count().max(1),
165 };
166 }
167 empty_tokens(source)
168}
169
170fn empty_tokens(source: &str) -> FileTokens {
172 FileTokens {
173 tokens: Vec::new(),
174 atomic_invocation_spans: Vec::new(),
175 source: source.to_string(),
176 line_count: source.lines().count().max(1),
177 }
178}
179
180fn tokenize_js_ts(path: &Path, source: &str, strip_types: bool, skip_imports: bool) -> FileTokens {
182 let source_type = match path.extension().and_then(|ext| ext.to_str()) {
183 Some("gts") => SourceType::ts(),
184 Some("gjs") => SourceType::mjs(),
185 _ => SourceType::from_path(path).unwrap_or_default(),
186 };
187 let stripped_glimmer_source = crate::extract::is_glimmer_file(path)
188 .then(|| crate::extract::strip_glimmer_templates(source))
189 .flatten();
190 let parser_source = stripped_glimmer_source.as_deref().unwrap_or(source);
191 let allocator = Allocator::default();
192 let parser_return = Parser::new(&allocator, parser_source, source_type).parse();
193
194 let mut extractor = TokenExtractor::new(strip_types, skip_imports);
195 extractor.visit_program(&parser_return.program);
196
197 if extractor.tokens.len() < 5 && source.len() > 100 && !source_type.is_jsx() {
198 let jsx_type = if source_type.is_typescript() {
199 SourceType::tsx()
200 } else {
201 SourceType::jsx()
202 };
203 let allocator2 = Allocator::default();
204 let retry_return = Parser::new(&allocator2, parser_source, jsx_type).parse();
205 let mut retry_extractor = TokenExtractor::new(strip_types, skip_imports);
206 retry_extractor.visit_program(&retry_return.program);
207 if retry_extractor.tokens.len() > extractor.tokens.len() {
208 extractor = retry_extractor;
209 }
210 }
211
212 FileTokens {
213 tokens: extractor.tokens,
214 atomic_invocation_spans: extractor.atomic_invocation_spans,
215 source: source.to_string(),
216 line_count: source.lines().count().max(1),
217 }
218}
219
220#[cfg(test)]
221mod tests;