fallow_core/duplicates/tokenize/
mod.rs1use std::path::Path;
2
3use oxc_allocator::Allocator;
4use oxc_ast_visit::Visit;
5use oxc_parser::Parser;
6use oxc_span::{SourceType, Span};
7
8pub use super::token_types::{
10 FileTokens, KeywordType, OperatorType, PunctuationType, SourceToken, TokenKind,
11};
12use super::token_visitor::TokenExtractor;
13
14#[must_use]
27pub fn tokenize_file(path: &Path, source: &str, skip_imports: bool) -> FileTokens {
28 tokenize_file_inner(path, source, false, skip_imports)
29}
30
31#[must_use]
33pub fn tokenize_file_cross_language(
34 path: &Path,
35 source: &str,
36 strip_types: bool,
37 skip_imports: bool,
38) -> FileTokens {
39 tokenize_file_inner(path, source, strip_types, skip_imports)
40}
41
42fn tokenize_file_inner(
43 path: &Path,
44 source: &str,
45 strip_types: bool,
46 skip_imports: bool,
47) -> FileTokens {
48 use crate::extract::{extract_astro_frontmatter, extract_mdx_statements, is_sfc_file};
49
50 let ext = path.extension().and_then(|e| e.to_str()).unwrap_or("");
51
52 if is_sfc_file(path) {
53 return tokenize_sfc(source, strip_types, skip_imports);
54 }
55 if ext == "astro" {
56 return tokenize_astro(source, strip_types, skip_imports, extract_astro_frontmatter);
57 }
58 if ext == "mdx" {
59 return tokenize_mdx(source, strip_types, skip_imports, extract_mdx_statements);
60 }
61 if ext == "css" || ext == "scss" {
62 return empty_tokens(source);
63 }
64
65 tokenize_js_ts(path, source, strip_types, skip_imports)
66}
67
68#[expect(
70 clippy::cast_possible_truncation,
71 reason = "byte offsets are bounded by source size"
72)]
73fn tokenize_sfc(source: &str, strip_types: bool, skip_imports: bool) -> FileTokens {
74 let scripts = crate::extract::extract_sfc_scripts(source);
75 let mut all_tokens = Vec::new();
76
77 for script in &scripts {
78 let source_type = match (script.is_typescript, script.is_jsx) {
79 (true, true) => SourceType::tsx(),
80 (true, false) => SourceType::ts(),
81 (false, true) => SourceType::jsx(),
82 (false, false) => SourceType::mjs(),
83 };
84 let allocator = Allocator::default();
85 let parser_return = Parser::new(&allocator, &script.body, source_type).parse();
86
87 let mut extractor = TokenExtractor::new(strip_types, skip_imports);
88 extractor.visit_program(&parser_return.program);
89
90 let offset = script.byte_offset as u32;
91 for token in &mut extractor.tokens {
92 token.span = Span::new(token.span.start + offset, token.span.end + offset);
93 }
94 all_tokens.extend(extractor.tokens);
95 }
96
97 FileTokens {
98 tokens: all_tokens,
99 source: source.to_string(),
100 line_count: source.lines().count().max(1),
101 }
102}
103
104#[expect(
106 clippy::cast_possible_truncation,
107 reason = "byte offsets are bounded by source size"
108)]
109fn tokenize_astro(
110 source: &str,
111 strip_types: bool,
112 skip_imports: bool,
113 extract_fn: fn(&str) -> Option<fallow_extract::sfc::SfcScript>,
114) -> FileTokens {
115 if let Some(script) = extract_fn(source) {
116 let allocator = Allocator::default();
117 let parser_return = Parser::new(&allocator, &script.body, SourceType::ts()).parse();
118
119 let mut extractor = TokenExtractor::new(strip_types, skip_imports);
120 extractor.visit_program(&parser_return.program);
121
122 let offset = script.byte_offset as u32;
123 for token in &mut extractor.tokens {
124 token.span = Span::new(token.span.start + offset, token.span.end + offset);
125 }
126
127 return FileTokens {
128 tokens: extractor.tokens,
129 source: source.to_string(),
130 line_count: source.lines().count().max(1),
131 };
132 }
133 empty_tokens(source)
134}
135
136fn tokenize_mdx(
138 source: &str,
139 strip_types: bool,
140 skip_imports: bool,
141 extract_fn: fn(&str) -> String,
142) -> FileTokens {
143 let statements = extract_fn(source);
144 if !statements.is_empty() {
145 let allocator = Allocator::default();
146 let parser_return = Parser::new(&allocator, &statements, SourceType::jsx()).parse();
147
148 let mut extractor = TokenExtractor::new(strip_types, skip_imports);
149 extractor.visit_program(&parser_return.program);
150
151 return FileTokens {
152 tokens: extractor.tokens,
153 source: source.to_string(),
154 line_count: source.lines().count().max(1),
155 };
156 }
157 empty_tokens(source)
158}
159
160fn empty_tokens(source: &str) -> FileTokens {
162 FileTokens {
163 tokens: Vec::new(),
164 source: source.to_string(),
165 line_count: source.lines().count().max(1),
166 }
167}
168
169fn tokenize_js_ts(path: &Path, source: &str, strip_types: bool, skip_imports: bool) -> FileTokens {
171 let source_type = match path.extension().and_then(|ext| ext.to_str()) {
172 Some("gts") => SourceType::ts(),
173 Some("gjs") => SourceType::mjs(),
174 _ => SourceType::from_path(path).unwrap_or_default(),
175 };
176 let stripped_glimmer_source = crate::extract::is_glimmer_file(path)
177 .then(|| crate::extract::strip_glimmer_templates(source))
178 .flatten();
179 let parser_source = stripped_glimmer_source.as_deref().unwrap_or(source);
180 let allocator = Allocator::default();
181 let parser_return = Parser::new(&allocator, parser_source, source_type).parse();
182
183 let mut extractor = TokenExtractor::new(strip_types, skip_imports);
184 extractor.visit_program(&parser_return.program);
185
186 if extractor.tokens.len() < 5 && source.len() > 100 && !source_type.is_jsx() {
189 let jsx_type = if source_type.is_typescript() {
190 SourceType::tsx()
191 } else {
192 SourceType::jsx()
193 };
194 let allocator2 = Allocator::default();
195 let retry_return = Parser::new(&allocator2, parser_source, jsx_type).parse();
196 let mut retry_extractor = TokenExtractor::new(strip_types, skip_imports);
197 retry_extractor.visit_program(&retry_return.program);
198 if retry_extractor.tokens.len() > extractor.tokens.len() {
199 extractor = retry_extractor;
200 }
201 }
202
203 FileTokens {
204 tokens: extractor.tokens,
205 source: source.to_string(),
206 line_count: source.lines().count().max(1),
207 }
208}
209
210#[cfg(test)]
211mod tests;