1use std::path::Path;
2
3use oxc_allocator::Allocator;
4use oxc_ast_visit::Visit;
5use oxc_parser::Parser;
6use oxc_span::{SourceType, Span};
7
8mod lexical;
9
10pub use super::token_types::{
11 FileTokens, KeywordType, OperatorType, PunctuationType, SourceToken, TokenKind,
12};
13use super::token_visitor::TokenExtractor;
14
15#[must_use]
29pub fn tokenize_file(path: &Path, source: &str, skip_imports: bool) -> FileTokens {
30 tokenize_file_inner(path, source, false, skip_imports)
31}
32
33#[must_use]
35pub fn tokenize_file_cross_language(
36 path: &Path,
37 source: &str,
38 strip_types: bool,
39 skip_imports: bool,
40) -> FileTokens {
41 tokenize_file_inner(path, source, strip_types, skip_imports)
42}
43
44fn tokenize_file_inner(
45 path: &Path,
46 source: &str,
47 strip_types: bool,
48 skip_imports: bool,
49) -> FileTokens {
50 use crate::extract::{extract_astro_frontmatter, extract_mdx_statements, is_sfc_file};
51
52 let ext = path.extension().and_then(|e| e.to_str()).unwrap_or("");
53
54 if is_sfc_file(path) {
55 return tokenize_sfc(source, strip_types, skip_imports);
56 }
57 if ext == "astro" {
58 return tokenize_astro(source, strip_types, skip_imports, extract_astro_frontmatter);
59 }
60 if ext == "mdx" {
61 return tokenize_mdx(source, strip_types, skip_imports, extract_mdx_statements);
62 }
63 if matches!(ext, "css" | "scss" | "sass" | "less") {
64 return tokenize_style_source(source);
65 }
66
67 tokenize_js_ts(path, source, strip_types, skip_imports)
68}
69
70fn tokenize_sfc(source: &str, strip_types: bool, skip_imports: bool) -> FileTokens {
72 let scripts = crate::extract::extract_sfc_scripts(source);
73 let mut sections = Vec::new();
74
75 for script in &scripts {
76 let source_type = match (script.is_typescript, script.is_jsx) {
77 (true, true) => SourceType::tsx(),
78 (true, false) => SourceType::ts(),
79 (false, true) => SourceType::jsx(),
80 (false, false) => SourceType::mjs(),
81 };
82 sections.push(tokenize_js_section(
83 "js",
84 &script.body,
85 script.byte_offset,
86 source_type,
87 strip_types,
88 skip_imports,
89 ));
90 }
91
92 for region in crate::extract::extract_sfc_template_regions(source) {
93 sections.push(tokenize_lexical_section(
94 "markup",
95 ®ion.body,
96 region.byte_offset,
97 ));
98 }
99
100 for style in crate::extract::extract_sfc_styles(source) {
101 if style.src.is_none() {
102 sections.push(tokenize_lexical_section(
103 "style",
104 &style.body,
105 style.byte_offset,
106 ));
107 }
108 }
109
110 let (all_tokens, atomic_invocation_spans) = merge_sections(sections);
111
112 FileTokens {
113 tokens: all_tokens,
114 atomic_invocation_spans,
115 source: source.to_string(),
116 line_count: source.lines().count().max(1),
117 }
118}
119
120fn tokenize_astro(
122 source: &str,
123 strip_types: bool,
124 skip_imports: bool,
125 extract_fn: fn(&str) -> Option<fallow_extract::sfc::SfcScript>,
126) -> FileTokens {
127 if let Some(script) = extract_fn(source) {
128 let mut sections = vec![tokenize_js_section(
129 "js",
130 &script.body,
131 script.byte_offset,
132 SourceType::ts(),
133 strip_types,
134 skip_imports,
135 )];
136 for region in crate::extract::extract_astro_template_regions(source) {
137 sections.push(tokenize_lexical_section(
138 "markup",
139 ®ion.body,
140 region.byte_offset,
141 ));
142 }
143 for region in crate::extract::extract_astro_style_regions(source) {
144 sections.push(tokenize_lexical_section(
145 "style",
146 ®ion.body,
147 region.byte_offset,
148 ));
149 }
150 let (tokens, atomic_invocation_spans) = merge_sections(sections);
151 return FileTokens {
152 tokens,
153 atomic_invocation_spans,
154 source: source.to_string(),
155 line_count: source.lines().count().max(1),
156 };
157 }
158 let mut sections = Vec::new();
159 for region in crate::extract::extract_astro_template_regions(source) {
160 sections.push(tokenize_lexical_section(
161 "markup",
162 ®ion.body,
163 region.byte_offset,
164 ));
165 }
166 for region in crate::extract::extract_astro_style_regions(source) {
167 sections.push(tokenize_lexical_section(
168 "style",
169 ®ion.body,
170 region.byte_offset,
171 ));
172 }
173 let (tokens, atomic_invocation_spans) = merge_sections(sections);
174 FileTokens {
175 tokens,
176 atomic_invocation_spans,
177 source: source.to_string(),
178 line_count: source.lines().count().max(1),
179 }
180}
181
182fn tokenize_mdx(
184 source: &str,
185 strip_types: bool,
186 skip_imports: bool,
187 extract_fn: fn(&str) -> String,
188) -> FileTokens {
189 let statements = extract_fn(source);
190 if !statements.is_empty() {
191 let allocator = Allocator::default();
192 let parser_return = Parser::new(&allocator, &statements, SourceType::jsx()).parse();
193
194 let mut extractor = TokenExtractor::new(strip_types, skip_imports);
195 extractor.visit_program(&parser_return.program);
196
197 return FileTokens {
198 tokens: extractor.tokens,
199 atomic_invocation_spans: extractor.atomic_invocation_spans,
200 source: source.to_string(),
201 line_count: source.lines().count().max(1),
202 };
203 }
204 empty_tokens(source)
205}
206
207fn empty_tokens(source: &str) -> FileTokens {
209 FileTokens {
210 tokens: Vec::new(),
211 atomic_invocation_spans: Vec::new(),
212 source: source.to_string(),
213 line_count: source.lines().count().max(1),
214 }
215}
216
217fn tokenize_style_source(source: &str) -> FileTokens {
218 let mut tokens = Vec::with_capacity(source.len().min(64));
219 tokens.push(lexical::boundary_token("style", 0));
220 tokens.extend(lexical::tokenize_lexical_region(source, 0, true));
221 FileTokens {
222 tokens,
223 atomic_invocation_spans: Vec::new(),
224 source: source.to_string(),
225 line_count: source.lines().count().max(1),
226 }
227}
228
229struct TokenSection {
230 name: &'static str,
231 start: usize,
232 tokens: Vec<SourceToken>,
233 atomic_invocation_spans: Vec<Span>,
234}
235
236fn tokenize_js_section(
237 name: &'static str,
238 source: &str,
239 byte_offset: usize,
240 source_type: SourceType,
241 strip_types: bool,
242 skip_imports: bool,
243) -> TokenSection {
244 let allocator = Allocator::default();
245 let parser_return = Parser::new(&allocator, source, source_type).parse();
246
247 let mut extractor = TokenExtractor::new(strip_types, skip_imports);
248 extractor.visit_program(&parser_return.program);
249
250 let offset = byte_offset as u32;
251 for token in &mut extractor.tokens {
252 token.span = Span::new(token.span.start + offset, token.span.end + offset);
253 }
254 for span in &mut extractor.atomic_invocation_spans {
255 *span = Span::new(span.start + offset, span.end + offset);
256 }
257
258 TokenSection {
259 name,
260 start: byte_offset,
261 tokens: extractor.tokens,
262 atomic_invocation_spans: extractor.atomic_invocation_spans,
263 }
264}
265
266fn tokenize_lexical_section(name: &'static str, source: &str, byte_offset: usize) -> TokenSection {
267 let css = name == "style";
270 TokenSection {
271 name,
272 start: byte_offset,
273 tokens: lexical::tokenize_lexical_region(source, byte_offset, css),
274 atomic_invocation_spans: Vec::new(),
275 }
276}
277
278fn merge_sections(mut sections: Vec<TokenSection>) -> (Vec<SourceToken>, Vec<Span>) {
279 sections.retain(|section| !section.tokens.is_empty());
280 sections.sort_by_key(|section| section.start);
281
282 let mut tokens = Vec::new();
283 let mut atomic_invocation_spans = Vec::new();
284
285 for section in sections {
286 tokens.push(lexical::boundary_token(section.name, section.start));
287 tokens.extend(section.tokens);
288 atomic_invocation_spans.extend(section.atomic_invocation_spans);
289 }
290
291 (tokens, atomic_invocation_spans)
292}
293
294fn tokenize_js_ts(path: &Path, source: &str, strip_types: bool, skip_imports: bool) -> FileTokens {
296 let source_type = match path.extension().and_then(|ext| ext.to_str()) {
297 Some("gts") => SourceType::ts(),
298 Some("gjs") => SourceType::mjs(),
299 _ => SourceType::from_path(path).unwrap_or_default(),
300 };
301 let stripped_glimmer_source = crate::extract::is_glimmer_file(path)
302 .then(|| crate::extract::strip_glimmer_templates(source))
303 .flatten();
304 let parser_source = stripped_glimmer_source.as_deref().unwrap_or(source);
305 let allocator = Allocator::default();
306 let parser_return = Parser::new(&allocator, parser_source, source_type).parse();
307
308 let mut extractor = TokenExtractor::new(strip_types, skip_imports);
309 extractor.visit_program(&parser_return.program);
310
311 if extractor.tokens.len() < 5 && source.len() > 100 && !source_type.is_jsx() {
312 let jsx_type = if source_type.is_typescript() {
313 SourceType::tsx()
314 } else {
315 SourceType::jsx()
316 };
317 let allocator2 = Allocator::default();
318 let retry_return = Parser::new(&allocator2, parser_source, jsx_type).parse();
319 let mut retry_extractor = TokenExtractor::new(strip_types, skip_imports);
320 retry_extractor.visit_program(&retry_return.program);
321 if retry_extractor.tokens.len() > extractor.tokens.len() {
322 extractor = retry_extractor;
323 }
324 }
325
326 FileTokens {
327 tokens: extractor.tokens,
328 atomic_invocation_spans: extractor.atomic_invocation_spans,
329 source: source.to_string(),
330 line_count: source.lines().count().max(1),
331 }
332}
333
334#[cfg(test)]
335mod tests;