fallow_core/duplicates/tokenize/
mod.rs1use std::path::Path;
2
3use oxc_allocator::Allocator;
4use oxc_ast_visit::Visit;
5use oxc_parser::Parser;
6use oxc_span::{SourceType, Span};
7
8pub use super::token_types::{
10 FileTokens, KeywordType, OperatorType, PunctuationType, SourceToken, TokenKind,
11};
12use super::token_visitor::TokenExtractor;
13
14#[must_use]
24pub fn tokenize_file(path: &Path, source: &str) -> FileTokens {
25 tokenize_file_inner(path, source, false)
26}
27
28#[must_use]
30pub fn tokenize_file_cross_language(path: &Path, source: &str, strip_types: bool) -> FileTokens {
31 tokenize_file_inner(path, source, strip_types)
32}
33
34fn tokenize_file_inner(path: &Path, source: &str, strip_types: bool) -> FileTokens {
35 use crate::extract::{extract_astro_frontmatter, extract_mdx_statements, is_sfc_file};
36
37 let ext = path.extension().and_then(|e| e.to_str()).unwrap_or("");
38
39 if is_sfc_file(path) {
40 return tokenize_sfc(source, strip_types);
41 }
42 if ext == "astro" {
43 return tokenize_astro(source, strip_types, extract_astro_frontmatter);
44 }
45 if ext == "mdx" {
46 return tokenize_mdx(source, strip_types, extract_mdx_statements);
47 }
48 if ext == "css" || ext == "scss" {
49 return empty_tokens(source);
50 }
51
52 tokenize_js_ts(path, source, strip_types)
53}
54
55#[expect(
57 clippy::cast_possible_truncation,
58 reason = "byte offsets are bounded by source size"
59)]
60fn tokenize_sfc(source: &str, strip_types: bool) -> FileTokens {
61 let scripts = crate::extract::extract_sfc_scripts(source);
62 let mut all_tokens = Vec::new();
63
64 for script in &scripts {
65 let source_type = match (script.is_typescript, script.is_jsx) {
66 (true, true) => SourceType::tsx(),
67 (true, false) => SourceType::ts(),
68 (false, true) => SourceType::jsx(),
69 (false, false) => SourceType::mjs(),
70 };
71 let allocator = Allocator::default();
72 let parser_return = Parser::new(&allocator, &script.body, source_type).parse();
73
74 let mut extractor = TokenExtractor::with_strip_types(strip_types);
75 extractor.visit_program(&parser_return.program);
76
77 let offset = script.byte_offset as u32;
78 for token in &mut extractor.tokens {
79 token.span = Span::new(token.span.start + offset, token.span.end + offset);
80 }
81 all_tokens.extend(extractor.tokens);
82 }
83
84 FileTokens {
85 tokens: all_tokens,
86 source: source.to_string(),
87 line_count: source.lines().count().max(1),
88 }
89}
90
91#[expect(
93 clippy::cast_possible_truncation,
94 reason = "byte offsets are bounded by source size"
95)]
96fn tokenize_astro(
97 source: &str,
98 strip_types: bool,
99 extract_fn: fn(&str) -> Option<fallow_extract::sfc::SfcScript>,
100) -> FileTokens {
101 if let Some(script) = extract_fn(source) {
102 let allocator = Allocator::default();
103 let parser_return = Parser::new(&allocator, &script.body, SourceType::ts()).parse();
104
105 let mut extractor = TokenExtractor::with_strip_types(strip_types);
106 extractor.visit_program(&parser_return.program);
107
108 let offset = script.byte_offset as u32;
109 for token in &mut extractor.tokens {
110 token.span = Span::new(token.span.start + offset, token.span.end + offset);
111 }
112
113 return FileTokens {
114 tokens: extractor.tokens,
115 source: source.to_string(),
116 line_count: source.lines().count().max(1),
117 };
118 }
119 empty_tokens(source)
120}
121
122fn tokenize_mdx(source: &str, strip_types: bool, extract_fn: fn(&str) -> String) -> FileTokens {
124 let statements = extract_fn(source);
125 if !statements.is_empty() {
126 let allocator = Allocator::default();
127 let parser_return = Parser::new(&allocator, &statements, SourceType::jsx()).parse();
128
129 let mut extractor = TokenExtractor::with_strip_types(strip_types);
130 extractor.visit_program(&parser_return.program);
131
132 return FileTokens {
133 tokens: extractor.tokens,
134 source: source.to_string(),
135 line_count: source.lines().count().max(1),
136 };
137 }
138 empty_tokens(source)
139}
140
141fn empty_tokens(source: &str) -> FileTokens {
143 FileTokens {
144 tokens: Vec::new(),
145 source: source.to_string(),
146 line_count: source.lines().count().max(1),
147 }
148}
149
150fn tokenize_js_ts(path: &Path, source: &str, strip_types: bool) -> FileTokens {
152 let source_type = SourceType::from_path(path).unwrap_or_default();
153 let allocator = Allocator::default();
154 let parser_return = Parser::new(&allocator, source, source_type).parse();
155
156 let mut extractor = TokenExtractor::with_strip_types(strip_types);
157 extractor.visit_program(&parser_return.program);
158
159 if extractor.tokens.len() < 5 && source.len() > 100 && !source_type.is_jsx() {
162 let jsx_type = if source_type.is_typescript() {
163 SourceType::tsx()
164 } else {
165 SourceType::jsx()
166 };
167 let allocator2 = Allocator::default();
168 let retry_return = Parser::new(&allocator2, source, jsx_type).parse();
169 let mut retry_extractor = TokenExtractor::with_strip_types(strip_types);
170 retry_extractor.visit_program(&retry_return.program);
171 if retry_extractor.tokens.len() > extractor.tokens.len() {
172 extractor = retry_extractor;
173 }
174 }
175
176 FileTokens {
177 tokens: extractor.tokens,
178 source: source.to_string(),
179 line_count: source.lines().count().max(1),
180 }
181}
182
183#[cfg(test)]
184mod tests;