fallow_core/duplicates/tokenize/
mod.rs1use std::path::Path;
2
3use oxc_allocator::Allocator;
4use oxc_ast_visit::Visit;
5use oxc_parser::Parser;
6use oxc_span::{SourceType, Span};
7
8pub use super::token_types::{
10 FileTokens, KeywordType, OperatorType, PunctuationType, SourceToken, TokenKind,
11};
12use super::token_visitor::TokenExtractor;
13
14pub fn tokenize_file(path: &Path, source: &str) -> FileTokens {
24 tokenize_file_inner(path, source, false)
25}
26
27pub fn tokenize_file_cross_language(path: &Path, source: &str, strip_types: bool) -> FileTokens {
29 tokenize_file_inner(path, source, strip_types)
30}
31
32fn tokenize_file_inner(path: &Path, source: &str, strip_types: bool) -> FileTokens {
33 use crate::extract::{
34 extract_astro_frontmatter, extract_mdx_statements, extract_sfc_scripts, is_sfc_file,
35 };
36
37 let ext = path.extension().and_then(|e| e.to_str()).unwrap_or("");
38
39 if is_sfc_file(path) {
41 let scripts = extract_sfc_scripts(source);
42 let mut all_tokens = Vec::new();
43
44 for script in &scripts {
45 let source_type = match (script.is_typescript, script.is_jsx) {
46 (true, true) => SourceType::tsx(),
47 (true, false) => SourceType::ts(),
48 (false, true) => SourceType::jsx(),
49 (false, false) => SourceType::mjs(),
50 };
51 let allocator = Allocator::default();
52 let parser_return = Parser::new(&allocator, &script.body, source_type).parse();
53
54 let mut extractor = TokenExtractor::with_strip_types(strip_types);
55 extractor.visit_program(&parser_return.program);
56
57 let offset = script.byte_offset as u32;
60 for token in &mut extractor.tokens {
61 token.span = Span::new(token.span.start + offset, token.span.end + offset);
62 }
63 all_tokens.extend(extractor.tokens);
64 }
65
66 let line_count = source.lines().count().max(1);
67 return FileTokens {
68 tokens: all_tokens,
69 source: source.to_string(),
70 line_count,
71 };
72 }
73
74 if ext == "astro" {
76 if let Some(script) = extract_astro_frontmatter(source) {
77 let allocator = Allocator::default();
78 let parser_return = Parser::new(&allocator, &script.body, SourceType::ts()).parse();
79
80 let mut extractor = TokenExtractor::with_strip_types(strip_types);
81 extractor.visit_program(&parser_return.program);
82
83 let offset = script.byte_offset as u32;
84 for token in &mut extractor.tokens {
85 token.span = Span::new(token.span.start + offset, token.span.end + offset);
86 }
87
88 let line_count = source.lines().count().max(1);
89 return FileTokens {
90 tokens: extractor.tokens,
91 source: source.to_string(),
92 line_count,
93 };
94 }
95 let line_count = source.lines().count().max(1);
97 return FileTokens {
98 tokens: Vec::new(),
99 source: source.to_string(),
100 line_count,
101 };
102 }
103
104 if ext == "mdx" {
106 let statements = extract_mdx_statements(source);
107 if !statements.is_empty() {
108 let allocator = Allocator::default();
109 let parser_return = Parser::new(&allocator, &statements, SourceType::jsx()).parse();
110
111 let mut extractor = TokenExtractor::with_strip_types(strip_types);
112 extractor.visit_program(&parser_return.program);
113
114 let line_count = source.lines().count().max(1);
115 return FileTokens {
116 tokens: extractor.tokens,
117 source: source.to_string(),
118 line_count,
119 };
120 }
121 let line_count = source.lines().count().max(1);
122 return FileTokens {
123 tokens: Vec::new(),
124 source: source.to_string(),
125 line_count,
126 };
127 }
128
129 if ext == "css" || ext == "scss" {
131 let line_count = source.lines().count().max(1);
132 return FileTokens {
133 tokens: Vec::new(),
134 source: source.to_string(),
135 line_count,
136 };
137 }
138
139 let source_type = SourceType::from_path(path).unwrap_or_default();
140 let allocator = Allocator::default();
141 let parser_return = Parser::new(&allocator, source, source_type).parse();
142
143 let mut extractor = TokenExtractor::with_strip_types(strip_types);
144 extractor.visit_program(&parser_return.program);
145
146 if extractor.tokens.len() < 5 && source.len() > 100 && !source_type.is_jsx() {
149 let jsx_type = if source_type.is_typescript() {
150 SourceType::tsx()
151 } else {
152 SourceType::jsx()
153 };
154 let allocator2 = Allocator::default();
155 let retry_return = Parser::new(&allocator2, source, jsx_type).parse();
156 let mut retry_extractor = TokenExtractor::with_strip_types(strip_types);
157 retry_extractor.visit_program(&retry_return.program);
158 if retry_extractor.tokens.len() > extractor.tokens.len() {
159 extractor = retry_extractor;
160 }
161 }
162
163 let line_count = source.lines().count().max(1);
164
165 FileTokens {
166 tokens: extractor.tokens,
167 source: source.to_string(),
168 line_count,
169 }
170}
171
172#[cfg(test)]
173mod tests;