1mod apex;
2mod blocks;
3mod embedded;
4mod generic;
5mod hash;
6mod ignore;
7mod line_index;
8mod markdown;
9mod markup_attrs;
10mod oxc;
11mod scan;
12mod tap;
13
14use serde::Serialize;
15
16use crate::cli::{Mode, Options};
17
18use generic::tokenize_generic;
19use hash::hash_token;
20use ignore::find_ignore_regions;
21use line_index::LineIndex;
22use oxc::{is_oxc_format, tokenize_oxc_maps};
23use scan::count_prism_whitespace_tokens;
24
25#[derive(Clone, Debug, Serialize)]
27pub struct Location {
28 pub line: usize,
30 pub column: usize,
32 pub position: usize,
34}
35
36#[derive(Clone, Debug)]
38pub struct DetectionToken {
39 pub hash: u64,
41 pub start: Location,
43 pub end: Location,
45 pub range: [usize; 2],
47}
48
49#[derive(Clone, Debug)]
54pub struct TokenMap {
55 pub format: String,
57 pub tokens: Vec<DetectionToken>,
59 positions_assigned: bool,
60}
61
62#[derive(Clone, Debug)]
64pub struct SourceTokenMap {
65 pub source_id: String,
67 pub format: String,
69 pub tokens: Vec<DetectionToken>,
71 pub lines: usize,
73}
74
75#[derive(Clone, Debug)]
80pub struct Tokenizer {
81 options: Options,
82}
83
84impl Default for Tokenizer {
85 fn default() -> Self {
86 Self::new()
87 }
88}
89
90impl Tokenizer {
91 pub fn new() -> Self {
93 Self {
94 options: Options::default(),
95 }
96 }
97
98 pub fn with_options(options: Options) -> Self {
100 Self { options }
101 }
102
103 pub fn options(&self) -> &Options {
105 &self.options
106 }
107
108 pub fn options_mut(&mut self) -> &mut Options {
110 &mut self.options
111 }
112
113 pub fn tokenize(&self, content: &str, format: &str) -> Vec<DetectionToken> {
118 self.tokenize_maps(content, format)
119 .into_iter()
120 .next()
121 .map(|map| map.tokens)
122 .unwrap_or_default()
123 }
124
125 pub fn tokenize_maps(&self, content: &str, format: &str) -> Vec<TokenMap> {
127 tokenize_maps_for_detection(content, format, &self.options)
128 }
129
130 pub fn generate_maps(
132 &self,
133 source_id: impl Into<String>,
134 content: &str,
135 format: &str,
136 ) -> Vec<SourceTokenMap> {
137 let source_id = source_id.into();
138 self.tokenize_maps(content, format)
139 .into_iter()
140 .map(|map| SourceTokenMap {
141 source_id: source_id.clone(),
142 lines: token_map_line_count(&map.tokens),
143 format: map.format,
144 tokens: map.tokens,
145 })
146 .collect()
147 }
148}
149
150#[derive(Clone, Copy, Debug, PartialEq, Eq)]
151enum TokenKind {
152 Comment,
153 Constant,
154 Empty,
155 Keyword,
156 NewLine,
157 Number,
158 Operator,
159 Punctuation,
160 String,
161 Default,
162}
163
164#[derive(Clone, Copy)]
165struct ByteSpan {
166 start: usize,
167 end: usize,
168}
169
170struct TokenContext<'a> {
171 content: &'a str,
172 options: &'a Options,
173 ignore_regions: &'a [[usize; 2]],
174}
175
176impl TokenContext<'_> {
177 fn slice(&self, span: ByteSpan) -> &str {
178 &self.content[span.start..span.end]
179 }
180
181 fn overlaps_ignore_region(&self, span: ByteSpan) -> bool {
182 self.ignore_regions
183 .iter()
184 .any(|[region_start, region_end]| span.start < *region_end && span.end > *region_start)
185 }
186}
187
188#[cfg(test)]
189fn tokenize_for_detection(content: &str, format: &str, options: &Options) -> Vec<DetectionToken> {
190 tokenize_maps_for_detection(content, format, options)
191 .into_iter()
192 .next()
193 .map(|map| map.tokens)
194 .unwrap_or_default()
195}
196
197pub fn tokenize_maps_for_detection(
198 content: &str,
199 format: &str,
200 options: &Options,
201) -> Vec<TokenMap> {
202 let ignore_regions = find_ignore_regions(content, options);
203 let mut maps = if format == "markdown" {
204 markdown::tokenize_maps(content, options, &ignore_regions)
205 } else if format == "apex" {
206 apex::tokenize_maps(content, options, &ignore_regions)
207 } else if format == "tap" {
208 tap::tokenize_maps(content, options, &ignore_regions)
209 } else if matches!(format, "markup" | "vue" | "svelte" | "astro") {
210 blocks::tokenize_maps(content, format, options, &ignore_regions)
211 } else if is_oxc_format(format) {
212 tokenize_oxc_maps(content, format, options, &ignore_regions)
213 } else {
214 vec![TokenMap {
215 format: format.to_string(),
216 tokens: tokenize_generic(content, format, options, &ignore_regions),
217 positions_assigned: false,
218 }]
219 };
220 for map in &mut maps {
221 if !map.positions_assigned {
222 assign_token_positions(content, &map.format, options, &mut map.tokens);
223 }
224 }
225 maps
226}
227
228fn token_map_line_count(tokens: &[DetectionToken]) -> usize {
229 match (tokens.first(), tokens.last()) {
230 (Some(first), Some(last)) => last.end.line.saturating_sub(first.start.line),
231 _ => 0,
232 }
233}
234
235fn assign_token_positions(
236 content: &str,
237 format: &str,
238 options: &Options,
239 tokens: &mut [DetectionToken],
240) {
241 let needs_report_positions =
242 options.reporters.iter().any(|reporter| reporter == "json") || !options.silent;
243 if !needs_report_positions || !matches!(format, "javascript" | "typescript" | "jsx" | "tsx") {
244 for (position, token) in tokens.iter_mut().enumerate() {
245 token.start.position = position;
246 token.end.position = position;
247 }
248 return;
249 }
250
251 let mut position = 0usize;
252 let mut previous_end = 0usize;
253 for token in tokens {
254 if token.range[0] > previous_end {
255 position += count_prism_whitespace_tokens(content, previous_end, token.range[0]);
256 }
257 token.start.position = position;
258 token.end.position = position;
259 position += 1;
260 previous_end = previous_end.max(token.range[1]);
261 }
262}
263
264fn push_token(
265 tokens: &mut Vec<DetectionToken>,
266 context: &TokenContext<'_>,
267 kind: TokenKind,
268 span: ByteSpan,
269 start: Location,
270 end: Location,
271) {
272 if context.options.mode == Mode::Weak && kind == TokenKind::Comment {
273 return;
274 }
275 if context.overlaps_ignore_region(span) {
276 return;
277 }
278 tokens.push(DetectionToken {
279 hash: hash_token(kind, context.slice(span), context.options.ignore_case),
280 start,
281 end,
282 range: [span.start, span.end],
283 });
284}
285
286fn push_strict_whitespace_tokens(
287 tokens: &mut Vec<DetectionToken>,
288 context: &TokenContext<'_>,
289 span: ByteSpan,
290 line_index: &LineIndex,
291) {
292 if context.options.mode != Mode::Strict {
293 return;
294 }
295 let mut start = span.start;
296 while start < span.end {
297 let (end, kind) = scan_whitespace_token(context.content, start, span.end);
298 push_token(
299 tokens,
300 context,
301 kind,
302 ByteSpan { start, end },
303 line_index.location(start),
304 line_index.location(end),
305 );
306 start = end.max(start + 1);
307 }
308}
309
310fn scan_whitespace_token(content: &str, start: usize, limit: usize) -> (usize, TokenKind) {
311 let bytes = content.as_bytes();
312 if bytes[start] == b'\n' {
313 return (start + 1, TokenKind::NewLine);
314 }
315
316 let mut end = start;
317 while end < limit {
318 let ch = content[end..].chars().next().unwrap_or('\0');
319 if ch == '\n' || !ch.is_whitespace() {
320 break;
321 }
322 end += ch.len_utf8();
323 }
324 (end, TokenKind::Empty)
325}
326
327#[cfg(test)]
328mod tests;