1use std::collections::BTreeMap;
4
5use cpd_core::models::{DetectionToken, Token};
6
7use crate::embedded::blank_ranges_preserve_newlines;
8use crate::line_index::LineIndex;
9use crate::markdown::{offset_detection_tokens, tokens_to_detection};
10use crate::tokenizer::{Mode, TokenMap, TokenizeOptions};
11
12#[derive(Debug, Clone)]
13pub struct Block {
14 pub block_format: String,
15 pub content: String,
16 pub start_offset: usize,
17 pub start_line: u32,
18}
19
20#[allow(dead_code)]
21struct SfcBlock {
22 tag: String,
23 block_format: String,
24 block_start: usize,
25 inner_start: usize,
26 inner_end: usize,
27 block_end: usize,
28}
29
30pub fn tokenize_sfc_maps(
31 source: &str,
32 file_format: &str,
33 options: &TokenizeOptions,
34) -> Vec<TokenMap> {
35 if source.is_empty() {
36 return Vec::new();
37 }
38
39 let blocks = find_sfc_blocks(source, file_format);
40 if blocks.is_empty() {
41 let tokens = crate::generic::tokenize_generic(source, "html");
42 let detection = tokens_to_detection(tokens, options);
43 return if detection.is_empty() {
44 Vec::new()
45 } else {
46 vec![TokenMap {
47 format: "html".to_string(),
48 tokens: detection,
49 }]
50 };
51 }
52
53 let blank_ranges: Vec<[usize; 2]> = blocks
54 .iter()
55 .filter_map(|b| {
56 if b.inner_start < b.inner_end {
57 Some([b.inner_start, b.inner_end])
58 } else {
59 None
60 }
61 })
62 .collect();
63
64 let sanitized = blank_ranges_preserve_newlines(source, &blank_ranges);
65 let line_index = LineIndex::new(source.as_bytes());
66
67 let mut grouped: BTreeMap<String, Vec<DetectionToken>> = BTreeMap::new();
68
69 let markup_tokens = crate::generic::tokenize_generic(&sanitized, "html");
70 let mut markup_detection = tokens_to_detection(markup_tokens, options);
71 markup_detection.retain(|t| t.range[0] < t.range[1]);
72 if !markup_detection.is_empty() {
73 grouped
74 .entry("html".to_string())
75 .or_default()
76 .extend(markup_detection);
77 }
78
79 for block in &blocks {
80 if block.inner_start >= block.inner_end {
81 continue;
82 }
83 let inner = &source[block.inner_start..block.inner_end];
84 let inner_start_loc = line_index.location(block.inner_start);
85
86 let mut inner_tokens = tokenize_sfc_block_inner(&block.block_format, inner, options);
87 offset_detection_tokens(&mut inner_tokens, block.inner_start, &inner_start_loc);
88
89 grouped
90 .entry(block.block_format.clone())
91 .or_default()
92 .extend(inner_tokens);
93 }
94
95 grouped
96 .into_iter()
97 .filter(|(_, tokens)| !tokens.is_empty())
98 .map(|(format, tokens)| TokenMap { format, tokens })
99 .collect()
100}
101
102fn tokenize_sfc_block_inner(
103 format: &str,
104 source: &str,
105 options: &TokenizeOptions,
106) -> Vec<DetectionToken> {
107 let raw = match format {
108 "javascript" | "typescript" | "jsx" | "tsx" => {
109 crate::javascript::tokenize_js(source, format)
110 }
111 "vue" | "svelte" | "astro" => crate::sfc::tokenize_sfc(source, format, options.mode),
112 "markdown" | "md" => crate::generic::tokenize_generic(source, format),
113 _ => crate::generic::tokenize_generic(source, format),
114 };
115 tokens_to_detection(raw, options)
116}
117
118fn find_sfc_blocks(source: &str, file_format: &str) -> Vec<SfcBlock> {
119 let source_lower = source.to_ascii_lowercase();
120 let tag_names: &[&str] = match file_format {
121 "svelte" | "astro" => &["script", "style"],
122 _ => &["template", "script", "style"],
123 };
124
125 let mut blocks = Vec::new();
126
127 if file_format == "astro" {
128 if let Some(fm) = astro_frontmatter_block(source) {
129 blocks.push(fm);
130 }
131 }
132
133 for tag in tag_names {
134 let mut search_from = 0usize;
135 while let Some(block) = find_sfc_tag_block(source, &source_lower, tag, search_from) {
136 search_from = block.block_end;
137 blocks.push(block);
138 }
139 }
140
141 blocks.sort_by_key(|b| b.block_start);
142 blocks
143}
144
145fn find_sfc_tag_block(
146 source: &str,
147 source_lower: &str,
148 tag: &str,
149 from: usize,
150) -> Option<SfcBlock> {
151 let open_needle = format!("<{}", tag);
152 let close_needle = format!("</{}>", tag);
153
154 let open_start = source_lower[from..].find(&open_needle)? + from;
155 let after_tag_name = open_start + 1 + tag.len();
156 if source_lower
157 .as_bytes()
158 .get(after_tag_name)
159 .is_some_and(|b| b.is_ascii_alphabetic())
160 {
161 return None;
162 }
163 let tag_end = source_lower[open_start..].find('>')? + open_start + 1;
164 let close_start = source_lower[tag_end..].find(&close_needle)? + tag_end;
165
166 let attrs = &source[open_start + 1 + tag.len()..tag_end];
167 let inner_start = tag_end;
168 let inner_end = close_start;
169 let block_end = source_lower[close_start..]
170 .find('>')
171 .map(|i| close_start + i + 1)
172 .unwrap_or(close_start + close_needle.len());
173 let block_end = block_end.min(source.len());
174
175 let block_format = detect_sfc_block_format(attrs, tag);
176
177 Some(SfcBlock {
178 tag: tag.to_string(),
179 block_format,
180 block_start: open_start,
181 inner_start,
182 inner_end: inner_end.max(inner_start),
183 block_end,
184 })
185}
186
187fn detect_sfc_block_format(attrs: &str, tag: &str) -> String {
188 let lang = extract_lang_attr_value(attrs);
189 match tag {
190 "script" => match lang.as_deref() {
191 Some("ts" | "typescript") => "typescript".to_string(),
192 Some("js" | "javascript") => "javascript".to_string(),
193 Some(other) => {
194 if crate::formats::get_format_by_extension(other).is_some()
195 || crate::formats::SUPPORTED_FORMATS
196 .iter()
197 .any(|e| e.name == other)
198 {
199 other.to_string()
200 } else {
201 "javascript".to_string()
202 }
203 }
204 None => "javascript".to_string(),
205 },
206 "style" => match lang.as_deref() {
207 Some("scss" | "sass") => "scss".to_string(),
208 Some("less") => "less".to_string(),
209 _ => "css".to_string(),
210 },
211 "template" => match lang.as_deref() {
212 Some(v) if v == "pug" || v == "jade" => "pug".to_string(),
213 _ => "html".to_string(),
214 },
215 _ => "html".to_string(),
216 }
217}
218
219fn astro_frontmatter_block(source: &str) -> Option<SfcBlock> {
220 if !(source.starts_with("---\n") || source.starts_with("---\r\n")) {
221 return None;
222 }
223 let lines = crate::markdown::line_spans(source);
224 let close_idx = lines
225 .iter()
226 .enumerate()
227 .skip(1)
228 .find(|(_, span)| source[span.start..span.end].trim() == "---")
229 .map(|(idx, _)| idx)?;
230 let inner_start = lines.get(1)?.start;
231 let inner_end = source[..lines[close_idx].start]
232 .strip_suffix('\n')
233 .map(|prefix: &str| prefix.len())
234 .unwrap_or(lines[close_idx].start);
235 let block_end = lines[close_idx].next_start.min(source.len());
236 Some(SfcBlock {
237 tag: "script".to_string(),
238 block_format: "typescript".to_string(),
239 block_start: 0,
240 inner_start,
241 inner_end: inner_end.max(inner_start),
242 block_end,
243 })
244}
245
246fn extract_lang_attr_value(attrs: &str) -> Option<String> {
247 let lower = attrs.to_ascii_lowercase();
248 let lang_pos = lower.find("lang=")?;
249 let rest = &attrs[lang_pos + 5..];
250 let quote = if rest.starts_with('"') {
251 '"'
252 } else if rest.starts_with('\'') {
253 '\''
254 } else {
255 return None;
256 };
257 let value_start = 1;
258 let value_end = rest[value_start..].find(quote)? + value_start;
259 Some(rest[value_start..value_end].to_ascii_lowercase())
260}
261
262pub fn extract_blocks(source: &str, file_format: &str) -> Vec<Block> {
264 let source_lower = source.to_ascii_lowercase();
265 let tag_names: &[&str] = match file_format {
266 "svelte" | "astro" => &["script", "style"],
267 _ => &["template", "script", "style"],
268 };
269
270 let mut blocks = Vec::new();
271 for tag in tag_names {
272 let mut search_from = 0;
273 while let Some((block, next_from)) =
274 find_display_block(source, &source_lower, tag, search_from)
275 {
276 search_from = next_from;
277 blocks.push(block);
278 }
279 }
280 blocks.sort_by_key(|b: &Block| b.start_offset);
281 blocks
282}
283
284fn find_display_block(
285 source: &str,
286 source_lower: &str,
287 tag: &str,
288 from: usize,
289) -> Option<(Block, usize)> {
290 let open_needle = format!("<{}", tag);
291 let close_needle = format!("</{}>", tag);
292
293 let open_start = source_lower[from..].find(&open_needle)? + from;
294 let after_tag_name = open_start + 1 + tag.len();
295 if source_lower
296 .as_bytes()
297 .get(after_tag_name)
298 .is_some_and(|b| b.is_ascii_alphabetic())
299 {
300 return None;
301 }
302 let tag_end = source_lower[open_start..].find('>')? + open_start + 1;
303 let close_start = source_lower[tag_end..].find(&close_needle)? + tag_end;
304
305 let attrs = &source[open_start + 1 + tag.len()..tag_end];
306 let content = source[tag_end..close_start].to_string();
307 let content_len = content.len();
308 let start_line = source[..tag_end].lines().count() as u32 + 1;
309 let block_format = detect_display_block_format(attrs, tag);
310
311 Some((
312 Block {
313 block_format,
314 content,
315 start_offset: tag_end,
316 start_line,
317 },
318 tag_end + content_len,
319 ))
320}
321
322fn detect_display_block_format(attrs: &str, tag: &str) -> String {
323 let lang = extract_lang_attr_value(attrs);
324 match tag {
325 "script" => match lang.as_deref() {
326 Some("ts" | "typescript") => "typescript".to_string(),
327 Some("js" | "javascript") => "javascript".to_string(),
328 _ => "javascript".to_string(),
329 },
330 "style" => match lang.as_deref() {
331 Some("scss" | "sass") => "scss".to_string(),
332 Some("less") => "less".to_string(),
333 _ => "css".to_string(),
334 },
335 "template" => "html".to_string(),
336 _ => "html".to_string(),
337 }
338}
339
340pub fn tokenize_sfc(source: &str, file_format: &str, mode: Mode) -> Vec<Token> {
341 let blocks = extract_blocks(source, file_format);
342 let mut all_tokens = Vec::new();
343
344 for block in &blocks {
345 let mut block_tokens =
346 crate::tokenizer::tokenize(&block.block_format, &block.content, mode);
347 let line_offset = block.start_line.saturating_sub(1);
348 for token in &mut block_tokens {
349 token.start.line += line_offset;
350 token.end.line += line_offset;
351 }
352 all_tokens.extend(block_tokens);
353 }
354
355 all_tokens
356}
357
358#[cfg(test)]
359mod tests {
360 use super::*;
361
362 const VUE_FILE: &str = r#"<template>
363 <div>Hello</div>
364</template>
365
366<script>
367export default { name: 'Foo' }
368</script>
369
370<style>
371.foo { color: red; }
372</style>
373"#;
374
375 const VUE_TS_FILE: &str = r#"<template>
376 <div>Hello</div>
377</template>
378
379<script lang="ts">
380const x: number = 5;
381</script>
382
383<style lang="scss">
384.foo { color: red; }
385</style>
386"#;
387
388 #[test]
389 fn vue_file_extracts_three_blocks() {
390 let blocks = extract_blocks(VUE_FILE, "vue");
391 assert_eq!(blocks.len(), 3, "must find template, script, style blocks");
392 }
393
394 #[test]
395 fn script_block_default_format_is_javascript() {
396 let blocks = extract_blocks(VUE_FILE, "vue");
397 let script = blocks.iter().find(|b| b.block_format == "javascript");
398 assert!(script.is_some(), "plain <script> must be javascript format");
399 }
400
401 #[test]
402 fn script_lang_ts_produces_typescript_format() {
403 let blocks = extract_blocks(VUE_TS_FILE, "vue");
404 let ts_block = blocks.iter().find(|b| b.block_format == "typescript");
405 assert!(
406 ts_block.is_some(),
407 "<script lang=\"ts\"> must produce typescript format"
408 );
409 }
410
411 #[test]
412 fn unknown_lang_does_not_panic() {
413 let source = "<script lang=\"unknownlang123\">\nconst x = 1;\n</script>\n";
414 let result = std::panic::catch_unwind(|| extract_blocks(source, "vue"));
415 assert!(result.is_ok(), "unknown lang must not panic");
416 }
417
418 #[test]
419 fn no_blocks_returns_empty() {
420 let source = "just plain text no tags";
421 let blocks = extract_blocks(source, "vue");
422 assert!(blocks.is_empty());
423 }
424
425 #[test]
426 fn start_offset_is_after_opening_tag() {
427 let blocks = extract_blocks(VUE_FILE, "vue");
428 for block in &blocks {
429 assert!(block.start_offset > 0);
430 }
431 }
432
433 #[test]
434 fn vue_sfc_maps_produces_multiple_formats() {
435 let options = TokenizeOptions::new(Mode::Mild);
436 let maps = tokenize_sfc_maps(VUE_FILE, "vue", &options);
437 let formats: Vec<&str> = maps.iter().map(|m| m.format.as_str()).collect();
438 assert!(formats.contains(&"javascript"), "must have javascript map");
439 assert!(formats.contains(&"css"), "must have css map");
440 assert!(formats.contains(&"html"), "must have html map");
441 }
442
443 #[test]
444 fn vue_ts_maps_produces_typescript() {
445 let options = TokenizeOptions::new(Mode::Mild);
446 let maps = tokenize_sfc_maps(VUE_TS_FILE, "vue", &options);
447 let formats: Vec<&str> = maps.iter().map(|m| m.format.as_str()).collect();
448 assert!(formats.contains(&"typescript"), "must have typescript map");
449 assert!(formats.contains(&"scss"), "must have scss map");
450 }
451
452 #[test]
453 fn empty_sfc_returns_empty() {
454 let options = TokenizeOptions::new(Mode::Mild);
455 let maps = tokenize_sfc_maps("", "vue", &options);
456 assert!(maps.is_empty());
457 }
458
459 #[test]
460 fn svelte_sfc_maps_produces_multiple_formats() {
461 let source = r#"<script>
462 let count = 0;
463</script>
464
465<style>
466 .count { color: blue; }
467</style>
468"#;
469 let options = TokenizeOptions::new(Mode::Mild);
470 let maps = tokenize_sfc_maps(source, "svelte", &options);
471 let formats: Vec<&str> = maps.iter().map(|m| m.format.as_str()).collect();
472 assert!(
473 formats.contains(&"javascript"),
474 "svelte must have javascript map"
475 );
476 assert!(formats.contains(&"css"), "svelte must have css map");
477 assert!(
478 formats.contains(&"html"),
479 "svelte must have html markup map"
480 );
481 }
482}