1use std::collections::BTreeMap;
2
3use cpd_core::models::{DetectionToken, Token};
4
5use crate::embedded::blank_ranges_preserve_newlines;
6use crate::line_index::LineIndex;
7use crate::markdown::{offset_detection_tokens, tokens_to_detection};
8use crate::tokenizer::{Mode, TokenMap, TokenizeOptions};
9
10#[derive(Debug, Clone)]
11pub struct Block {
12 pub block_format: String,
13 pub content: String,
14 pub start_offset: usize,
15 pub start_line: u32,
16}
17
18#[allow(dead_code)]
19struct SfcBlock {
20 tag: String,
21 block_format: String,
22 block_start: usize,
23 inner_start: usize,
24 inner_end: usize,
25 block_end: usize,
26}
27
28pub fn tokenize_sfc_maps(
29 source: &str,
30 file_format: &str,
31 options: &TokenizeOptions,
32) -> Vec<TokenMap> {
33 if source.is_empty() {
34 return Vec::new();
35 }
36
37 let blocks = find_sfc_blocks(source, file_format);
38 if blocks.is_empty() {
39 let tokens = crate::generic::tokenize_generic(source, "html");
40 let detection = tokens_to_detection(tokens, options);
41 return if detection.is_empty() {
42 Vec::new()
43 } else {
44 vec![TokenMap {
45 format: "html".to_string(),
46 tokens: detection,
47 }]
48 };
49 }
50
51 let blank_ranges: Vec<[usize; 2]> = blocks
52 .iter()
53 .filter_map(|b| {
54 if b.inner_start < b.inner_end {
55 Some([b.inner_start, b.inner_end])
56 } else {
57 None
58 }
59 })
60 .collect();
61
62 let sanitized = blank_ranges_preserve_newlines(source, &blank_ranges);
63 let line_index = LineIndex::new(source.as_bytes());
64
65 let mut grouped: BTreeMap<String, Vec<DetectionToken>> = BTreeMap::new();
66
67 let markup_tokens = crate::generic::tokenize_generic(&sanitized, "html");
68 let mut markup_detection = tokens_to_detection(markup_tokens, options);
69 markup_detection.retain(|t| t.range[0] < t.range[1]);
70 if !markup_detection.is_empty() {
71 grouped
72 .entry("html".to_string())
73 .or_default()
74 .extend(markup_detection);
75 }
76
77 for block in &blocks {
78 if block.inner_start >= block.inner_end {
79 continue;
80 }
81 let inner = &source[block.inner_start..block.inner_end];
82 let inner_start_loc = line_index.location(block.inner_start);
83
84 let mut inner_tokens = tokenize_sfc_block_inner(&block.block_format, inner, options);
85 offset_detection_tokens(&mut inner_tokens, block.inner_start, &inner_start_loc);
86
87 grouped
88 .entry(block.block_format.clone())
89 .or_default()
90 .extend(inner_tokens);
91 }
92
93 grouped
94 .into_iter()
95 .filter(|(_, tokens)| !tokens.is_empty())
96 .map(|(format, tokens)| TokenMap { format, tokens })
97 .collect()
98}
99
100fn tokenize_sfc_block_inner(
101 format: &str,
102 source: &str,
103 options: &TokenizeOptions,
104) -> Vec<DetectionToken> {
105 let raw = match format {
106 "javascript" | "typescript" | "jsx" | "tsx" => {
107 crate::javascript::tokenize_js(source, format)
108 }
109 "vue" | "svelte" | "astro" => crate::sfc::tokenize_sfc(source, format, options.mode),
110 "markdown" | "md" => crate::generic::tokenize_generic(source, format),
111 _ => crate::generic::tokenize_generic(source, format),
112 };
113 tokens_to_detection(raw, options)
114}
115
116fn find_sfc_blocks(source: &str, file_format: &str) -> Vec<SfcBlock> {
117 let source_lower = source.to_ascii_lowercase();
118 let tag_names: &[&str] = match file_format {
119 "svelte" | "astro" => &["script", "style"],
120 _ => &["template", "script", "style"],
121 };
122
123 let mut blocks = Vec::new();
124
125 if file_format == "astro" {
126 if let Some(fm) = astro_frontmatter_block(source) {
127 blocks.push(fm);
128 }
129 }
130
131 for tag in tag_names {
132 let mut search_from = 0usize;
133 while let Some(block) = find_sfc_tag_block(source, &source_lower, tag, search_from) {
134 search_from = block.block_end;
135 blocks.push(block);
136 }
137 }
138
139 blocks.sort_by_key(|b| b.block_start);
140 let mut deduped = Vec::new();
141 for block in blocks {
142 let nested = deduped.iter().any(|existing: &SfcBlock| {
143 block.block_start >= existing.block_start && block.block_start < existing.block_end
144 });
145 if !nested {
146 deduped.push(block);
147 }
148 }
149 deduped
150}
151
152fn find_sfc_tag_block(
153 source: &str,
154 source_lower: &str,
155 tag: &str,
156 from: usize,
157) -> Option<SfcBlock> {
158 let open_needle = format!("<{}", tag);
159 let close_needle = format!("</{}>", tag);
160
161 let open_start = source_lower[from..].find(&open_needle)? + from;
162 let after_tag_name = open_start + 1 + tag.len();
163 if source_lower
164 .as_bytes()
165 .get(after_tag_name)
166 .is_some_and(|b| b.is_ascii_alphabetic())
167 {
168 return None;
169 }
170 let tag_end = source_lower[open_start..].find('>')? + open_start + 1;
171 let close_start = source_lower[tag_end..].find(&close_needle)? + tag_end;
172
173 let attrs = &source[open_start + 1 + tag.len()..tag_end];
174 let inner_start = tag_end;
175 let inner_end = close_start;
176 let block_end = source_lower[close_start..]
177 .find('>')
178 .map(|i| close_start + i + 1)
179 .unwrap_or(close_start + close_needle.len());
180 let block_end = block_end.min(source.len());
181
182 let block_format = detect_sfc_block_format(attrs, tag);
183
184 Some(SfcBlock {
185 tag: tag.to_string(),
186 block_format,
187 block_start: open_start,
188 inner_start,
189 inner_end: inner_end.max(inner_start),
190 block_end,
191 })
192}
193
194fn detect_sfc_block_format(attrs: &str, tag: &str) -> String {
195 let lang = extract_lang_attr_value(attrs);
196 match tag {
197 "script" => match lang.as_deref() {
198 Some("ts" | "typescript") => "typescript".to_string(),
199 Some("js" | "javascript") => "javascript".to_string(),
200 Some(other) => {
201 if crate::formats::get_format_by_extension(other).is_some()
202 || crate::formats::SUPPORTED_FORMATS
203 .iter()
204 .any(|e| e.name == other)
205 {
206 other.to_string()
207 } else {
208 "javascript".to_string()
209 }
210 }
211 None => "javascript".to_string(),
212 },
213 "style" => match lang.as_deref() {
214 Some("scss" | "sass") => "scss".to_string(),
215 Some("less") => "less".to_string(),
216 _ => "css".to_string(),
217 },
218 "template" => match lang.as_deref() {
219 Some(v) if v == "pug" || v == "jade" => "pug".to_string(),
220 _ => "html".to_string(),
221 },
222 _ => "html".to_string(),
223 }
224}
225
226fn astro_frontmatter_block(source: &str) -> Option<SfcBlock> {
227 if !(source.starts_with("---\n") || source.starts_with("---\r\n")) {
228 return None;
229 }
230 let lines = crate::markdown::line_spans(source);
231 let close_idx = lines
232 .iter()
233 .enumerate()
234 .skip(1)
235 .find(|(_, span)| source[span.start..span.end].trim() == "---")
236 .map(|(idx, _)| idx)?;
237 let inner_start = lines.get(1)?.start;
238 let inner_end = source[..lines[close_idx].start]
239 .strip_suffix('\n')
240 .map(|prefix: &str| prefix.len())
241 .unwrap_or(lines[close_idx].start);
242 let block_end = lines[close_idx].next_start.min(source.len());
243 Some(SfcBlock {
244 tag: "script".to_string(),
245 block_format: "typescript".to_string(),
246 block_start: 0,
247 inner_start,
248 inner_end: inner_end.max(inner_start),
249 block_end,
250 })
251}
252
253fn extract_lang_attr_value(attrs: &str) -> Option<String> {
254 let lower = attrs.to_ascii_lowercase();
255 let lang_pos = lower.find("lang=")?;
256 let rest = &attrs[lang_pos + 5..];
257 let quote = if rest.starts_with('"') {
258 '"'
259 } else if rest.starts_with('\'') {
260 '\''
261 } else {
262 return None;
263 };
264 let value_start = 1;
265 let value_end = rest[value_start..].find(quote)? + value_start;
266 Some(rest[value_start..value_end].to_ascii_lowercase())
267}
268
269pub fn extract_blocks(source: &str, file_format: &str) -> Vec<Block> {
271 let source_lower = source.to_ascii_lowercase();
272 let tag_names: &[&str] = match file_format {
273 "svelte" | "astro" => &["script", "style"],
274 _ => &["template", "script", "style"],
275 };
276
277 let mut blocks = Vec::new();
278 for tag in tag_names {
279 let mut search_from = 0;
280 while let Some((block, next_from)) =
281 find_display_block(source, &source_lower, tag, search_from)
282 {
283 search_from = next_from;
284 blocks.push(block);
285 }
286 }
287 blocks.sort_by_key(|b: &Block| b.start_offset);
288 blocks
289}
290
291fn find_display_block(
292 source: &str,
293 source_lower: &str,
294 tag: &str,
295 from: usize,
296) -> Option<(Block, usize)> {
297 let open_needle = format!("<{}", tag);
298 let close_needle = format!("</{}>", tag);
299
300 let open_start = source_lower[from..].find(&open_needle)? + from;
301 let after_tag_name = open_start + 1 + tag.len();
302 if source_lower
303 .as_bytes()
304 .get(after_tag_name)
305 .is_some_and(|b| b.is_ascii_alphabetic())
306 {
307 return None;
308 }
309 let tag_end = source_lower[open_start..].find('>')? + open_start + 1;
310 let close_start = source_lower[tag_end..].find(&close_needle)? + tag_end;
311
312 let attrs = &source[open_start + 1 + tag.len()..tag_end];
313 let content = source[tag_end..close_start].to_string();
314 let content_len = content.len();
315 let start_line = source[..tag_end].lines().count() as u32 + 1;
316 let block_format = detect_display_block_format(attrs, tag);
317
318 Some((
319 Block {
320 block_format,
321 content,
322 start_offset: tag_end,
323 start_line,
324 },
325 tag_end + content_len,
326 ))
327}
328
329fn detect_display_block_format(attrs: &str, tag: &str) -> String {
330 let lang = extract_lang_attr_value(attrs);
331 match tag {
332 "script" => match lang.as_deref() {
333 Some("ts" | "typescript") => "typescript".to_string(),
334 Some("js" | "javascript") => "javascript".to_string(),
335 _ => "javascript".to_string(),
336 },
337 "style" => match lang.as_deref() {
338 Some("scss" | "sass") => "scss".to_string(),
339 Some("less") => "less".to_string(),
340 _ => "css".to_string(),
341 },
342 "template" => "html".to_string(),
343 _ => "html".to_string(),
344 }
345}
346
347pub fn tokenize_sfc(source: &str, file_format: &str, mode: Mode) -> Vec<Token> {
348 let blocks = extract_blocks(source, file_format);
349 let mut all_tokens = Vec::new();
350
351 for block in &blocks {
352 let mut block_tokens =
353 crate::tokenizer::tokenize(&block.block_format, &block.content, mode);
354 let line_offset = block.start_line.saturating_sub(1);
355 for token in &mut block_tokens {
356 token.start.line += line_offset;
357 token.end.line += line_offset;
358 }
359 all_tokens.extend(block_tokens);
360 }
361
362 all_tokens
363}
364
365#[cfg(test)]
366mod tests {
367 use super::*;
368
369 const VUE_FILE: &str = r#"<template>
370 <div>Hello</div>
371</template>
372
373<script>
374export default { name: 'Foo' }
375</script>
376
377<style>
378.foo { color: red; }
379</style>
380"#;
381
382 const VUE_TS_FILE: &str = r#"<template>
383 <div>Hello</div>
384</template>
385
386<script lang="ts">
387const x: number = 5;
388</script>
389
390<style lang="scss">
391.foo { color: red; }
392</style>
393"#;
394
395 #[test]
396 fn vue_file_extracts_three_blocks() {
397 let blocks = extract_blocks(VUE_FILE, "vue");
398 assert_eq!(blocks.len(), 3, "must find template, script, style blocks");
399 }
400
401 #[test]
402 fn script_block_default_format_is_javascript() {
403 let blocks = extract_blocks(VUE_FILE, "vue");
404 let script = blocks.iter().find(|b| b.block_format == "javascript");
405 assert!(script.is_some(), "plain <script> must be javascript format");
406 }
407
408 #[test]
409 fn script_lang_ts_produces_typescript_format() {
410 let blocks = extract_blocks(VUE_TS_FILE, "vue");
411 let ts_block = blocks.iter().find(|b| b.block_format == "typescript");
412 assert!(
413 ts_block.is_some(),
414 "<script lang=\"ts\"> must produce typescript format"
415 );
416 }
417
418 #[test]
419 fn unknown_lang_does_not_panic() {
420 let source = "<script lang=\"unknownlang123\">\nconst x = 1;\n</script>\n";
421 let result = std::panic::catch_unwind(|| extract_blocks(source, "vue"));
422 assert!(result.is_ok(), "unknown lang must not panic");
423 }
424
425 #[test]
426 fn no_blocks_returns_empty() {
427 let source = "just plain text no tags";
428 let blocks = extract_blocks(source, "vue");
429 assert!(blocks.is_empty());
430 }
431
432 #[test]
433 fn start_offset_is_after_opening_tag() {
434 let blocks = extract_blocks(VUE_FILE, "vue");
435 for block in &blocks {
436 assert!(block.start_offset > 0);
437 }
438 }
439
440 #[test]
441 fn vue_sfc_maps_produces_multiple_formats() {
442 let options = TokenizeOptions::new(Mode::Mild);
443 let maps = tokenize_sfc_maps(VUE_FILE, "vue", &options);
444 let formats: Vec<&str> = maps.iter().map(|m| m.format.as_str()).collect();
445 assert!(formats.contains(&"javascript"), "must have javascript map");
446 assert!(formats.contains(&"css"), "must have css map");
447 assert!(formats.contains(&"html"), "must have html map");
448 }
449
450 #[test]
451 fn vue_ts_maps_produces_typescript() {
452 let options = TokenizeOptions::new(Mode::Mild);
453 let maps = tokenize_sfc_maps(VUE_TS_FILE, "vue", &options);
454 let formats: Vec<&str> = maps.iter().map(|m| m.format.as_str()).collect();
455 assert!(formats.contains(&"typescript"), "must have typescript map");
456 assert!(formats.contains(&"scss"), "must have scss map");
457 }
458
459 #[test]
460 fn empty_sfc_returns_empty() {
461 let options = TokenizeOptions::new(Mode::Mild);
462 let maps = tokenize_sfc_maps("", "vue", &options);
463 assert!(maps.is_empty());
464 }
465
466 #[test]
467 fn svelte_sfc_maps_produces_multiple_formats() {
468 let source = r#"<script>
469 let count = 0;
470</script>
471
472<style>
473 .count { color: blue; }
474</style>
475"#;
476 let options = TokenizeOptions::new(Mode::Mild);
477 let maps = tokenize_sfc_maps(source, "svelte", &options);
478 let formats: Vec<&str> = maps.iter().map(|m| m.format.as_str()).collect();
479 assert!(
480 formats.contains(&"javascript"),
481 "svelte must have javascript map"
482 );
483 assert!(formats.contains(&"css"), "svelte must have css map");
484 assert!(
485 formats.contains(&"html"),
486 "svelte must have html markup map"
487 );
488 }
489
490 #[test]
491 fn svelte_script_containing_style_text_no_panic() {
492 let source = r#"<script>
493 const x = "<style>.red{color:red}</style>";
494</script>
495
496<style>
497 .blue { color: blue; }
498</style>
499"#;
500 let result = std::panic::catch_unwind(|| {
501 let options = TokenizeOptions::new(Mode::Mild);
502 tokenize_sfc_maps(source, "svelte", &options)
503 });
504 assert!(
505 result.is_ok(),
506 "must not panic when <style> text appears inside <script>"
507 );
508 let maps = result.unwrap();
509 let formats: Vec<&str> = maps.iter().map(|m| m.format.as_str()).collect();
510 assert!(
511 formats.contains(&"javascript"),
512 "must have javascript block"
513 );
514 assert!(formats.contains(&"css"), "must have real css block");
515 assert!(formats.contains(&"html"), "must have html markup");
516 }
517}