Skip to main content

cpd_tokenizer/
sfc.rs

1// Attribution: SFC block extraction for Vue/Svelte/Astro; inspired by jscpd-rs approach; rewritten independently.
2
3use std::collections::BTreeMap;
4
5use cpd_core::models::{DetectionToken, Token};
6
7use crate::embedded::blank_ranges_preserve_newlines;
8use crate::line_index::LineIndex;
9use crate::markdown::{offset_detection_tokens, tokens_to_detection};
10use crate::tokenizer::{Mode, TokenMap, TokenizeOptions};
11
12#[derive(Debug, Clone)]
13pub struct Block {
14    pub block_format: String,
15    pub content: String,
16    pub start_offset: usize,
17    pub start_line: u32,
18}
19
20#[allow(dead_code)]
21struct SfcBlock {
22    tag: String,
23    block_format: String,
24    block_start: usize,
25    inner_start: usize,
26    inner_end: usize,
27    block_end: usize,
28}
29
30pub fn tokenize_sfc_maps(
31    source: &str,
32    file_format: &str,
33    options: &TokenizeOptions,
34) -> Vec<TokenMap> {
35    if source.is_empty() {
36        return Vec::new();
37    }
38
39    let blocks = find_sfc_blocks(source, file_format);
40    if blocks.is_empty() {
41        let tokens = crate::generic::tokenize_generic(source, "html");
42        let detection = tokens_to_detection(tokens, options);
43        return if detection.is_empty() {
44            Vec::new()
45        } else {
46            vec![TokenMap {
47                format: "html".to_string(),
48                tokens: detection,
49            }]
50        };
51    }
52
53    let blank_ranges: Vec<[usize; 2]> = blocks
54        .iter()
55        .filter_map(|b| {
56            if b.inner_start < b.inner_end {
57                Some([b.inner_start, b.inner_end])
58            } else {
59                None
60            }
61        })
62        .collect();
63
64    let sanitized = blank_ranges_preserve_newlines(source, &blank_ranges);
65    let line_index = LineIndex::new(source.as_bytes());
66
67    let mut grouped: BTreeMap<String, Vec<DetectionToken>> = BTreeMap::new();
68
69    let markup_tokens = crate::generic::tokenize_generic(&sanitized, "html");
70    let mut markup_detection = tokens_to_detection(markup_tokens, options);
71    markup_detection.retain(|t| t.range[0] < t.range[1]);
72    if !markup_detection.is_empty() {
73        grouped
74            .entry("html".to_string())
75            .or_default()
76            .extend(markup_detection);
77    }
78
79    for block in &blocks {
80        if block.inner_start >= block.inner_end {
81            continue;
82        }
83        let inner = &source[block.inner_start..block.inner_end];
84        let inner_start_loc = line_index.location(block.inner_start);
85
86        let mut inner_tokens = tokenize_sfc_block_inner(&block.block_format, inner, options);
87        offset_detection_tokens(&mut inner_tokens, block.inner_start, &inner_start_loc);
88
89        grouped
90            .entry(block.block_format.clone())
91            .or_default()
92            .extend(inner_tokens);
93    }
94
95    grouped
96        .into_iter()
97        .filter(|(_, tokens)| !tokens.is_empty())
98        .map(|(format, tokens)| TokenMap { format, tokens })
99        .collect()
100}
101
102fn tokenize_sfc_block_inner(
103    format: &str,
104    source: &str,
105    options: &TokenizeOptions,
106) -> Vec<DetectionToken> {
107    let raw = match format {
108        "javascript" | "typescript" | "jsx" | "tsx" => {
109            crate::javascript::tokenize_js(source, format)
110        }
111        "vue" | "svelte" | "astro" => crate::sfc::tokenize_sfc(source, format, options.mode),
112        "markdown" | "md" => crate::generic::tokenize_generic(source, format),
113        _ => crate::generic::tokenize_generic(source, format),
114    };
115    tokens_to_detection(raw, options)
116}
117
118fn find_sfc_blocks(source: &str, file_format: &str) -> Vec<SfcBlock> {
119    let source_lower = source.to_ascii_lowercase();
120    let tag_names: &[&str] = match file_format {
121        "svelte" | "astro" => &["script", "style"],
122        _ => &["template", "script", "style"],
123    };
124
125    let mut blocks = Vec::new();
126
127    if file_format == "astro" {
128        if let Some(fm) = astro_frontmatter_block(source) {
129            blocks.push(fm);
130        }
131    }
132
133    for tag in tag_names {
134        let mut search_from = 0usize;
135        while let Some(block) = find_sfc_tag_block(source, &source_lower, tag, search_from) {
136            search_from = block.block_end;
137            blocks.push(block);
138        }
139    }
140
141    blocks.sort_by_key(|b| b.block_start);
142    blocks
143}
144
145fn find_sfc_tag_block(
146    source: &str,
147    source_lower: &str,
148    tag: &str,
149    from: usize,
150) -> Option<SfcBlock> {
151    let open_needle = format!("<{}", tag);
152    let close_needle = format!("</{}>", tag);
153
154    let open_start = source_lower[from..].find(&open_needle)? + from;
155    let after_tag_name = open_start + 1 + tag.len();
156    if source_lower
157        .as_bytes()
158        .get(after_tag_name)
159        .is_some_and(|b| b.is_ascii_alphabetic())
160    {
161        return None;
162    }
163    let tag_end = source_lower[open_start..].find('>')? + open_start + 1;
164    let close_start = source_lower[tag_end..].find(&close_needle)? + tag_end;
165
166    let attrs = &source[open_start + 1 + tag.len()..tag_end];
167    let inner_start = tag_end;
168    let inner_end = close_start;
169    let block_end = source_lower[close_start..]
170        .find('>')
171        .map(|i| close_start + i + 1)
172        .unwrap_or(close_start + close_needle.len());
173    let block_end = block_end.min(source.len());
174
175    let block_format = detect_sfc_block_format(attrs, tag);
176
177    Some(SfcBlock {
178        tag: tag.to_string(),
179        block_format,
180        block_start: open_start,
181        inner_start,
182        inner_end: inner_end.max(inner_start),
183        block_end,
184    })
185}
186
187fn detect_sfc_block_format(attrs: &str, tag: &str) -> String {
188    let lang = extract_lang_attr_value(attrs);
189    match tag {
190        "script" => match lang.as_deref() {
191            Some("ts" | "typescript") => "typescript".to_string(),
192            Some("js" | "javascript") => "javascript".to_string(),
193            Some(other) => {
194                if crate::formats::get_format_by_extension(other).is_some()
195                    || crate::formats::SUPPORTED_FORMATS
196                        .iter()
197                        .any(|e| e.name == other)
198                {
199                    other.to_string()
200                } else {
201                    "javascript".to_string()
202                }
203            }
204            None => "javascript".to_string(),
205        },
206        "style" => match lang.as_deref() {
207            Some("scss" | "sass") => "scss".to_string(),
208            Some("less") => "less".to_string(),
209            _ => "css".to_string(),
210        },
211        "template" => match lang.as_deref() {
212            Some(v) if v == "pug" || v == "jade" => "pug".to_string(),
213            _ => "html".to_string(),
214        },
215        _ => "html".to_string(),
216    }
217}
218
219fn astro_frontmatter_block(source: &str) -> Option<SfcBlock> {
220    if !(source.starts_with("---\n") || source.starts_with("---\r\n")) {
221        return None;
222    }
223    let lines = crate::markdown::line_spans(source);
224    let close_idx = lines
225        .iter()
226        .enumerate()
227        .skip(1)
228        .find(|(_, span)| source[span.start..span.end].trim() == "---")
229        .map(|(idx, _)| idx)?;
230    let inner_start = lines.get(1)?.start;
231    let inner_end = source[..lines[close_idx].start]
232        .strip_suffix('\n')
233        .map(|prefix: &str| prefix.len())
234        .unwrap_or(lines[close_idx].start);
235    let block_end = lines[close_idx].next_start.min(source.len());
236    Some(SfcBlock {
237        tag: "script".to_string(),
238        block_format: "typescript".to_string(),
239        block_start: 0,
240        inner_start,
241        inner_end: inner_end.max(inner_start),
242        block_end,
243    })
244}
245
246fn extract_lang_attr_value(attrs: &str) -> Option<String> {
247    let lower = attrs.to_ascii_lowercase();
248    let lang_pos = lower.find("lang=")?;
249    let rest = &attrs[lang_pos + 5..];
250    let quote = if rest.starts_with('"') {
251        '"'
252    } else if rest.starts_with('\'') {
253        '\''
254    } else {
255        return None;
256    };
257    let value_start = 1;
258    let value_end = rest[value_start..].find(quote)? + value_start;
259    Some(rest[value_start..value_end].to_ascii_lowercase())
260}
261
262/// Extract blocks from a Vue/Svelte/Astro file (display path).
263pub fn extract_blocks(source: &str, file_format: &str) -> Vec<Block> {
264    let source_lower = source.to_ascii_lowercase();
265    let tag_names: &[&str] = match file_format {
266        "svelte" | "astro" => &["script", "style"],
267        _ => &["template", "script", "style"],
268    };
269
270    let mut blocks = Vec::new();
271    for tag in tag_names {
272        let mut search_from = 0;
273        while let Some((block, next_from)) =
274            find_display_block(source, &source_lower, tag, search_from)
275        {
276            search_from = next_from;
277            blocks.push(block);
278        }
279    }
280    blocks.sort_by_key(|b: &Block| b.start_offset);
281    blocks
282}
283
284fn find_display_block(
285    source: &str,
286    source_lower: &str,
287    tag: &str,
288    from: usize,
289) -> Option<(Block, usize)> {
290    let open_needle = format!("<{}", tag);
291    let close_needle = format!("</{}>", tag);
292
293    let open_start = source_lower[from..].find(&open_needle)? + from;
294    let after_tag_name = open_start + 1 + tag.len();
295    if source_lower
296        .as_bytes()
297        .get(after_tag_name)
298        .is_some_and(|b| b.is_ascii_alphabetic())
299    {
300        return None;
301    }
302    let tag_end = source_lower[open_start..].find('>')? + open_start + 1;
303    let close_start = source_lower[tag_end..].find(&close_needle)? + tag_end;
304
305    let attrs = &source[open_start + 1 + tag.len()..tag_end];
306    let content = source[tag_end..close_start].to_string();
307    let content_len = content.len();
308    let start_line = source[..tag_end].lines().count() as u32 + 1;
309    let block_format = detect_display_block_format(attrs, tag);
310
311    Some((
312        Block {
313            block_format,
314            content,
315            start_offset: tag_end,
316            start_line,
317        },
318        tag_end + content_len,
319    ))
320}
321
322fn detect_display_block_format(attrs: &str, tag: &str) -> String {
323    let lang = extract_lang_attr_value(attrs);
324    match tag {
325        "script" => match lang.as_deref() {
326            Some("ts" | "typescript") => "typescript".to_string(),
327            Some("js" | "javascript") => "javascript".to_string(),
328            _ => "javascript".to_string(),
329        },
330        "style" => match lang.as_deref() {
331            Some("scss" | "sass") => "scss".to_string(),
332            Some("less") => "less".to_string(),
333            _ => "css".to_string(),
334        },
335        "template" => "html".to_string(),
336        _ => "html".to_string(),
337    }
338}
339
340pub fn tokenize_sfc(source: &str, file_format: &str, mode: Mode) -> Vec<Token> {
341    let blocks = extract_blocks(source, file_format);
342    let mut all_tokens = Vec::new();
343
344    for block in &blocks {
345        let mut block_tokens =
346            crate::tokenizer::tokenize(&block.block_format, &block.content, mode);
347        let line_offset = block.start_line.saturating_sub(1);
348        for token in &mut block_tokens {
349            token.start.line += line_offset;
350            token.end.line += line_offset;
351        }
352        all_tokens.extend(block_tokens);
353    }
354
355    all_tokens
356}
357
358#[cfg(test)]
359mod tests {
360    use super::*;
361
362    const VUE_FILE: &str = r#"<template>
363  <div>Hello</div>
364</template>
365
366<script>
367export default { name: 'Foo' }
368</script>
369
370<style>
371.foo { color: red; }
372</style>
373"#;
374
375    const VUE_TS_FILE: &str = r#"<template>
376  <div>Hello</div>
377</template>
378
379<script lang="ts">
380const x: number = 5;
381</script>
382
383<style lang="scss">
384.foo { color: red; }
385</style>
386"#;
387
388    #[test]
389    fn vue_file_extracts_three_blocks() {
390        let blocks = extract_blocks(VUE_FILE, "vue");
391        assert_eq!(blocks.len(), 3, "must find template, script, style blocks");
392    }
393
394    #[test]
395    fn script_block_default_format_is_javascript() {
396        let blocks = extract_blocks(VUE_FILE, "vue");
397        let script = blocks.iter().find(|b| b.block_format == "javascript");
398        assert!(script.is_some(), "plain <script> must be javascript format");
399    }
400
401    #[test]
402    fn script_lang_ts_produces_typescript_format() {
403        let blocks = extract_blocks(VUE_TS_FILE, "vue");
404        let ts_block = blocks.iter().find(|b| b.block_format == "typescript");
405        assert!(
406            ts_block.is_some(),
407            "<script lang=\"ts\"> must produce typescript format"
408        );
409    }
410
411    #[test]
412    fn unknown_lang_does_not_panic() {
413        let source = "<script lang=\"unknownlang123\">\nconst x = 1;\n</script>\n";
414        let result = std::panic::catch_unwind(|| extract_blocks(source, "vue"));
415        assert!(result.is_ok(), "unknown lang must not panic");
416    }
417
418    #[test]
419    fn no_blocks_returns_empty() {
420        let source = "just plain text no tags";
421        let blocks = extract_blocks(source, "vue");
422        assert!(blocks.is_empty());
423    }
424
425    #[test]
426    fn start_offset_is_after_opening_tag() {
427        let blocks = extract_blocks(VUE_FILE, "vue");
428        for block in &blocks {
429            assert!(block.start_offset > 0);
430        }
431    }
432
433    #[test]
434    fn vue_sfc_maps_produces_multiple_formats() {
435        let options = TokenizeOptions::new(Mode::Mild);
436        let maps = tokenize_sfc_maps(VUE_FILE, "vue", &options);
437        let formats: Vec<&str> = maps.iter().map(|m| m.format.as_str()).collect();
438        assert!(formats.contains(&"javascript"), "must have javascript map");
439        assert!(formats.contains(&"css"), "must have css map");
440        assert!(formats.contains(&"html"), "must have html map");
441    }
442
443    #[test]
444    fn vue_ts_maps_produces_typescript() {
445        let options = TokenizeOptions::new(Mode::Mild);
446        let maps = tokenize_sfc_maps(VUE_TS_FILE, "vue", &options);
447        let formats: Vec<&str> = maps.iter().map(|m| m.format.as_str()).collect();
448        assert!(formats.contains(&"typescript"), "must have typescript map");
449        assert!(formats.contains(&"scss"), "must have scss map");
450    }
451
452    #[test]
453    fn empty_sfc_returns_empty() {
454        let options = TokenizeOptions::new(Mode::Mild);
455        let maps = tokenize_sfc_maps("", "vue", &options);
456        assert!(maps.is_empty());
457    }
458
459    #[test]
460    fn svelte_sfc_maps_produces_multiple_formats() {
461        let source = r#"<script>
462  let count = 0;
463</script>
464
465<style>
466  .count { color: blue; }
467</style>
468"#;
469        let options = TokenizeOptions::new(Mode::Mild);
470        let maps = tokenize_sfc_maps(source, "svelte", &options);
471        let formats: Vec<&str> = maps.iter().map(|m| m.format.as_str()).collect();
472        assert!(
473            formats.contains(&"javascript"),
474            "svelte must have javascript map"
475        );
476        assert!(formats.contains(&"css"), "svelte must have css map");
477        assert!(
478            formats.contains(&"html"),
479            "svelte must have html markup map"
480        );
481    }
482}