Skip to main content

cpd_tokenizer/
sfc.rs

1use std::collections::BTreeMap;
2
3use cpd_core::models::{DetectionToken, Token};
4
5use crate::embedded::blank_ranges_preserve_newlines;
6use crate::line_index::LineIndex;
7use crate::markdown::{offset_detection_tokens, tokens_to_detection};
8use crate::tokenizer::{Mode, TokenMap, TokenizeOptions};
9
10#[derive(Debug, Clone)]
11pub struct Block {
12    pub block_format: String,
13    pub content: String,
14    pub start_offset: usize,
15    pub start_line: u32,
16}
17
18#[allow(dead_code)]
19struct SfcBlock {
20    tag: String,
21    block_format: String,
22    block_start: usize,
23    inner_start: usize,
24    inner_end: usize,
25    block_end: usize,
26}
27
28pub fn tokenize_sfc_maps(
29    source: &str,
30    file_format: &str,
31    options: &TokenizeOptions,
32) -> Vec<TokenMap> {
33    if source.is_empty() {
34        return Vec::new();
35    }
36
37    let blocks = find_sfc_blocks(source, file_format);
38    if blocks.is_empty() {
39        let tokens = crate::generic::tokenize_generic(source, "html");
40        let detection = tokens_to_detection(tokens, options);
41        return if detection.is_empty() {
42            Vec::new()
43        } else {
44            vec![TokenMap {
45                format: "html".to_string(),
46                tokens: detection,
47            }]
48        };
49    }
50
51    let blank_ranges: Vec<[usize; 2]> = blocks
52        .iter()
53        .filter_map(|b| {
54            if b.inner_start < b.inner_end {
55                Some([b.inner_start, b.inner_end])
56            } else {
57                None
58            }
59        })
60        .collect();
61
62    let sanitized = blank_ranges_preserve_newlines(source, &blank_ranges);
63    let line_index = LineIndex::new(source.as_bytes());
64
65    let mut grouped: BTreeMap<String, Vec<DetectionToken>> = BTreeMap::new();
66
67    let markup_tokens = crate::generic::tokenize_generic(&sanitized, "html");
68    let mut markup_detection = tokens_to_detection(markup_tokens, options);
69    markup_detection.retain(|t| t.range[0] < t.range[1]);
70    if !markup_detection.is_empty() {
71        grouped
72            .entry("html".to_string())
73            .or_default()
74            .extend(markup_detection);
75    }
76
77    for block in &blocks {
78        if block.inner_start >= block.inner_end {
79            continue;
80        }
81        let inner = &source[block.inner_start..block.inner_end];
82        let inner_start_loc = line_index.location(block.inner_start);
83
84        let mut inner_tokens = tokenize_sfc_block_inner(&block.block_format, inner, options);
85        offset_detection_tokens(&mut inner_tokens, block.inner_start, &inner_start_loc);
86
87        grouped
88            .entry(block.block_format.clone())
89            .or_default()
90            .extend(inner_tokens);
91    }
92
93    grouped
94        .into_iter()
95        .filter(|(_, tokens)| !tokens.is_empty())
96        .map(|(format, tokens)| TokenMap { format, tokens })
97        .collect()
98}
99
100fn tokenize_sfc_block_inner(
101    format: &str,
102    source: &str,
103    options: &TokenizeOptions,
104) -> Vec<DetectionToken> {
105    let raw = match format {
106        "javascript" | "typescript" | "jsx" | "tsx" => {
107            crate::javascript::tokenize_js(source, format)
108        }
109        "vue" | "svelte" | "astro" => crate::sfc::tokenize_sfc(source, format, options.mode),
110        "markdown" | "md" => crate::generic::tokenize_generic(source, format),
111        _ => crate::generic::tokenize_generic(source, format),
112    };
113    tokens_to_detection(raw, options)
114}
115
116fn find_sfc_blocks(source: &str, file_format: &str) -> Vec<SfcBlock> {
117    let source_lower = source.to_ascii_lowercase();
118    let tag_names: &[&str] = match file_format {
119        "svelte" | "astro" => &["script", "style"],
120        _ => &["template", "script", "style"],
121    };
122
123    let mut blocks = Vec::new();
124
125    if file_format == "astro" {
126        if let Some(fm) = astro_frontmatter_block(source) {
127            blocks.push(fm);
128        }
129    }
130
131    for tag in tag_names {
132        let mut search_from = 0usize;
133        while let Some(block) = find_sfc_tag_block(source, &source_lower, tag, search_from) {
134            search_from = block.block_end;
135            blocks.push(block);
136        }
137    }
138
139    blocks.sort_by_key(|b| b.block_start);
140    let mut deduped = Vec::new();
141    for block in blocks {
142        let nested = deduped.iter().any(|existing: &SfcBlock| {
143            block.block_start >= existing.block_start && block.block_start < existing.block_end
144        });
145        if !nested {
146            deduped.push(block);
147        }
148    }
149    deduped
150}
151
152fn find_sfc_tag_block(
153    source: &str,
154    source_lower: &str,
155    tag: &str,
156    from: usize,
157) -> Option<SfcBlock> {
158    let open_needle = format!("<{}", tag);
159    let close_needle = format!("</{}>", tag);
160
161    let open_start = source_lower[from..].find(&open_needle)? + from;
162    let after_tag_name = open_start + 1 + tag.len();
163    if source_lower
164        .as_bytes()
165        .get(after_tag_name)
166        .is_some_and(|b| b.is_ascii_alphabetic())
167    {
168        return None;
169    }
170    let tag_end = source_lower[open_start..].find('>')? + open_start + 1;
171    let close_start = source_lower[tag_end..].find(&close_needle)? + tag_end;
172
173    let attrs = &source[open_start + 1 + tag.len()..tag_end];
174    let inner_start = tag_end;
175    let inner_end = close_start;
176    let block_end = source_lower[close_start..]
177        .find('>')
178        .map(|i| close_start + i + 1)
179        .unwrap_or(close_start + close_needle.len());
180    let block_end = block_end.min(source.len());
181
182    let block_format = detect_sfc_block_format(attrs, tag);
183
184    Some(SfcBlock {
185        tag: tag.to_string(),
186        block_format,
187        block_start: open_start,
188        inner_start,
189        inner_end: inner_end.max(inner_start),
190        block_end,
191    })
192}
193
194fn detect_sfc_block_format(attrs: &str, tag: &str) -> String {
195    let lang = extract_lang_attr_value(attrs);
196    match tag {
197        "script" => match lang.as_deref() {
198            Some("ts" | "typescript") => "typescript".to_string(),
199            Some("js" | "javascript") => "javascript".to_string(),
200            Some(other) => {
201                if crate::formats::get_format_by_extension(other).is_some()
202                    || crate::formats::SUPPORTED_FORMATS
203                        .iter()
204                        .any(|e| e.name == other)
205                {
206                    other.to_string()
207                } else {
208                    "javascript".to_string()
209                }
210            }
211            None => "javascript".to_string(),
212        },
213        "style" => match lang.as_deref() {
214            Some("scss" | "sass") => "scss".to_string(),
215            Some("less") => "less".to_string(),
216            _ => "css".to_string(),
217        },
218        "template" => match lang.as_deref() {
219            Some(v) if v == "pug" || v == "jade" => "pug".to_string(),
220            _ => "html".to_string(),
221        },
222        _ => "html".to_string(),
223    }
224}
225
226fn astro_frontmatter_block(source: &str) -> Option<SfcBlock> {
227    if !(source.starts_with("---\n") || source.starts_with("---\r\n")) {
228        return None;
229    }
230    let lines = crate::markdown::line_spans(source);
231    let close_idx = lines
232        .iter()
233        .enumerate()
234        .skip(1)
235        .find(|(_, span)| source[span.start..span.end].trim() == "---")
236        .map(|(idx, _)| idx)?;
237    let inner_start = lines.get(1)?.start;
238    let inner_end = source[..lines[close_idx].start]
239        .strip_suffix('\n')
240        .map(|prefix: &str| prefix.len())
241        .unwrap_or(lines[close_idx].start);
242    let block_end = lines[close_idx].next_start.min(source.len());
243    Some(SfcBlock {
244        tag: "script".to_string(),
245        block_format: "typescript".to_string(),
246        block_start: 0,
247        inner_start,
248        inner_end: inner_end.max(inner_start),
249        block_end,
250    })
251}
252
253fn extract_lang_attr_value(attrs: &str) -> Option<String> {
254    let lower = attrs.to_ascii_lowercase();
255    let lang_pos = lower.find("lang=")?;
256    let rest = &attrs[lang_pos + 5..];
257    let quote = if rest.starts_with('"') {
258        '"'
259    } else if rest.starts_with('\'') {
260        '\''
261    } else {
262        return None;
263    };
264    let value_start = 1;
265    let value_end = rest[value_start..].find(quote)? + value_start;
266    Some(rest[value_start..value_end].to_ascii_lowercase())
267}
268
269/// Extract blocks from a Vue/Svelte/Astro file (display path).
270pub fn extract_blocks(source: &str, file_format: &str) -> Vec<Block> {
271    let source_lower = source.to_ascii_lowercase();
272    let tag_names: &[&str] = match file_format {
273        "svelte" | "astro" => &["script", "style"],
274        _ => &["template", "script", "style"],
275    };
276
277    let mut blocks = Vec::new();
278    for tag in tag_names {
279        let mut search_from = 0;
280        while let Some((block, next_from)) =
281            find_display_block(source, &source_lower, tag, search_from)
282        {
283            search_from = next_from;
284            blocks.push(block);
285        }
286    }
287    blocks.sort_by_key(|b: &Block| b.start_offset);
288    blocks
289}
290
291fn find_display_block(
292    source: &str,
293    source_lower: &str,
294    tag: &str,
295    from: usize,
296) -> Option<(Block, usize)> {
297    let open_needle = format!("<{}", tag);
298    let close_needle = format!("</{}>", tag);
299
300    let open_start = source_lower[from..].find(&open_needle)? + from;
301    let after_tag_name = open_start + 1 + tag.len();
302    if source_lower
303        .as_bytes()
304        .get(after_tag_name)
305        .is_some_and(|b| b.is_ascii_alphabetic())
306    {
307        return None;
308    }
309    let tag_end = source_lower[open_start..].find('>')? + open_start + 1;
310    let close_start = source_lower[tag_end..].find(&close_needle)? + tag_end;
311
312    let attrs = &source[open_start + 1 + tag.len()..tag_end];
313    let content = source[tag_end..close_start].to_string();
314    let content_len = content.len();
315    let start_line = source[..tag_end].lines().count() as u32 + 1;
316    let block_format = detect_display_block_format(attrs, tag);
317
318    Some((
319        Block {
320            block_format,
321            content,
322            start_offset: tag_end,
323            start_line,
324        },
325        tag_end + content_len,
326    ))
327}
328
329fn detect_display_block_format(attrs: &str, tag: &str) -> String {
330    let lang = extract_lang_attr_value(attrs);
331    match tag {
332        "script" => match lang.as_deref() {
333            Some("ts" | "typescript") => "typescript".to_string(),
334            Some("js" | "javascript") => "javascript".to_string(),
335            _ => "javascript".to_string(),
336        },
337        "style" => match lang.as_deref() {
338            Some("scss" | "sass") => "scss".to_string(),
339            Some("less") => "less".to_string(),
340            _ => "css".to_string(),
341        },
342        "template" => "html".to_string(),
343        _ => "html".to_string(),
344    }
345}
346
347pub fn tokenize_sfc(source: &str, file_format: &str, mode: Mode) -> Vec<Token> {
348    let blocks = extract_blocks(source, file_format);
349    let mut all_tokens = Vec::new();
350
351    for block in &blocks {
352        let mut block_tokens =
353            crate::tokenizer::tokenize(&block.block_format, &block.content, mode);
354        let line_offset = block.start_line.saturating_sub(1);
355        for token in &mut block_tokens {
356            token.start.line += line_offset;
357            token.end.line += line_offset;
358        }
359        all_tokens.extend(block_tokens);
360    }
361
362    all_tokens
363}
364
365#[cfg(test)]
366mod tests {
367    use super::*;
368
369    const VUE_FILE: &str = r#"<template>
370  <div>Hello</div>
371</template>
372
373<script>
374export default { name: 'Foo' }
375</script>
376
377<style>
378.foo { color: red; }
379</style>
380"#;
381
382    const VUE_TS_FILE: &str = r#"<template>
383  <div>Hello</div>
384</template>
385
386<script lang="ts">
387const x: number = 5;
388</script>
389
390<style lang="scss">
391.foo { color: red; }
392</style>
393"#;
394
395    #[test]
396    fn vue_file_extracts_three_blocks() {
397        let blocks = extract_blocks(VUE_FILE, "vue");
398        assert_eq!(blocks.len(), 3, "must find template, script, style blocks");
399    }
400
401    #[test]
402    fn script_block_default_format_is_javascript() {
403        let blocks = extract_blocks(VUE_FILE, "vue");
404        let script = blocks.iter().find(|b| b.block_format == "javascript");
405        assert!(script.is_some(), "plain <script> must be javascript format");
406    }
407
408    #[test]
409    fn script_lang_ts_produces_typescript_format() {
410        let blocks = extract_blocks(VUE_TS_FILE, "vue");
411        let ts_block = blocks.iter().find(|b| b.block_format == "typescript");
412        assert!(
413            ts_block.is_some(),
414            "<script lang=\"ts\"> must produce typescript format"
415        );
416    }
417
418    #[test]
419    fn unknown_lang_does_not_panic() {
420        let source = "<script lang=\"unknownlang123\">\nconst x = 1;\n</script>\n";
421        let result = std::panic::catch_unwind(|| extract_blocks(source, "vue"));
422        assert!(result.is_ok(), "unknown lang must not panic");
423    }
424
425    #[test]
426    fn no_blocks_returns_empty() {
427        let source = "just plain text no tags";
428        let blocks = extract_blocks(source, "vue");
429        assert!(blocks.is_empty());
430    }
431
432    #[test]
433    fn start_offset_is_after_opening_tag() {
434        let blocks = extract_blocks(VUE_FILE, "vue");
435        for block in &blocks {
436            assert!(block.start_offset > 0);
437        }
438    }
439
440    #[test]
441    fn vue_sfc_maps_produces_multiple_formats() {
442        let options = TokenizeOptions::new(Mode::Mild);
443        let maps = tokenize_sfc_maps(VUE_FILE, "vue", &options);
444        let formats: Vec<&str> = maps.iter().map(|m| m.format.as_str()).collect();
445        assert!(formats.contains(&"javascript"), "must have javascript map");
446        assert!(formats.contains(&"css"), "must have css map");
447        assert!(formats.contains(&"html"), "must have html map");
448    }
449
450    #[test]
451    fn vue_ts_maps_produces_typescript() {
452        let options = TokenizeOptions::new(Mode::Mild);
453        let maps = tokenize_sfc_maps(VUE_TS_FILE, "vue", &options);
454        let formats: Vec<&str> = maps.iter().map(|m| m.format.as_str()).collect();
455        assert!(formats.contains(&"typescript"), "must have typescript map");
456        assert!(formats.contains(&"scss"), "must have scss map");
457    }
458
459    #[test]
460    fn empty_sfc_returns_empty() {
461        let options = TokenizeOptions::new(Mode::Mild);
462        let maps = tokenize_sfc_maps("", "vue", &options);
463        assert!(maps.is_empty());
464    }
465
466    #[test]
467    fn svelte_sfc_maps_produces_multiple_formats() {
468        let source = r#"<script>
469  let count = 0;
470</script>
471
472<style>
473  .count { color: blue; }
474</style>
475"#;
476        let options = TokenizeOptions::new(Mode::Mild);
477        let maps = tokenize_sfc_maps(source, "svelte", &options);
478        let formats: Vec<&str> = maps.iter().map(|m| m.format.as_str()).collect();
479        assert!(
480            formats.contains(&"javascript"),
481            "svelte must have javascript map"
482        );
483        assert!(formats.contains(&"css"), "svelte must have css map");
484        assert!(
485            formats.contains(&"html"),
486            "svelte must have html markup map"
487        );
488    }
489
490    #[test]
491    fn svelte_script_containing_style_text_no_panic() {
492        let source = r#"<script>
493  const x = "<style>.red{color:red}</style>";
494</script>
495
496<style>
497  .blue { color: blue; }
498</style>
499"#;
500        let result = std::panic::catch_unwind(|| {
501            let options = TokenizeOptions::new(Mode::Mild);
502            tokenize_sfc_maps(source, "svelte", &options)
503        });
504        assert!(
505            result.is_ok(),
506            "must not panic when <style> text appears inside <script>"
507        );
508        let maps = result.unwrap();
509        let formats: Vec<&str> = maps.iter().map(|m| m.format.as_str()).collect();
510        assert!(
511            formats.contains(&"javascript"),
512            "must have javascript block"
513        );
514        assert!(formats.contains(&"css"), "must have real css block");
515        assert!(formats.contains(&"html"), "must have html markup");
516    }
517}