fob_graph/analysis/extractors/
astro.rs

1//! Astro component script extractor.
2//!
3//! This module implements efficient extraction of JavaScript/TypeScript from Astro
4//! component frontmatter and `<script>` blocks.
5
6use memchr::memmem;
7
8use super::common::{
9    ExtractedScript, Extractor, ExtractorError, MAX_FILE_SIZE, MAX_SCRIPT_TAGS, ScriptContext,
10};
11
12/// Astro component script extractor
13#[derive(Debug, Clone, Copy)]
14pub struct AstroExtractor;
15
16impl Extractor for AstroExtractor {
17    fn extract<'a>(&self, source: &'a str) -> Result<Vec<ExtractedScript<'a>>, ExtractorError> {
18        // Enforce file size limit
19        if source.len() > MAX_FILE_SIZE {
20            return Err(ExtractorError::FileTooLarge {
21                size: source.len(),
22                max: MAX_FILE_SIZE,
23            });
24        }
25
26        let mut sources = Vec::new();
27        let mut pointer = 0;
28
29        // First, try to extract frontmatter
30        if let Some(frontmatter) = parse_frontmatter(source, &mut pointer)? {
31            sources.push(frontmatter);
32        }
33
34        // Then extract all script blocks
35        let mut script_count = 0;
36        while let Some(script) = parse_script(source, &mut pointer)? {
37            sources.push(script);
38            script_count += 1;
39
40            // Enforce script tag count limit
41            if script_count > MAX_SCRIPT_TAGS {
42                return Err(ExtractorError::TooManyScriptTags {
43                    count: script_count,
44                    max: MAX_SCRIPT_TAGS,
45                });
46            }
47        }
48
49        Ok(sources)
50    }
51
52    fn file_extension(&self) -> &'static str {
53        ".astro"
54    }
55}
56
57/// Parses frontmatter if it exists at the start of the file.
58fn parse_frontmatter<'a>(
59    source_text: &'a str,
60    pointer: &mut usize,
61) -> Result<Option<ExtractedScript<'a>>, ExtractorError> {
62    let bytes = source_text.as_bytes();
63
64    // Skip leading whitespace
65    while *pointer < bytes.len() && matches!(bytes[*pointer], b' ' | b'\t' | b'\n' | b'\r') {
66        *pointer += 1;
67    }
68
69    // Check for opening ---
70    if *pointer + 3 > bytes.len() || &bytes[*pointer..*pointer + 3] != b"---" {
71        return Ok(None); // No frontmatter
72    }
73
74    let frontmatter_start = *pointer;
75    *pointer += 3; // Skip opening ---
76
77    // Find the newline after opening ---
78    while *pointer < bytes.len() && bytes[*pointer] != b'\n' {
79        *pointer += 1;
80    }
81    if *pointer < bytes.len() {
82        *pointer += 1; // Skip the newline
83    }
84
85    let content_start = *pointer;
86
87    // Find closing ---
88    let closing_pos = match find_frontmatter_closing(bytes, *pointer) {
89        Some(pos) => pos,
90        None => {
91            return Err(ExtractorError::UnclosedFrontmatter {
92                position: frontmatter_start,
93            });
94        }
95    };
96
97    // Extract frontmatter content
98    let source_text = &source_text[content_start..closing_pos];
99
100    // Move pointer past closing ---
101    *pointer = closing_pos + 3; // 3 = "---".len()
102
103    // Skip to end of line after closing ---
104    while *pointer < bytes.len() && bytes[*pointer] != b'\n' {
105        *pointer += 1;
106    }
107    if *pointer < bytes.len() {
108        *pointer += 1; // Skip the newline
109    }
110
111    // Frontmatter is TypeScript by default in Astro
112    Ok(Some(ExtractedScript::new(
113        source_text,
114        content_start,
115        ScriptContext::AstroFrontmatter,
116        "ts",
117    )))
118}
119
120/// Parses a single script block starting from the given position.
121fn parse_script<'a>(
122    source_text: &'a str,
123    pointer: &mut usize,
124) -> Result<Option<ExtractedScript<'a>>, ExtractorError> {
125    let bytes = source_text.as_bytes();
126
127    // Find the start of a <script tag
128    let script_start = match find_script_start(bytes, *pointer) {
129        Some(pos) => pos,
130        None => return Ok(None), // No more script tags
131    };
132
133    // Move pointer past "<script"
134    *pointer = script_start + 7; // 7 = "<script".len()
135
136    // Check if this is a script tag (not "scripts" or "scripting")
137    if *pointer < bytes.len() {
138        let next_char = bytes[*pointer];
139        if !matches!(next_char, b' ' | b'\t' | b'\n' | b'\r' | b'>' | b'/') {
140            // Not a script tag, keep searching
141            return parse_script(source_text, pointer);
142        }
143    }
144
145    // Find the end of the opening tag (the closing >)
146    let tag_end = match find_script_closing_angle(bytes, *pointer) {
147        Some(pos) => pos,
148        None => {
149            return Err(ExtractorError::UnclosedScriptTag {
150                position: script_start,
151            });
152        }
153    };
154
155    // Check for self-closing tag <script ... />
156    if tag_end > 0 && bytes[tag_end - 1] == b'/' {
157        // Self-closing tag, no content
158        *pointer = tag_end + 1;
159        return Ok(Some(ExtractedScript::new(
160            "",
161            tag_end + 1,
162            ScriptContext::AstroScript,
163            "js",
164        )));
165    }
166
167    // Move pointer past the closing >
168    *pointer = tag_end + 1;
169    let content_start = *pointer;
170
171    // Find the closing </script> tag
172    let script_end = match find_script_end(bytes, *pointer) {
173        Some(pos) => pos,
174        None => {
175            return Err(ExtractorError::UnclosedScriptTag {
176                position: script_start,
177            });
178        }
179    };
180
181    // Extract the script content
182    let source_text = &source_text[content_start..script_end];
183
184    // Move pointer past the closing </script>
185    *pointer = script_end + 9; // 9 = "</script>".len()
186
187    // Script tags in Astro are JavaScript by default
188    Ok(Some(ExtractedScript::new(
189        source_text,
190        content_start,
191        ScriptContext::AstroScript,
192        "js",
193    )))
194}
195
196/// Finds the closing `---` for frontmatter.
197fn find_frontmatter_closing(bytes: &[u8], start: usize) -> Option<usize> {
198    let mut pos = start;
199
200    while pos + 3 <= bytes.len() {
201        // Check if we're at the start of a line
202        let at_line_start = pos == 0 || bytes[pos - 1] == b'\n';
203
204        if at_line_start && &bytes[pos..pos + 3] == b"---" {
205            // Check that --- is followed by newline or end of file
206            let after_dashes = pos + 3;
207            if after_dashes >= bytes.len()
208                || matches!(bytes[after_dashes], b'\n' | b'\r' | b' ' | b'\t')
209            {
210                return Some(pos);
211            }
212        }
213
214        pos += 1;
215    }
216
217    None
218}
219
220/// Finds the start of a `<script` tag using memchr.
221fn find_script_start(bytes: &[u8], start: usize) -> Option<usize> {
222    let search_slice = &bytes[start..];
223    memmem::find(search_slice, b"<script").map(|pos| start + pos)
224}
225
226/// Finds the closing `>` of a script tag, handling quoted attributes.
227fn find_script_closing_angle(bytes: &[u8], start: usize) -> Option<usize> {
228    let mut in_quote = false;
229    let mut quote_char = 0u8;
230
231    for (i, &byte) in bytes[start..].iter().enumerate() {
232        match byte {
233            b'"' | b'\'' => {
234                if !in_quote {
235                    in_quote = true;
236                    quote_char = byte;
237                } else if byte == quote_char {
238                    in_quote = false;
239                }
240            }
241            b'>' if !in_quote => return Some(start + i),
242            _ => {}
243        }
244    }
245
246    None
247}
248
249/// Finds the closing `</script>` tag.
250fn find_script_end(bytes: &[u8], start: usize) -> Option<usize> {
251    let search_slice = &bytes[start..];
252    memmem::find(search_slice, b"</script>").map(|pos| start + pos)
253}
254
255#[cfg(test)]
256mod tests {
257    use super::*;
258
259    #[test]
260    fn test_frontmatter_only() {
261        let astro = r#"---
262const title = 'My Page'
263const data = await fetch('/api').then(r => r.json())
264---
265<html><head><title>{title}</title></head></html>
266"#;
267        let extractor = AstroExtractor;
268        let sources = extractor.extract(astro).unwrap();
269        assert_eq!(sources.len(), 1);
270        assert_eq!(sources[0].context, ScriptContext::AstroFrontmatter);
271        assert_eq!(sources[0].lang, "ts");
272        assert!(sources[0].source_text.contains("const title"));
273    }
274
275    #[test]
276    fn test_script_only() {
277        let astro = r#"
278<html>
279  <body>
280    <script>
281      console.log('Hello, Astro!')
282    </script>
283  </body>
284</html>
285"#;
286        let extractor = AstroExtractor;
287        let sources = extractor.extract(astro).unwrap();
288        assert_eq!(sources.len(), 1);
289        assert_eq!(sources[0].context, ScriptContext::AstroScript);
290        assert_eq!(sources[0].lang, "js");
291        assert!(sources[0].source_text.contains("console.log"));
292    }
293
294    #[test]
295    fn test_frontmatter_and_scripts() {
296        let astro = r#"---
297const pageTitle = 'Home'
298---
299<html>
300  <head><title>{pageTitle}</title></head>
301  <body>
302    <script>
303      console.log('Script 1')
304    </script>
305    <script>
306      console.log('Script 2')
307    </script>
308  </body>
309</html>
310"#;
311        let extractor = AstroExtractor;
312        let sources = extractor.extract(astro).unwrap();
313        assert_eq!(sources.len(), 3);
314        assert_eq!(sources[0].context, ScriptContext::AstroFrontmatter);
315        assert_eq!(sources[1].context, ScriptContext::AstroScript);
316        assert_eq!(sources[2].context, ScriptContext::AstroScript);
317        assert!(sources[1].source_text.contains("Script 1"));
318        assert!(sources[2].source_text.contains("Script 2"));
319    }
320
321    #[test]
322    fn test_no_frontmatter_or_scripts() {
323        let astro = "<html><body><h1>Hello</h1></body></html>";
324        let extractor = AstroExtractor;
325        let sources = extractor.extract(astro).unwrap();
326        assert_eq!(sources.len(), 0);
327    }
328
329    #[test]
330    fn test_unclosed_frontmatter() {
331        let astro = r#"---
332const x = 1
333<html></html>
334"#;
335        let extractor = AstroExtractor;
336        let result = extractor.extract(astro);
337        assert!(matches!(
338            result,
339            Err(ExtractorError::UnclosedFrontmatter { .. })
340        ));
341    }
342
343    #[test]
344    fn test_unclosed_script() {
345        let astro = r#"<script>console.log('test')"#;
346        let extractor = AstroExtractor;
347        let result = extractor.extract(astro);
348        assert!(matches!(
349            result,
350            Err(ExtractorError::UnclosedScriptTag { .. })
351        ));
352    }
353
354    #[test]
355    fn test_file_too_large() {
356        let large_content = "x".repeat(MAX_FILE_SIZE + 1);
357        let extractor = AstroExtractor;
358        let result = extractor.extract(&large_content);
359        assert!(matches!(result, Err(ExtractorError::FileTooLarge { .. })));
360    }
361}