Skip to main content

amql_engine/extractor/
runner.rs

1//! Subprocess execution and JSON parsing for extractors.
2
3use crate::error::AqlError;
4use crate::manifest::{ExtractorConfig, Manifest};
5use crate::store::Annotation;
6use crate::types::{AttrName, Binding, ProjectRoot, RelativePath, TagName};
7use rustc_hash::{FxHashMap, FxHashSet};
8use serde::{Deserialize, Serialize};
9use serde_json::Value as JsonValue;
10use std::process::Command;
11use std::time::Duration;
12
13/// Maximum time an extractor subprocess may run before being killed.
14const EXTRACTOR_TIMEOUT: Duration = Duration::from_secs(30);
15
16/// Result of running a single extractor.
17#[derive(Debug, Clone, Serialize)]
18pub struct ExtractorResult {
19    /// Annotations discovered by the extractor.
20    pub annotations: Vec<Annotation>,
21    /// Non-fatal errors encountered during extraction.
22    pub errors: Vec<String>,
23}
24
25/// JSON schema for a single annotation emitted by an extractor.
26#[derive(Debug, Deserialize)]
27struct RawExtractorAnnotation {
28    file: String,
29    bind: String,
30    tag: String,
31    #[serde(default)]
32    attrs: FxHashMap<String, JsonValue>,
33    #[serde(default)]
34    children: Vec<RawExtractorAnnotation>,
35}
36
37/// Top-level JSON output from an extractor process.
38#[derive(Debug, Deserialize)]
39struct RawExtractorOutput {
40    annotations: Vec<RawExtractorAnnotation>,
41}
42
43/// Recursively convert a raw extractor annotation into an Annotation.
44fn convert_raw_annotation(raw: RawExtractorAnnotation) -> Annotation {
45    let attrs: FxHashMap<AttrName, JsonValue> = raw
46        .attrs
47        .into_iter()
48        .map(|(k, v)| (AttrName::from(k), v))
49        .collect();
50
51    let children: Vec<Annotation> = raw
52        .children
53        .into_iter()
54        .map(convert_raw_annotation)
55        .collect();
56
57    Annotation {
58        tag: TagName::from(raw.tag),
59        attrs,
60        binding: Binding::from(raw.bind),
61        file: RelativePath::from(raw.file),
62        children,
63    }
64}
65
66/// Run a single extractor as a subprocess and parse its JSON output.
67///
68/// The extractor command is executed via `sh -c` with the project root as
69/// the working directory. Stdout must be valid JSON matching the extractor
70/// output schema. Stderr is captured but ignored on success.
71pub fn run_extractor(
72    config: &ExtractorConfig,
73    project_root: &ProjectRoot,
74) -> Result<ExtractorResult, AqlError> {
75    let mut child = Command::new("sh")
76        .arg("-c")
77        .arg(&config.run)
78        .current_dir(project_root.as_ref())
79        .stdout(std::process::Stdio::piped())
80        .stderr(std::process::Stdio::piped())
81        .spawn()
82        .map_err(|e| format!("Failed to execute extractor '{}': {e}", config.name))?;
83
84    // Read stdout and stderr on separate threads to avoid pipe buffer
85    // deadlock. If the child produces more output than the OS pipe buffer
86    // (~64 KB), it will block on write. Draining concurrently prevents this.
87    let stdout_handle = child.stdout.take();
88    let stderr_handle = child.stderr.take();
89
90    let stdout_thread = std::thread::spawn(move || -> Vec<u8> {
91        use std::io::Read;
92        let mut buf = Vec::new();
93        if let Some(mut stdout) = stdout_handle {
94            let _ = stdout.read_to_end(&mut buf);
95        }
96        buf
97    });
98
99    let stderr_thread = std::thread::spawn(move || -> Vec<u8> {
100        use std::io::Read;
101        let mut buf = Vec::new();
102        if let Some(mut stderr) = stderr_handle {
103            let _ = stderr.read_to_end(&mut buf);
104        }
105        buf
106    });
107
108    let deadline = std::time::Instant::now() + EXTRACTOR_TIMEOUT;
109    loop {
110        match child.try_wait() {
111            Ok(Some(_)) => break,
112            Ok(None) => {
113                if std::time::Instant::now() >= deadline {
114                    let _ = child.kill();
115                    let _ = child.wait();
116                    return Err(format!(
117                        "Extractor '{}' timed out after {}s",
118                        config.name,
119                        EXTRACTOR_TIMEOUT.as_secs()
120                    )
121                    .into());
122                }
123                std::thread::sleep(Duration::from_millis(50));
124            }
125            Err(e) => {
126                return Err(format!("Failed to wait on extractor '{}': {e}", config.name).into());
127            }
128        }
129    }
130
131    let status = child
132        .wait()
133        .map_err(|e| format!("Failed to wait on extractor '{}': {e}", config.name))?;
134
135    let stdout_bytes = stdout_thread.join().unwrap_or_default();
136    let stderr_bytes = stderr_thread.join().unwrap_or_default();
137
138    let output = std::process::Output {
139        status,
140        stdout: stdout_bytes,
141        stderr: stderr_bytes,
142    };
143
144    if !output.status.success() {
145        let stderr = String::from_utf8_lossy(&output.stderr);
146        return Err(format!(
147            "Extractor '{}' exited with {}: {}",
148            config.name,
149            output.status,
150            stderr.trim()
151        )
152        .into());
153    }
154
155    let stdout = String::from_utf8_lossy(&output.stdout);
156    let raw: RawExtractorOutput = serde_json::from_str(&stdout)
157        .map_err(|e| format!("Extractor '{}' produced invalid JSON: {e}", config.name))?;
158
159    let mut annotations = Vec::with_capacity(raw.annotations.len());
160    let mut errors = Vec::new();
161
162    for entry in raw.annotations {
163        if entry.file.is_empty() {
164            errors.push(format!(
165                "Extractor '{}': annotation missing 'file' field",
166                config.name
167            ));
168            continue;
169        }
170        if entry.bind.is_empty() {
171            errors.push(format!(
172                "Extractor '{}': annotation for '{}' missing 'bind' field",
173                config.name, entry.file
174            ));
175            continue;
176        }
177
178        annotations.push(convert_raw_annotation(entry));
179    }
180
181    Ok(ExtractorResult {
182        annotations,
183        errors,
184    })
185}
186
187/// Run all extractors defined in the manifest and collect results.
188///
189/// For each manifest extractor: if `run` is non-empty, execute as a
190/// subprocess. If `run` is empty, look up a matching built-in by name
191/// in the registry. Extractors that fail are logged as errors in their
192/// result rather than halting the entire pipeline.
193#[must_use = "extractor results contain annotations to merge into the store"]
194pub fn run_all_extractors(
195    manifest: &Manifest,
196    project_root: &ProjectRoot,
197    registry: &super::ExtractorRegistry,
198) -> Vec<ExtractorResult> {
199    manifest
200        .extractors
201        .iter()
202        .map(|config| {
203            if config.run.is_empty() {
204                // Try built-in extractor
205                match registry.get(&config.name) {
206                    Some(builtin) => run_builtin(builtin, config, project_root),
207                    None => ExtractorResult {
208                        annotations: vec![],
209                        errors: vec![format!(
210                            "Extractor '{}': no 'run' command and no built-in found",
211                            config.name
212                        )],
213                    },
214                }
215            } else {
216                match run_extractor(config, project_root) {
217                    Ok(result) => result,
218                    Err(e) => ExtractorResult {
219                        annotations: vec![],
220                        errors: vec![e.to_string()],
221                    },
222                }
223            }
224        })
225        .collect()
226}
227
228/// Run a built-in extractor against files matching the config's globs.
229fn run_builtin(
230    builtin: &dyn super::BuiltinExtractor,
231    config: &ExtractorConfig,
232    project_root: &ProjectRoot,
233) -> ExtractorResult {
234    let extensions: FxHashSet<String> = builtin
235        .extensions()
236        .iter()
237        .map(|ext| ext.trim_start_matches('.').to_string())
238        .collect();
239
240    let mut annotations = Vec::new();
241    let mut errors = Vec::new();
242
243    let walker = ignore::WalkBuilder::new(project_root.as_ref())
244        .hidden(false)
245        .git_ignore(true)
246        .git_global(true)
247        .git_exclude(true)
248        .filter_entry(|entry| {
249            let name = entry.file_name().to_string_lossy();
250            if entry.file_type().is_some_and(|ft| ft.is_dir()) {
251                return !crate::paths::SKIP_DIRS.contains(&name.as_ref());
252            }
253            true
254        })
255        .build();
256
257    for entry in walker {
258        let entry = match entry {
259            Ok(e) => e,
260            Err(e) => {
261                errors.push(format!("Extractor '{}': walk error: {e}", config.name));
262                continue;
263            }
264        };
265
266        let path = entry.path();
267        if !path.is_file() {
268            continue;
269        }
270
271        // Check extension
272        let ext = path.extension().and_then(|e| e.to_str()).unwrap_or("");
273        if !extensions.contains(ext) {
274            continue;
275        }
276
277        // Skip declaration files
278        let name = path.file_name().and_then(|n| n.to_str()).unwrap_or("");
279        if name.ends_with(".d.ts") || name.ends_with(".d.mts") || name.ends_with(".d.cts") {
280            continue;
281        }
282
283        // Apply glob filter if specified
284        if !config.globs.is_empty() {
285            let relative_str = path
286                .strip_prefix(project_root.as_ref())
287                .ok()
288                .and_then(|r| r.to_str())
289                .unwrap_or("");
290            let matches_glob = config.globs.iter().any(|g| glob_matches(g, relative_str));
291            if !matches_glob {
292                continue;
293            }
294        }
295
296        let relative = match path.strip_prefix(project_root.as_ref()) {
297            Ok(r) => RelativePath::from(r.to_string_lossy().as_ref()),
298            Err(_) => continue,
299        };
300
301        let source = match std::fs::read_to_string(path) {
302            Ok(s) => s,
303            Err(e) => {
304                errors.push(format!(
305                    "Extractor '{}': failed to read {}: {e}",
306                    config.name,
307                    path.display()
308                ));
309                continue;
310            }
311        };
312
313        let file_anns = builtin.extract(&source, &relative);
314        annotations.extend(file_anns);
315    }
316
317    ExtractorResult {
318        annotations,
319        errors,
320    }
321}
322
323/// Simple glob matching: supports `*` and `**` prefix patterns.
324fn glob_matches(pattern: &str, path: &str) -> bool {
325    if pattern.starts_with("**") {
326        // **/*.ts matches any path ending with the suffix
327        let suffix = pattern.trim_start_matches("**/").trim_start_matches("*/");
328        if suffix.starts_with("*.") {
329            let ext = &suffix[1..]; // e.g. ".ts"
330            return path.ends_with(ext);
331        }
332        path.contains(suffix)
333    } else if let Some(idx) = pattern.find('*') {
334        let prefix = &pattern[..idx];
335        let suffix = &pattern[idx + 1..];
336        path.starts_with(prefix) && path.ends_with(suffix)
337    } else {
338        path == pattern
339    }
340}
341
342#[cfg(test)]
343mod tests {
344    use super::*;
345    use std::path::PathBuf;
346
347    fn test_project_root() -> ProjectRoot {
348        ProjectRoot::from(PathBuf::from("/tmp"))
349    }
350
351    #[test]
352    fn parses_valid_extractor_json() {
353        // Arrange
354        let json = r#"{"annotations":[
355            {"file":"src/api.ts","bind":"createUser","tag":"controller","attrs":{"method":"POST","path":"/api/users"}},
356            {"file":"src/api.ts","bind":"getUser","tag":"controller","attrs":{"method":"GET","path":"/api/users/:id"}}
357        ]}"#;
358        let raw: RawExtractorOutput = serde_json::from_str(json).unwrap();
359
360        // Act
361        let annotations: Vec<Annotation> = raw
362            .annotations
363            .into_iter()
364            .map(convert_raw_annotation)
365            .collect();
366
367        // Assert
368        assert_eq!(annotations.len(), 2, "should parse two annotations");
369        assert_eq!(
370            annotations[0].tag, "controller",
371            "first tag should be controller"
372        );
373        assert_eq!(
374            annotations[0].binding, "createUser",
375            "first binding should be createUser"
376        );
377        assert_eq!(
378            annotations[0].attrs.get("method"),
379            Some(&JsonValue::String("POST".to_string())),
380            "first method should be POST"
381        );
382        assert_eq!(
383            annotations[1].binding, "getUser",
384            "second binding should be getUser"
385        );
386    }
387
388    #[test]
389    fn parses_nested_children() {
390        // Arrange
391        let json = r#"{"annotations":[
392            {"file":"src/app.test.ts","bind":"AppTests","tag":"describe","attrs":{},
393             "children":[
394                {"file":"src/app.test.ts","bind":"renders correctly","tag":"test","attrs":{}},
395                {"file":"src/app.test.ts","bind":"nested suite","tag":"describe","attrs":{},
396                 "children":[
397                    {"file":"src/app.test.ts","bind":"handles edge case","tag":"test","attrs":{}}
398                 ]}
399             ]}
400        ]}"#;
401        let raw: RawExtractorOutput = serde_json::from_str(json).unwrap();
402
403        // Act
404        let annotations: Vec<Annotation> = raw
405            .annotations
406            .into_iter()
407            .map(convert_raw_annotation)
408            .collect();
409
410        // Assert
411        assert_eq!(
412            annotations.len(),
413            1,
414            "should parse one top-level annotation"
415        );
416        assert_eq!(
417            annotations[0].tag, "describe",
418            "top-level should be describe"
419        );
420        assert_eq!(
421            annotations[0].children.len(),
422            2,
423            "describe should have 2 children"
424        );
425        assert_eq!(
426            annotations[0].children[0].tag, "test",
427            "first child should be test"
428        );
429        assert_eq!(
430            annotations[0].children[1].tag, "describe",
431            "second child should be describe"
432        );
433        assert_eq!(
434            annotations[0].children[1].children.len(),
435            1,
436            "nested describe should have 1 child"
437        );
438        assert_eq!(
439            annotations[0].children[1].children[0].tag, "test",
440            "deeply nested should be test"
441        );
442    }
443
444    #[test]
445    fn skips_entries_with_missing_fields() {
446        // Arrange
447        let json = r#"{"annotations":[
448            {"file":"","bind":"x","tag":"t","attrs":{}},
449            {"file":"f.ts","bind":"","tag":"t","attrs":{}},
450            {"file":"f.ts","bind":"ok","tag":"t","attrs":{}}
451        ]}"#;
452        let raw: RawExtractorOutput = serde_json::from_str(json).unwrap();
453        let config_name = "test";
454
455        // Act
456        let mut annotations = Vec::new();
457        let mut errors = Vec::new();
458        for entry in raw.annotations {
459            if entry.file.is_empty() {
460                errors.push(format!(
461                    "Extractor '{}': annotation missing 'file' field",
462                    config_name
463                ));
464                continue;
465            }
466            if entry.bind.is_empty() {
467                errors.push(format!(
468                    "Extractor '{}': annotation for '{}' missing 'bind' field",
469                    config_name, entry.file
470                ));
471                continue;
472            }
473            annotations.push(convert_raw_annotation(entry));
474        }
475
476        // Assert
477        assert_eq!(annotations.len(), 1, "should keep only valid annotation");
478        assert_eq!(errors.len(), 2, "should report two errors");
479    }
480
481    #[test]
482    fn run_extractor_with_echo() {
483        // Arrange
484        let config = ExtractorConfig {
485            name: "echo-test".to_string(),
486            run: r#"echo '{"annotations":[{"file":"src/app.ts","bind":"handler","tag":"endpoint","attrs":{"method":"GET"}}]}'"#.to_string(),
487            globs: vec!["src/**/*.ts".to_string()],
488        };
489
490        // Act
491        let result = run_extractor(&config, &test_project_root()).unwrap();
492
493        // Assert
494        assert_eq!(result.annotations.len(), 1, "should produce one annotation");
495        assert_eq!(result.errors.len(), 0, "should produce no errors");
496        assert_eq!(
497            result.annotations[0].tag, "endpoint",
498            "tag should be endpoint"
499        );
500        assert_eq!(
501            result.annotations[0].binding, "handler",
502            "binding should be handler"
503        );
504    }
505
506    #[test]
507    fn run_extractor_handles_failure() {
508        // Arrange
509        let config = ExtractorConfig {
510            name: "fail-test".to_string(),
511            run: "exit 1".to_string(),
512            globs: vec![],
513        };
514
515        // Act
516        let result = run_extractor(&config, &test_project_root());
517
518        // Assert
519        assert!(result.is_err(), "should return error for failed extractor");
520        assert!(
521            result.unwrap_err().to_string().contains("fail-test"),
522            "error should mention extractor name"
523        );
524    }
525
526    #[test]
527    fn run_extractor_handles_invalid_json() {
528        // Arrange
529        let config = ExtractorConfig {
530            name: "bad-json".to_string(),
531            run: "echo 'not json'".to_string(),
532            globs: vec![],
533        };
534
535        // Act
536        let result = run_extractor(&config, &test_project_root());
537
538        // Assert
539        assert!(result.is_err(), "should return error for invalid JSON");
540        assert!(
541            result.unwrap_err().to_string().contains("invalid JSON"),
542            "error should mention invalid JSON"
543        );
544    }
545
546    #[test]
547    fn run_all_extractors_continues_on_failure() {
548        // Arrange
549        let manifest = Manifest {
550            version: "1.0".to_string(),
551            tags: FxHashMap::default(),
552            audiences: FxHashMap::default(),
553            visibilities: FxHashMap::default(),
554            extractors: vec![
555                ExtractorConfig {
556                    name: "failing".to_string(),
557                    run: "exit 1".to_string(),
558                    globs: vec![],
559                },
560                ExtractorConfig {
561                    name: "passing".to_string(),
562                    run: r#"echo '{"annotations":[{"file":"a.ts","bind":"b","tag":"t","attrs":{}}]}'"#.to_string(),
563                    globs: vec![],
564                },
565            ],
566        };
567
568        // Act
569        let registry = crate::extractor::ExtractorRegistry::new();
570        let results = run_all_extractors(&manifest, &test_project_root(), &registry);
571
572        // Assert
573        assert_eq!(results.len(), 2, "should return result for each extractor");
574        assert!(!results[0].errors.is_empty(), "first should have errors");
575        assert_eq!(
576            results[1].annotations.len(),
577            1,
578            "second should have annotations"
579        );
580    }
581}