Skip to main content

skill_veil_core/analyzer/
document.rs

1use crate::analyzer::assessment::assess_artifact;
2use crate::analyzer::binary_magic::detect_binary_disguise_kind;
3use crate::analyzer::references::extract_references;
4use crate::analyzer::types::{AnalyzerError, CodeBlock, Section, SkillDocument};
5use crate::ports::{FileSystemError, FileSystemProvider, MarkdownParser};
6use std::path::{Path, PathBuf};
7
8impl SkillDocument {
9    pub fn from_file_with_provider<P: MarkdownParser, F: FileSystemProvider>(
10        path: impl AsRef<Path>,
11        parser: &P,
12        fs_provider: &F,
13    ) -> Result<Self, AnalyzerError> {
14        let path = path.as_ref();
15        let bytes = fs_provider
16            .read_file_bytes(path)
17            .map_err(file_system_error_to_io_error)?
18            .as_bytes()
19            .to_vec();
20        // Probe magic bytes BEFORE the lossy decode so a ZIP/PE/ELF
21        // disguised as markdown is recorded on the document. Downstream
22        // (`scanner_execution::collect_raw_findings`) turns the kind
23        // into a Critical/Block finding.
24        let binary_disguise_kind = detect_binary_disguise_kind(path, &bytes).map(str::to_owned);
25        let decode_warning = std::str::from_utf8(&bytes).is_err();
26        let content = String::from_utf8_lossy(&bytes).into_owned();
27        Self::parse_with_parser(path.to_path_buf(), content, parser).map(|mut doc| {
28            doc.decode_warning = decode_warning;
29            doc.binary_disguise_kind = binary_disguise_kind;
30            doc
31        })
32    }
33
34    pub fn parse_with_parser<P: MarkdownParser + ?Sized>(
35        path: PathBuf,
36        content: String,
37        parser: &P,
38    ) -> Result<Self, AnalyzerError> {
39        let name = path
40            .file_stem()
41            .and_then(|s| s.to_str())
42            .unwrap_or("unknown")
43            .to_string();
44
45        let (sections, parse_warning) = match parser.parse_sections(&content) {
46            Ok(sections) => (sections, false),
47            Err(error) => {
48                tracing::warn!(
49                    "Failed to parse markdown sections in {}: {error}; continuing with empty sections",
50                    path.display()
51                );
52                (Vec::new(), true)
53            }
54        };
55        let referenced_files = extract_references(&content, &path);
56        let assessment = assess_artifact(path.as_path(), &content, &sections, &referenced_files);
57
58        Ok(Self {
59            path,
60            name,
61            extension_kind: assessment.extension_kind,
62            identity_source: assessment.identity_source,
63            structural_validity: assessment.structural_validity,
64            classification: assessment.classification,
65            structural_signals: assessment.structural_signals,
66            decode_warning: false,
67            parse_warning,
68            binary_disguise_kind: None,
69            sections,
70            raw_content: content,
71            referenced_files,
72        })
73    }
74
75    pub fn get_section(&self, name: &str) -> Option<&Section> {
76        let name_lower = name.to_lowercase();
77        self.sections.iter().find(|s| s.name == name_lower)
78    }
79
80    pub fn all_code_blocks(&self) -> Vec<&CodeBlock> {
81        self.sections
82            .iter()
83            .flat_map(|s| s.code_blocks.iter())
84            .collect()
85    }
86
87    /// Whether any code block in this document declares a fence language
88    /// equal to `lang`.
89    ///
90    /// Comparison is exact-match on lowercase strings. The parser
91    /// normalizes fence languages to ASCII lowercase at parse time (see
92    /// `adapters/pulldown_parser.rs` — the `Fenced(lang)` branch in the
93    /// code-block handler), so callers MUST pass `lang` in lowercase.
94    /// `kind: code_language` rule conditions in YAML rule packs use
95    /// lowercase tokens by convention, matching this contract.
96    pub fn has_code_language(&self, lang: &str) -> bool {
97        self.all_code_blocks()
98            .iter()
99            .any(|cb| cb.language.as_deref() == Some(lang))
100    }
101}
102
103fn file_system_error_to_io_error(error: FileSystemError) -> std::io::Error {
104    match error {
105        FileSystemError::IoError(error) => error,
106        FileSystemError::PathNotFound(path) => {
107            std::io::Error::new(std::io::ErrorKind::NotFound, path.display().to_string())
108        }
109    }
110}
111
112#[cfg(test)]
113mod tests {
114    use crate::adapters::PulldownMarkdownParser;
115    use crate::analyzer::types::SkillDocument;
116    use std::path::PathBuf;
117
118    fn parse_doc(content: &str) -> SkillDocument {
119        let parser = PulldownMarkdownParser::new();
120        SkillDocument::parse_with_parser(
121            PathBuf::from("/tmp/skill.md"),
122            content.to_string(),
123            &parser,
124        )
125        .expect("parse_with_parser must succeed for the inline fixture")
126    }
127
128    /// Contract: `has_code_language("python")` matches a `Python` (or
129    /// `PYTHON`) fence because the parser normalizes fence languages to
130    /// lowercase. Anchors the bug fix — turns a previously latent
131    /// case-insensitivity contract into a tested one.
132    #[test]
133    fn has_code_language_matches_uppercase_fence_when_caller_uses_lowercase() {
134        let upper = parse_doc("## Setup\n```Python\nprint('hi')\n```\n");
135        let screaming = parse_doc("## Setup\n```PYTHON\nprint('hi')\n```\n");
136        assert!(upper.has_code_language("python"));
137        assert!(screaming.has_code_language("python"));
138    }
139
140    /// Contract: `has_code_language` returns `false` when no code block
141    /// declares the requested language. Pins the negative case so the
142    /// case-insensitive-LHS contract doesn't accidentally widen to
143    /// "matches anything".
144    #[test]
145    fn has_code_language_returns_false_for_unknown_lang() {
146        let doc = parse_doc("## Setup\n```python\nprint('hi')\n```\n");
147        assert!(!doc.has_code_language("ruby"));
148    }
149}