use crate::lazy_pattern;
use crate::path_safety::path_stays_within_base;
use crate::patterns::compile_patterns;
use crate::ports::CompiledPattern;
use std::path::{Path, PathBuf};
const SCRIPT_EXT_PATTERN: &str = "sh|py|ps1|js|ts|rb|pl";
const ALL_EXT_PATTERN: &str = "sh|py|ps1|js|ts|rb|pl|exe|bin|dll";
lazy_pattern!(EXEC_REFERENCE_PATTERN, r"(?:chmod\s+\+x\s+|\./)([^\s]+)");
const KNOWN_BARE_URL_PREFIXES: &[&str] = &["mailto:", "data:", "javascript:", "tel:"];
fn has_url_scheme(candidate: &str) -> bool {
if KNOWN_BARE_URL_PREFIXES
.iter()
.any(|prefix| candidate.starts_with(prefix) && candidate.len() > prefix.len())
{
return true;
}
let bytes = candidate.as_bytes();
let Some(&first) = bytes.first() else {
return false;
};
if !first.is_ascii_alphabetic() {
return false;
}
let mut idx = 1;
while idx < bytes.len() {
let b = bytes[idx];
if !(b.is_ascii_alphanumeric() || b == b'+' || b == b'.' || b == b'-') {
return b == b':'
&& bytes.get(idx + 1).copied() == Some(b'/')
&& bytes.get(idx + 2).copied() == Some(b'/')
&& bytes.len() > idx + 3;
}
idx += 1;
}
false
}
pub(super) fn extract_references(content: &str, base_path: &Path) -> Vec<PathBuf> {
let mut references = Vec::new();
let base_dir = base_path.parent().unwrap_or_else(|| {
tracing::debug!(
"extract_references: `{}` has no parent; resolving references relative to CWD",
base_path.display()
);
Path::new(".")
});
let link_pattern = format!(r#"\[.*?\]\((\.?/?[^\)]+\.({}))\)"#, ALL_EXT_PATTERN);
let command_pattern = format!(
r#"(?:source|run|execute|include)\s+[\"']?([^\s\"']+\.({}))"#,
SCRIPT_EXT_PATTERN
);
let dynamic = compile_patterns(&[link_pattern.as_str(), command_pattern.as_str()]);
let patterns = dynamic
.iter()
.chain(std::iter::once::<&CompiledPattern>(&EXEC_REFERENCE_PATTERN));
for re in patterns {
for cap in re.captures_iter(content) {
let Some(m) = cap.get(1) else { continue };
let raw = m.matched_text.as_str();
if has_url_scheme(raw) {
tracing::debug!(
"extract_references: skipping URL reference in {}: {}",
base_path.display(),
raw
);
continue;
}
if Path::new(raw).is_absolute() {
tracing::debug!(
"extract_references: skipping absolute path in {}: {}",
base_path.display(),
raw
);
continue;
}
let resolved = base_dir.join(raw);
if !path_stays_within_base(&resolved, base_dir) {
tracing::debug!(
"extract_references: skipping path that escapes base_dir {}: {}",
base_dir.display(),
raw
);
continue;
}
if !references.contains(&resolved) {
references.push(resolved);
}
}
}
references
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn extract_references_rejects_absolute_link_targets() {
let content = "See [the script](/etc/shadow.sh) for details.";
let base_path = Path::new("/tmp/pkg/SKILL.md");
let refs = extract_references(content, base_path);
assert!(
refs.iter().all(|p| !p.starts_with("/etc")),
"Absolute /etc/shadow.sh must NOT escape base_dir; got {refs:?}"
);
}
#[test]
fn extract_references_rejects_parent_traversal() {
let content = "Run `[evil](../../etc/passwd.sh)`.";
let base_path = Path::new("/tmp/pkg/SKILL.md");
let refs = extract_references(content, base_path);
assert!(
refs.is_empty()
|| refs
.iter()
.all(|p| !p.to_string_lossy().contains("etc/passwd")),
"Parent-traversal must be rejected; got {refs:?}"
);
}
#[test]
fn extract_references_accepts_legitimate_relative_paths() {
let content = "[install](./scripts/install.sh) and [helper](helpers/util.py)";
let base_path = Path::new("/tmp/pkg/SKILL.md");
let refs = extract_references(content, base_path);
assert!(refs.iter().any(|p| p.ends_with("scripts/install.sh")));
assert!(refs.iter().any(|p| p.ends_with("helpers/util.py")));
}
#[test]
fn extract_references_rejects_url_link_targets() {
let base_path = Path::new("/tmp/pkg/SKILL.md");
for sample in [
"[install](https://example.com/install.sh)",
"[install](http://example.com/install.sh)",
"[install](ftp://example.com/install.sh)",
"[install](file:///etc/install.sh)",
] {
let refs = extract_references(sample, base_path);
assert!(
refs.is_empty(),
"URL target must not be resolved: {sample:?} -> {refs:?}"
);
}
}
#[test]
fn has_url_scheme_classifies_canonical_inputs() {
for url in [
"https://example.com/x.sh",
"http://example.com",
"ftp://example.com",
"file:///etc/passwd",
"git+ssh://example.com/repo.git",
"data:text/plain,hello",
] {
assert!(has_url_scheme(url), "must classify as URL: {url:?}");
}
for non_url in [
"scripts/install.sh",
"./scripts/install.sh",
"../helpers/util.py",
"C:foo.txt",
"scheme",
"a:",
"",
":",
] {
assert!(
!has_url_scheme(non_url),
"must NOT classify as URL: {non_url:?}"
);
}
}
}