use std::collections::BTreeMap;
use serde::{Deserialize, Serialize};
use super::TrustLevel;
pub const FILE_ORIGIN_PREFIX: &str = "file";
pub const TAINTED_CONTEXT_ORIGIN: &str = "tainted-context";
const PATH_KEYS: &[&str] = &["path", "file_path", "file", "filename", "target_file"];
fn normalize_path(path: &str) -> String {
let trimmed = path.trim();
let trimmed = trimmed.strip_prefix("./").unwrap_or(trimmed);
trimmed.trim_end_matches('/').to_string()
}
#[derive(Clone, Debug, Default, PartialEq, Eq, Serialize, Deserialize)]
pub struct FileProvenanceLedger {
tainted: BTreeMap<String, String>,
}
impl FileProvenanceLedger {
pub fn record(&mut self, path: &str, origin: &str) {
let key = normalize_path(path);
if key.is_empty() {
return;
}
self.tainted
.entry(key)
.or_insert_with(|| origin.to_string());
}
pub fn classify(&self, path: &str) -> Option<(TrustLevel, String)> {
let key = normalize_path(path);
self.tainted.get(&key).map(|origin| {
(
TrustLevel::Untrusted,
format!("{FILE_ORIGIN_PREFIX}:{origin}"),
)
})
}
pub fn references_tainted_path(&self, command: &str) -> Option<(TrustLevel, String)> {
if self.tainted.is_empty() {
return None;
}
command
.split(|c: char| !is_path_char(c))
.filter(|token| !token.is_empty())
.find_map(|token| self.classify(token))
}
pub fn is_empty(&self) -> bool {
self.tainted.is_empty()
}
pub fn len(&self) -> usize {
self.tainted.len()
}
}
fn is_path_char(c: char) -> bool {
c.is_ascii_alphanumeric() || matches!(c, '/' | '.' | '-' | '_' | '~')
}
const COMMAND_KEYS: &[&str] = &["command", "cmd", "script"];
pub fn command_string(arguments: &serde_json::Value) -> Option<String> {
let obj = arguments.as_object()?;
let mut parts: Vec<String> = Vec::new();
for key in COMMAND_KEYS {
if let Some(serde_json::Value::String(value)) = obj.get(*key) {
if !value.trim().is_empty() {
parts.push(value.clone());
}
}
}
for key in ["args", "argv"] {
if let Some(serde_json::Value::Array(items)) = obj.get(key) {
for item in items {
if let serde_json::Value::String(value) = item {
if !value.trim().is_empty() {
parts.push(value.clone());
}
}
}
}
}
if parts.is_empty() {
None
} else {
Some(parts.join(" "))
}
}
pub fn path_arguments(arguments: &serde_json::Value) -> Vec<String> {
let mut paths = Vec::new();
let Some(obj) = arguments.as_object() else {
return paths;
};
for key in PATH_KEYS {
if let Some(serde_json::Value::String(value)) = obj.get(*key) {
if !value.trim().is_empty() {
paths.push(value.clone());
}
}
}
if let Some(serde_json::Value::Array(items)) = obj.get("paths") {
for item in items {
if let serde_json::Value::String(value) = item {
if !value.trim().is_empty() {
paths.push(value.clone());
}
}
}
}
paths
}
#[cfg(test)]
mod tests {
use super::*;
use serde_json::json;
#[test]
fn a_read_of_an_untainted_path_is_trusted() {
let ledger = FileProvenanceLedger::default();
assert!(ledger.is_empty());
assert!(ledger.classify("src/main.rs").is_none());
}
#[test]
fn a_read_of_a_recorded_path_is_untrusted_with_a_chained_origin() {
let mut ledger = FileProvenanceLedger::default();
ledger.record("vendor/dep/README.md", "fetch:web_fetch");
assert_eq!(ledger.len(), 1);
assert_eq!(
ledger.classify("vendor/dep/README.md"),
Some((TrustLevel::Untrusted, "file:fetch:web_fetch".to_string()))
);
}
#[test]
fn normalization_matches_write_and_read_spellings() {
let mut ledger = FileProvenanceLedger::default();
ledger.record("./notes/summary.md", TAINTED_CONTEXT_ORIGIN);
assert_eq!(
ledger.classify("notes/summary.md"),
Some((
TrustLevel::Untrusted,
format!("{FILE_ORIGIN_PREFIX}:{TAINTED_CONTEXT_ORIGIN}")
))
);
}
#[test]
fn the_first_origin_sticks() {
let mut ledger = FileProvenanceLedger::default();
ledger.record("a.txt", "fetch:web_fetch");
ledger.record("a.txt", TAINTED_CONTEXT_ORIGIN);
assert_eq!(
ledger.classify("a.txt"),
Some((TrustLevel::Untrusted, "file:fetch:web_fetch".to_string()))
);
}
#[test]
fn empty_paths_are_ignored() {
let mut ledger = FileProvenanceLedger::default();
ledger.record(" ", "fetch:web_fetch");
ledger.record("", "fetch:web_fetch");
assert!(ledger.is_empty());
}
#[test]
fn path_arguments_reads_the_common_vocabulary() {
assert_eq!(
path_arguments(&json!({"path": "src/a.rs"})),
vec!["src/a.rs".to_string()]
);
assert_eq!(
path_arguments(&json!({"file_path": "src/b.rs", "content": "..."})),
vec!["src/b.rs".to_string()]
);
assert_eq!(
path_arguments(&json!({"paths": ["x.md", "y.md"], "note": "z"})),
vec!["x.md".to_string(), "y.md".to_string()]
);
}
#[test]
fn path_arguments_ignores_non_path_and_blank_values() {
assert!(path_arguments(&json!({"query": "ripgrep this"})).is_empty());
assert!(path_arguments(&json!({"path": " "})).is_empty());
assert!(path_arguments(&json!("not an object")).is_empty());
}
#[test]
fn references_tainted_path_catches_a_laundered_command_read() {
let mut ledger = FileProvenanceLedger::default();
ledger.record("vendor/dep/README.md", "fetch:clone");
assert_eq!(
ledger.references_tainted_path("cat ./vendor/dep/README.md | head -n 40"),
Some((TrustLevel::Untrusted, "file:fetch:clone".to_string()))
);
assert_eq!(
ledger.references_tainted_path("grep -R foo \"vendor/dep/README.md\""),
Some((TrustLevel::Untrusted, "file:fetch:clone".to_string()))
);
}
#[test]
fn references_tainted_path_is_precise_and_fail_open() {
let mut ledger = FileProvenanceLedger::default();
ledger.record("a.md", "fetch:clone");
assert!(ledger
.references_tainted_path("cat data.md && echo done")
.is_none());
assert!(ledger.references_tainted_path("ls -la src/").is_none());
assert!(FileProvenanceLedger::default()
.references_tainted_path("cat anything")
.is_none());
}
#[test]
fn command_string_joins_command_and_argv() {
assert_eq!(
command_string(&json!({"command": "cat vendor/dep/README.md"})),
Some("cat vendor/dep/README.md".to_string())
);
assert_eq!(
command_string(&json!({"cmd": "grep", "args": ["-R", "foo", "vendor/dep/README.md"]})),
Some("grep -R foo vendor/dep/README.md".to_string())
);
assert!(command_string(&json!({"path": "src/a.rs"})).is_none());
assert!(command_string(&json!({"command": " "})).is_none());
}
}