Skip to main content

aicx_parser/
sanitize.rs

1//! Path and input sanitization for ai-contexters.
2//!
3//! Follows the established pattern:
4//! traversal check → canonicalize → allowlist validation.
5//!
6//! Prevents path traversal and command injection from user-supplied inputs
7//! (CLI arguments, project names, agent names).
8//!
9//! Vibecrafted with AI Agents by VetCoders (c)2026 VetCoders
10
11use anyhow::{Result, anyhow};
12use std::path::{Component, Path, PathBuf};
13
14/// Known safe extractor agent names.
15pub const ALLOWED_AGENTS: &[&str] = &[
16    "claude",
17    "codex",
18    "gemini",
19    "junie",
20    "codescribe",
21    "operator-md",
22];
23
24// ============================================================================
25// Core helpers (mirroring rmcp-memex pattern)
26// ============================================================================
27
28/// Check if a path string contains traversal sequences.
29///
30/// Genuine path traversal is `..` as its own path component (e.g. `../`,
31/// `foo/../bar`). Substring matching against `..` falsely flags innocent
32/// directory names like `...`, `foo..bar`, or `a..b/c`, which broke
33/// real corpus iteration when ingest stored a literal three-dot folder.
34/// We split the path into components and only flag the canonical
35/// `Component::ParentDir`, plus the usual control characters.
36fn contains_traversal(path: &str) -> bool {
37    if path.contains('\0') || path.contains('\n') || path.contains('\r') {
38        return true;
39    }
40    Path::new(path)
41        .components()
42        .any(|c| matches!(c, Component::ParentDir))
43}
44
45/// Get the user's home directory.
46fn home_dir() -> Result<PathBuf> {
47    std::env::var("HOME")
48        .map(PathBuf::from)
49        .map_err(|_| anyhow!("Cannot determine home directory from $HOME"))
50}
51
52/// Canonicalize a path, returning error if it doesn't exist.
53fn canonicalize_existing(path: &Path) -> Result<PathBuf> {
54    path.canonicalize()
55        .map_err(|e| anyhow!("Cannot canonicalize path '{}': {}", path.display(), e))
56}
57
58/// Validate that a path is under an allowed base directory.
59fn is_under_allowed_base(path: &Path) -> Result<bool> {
60    let home = home_dir()?;
61
62    if path.starts_with(&home) {
63        return Ok(true);
64    }
65
66    #[cfg(target_os = "macos")]
67    if path.starts_with("/Users") {
68        let components: Vec<_> = path.components().collect();
69        if components.len() >= 3 {
70            return Ok(true);
71        }
72    }
73
74    // Temporary directories (tests)
75    if path.starts_with("/tmp")
76        || path.starts_with("/var/folders")
77        || path.starts_with("/private/tmp")
78        || path.starts_with("/private/var/folders")
79    {
80        return Ok(true);
81    }
82
83    Ok(false)
84}
85
86// ============================================================================
87// Public API: path validation
88// ============================================================================
89
90/// Sanitize and validate a path that must exist (for reading).
91///
92/// Traversal check → canonicalize → allowlist.
93pub fn validate_read_path(path: &Path) -> Result<PathBuf> {
94    let path_str = path.to_string_lossy();
95    if contains_traversal(&path_str) {
96        return Err(anyhow!(
97            "Path contains invalid traversal sequence: {}",
98            path_str
99        ));
100    }
101
102    if !path.exists() {
103        return Err(anyhow!("Path does not exist: {}", path.display()));
104    }
105
106    let canonical = canonicalize_existing(path)?;
107
108    if !is_under_allowed_base(&canonical)? {
109        return Err(anyhow!(
110            "Cannot read from path outside allowed directories: {}",
111            canonical.display()
112        ));
113    }
114
115    Ok(canonical)
116}
117
118/// Sanitize and validate a path for writing (may not exist yet).
119///
120/// Traversal check → validate parent → allowlist.
121pub fn validate_write_path(path: &Path) -> Result<PathBuf> {
122    let path_str = path.to_string_lossy();
123    if contains_traversal(&path_str) {
124        return Err(anyhow!(
125            "Path contains invalid traversal sequence: {}",
126            path_str
127        ));
128    }
129
130    if path.exists() {
131        let canonical = canonicalize_existing(path)?;
132        if !is_under_allowed_base(&canonical)? {
133            return Err(anyhow!(
134                "Cannot write to path outside allowed directories: {}",
135                canonical.display()
136            ));
137        }
138        return Ok(canonical);
139    }
140
141    // New path — walk ancestors until we find an existing base directory and validate it.
142    let candidate = if path.is_absolute() {
143        path.to_path_buf()
144    } else {
145        std::env::current_dir()
146            .map_err(|e| anyhow!("Cannot determine current directory: {}", e))?
147            .join(path)
148    };
149
150    let mut ancestor = Some(candidate.as_path());
151    let mut existing_ancestor = None;
152    while let Some(current) = ancestor {
153        if current.exists() {
154            existing_ancestor = Some(canonicalize_existing(current)?);
155            break;
156        }
157        ancestor = current.parent();
158    }
159
160    let canonical_base = existing_ancestor.ok_or_else(|| {
161        anyhow!(
162            "Cannot validate write path '{}': no existing ancestor found",
163            path.display()
164        )
165    })?;
166
167    if !is_under_allowed_base(&canonical_base)? {
168        return Err(anyhow!(
169            "Path '{}' would be created outside allowed directories",
170            path.display()
171        ));
172    }
173
174    Ok(path.to_path_buf())
175}
176
177/// Sanitize a directory path used for reading (e.g., chunks_dir, contexts_dir).
178///
179/// Traversal check → canonicalize → allowlist. Must be a directory.
180pub fn validate_dir_path(path: &Path) -> Result<PathBuf> {
181    let validated = validate_read_path(path)?;
182    if !validated.is_dir() {
183        return Err(anyhow!("Path is not a directory: {}", validated.display()));
184    }
185    Ok(validated)
186}
187
188/// Open a file for reading only after validating the path.
189pub fn open_file_validated(path: &Path) -> Result<std::fs::File> {
190    let validated = validate_read_path(path)?;
191    // nosemgrep: rust.actix.path-traversal.tainted-path.tainted-path
192    std::fs::File::open(&validated)
193        .map_err(|e| anyhow!("Failed to open '{}': {}", validated.display(), e))
194}
195
196/// Create or truncate a file only after validating the write path.
197pub fn create_file_validated(path: &Path) -> Result<std::fs::File> {
198    let validated = validate_write_path(path)?;
199    // nosemgrep: rust.actix.path-traversal.tainted-path.tainted-path
200    std::fs::File::create(&validated)
201        .map_err(|e| anyhow!("Failed to create '{}': {}", validated.display(), e))
202}
203
204/// Read a UTF-8 text file only after validating the path.
205pub fn read_to_string_validated(path: &Path) -> Result<String> {
206    let validated = validate_read_path(path)?;
207    // nosemgrep: rust.actix.path-traversal.tainted-path.tainted-path
208    std::fs::read_to_string(&validated)
209        .map_err(|e| anyhow!("Failed to read '{}': {}", validated.display(), e))
210}
211
212/// Read a directory only after validating it as an allowed directory path.
213pub fn read_dir_validated(path: &Path) -> Result<std::fs::ReadDir> {
214    let validated = validate_dir_path(path)?;
215    // nosemgrep: rust.actix.path-traversal.tainted-path.tainted-path
216    std::fs::read_dir(&validated)
217        .map_err(|e| anyhow!("Failed to read dir '{}': {}", validated.display(), e))
218}
219
220// ============================================================================
221// Public API: input validation
222// ============================================================================
223
224/// Validate an agent name against the allowlist.
225///
226/// Prevents command injection by ensuring only known agent binaries
227/// are passed to `Command::new()`.
228pub fn safe_agent_name(name: &str) -> Result<&str> {
229    if ALLOWED_AGENTS.contains(&name) {
230        Ok(name)
231    } else {
232        Err(anyhow!(
233            "Unknown agent: {:?}. Allowed: {}",
234            name,
235            ALLOWED_AGENTS.join(", ")
236        ))
237    }
238}
239
240/// Sanitize a project name used in filesystem paths.
241///
242/// Rejects names containing path separators, traversal sequences,
243/// or control characters.
244pub fn safe_project_name(name: &str) -> Result<&str> {
245    if name.is_empty() {
246        return Err(anyhow!("Project name cannot be empty"));
247    }
248    if contains_traversal(name) || name.contains('/') || name.contains('\\') {
249        return Err(anyhow!("Invalid project name: {:?}", name));
250    }
251    Ok(name)
252}
253
254// ============================================================================
255// Tests
256// ============================================================================
257
258#[cfg(test)]
259mod tests {
260    use super::*;
261    use std::fs;
262
263    #[test]
264    fn test_contains_traversal() {
265        assert!(contains_traversal("../etc/passwd"));
266        assert!(contains_traversal("foo/../bar"));
267        assert!(contains_traversal("path\0with\0nulls"));
268        assert!(contains_traversal("line\nbreak"));
269        assert!(!contains_traversal("/normal/path"));
270        assert!(!contains_traversal("simple_name"));
271        assert!(!contains_traversal("./relative/path"));
272    }
273
274    #[test]
275    fn test_contains_traversal_does_not_flag_three_dot_folder() {
276        // Regression: a literal `...` directory name (yes, it happens — we had
277        // a broken ingest that wrote `~/.aicx/store/...`) is NOT path traversal
278        // and must not nuke the entire corpus iteration.
279        assert!(!contains_traversal("..."));
280        assert!(!contains_traversal("/Users/foo/.aicx/store/..."));
281        assert!(!contains_traversal("foo/.../bar"));
282    }
283
284    #[test]
285    fn test_contains_traversal_does_not_flag_dot_dot_inside_name() {
286        // `..` as a substring inside a normal component is fine; only a
287        // standalone `..` component is genuine traversal.
288        assert!(!contains_traversal("foo..bar"));
289        assert!(!contains_traversal("a..b/c"));
290        assert!(!contains_traversal("normal..text"));
291        assert!(!contains_traversal("/srv/a..b/c"));
292    }
293
294    #[test]
295    fn test_contains_traversal_carriage_return() {
296        assert!(contains_traversal("path\rwith\rcr"));
297    }
298
299    #[test]
300    fn test_validate_read_path_existing() {
301        let tmp = std::env::temp_dir().join("ai-ctx-san-test-read");
302        let _ = fs::remove_dir_all(&tmp);
303        fs::create_dir_all(&tmp).unwrap();
304        let test_file = tmp.join("test.txt");
305        fs::write(&test_file, "test").unwrap();
306
307        let result = validate_read_path(&test_file);
308        assert!(result.is_ok(), "Failed: {:?}", result);
309
310        let _ = fs::remove_dir_all(&tmp);
311    }
312
313    #[test]
314    fn test_validate_read_path_traversal() {
315        let bad = Path::new("/tmp/../../../etc/passwd");
316        assert!(validate_read_path(bad).is_err());
317    }
318
319    #[test]
320    fn test_validate_read_path_nonexistent() {
321        let missing = Path::new("/tmp/ai-ctx-nonexistent-12345");
322        assert!(validate_read_path(missing).is_err());
323    }
324
325    #[test]
326    fn test_validate_write_path_new() {
327        let tmp = std::env::temp_dir().join("ai-ctx-san-test-write");
328        let _ = fs::create_dir_all(&tmp);
329        let new_file = tmp.join("new.txt");
330        let result = validate_write_path(&new_file);
331        assert!(result.is_ok(), "Failed: {:?}", result);
332        let _ = fs::remove_dir_all(&tmp);
333    }
334
335    #[test]
336    fn test_validate_write_path_traversal() {
337        let bad = Path::new("/tmp/../../../etc/evil.txt");
338        assert!(validate_write_path(bad).is_err());
339    }
340
341    #[test]
342    fn test_validate_write_path_rejects_non_allowed_ancestor() {
343        let bad = Path::new("/etc/ai-contexters-test/nope/file.txt");
344        assert!(validate_write_path(bad).is_err());
345    }
346
347    #[test]
348    fn test_validate_write_path_relative_with_missing_parents() {
349        let nested = Path::new("target/ai-ctx-sanitize-new/subdir/new.txt");
350        assert!(validate_write_path(nested).is_ok());
351    }
352
353    #[test]
354    fn test_validate_dir_path() {
355        let tmp = std::env::temp_dir();
356        assert!(validate_dir_path(&tmp).is_ok());
357    }
358
359    #[test]
360    fn test_open_file_validated() {
361        let tmp = std::env::temp_dir().join("ai-ctx-san-open-file");
362        let _ = fs::remove_dir_all(&tmp);
363        fs::create_dir_all(&tmp).unwrap();
364        let test_file = tmp.join("test.txt");
365        fs::write(&test_file, "hello").unwrap();
366
367        let mut opened = open_file_validated(&test_file).unwrap();
368        let mut content = String::new();
369        use std::io::Read as _;
370        opened.read_to_string(&mut content).unwrap();
371        assert_eq!(content, "hello");
372
373        let _ = fs::remove_dir_all(&tmp);
374    }
375
376    #[test]
377    fn test_read_to_string_validated() {
378        let tmp = std::env::temp_dir().join("ai-ctx-san-read-string");
379        let _ = fs::remove_dir_all(&tmp);
380        fs::create_dir_all(&tmp).unwrap();
381        let test_file = tmp.join("test.txt");
382        fs::write(&test_file, "hello").unwrap();
383
384        let content = read_to_string_validated(&test_file).unwrap();
385        assert_eq!(content, "hello");
386
387        let _ = fs::remove_dir_all(&tmp);
388    }
389
390    #[test]
391    fn test_create_file_validated() {
392        let tmp = std::env::temp_dir().join("ai-ctx-san-create-file");
393        let _ = fs::remove_dir_all(&tmp);
394        fs::create_dir_all(&tmp).unwrap();
395        let test_file = tmp.join("test.txt");
396
397        let mut created = create_file_validated(&test_file).unwrap();
398        use std::io::Write as _;
399        created.write_all(b"hello").unwrap();
400        drop(created);
401
402        let content = fs::read_to_string(&test_file).unwrap();
403        assert_eq!(content, "hello");
404
405        let _ = fs::remove_dir_all(&tmp);
406    }
407
408    #[test]
409    fn test_read_dir_validated() {
410        let tmp = std::env::temp_dir().join("ai-ctx-san-read-dir");
411        let _ = fs::remove_dir_all(&tmp);
412        fs::create_dir_all(&tmp).unwrap();
413        fs::write(tmp.join("a.txt"), "a").unwrap();
414
415        let entries = read_dir_validated(&tmp)
416            .unwrap()
417            .filter_map(|entry| entry.ok())
418            .count();
419        assert_eq!(entries, 1);
420
421        let _ = fs::remove_dir_all(&tmp);
422    }
423
424    #[test]
425    fn test_safe_agent_name_valid() {
426        assert_eq!(safe_agent_name("claude").unwrap(), "claude");
427        assert_eq!(safe_agent_name("codex").unwrap(), "codex");
428        assert_eq!(safe_agent_name("gemini").unwrap(), "gemini");
429        assert_eq!(safe_agent_name("junie").unwrap(), "junie");
430        assert_eq!(safe_agent_name("codescribe").unwrap(), "codescribe");
431        assert_eq!(safe_agent_name("operator-md").unwrap(), "operator-md");
432    }
433
434    #[test]
435    fn test_safe_agent_name_rejects_unknown() {
436        assert!(safe_agent_name("rm").is_err());
437        assert!(safe_agent_name("bash").is_err());
438        assert!(safe_agent_name("claude; rm -rf /").is_err());
439    }
440
441    #[test]
442    fn test_safe_project_name_valid() {
443        assert!(safe_project_name("my-project").is_ok());
444        assert!(safe_project_name("lbrx-services").is_ok());
445        assert!(safe_project_name("CodeScribe").is_ok());
446    }
447}
448
449// ============================================================================
450// Query normalization (PL/EN diacritics + case folding)
451// ============================================================================
452
453/// Normalize text for fuzzy matching: lowercase + strip Polish diacritics.
454///
455/// Maps: ą→a, ć→c, ę→e, ł→l, ń→n, ó→o, ś→s, ź→z, ż→z
456/// Enables "wdrozenie" to match "wdrożenie", "zrodlo" to match "źródło", etc.
457pub fn normalize_query(text: &str) -> String {
458    text.chars()
459        .map(|c| match c {
460            'Ą' | 'ą' => 'a',
461            'Ć' | 'ć' => 'c',
462            'Ę' | 'ę' => 'e',
463            'Ł' | 'ł' => 'l',
464            'Ń' | 'ń' => 'n',
465            'Ó' | 'ó' => 'o',
466            'Ś' | 'ś' => 's',
467            'Ź' | 'ź' | 'Ż' | 'ż' => 'z',
468            _ => c,
469        })
470        .collect::<String>()
471        .to_lowercase()
472}
473
474// ============================================================================
475// Self-echo filtering (prevents feedback loops)
476// ============================================================================
477
478/// Patterns in messages that indicate aicx's own operational traffic.
479/// These create feedback loops: search → log → extract → search matches own query.
480/// Retired MCP tool names stay here so historical traces remain filterable.
481const SELF_ECHO_PATTERNS: &[&str] = &[
482    // MCP tool calls
483    "aicx_search",
484    "aicx_rank",
485    "aicx_refs",
486    "aicx_store",
487    // Dashboard API calls
488    "/api/search/fuzzy",
489    "/api/search/semantic",
490    "/api/search/cross",
491    "/api/health",
492    "/api/regenerate",
493    "/api/status",
494    // MCP JSON-RPC
495    "\"method\":\"tools/call\"",
496    "\"method\":\"tools/list\"",
497    "\"method\":\"initialize\"",
498    // CLI self-invocations
499    "aicx all -H",
500    "aicx all --hours",
501    "aicx claude -H",
502    "aicx claude --hours",
503    "aicx codex -H",
504    "aicx codex --hours",
505    "aicx gemini -H",
506    "aicx gemini --hours",
507    "aicx junie -H",
508    "aicx junie --hours",
509    "aicx store -H",
510    "aicx store --hours",
511    "aicx rank -p",
512    "aicx refs -H",
513    "aicx refs --hours",
514    "aicx serve",
515    "aicx dashboard --generate-html",
516    "aicx dashboard --serve",
517    "aicx dashboard-serve",
518    "aicx reports",
519    "aicx reports-extractor",
520];
521
522/// Sentinel brackets for aicx read blocks injected by vc-init / vc-agents.
523/// Content between these markers is recycled context, not original signal.
524const AICX_READ_BEGIN: &str = "【aicx:read】";
525const AICX_READ_END: &str = "【/aicx:read】";
526
527/// Returns true if a message is aicx operational self-echo that should be
528/// filtered from extraction to prevent feedback loops.
529///
530/// A message is self-echo if >50% of its non-empty lines match patterns,
531/// excluding lines inside 【aicx:read】...【/aicx:read】 blocks (which are
532/// counted as echo unconditionally).
533pub fn is_self_echo(message: &str) -> bool {
534    let lines: Vec<&str> = message
535        .lines()
536        .map(str::trim)
537        .filter(|l| !l.is_empty())
538        .collect();
539
540    if lines.is_empty() {
541        return false;
542    }
543
544    let mut echo_lines = 0usize;
545    let mut inside_aicx_block = false;
546
547    for line in &lines {
548        if line.contains(AICX_READ_BEGIN) {
549            inside_aicx_block = true;
550            echo_lines += 1;
551            continue;
552        }
553        if line.contains(AICX_READ_END) {
554            inside_aicx_block = false;
555            echo_lines += 1;
556            continue;
557        }
558        if inside_aicx_block {
559            echo_lines += 1;
560            continue;
561        }
562        let lower = line.to_lowercase();
563        if SELF_ECHO_PATTERNS
564            .iter()
565            .any(|pat| lower.contains(&pat.to_lowercase()))
566        {
567            echo_lines += 1;
568        }
569    }
570
571    // Message is self-echo if majority of lines match
572    echo_lines > 0 && echo_lines * 2 >= lines.len()
573}
574
575/// Filter a vec of timeline entries, removing self-echo messages.
576pub fn filter_self_echo<T>(entries: Vec<T>, get_message: impl Fn(&T) -> &str) -> Vec<T> {
577    entries
578        .into_iter()
579        .filter(|e| !is_self_echo(get_message(e)))
580        .collect()
581}
582
583#[cfg(test)]
584mod echo_tests {
585    use super::*;
586
587    #[test]
588    fn test_normal_message_not_echo() {
589        assert!(!is_self_echo("Fix the login regression in auth middleware"));
590        assert!(!is_self_echo("Decision: use per-chunk scoring"));
591        assert!(!is_self_echo("TODO: add tests for edge cases"));
592    }
593
594    #[test]
595    fn test_search_call_is_echo() {
596        assert!(is_self_echo(
597            r#"{"jsonrpc":"2.0","id":3,"method":"tools/call","params":{"name":"aicx_search","arguments":{"query":"deploy vistacare"}}}"#
598        ));
599    }
600
601    #[test]
602    fn test_api_call_is_echo() {
603        assert!(is_self_echo(
604            r#"curl -s "http://127.0.0.1:8033/api/search/fuzzy?q=deploy+vistacare&limit=3""#
605        ));
606    }
607
608    #[test]
609    fn test_cli_self_invocation_is_echo() {
610        assert!(is_self_echo("aicx all -H 24 --emit none"));
611        assert!(is_self_echo("aicx store -H 24 --full-rescan"));
612        assert!(is_self_echo("aicx store --hours 24"));
613        assert!(is_self_echo("aicx rank -p ai-contexters -H 72 --strict"));
614        assert!(is_self_echo(
615            "aicx dashboard --generate-html -p ai-contexters -H 24"
616        ));
617        assert!(is_self_echo(
618            "aicx reports --repo ai-contexters --workflow marbles"
619        ));
620    }
621
622    #[test]
623    fn test_mention_in_larger_message_not_echo() {
624        // Mere mention of aicx in a discussion should NOT be filtered
625        let msg = "We should add aicx_search to the MCP server.\n\
626                   The architecture looks clean.\n\
627                   Let's proceed with implementation.\n\
628                   Decision: expose 4 tools via rmcp.";
629        assert!(!is_self_echo(msg));
630    }
631}
632
633#[cfg(test)]
634mod normalize_tests {
635    use super::*;
636
637    #[test]
638    fn test_normalize_query_strips_diacritics() {
639        assert_eq!(normalize_query("wdrożenie"), "wdrozenie");
640        assert_eq!(normalize_query("źródło ŁĄCZNOŚCI"), "zrodlo lacznosci");
641        assert_eq!(normalize_query("Deploy Vista"), "deploy vista");
642        assert_eq!(normalize_query("ąćęłńóśźż"), "acelnoszz");
643    }
644
645    #[test]
646    fn test_safe_project_name_rejects_bad() {
647        assert!(safe_project_name("../etc").is_err());
648        assert!(safe_project_name("foo/bar").is_err());
649        assert!(safe_project_name("").is_err());
650        assert!(safe_project_name("foo\0bar").is_err());
651    }
652}