Skip to main content

aicx_parser/
sanitize.rs

1//! Path and input sanitization for ai-contexters.
2//!
3//! Follows the established pattern:
4//! traversal check → canonicalize → allowlist validation.
5//!
6//! Prevents path traversal and command injection from user-supplied inputs
7//! (CLI arguments, project names, agent names).
8//!
9//! Vibecrafted with AI Agents by VetCoders (c)2026 VetCoders
10
11use anyhow::{Result, anyhow};
12use std::path::{Component, Path, PathBuf};
13
14/// Known safe extractor agent names.
15const ALLOWED_AGENTS: &[&str] = &["claude", "codex", "gemini", "junie"];
16
17// ============================================================================
18// Core helpers (mirroring rmcp-memex pattern)
19// ============================================================================
20
21/// Check if a path string contains traversal sequences.
22///
23/// Genuine path traversal is `..` as its own path component (e.g. `../`,
24/// `foo/../bar`). Substring matching against `..` falsely flags innocent
25/// directory names like `...`, `foo..bar`, or `a..b/c`, which broke
26/// real corpus iteration when ingest stored a literal three-dot folder.
27/// We split the path into components and only flag the canonical
28/// `Component::ParentDir`, plus the usual control characters.
29fn contains_traversal(path: &str) -> bool {
30    if path.contains('\0') || path.contains('\n') || path.contains('\r') {
31        return true;
32    }
33    Path::new(path)
34        .components()
35        .any(|c| matches!(c, Component::ParentDir))
36}
37
38/// Get the user's home directory.
39fn home_dir() -> Result<PathBuf> {
40    std::env::var("HOME")
41        .map(PathBuf::from)
42        .map_err(|_| anyhow!("Cannot determine home directory from $HOME"))
43}
44
45/// Canonicalize a path, returning error if it doesn't exist.
46fn canonicalize_existing(path: &Path) -> Result<PathBuf> {
47    path.canonicalize()
48        .map_err(|e| anyhow!("Cannot canonicalize path '{}': {}", path.display(), e))
49}
50
51/// Validate that a path is under an allowed base directory.
52fn is_under_allowed_base(path: &Path) -> Result<bool> {
53    let home = home_dir()?;
54
55    if path.starts_with(&home) {
56        return Ok(true);
57    }
58
59    #[cfg(target_os = "macos")]
60    if path.starts_with("/Users") {
61        let components: Vec<_> = path.components().collect();
62        if components.len() >= 3 {
63            return Ok(true);
64        }
65    }
66
67    // Temporary directories (tests)
68    if path.starts_with("/tmp")
69        || path.starts_with("/var/folders")
70        || path.starts_with("/private/tmp")
71        || path.starts_with("/private/var/folders")
72    {
73        return Ok(true);
74    }
75
76    Ok(false)
77}
78
79// ============================================================================
80// Public API: path validation
81// ============================================================================
82
83/// Sanitize and validate a path that must exist (for reading).
84///
85/// Traversal check → canonicalize → allowlist.
86pub fn validate_read_path(path: &Path) -> Result<PathBuf> {
87    let path_str = path.to_string_lossy();
88    if contains_traversal(&path_str) {
89        return Err(anyhow!(
90            "Path contains invalid traversal sequence: {}",
91            path_str
92        ));
93    }
94
95    if !path.exists() {
96        return Err(anyhow!("Path does not exist: {}", path.display()));
97    }
98
99    let canonical = canonicalize_existing(path)?;
100
101    if !is_under_allowed_base(&canonical)? {
102        return Err(anyhow!(
103            "Cannot read from path outside allowed directories: {}",
104            canonical.display()
105        ));
106    }
107
108    Ok(canonical)
109}
110
111/// Sanitize and validate a path for writing (may not exist yet).
112///
113/// Traversal check → validate parent → allowlist.
114pub fn validate_write_path(path: &Path) -> Result<PathBuf> {
115    let path_str = path.to_string_lossy();
116    if contains_traversal(&path_str) {
117        return Err(anyhow!(
118            "Path contains invalid traversal sequence: {}",
119            path_str
120        ));
121    }
122
123    if path.exists() {
124        let canonical = canonicalize_existing(path)?;
125        if !is_under_allowed_base(&canonical)? {
126            return Err(anyhow!(
127                "Cannot write to path outside allowed directories: {}",
128                canonical.display()
129            ));
130        }
131        return Ok(canonical);
132    }
133
134    // New path — walk ancestors until we find an existing base directory and validate it.
135    let candidate = if path.is_absolute() {
136        path.to_path_buf()
137    } else {
138        std::env::current_dir()
139            .map_err(|e| anyhow!("Cannot determine current directory: {}", e))?
140            .join(path)
141    };
142
143    let mut ancestor = Some(candidate.as_path());
144    let mut existing_ancestor = None;
145    while let Some(current) = ancestor {
146        if current.exists() {
147            existing_ancestor = Some(canonicalize_existing(current)?);
148            break;
149        }
150        ancestor = current.parent();
151    }
152
153    let canonical_base = existing_ancestor.ok_or_else(|| {
154        anyhow!(
155            "Cannot validate write path '{}': no existing ancestor found",
156            path.display()
157        )
158    })?;
159
160    if !is_under_allowed_base(&canonical_base)? {
161        return Err(anyhow!(
162            "Path '{}' would be created outside allowed directories",
163            path.display()
164        ));
165    }
166
167    Ok(path.to_path_buf())
168}
169
170/// Sanitize a directory path used for reading (e.g., chunks_dir, contexts_dir).
171///
172/// Traversal check → canonicalize → allowlist. Must be a directory.
173pub fn validate_dir_path(path: &Path) -> Result<PathBuf> {
174    let validated = validate_read_path(path)?;
175    if !validated.is_dir() {
176        return Err(anyhow!("Path is not a directory: {}", validated.display()));
177    }
178    Ok(validated)
179}
180
181/// Open a file for reading only after validating the path.
182pub fn open_file_validated(path: &Path) -> Result<std::fs::File> {
183    let validated = validate_read_path(path)?;
184    // nosemgrep: rust.actix.path-traversal.tainted-path.tainted-path
185    std::fs::File::open(&validated)
186        .map_err(|e| anyhow!("Failed to open '{}': {}", validated.display(), e))
187}
188
189/// Create or truncate a file only after validating the write path.
190pub fn create_file_validated(path: &Path) -> Result<std::fs::File> {
191    let validated = validate_write_path(path)?;
192    // nosemgrep: rust.actix.path-traversal.tainted-path.tainted-path
193    std::fs::File::create(&validated)
194        .map_err(|e| anyhow!("Failed to create '{}': {}", validated.display(), e))
195}
196
197/// Read a UTF-8 text file only after validating the path.
198pub fn read_to_string_validated(path: &Path) -> Result<String> {
199    let validated = validate_read_path(path)?;
200    // nosemgrep: rust.actix.path-traversal.tainted-path.tainted-path
201    std::fs::read_to_string(&validated)
202        .map_err(|e| anyhow!("Failed to read '{}': {}", validated.display(), e))
203}
204
205/// Read a directory only after validating it as an allowed directory path.
206pub fn read_dir_validated(path: &Path) -> Result<std::fs::ReadDir> {
207    let validated = validate_dir_path(path)?;
208    // nosemgrep: rust.actix.path-traversal.tainted-path.tainted-path
209    std::fs::read_dir(&validated)
210        .map_err(|e| anyhow!("Failed to read dir '{}': {}", validated.display(), e))
211}
212
213// ============================================================================
214// Public API: input validation
215// ============================================================================
216
217/// Validate an agent name against the allowlist.
218///
219/// Prevents command injection by ensuring only known agent binaries
220/// are passed to `Command::new()`.
221pub fn safe_agent_name(name: &str) -> Result<&str> {
222    if ALLOWED_AGENTS.contains(&name) {
223        Ok(name)
224    } else {
225        Err(anyhow!(
226            "Unknown agent: {:?}. Allowed: {}",
227            name,
228            ALLOWED_AGENTS.join(", ")
229        ))
230    }
231}
232
233/// Sanitize a project name used in filesystem paths.
234///
235/// Rejects names containing path separators, traversal sequences,
236/// or control characters.
237pub fn safe_project_name(name: &str) -> Result<&str> {
238    if name.is_empty() {
239        return Err(anyhow!("Project name cannot be empty"));
240    }
241    if contains_traversal(name) || name.contains('/') || name.contains('\\') {
242        return Err(anyhow!("Invalid project name: {:?}", name));
243    }
244    Ok(name)
245}
246
247// ============================================================================
248// Tests
249// ============================================================================
250
251#[cfg(test)]
252mod tests {
253    use super::*;
254    use std::fs;
255
256    #[test]
257    fn test_contains_traversal() {
258        assert!(contains_traversal("../etc/passwd"));
259        assert!(contains_traversal("foo/../bar"));
260        assert!(contains_traversal("path\0with\0nulls"));
261        assert!(contains_traversal("line\nbreak"));
262        assert!(!contains_traversal("/normal/path"));
263        assert!(!contains_traversal("simple_name"));
264        assert!(!contains_traversal("./relative/path"));
265    }
266
267    #[test]
268    fn test_contains_traversal_does_not_flag_three_dot_folder() {
269        // Regression: a literal `...` directory name (yes, it happens — we had
270        // a broken ingest that wrote `~/.aicx/store/...`) is NOT path traversal
271        // and must not nuke the entire corpus iteration.
272        assert!(!contains_traversal("..."));
273        assert!(!contains_traversal("/Users/foo/.aicx/store/..."));
274        assert!(!contains_traversal("foo/.../bar"));
275    }
276
277    #[test]
278    fn test_contains_traversal_does_not_flag_dot_dot_inside_name() {
279        // `..` as a substring inside a normal component is fine; only a
280        // standalone `..` component is genuine traversal.
281        assert!(!contains_traversal("foo..bar"));
282        assert!(!contains_traversal("a..b/c"));
283        assert!(!contains_traversal("normal..text"));
284        assert!(!contains_traversal("/srv/a..b/c"));
285    }
286
287    #[test]
288    fn test_contains_traversal_carriage_return() {
289        assert!(contains_traversal("path\rwith\rcr"));
290    }
291
292    #[test]
293    fn test_validate_read_path_existing() {
294        let tmp = std::env::temp_dir().join("ai-ctx-san-test-read");
295        let _ = fs::remove_dir_all(&tmp);
296        fs::create_dir_all(&tmp).unwrap();
297        let test_file = tmp.join("test.txt");
298        fs::write(&test_file, "test").unwrap();
299
300        let result = validate_read_path(&test_file);
301        assert!(result.is_ok(), "Failed: {:?}", result);
302
303        let _ = fs::remove_dir_all(&tmp);
304    }
305
306    #[test]
307    fn test_validate_read_path_traversal() {
308        let bad = Path::new("/tmp/../../../etc/passwd");
309        assert!(validate_read_path(bad).is_err());
310    }
311
312    #[test]
313    fn test_validate_read_path_nonexistent() {
314        let missing = Path::new("/tmp/ai-ctx-nonexistent-12345");
315        assert!(validate_read_path(missing).is_err());
316    }
317
318    #[test]
319    fn test_validate_write_path_new() {
320        let tmp = std::env::temp_dir().join("ai-ctx-san-test-write");
321        let _ = fs::create_dir_all(&tmp);
322        let new_file = tmp.join("new.txt");
323        let result = validate_write_path(&new_file);
324        assert!(result.is_ok(), "Failed: {:?}", result);
325        let _ = fs::remove_dir_all(&tmp);
326    }
327
328    #[test]
329    fn test_validate_write_path_traversal() {
330        let bad = Path::new("/tmp/../../../etc/evil.txt");
331        assert!(validate_write_path(bad).is_err());
332    }
333
334    #[test]
335    fn test_validate_write_path_rejects_non_allowed_ancestor() {
336        let bad = Path::new("/etc/ai-contexters-test/nope/file.txt");
337        assert!(validate_write_path(bad).is_err());
338    }
339
340    #[test]
341    fn test_validate_write_path_relative_with_missing_parents() {
342        let nested = Path::new("target/ai-ctx-sanitize-new/subdir/new.txt");
343        assert!(validate_write_path(nested).is_ok());
344    }
345
346    #[test]
347    fn test_validate_dir_path() {
348        let tmp = std::env::temp_dir();
349        assert!(validate_dir_path(&tmp).is_ok());
350    }
351
352    #[test]
353    fn test_open_file_validated() {
354        let tmp = std::env::temp_dir().join("ai-ctx-san-open-file");
355        let _ = fs::remove_dir_all(&tmp);
356        fs::create_dir_all(&tmp).unwrap();
357        let test_file = tmp.join("test.txt");
358        fs::write(&test_file, "hello").unwrap();
359
360        let mut opened = open_file_validated(&test_file).unwrap();
361        let mut content = String::new();
362        use std::io::Read as _;
363        opened.read_to_string(&mut content).unwrap();
364        assert_eq!(content, "hello");
365
366        let _ = fs::remove_dir_all(&tmp);
367    }
368
369    #[test]
370    fn test_read_to_string_validated() {
371        let tmp = std::env::temp_dir().join("ai-ctx-san-read-string");
372        let _ = fs::remove_dir_all(&tmp);
373        fs::create_dir_all(&tmp).unwrap();
374        let test_file = tmp.join("test.txt");
375        fs::write(&test_file, "hello").unwrap();
376
377        let content = read_to_string_validated(&test_file).unwrap();
378        assert_eq!(content, "hello");
379
380        let _ = fs::remove_dir_all(&tmp);
381    }
382
383    #[test]
384    fn test_create_file_validated() {
385        let tmp = std::env::temp_dir().join("ai-ctx-san-create-file");
386        let _ = fs::remove_dir_all(&tmp);
387        fs::create_dir_all(&tmp).unwrap();
388        let test_file = tmp.join("test.txt");
389
390        let mut created = create_file_validated(&test_file).unwrap();
391        use std::io::Write as _;
392        created.write_all(b"hello").unwrap();
393        drop(created);
394
395        let content = fs::read_to_string(&test_file).unwrap();
396        assert_eq!(content, "hello");
397
398        let _ = fs::remove_dir_all(&tmp);
399    }
400
401    #[test]
402    fn test_read_dir_validated() {
403        let tmp = std::env::temp_dir().join("ai-ctx-san-read-dir");
404        let _ = fs::remove_dir_all(&tmp);
405        fs::create_dir_all(&tmp).unwrap();
406        fs::write(tmp.join("a.txt"), "a").unwrap();
407
408        let entries = read_dir_validated(&tmp)
409            .unwrap()
410            .filter_map(|entry| entry.ok())
411            .count();
412        assert_eq!(entries, 1);
413
414        let _ = fs::remove_dir_all(&tmp);
415    }
416
417    #[test]
418    fn test_safe_agent_name_valid() {
419        assert_eq!(safe_agent_name("claude").unwrap(), "claude");
420        assert_eq!(safe_agent_name("codex").unwrap(), "codex");
421        assert_eq!(safe_agent_name("gemini").unwrap(), "gemini");
422        assert_eq!(safe_agent_name("junie").unwrap(), "junie");
423    }
424
425    #[test]
426    fn test_safe_agent_name_rejects_unknown() {
427        assert!(safe_agent_name("rm").is_err());
428        assert!(safe_agent_name("bash").is_err());
429        assert!(safe_agent_name("claude; rm -rf /").is_err());
430    }
431
432    #[test]
433    fn test_safe_project_name_valid() {
434        assert!(safe_project_name("my-project").is_ok());
435        assert!(safe_project_name("lbrx-services").is_ok());
436        assert!(safe_project_name("CodeScribe").is_ok());
437    }
438}
439
440// ============================================================================
441// Query normalization (PL/EN diacritics + case folding)
442// ============================================================================
443
444/// Normalize text for fuzzy matching: lowercase + strip Polish diacritics.
445///
446/// Maps: ą→a, ć→c, ę→e, ł→l, ń→n, ó→o, ś→s, ź→z, ż→z
447/// Enables "wdrozenie" to match "wdrożenie", "zrodlo" to match "źródło", etc.
448pub fn normalize_query(text: &str) -> String {
449    text.chars()
450        .map(|c| match c {
451            'Ą' | 'ą' => 'a',
452            'Ć' | 'ć' => 'c',
453            'Ę' | 'ę' => 'e',
454            'Ł' | 'ł' => 'l',
455            'Ń' | 'ń' => 'n',
456            'Ó' | 'ó' => 'o',
457            'Ś' | 'ś' => 's',
458            'Ź' | 'ź' | 'Ż' | 'ż' => 'z',
459            _ => c,
460        })
461        .collect::<String>()
462        .to_lowercase()
463}
464
465// ============================================================================
466// Self-echo filtering (prevents feedback loops)
467// ============================================================================
468
469/// Patterns in messages that indicate aicx's own operational traffic.
470/// These create feedback loops: search → log → extract → search matches own query.
471/// Retired MCP tool names stay here so historical traces remain filterable.
472const SELF_ECHO_PATTERNS: &[&str] = &[
473    // MCP tool calls
474    "aicx_search",
475    "aicx_rank",
476    "aicx_refs",
477    "aicx_store",
478    // Dashboard API calls
479    "/api/search/fuzzy",
480    "/api/search/semantic",
481    "/api/search/cross",
482    "/api/health",
483    "/api/regenerate",
484    "/api/status",
485    // MCP JSON-RPC
486    "\"method\":\"tools/call\"",
487    "\"method\":\"tools/list\"",
488    "\"method\":\"initialize\"",
489    // CLI self-invocations
490    "aicx all -H",
491    "aicx all --hours",
492    "aicx claude -H",
493    "aicx claude --hours",
494    "aicx codex -H",
495    "aicx codex --hours",
496    "aicx gemini -H",
497    "aicx gemini --hours",
498    "aicx junie -H",
499    "aicx junie --hours",
500    "aicx store -H",
501    "aicx store --hours",
502    "aicx rank -p",
503    "aicx refs -H",
504    "aicx refs --hours",
505    "aicx serve",
506    "aicx dashboard --generate-html",
507    "aicx dashboard --serve",
508    "aicx dashboard-serve",
509    "aicx reports",
510    "aicx reports-extractor",
511];
512
513/// Sentinel brackets for aicx read blocks injected by vc-init / vc-agents.
514/// Content between these markers is recycled context, not original signal.
515const AICX_READ_BEGIN: &str = "【aicx:read】";
516const AICX_READ_END: &str = "【/aicx:read】";
517
518/// Returns true if a message is aicx operational self-echo that should be
519/// filtered from extraction to prevent feedback loops.
520///
521/// A message is self-echo if >50% of its non-empty lines match patterns,
522/// excluding lines inside 【aicx:read】...【/aicx:read】 blocks (which are
523/// counted as echo unconditionally).
524pub fn is_self_echo(message: &str) -> bool {
525    let lines: Vec<&str> = message
526        .lines()
527        .map(str::trim)
528        .filter(|l| !l.is_empty())
529        .collect();
530
531    if lines.is_empty() {
532        return false;
533    }
534
535    let mut echo_lines = 0usize;
536    let mut inside_aicx_block = false;
537
538    for line in &lines {
539        if line.contains(AICX_READ_BEGIN) {
540            inside_aicx_block = true;
541            echo_lines += 1;
542            continue;
543        }
544        if line.contains(AICX_READ_END) {
545            inside_aicx_block = false;
546            echo_lines += 1;
547            continue;
548        }
549        if inside_aicx_block {
550            echo_lines += 1;
551            continue;
552        }
553        let lower = line.to_lowercase();
554        if SELF_ECHO_PATTERNS
555            .iter()
556            .any(|pat| lower.contains(&pat.to_lowercase()))
557        {
558            echo_lines += 1;
559        }
560    }
561
562    // Message is self-echo if majority of lines match
563    echo_lines > 0 && echo_lines * 2 >= lines.len()
564}
565
566/// Filter a vec of timeline entries, removing self-echo messages.
567pub fn filter_self_echo<T>(entries: Vec<T>, get_message: impl Fn(&T) -> &str) -> Vec<T> {
568    entries
569        .into_iter()
570        .filter(|e| !is_self_echo(get_message(e)))
571        .collect()
572}
573
574#[cfg(test)]
575mod echo_tests {
576    use super::*;
577
578    #[test]
579    fn test_normal_message_not_echo() {
580        assert!(!is_self_echo("Fix the login regression in auth middleware"));
581        assert!(!is_self_echo("Decision: use per-chunk scoring"));
582        assert!(!is_self_echo("TODO: add tests for edge cases"));
583    }
584
585    #[test]
586    fn test_search_call_is_echo() {
587        assert!(is_self_echo(
588            r#"{"jsonrpc":"2.0","id":3,"method":"tools/call","params":{"name":"aicx_search","arguments":{"query":"deploy vistacare"}}}"#
589        ));
590    }
591
592    #[test]
593    fn test_api_call_is_echo() {
594        assert!(is_self_echo(
595            r#"curl -s "http://127.0.0.1:8033/api/search/fuzzy?q=deploy+vistacare&limit=3""#
596        ));
597    }
598
599    #[test]
600    fn test_cli_self_invocation_is_echo() {
601        assert!(is_self_echo("aicx all -H 24 --emit none"));
602        assert!(is_self_echo("aicx store -H 24 --full-rescan"));
603        assert!(is_self_echo("aicx store --hours 24"));
604        assert!(is_self_echo("aicx rank -p ai-contexters -H 72 --strict"));
605        assert!(is_self_echo(
606            "aicx dashboard --generate-html -p ai-contexters -H 24"
607        ));
608        assert!(is_self_echo(
609            "aicx reports --repo ai-contexters --workflow marbles"
610        ));
611    }
612
613    #[test]
614    fn test_mention_in_larger_message_not_echo() {
615        // Mere mention of aicx in a discussion should NOT be filtered
616        let msg = "We should add aicx_search to the MCP server.\n\
617                   The architecture looks clean.\n\
618                   Let's proceed with implementation.\n\
619                   Decision: expose 4 tools via rmcp.";
620        assert!(!is_self_echo(msg));
621    }
622}
623
624#[cfg(test)]
625mod normalize_tests {
626    use super::*;
627
628    #[test]
629    fn test_normalize_query_strips_diacritics() {
630        assert_eq!(normalize_query("wdrożenie"), "wdrozenie");
631        assert_eq!(normalize_query("źródło ŁĄCZNOŚCI"), "zrodlo lacznosci");
632        assert_eq!(normalize_query("Deploy Vista"), "deploy vista");
633        assert_eq!(normalize_query("ąćęłńóśźż"), "acelnoszz");
634    }
635
636    #[test]
637    fn test_safe_project_name_rejects_bad() {
638        assert!(safe_project_name("../etc").is_err());
639        assert!(safe_project_name("foo/bar").is_err());
640        assert!(safe_project_name("").is_err());
641        assert!(safe_project_name("foo\0bar").is_err());
642    }
643}