Skip to main content

zeph_core/
instructions.rs

1// SPDX-FileCopyrightText: 2026 Andrei G <bug-ops>
2// SPDX-License-Identifier: MIT OR Apache-2.0
3
4use std::collections::HashSet;
5use std::io::Read as _;
6use std::path::{Path, PathBuf};
7use std::time::Duration;
8
9use notify_debouncer_mini::{DebouncedEventKind, new_debouncer};
10use tokio::sync::mpsc;
11
12use crate::config::ProviderKind;
13
14pub enum InstructionEvent {
15    Changed,
16}
17
18pub struct InstructionWatcher {
19    _handle: tokio::task::JoinHandle<()>,
20}
21
22impl InstructionWatcher {
23    /// Start watching directories for instruction file (.md) changes.
24    ///
25    /// Sends `InstructionEvent::Changed` on any `.md` filesystem change (debounced 500ms).
26    ///
27    /// # Errors
28    ///
29    /// Returns an error if the filesystem watcher cannot be initialized.
30    pub fn start(
31        paths: &[PathBuf],
32        tx: mpsc::Sender<InstructionEvent>,
33    ) -> Result<Self, notify::Error> {
34        let (notify_tx, mut notify_rx) = mpsc::channel(16);
35
36        let mut debouncer = new_debouncer(
37            Duration::from_millis(500),
38            move |events: Result<Vec<notify_debouncer_mini::DebouncedEvent>, notify::Error>| {
39                let events = match events {
40                    Ok(events) => events,
41                    Err(e) => {
42                        tracing::warn!("instruction watcher error: {e}");
43                        return;
44                    }
45                };
46
47                let has_md_change = events.iter().any(|e| {
48                    e.kind == DebouncedEventKind::Any
49                        && e.path.extension().is_some_and(|ext| ext == "md")
50                });
51
52                if has_md_change {
53                    let _ = notify_tx.try_send(());
54                }
55            },
56        )?;
57
58        for path in paths {
59            if path.exists()
60                && let Err(e) = debouncer
61                    .watcher()
62                    .watch(path, notify::RecursiveMode::NonRecursive)
63            {
64                tracing::warn!(path = %path.display(), error = %e, "failed to watch instruction path");
65            }
66        }
67
68        tracing::debug!(paths = paths.len(), "starting instruction watcher");
69        let handle = tokio::spawn(async move {
70            let _debouncer = debouncer;
71            while notify_rx.recv().await.is_some() {
72                tracing::debug!("instruction file change detected, signaling reload");
73                if tx.send(InstructionEvent::Changed).await.is_err() {
74                    break;
75                }
76            }
77        });
78
79        Ok(Self { _handle: handle })
80    }
81}
82
83/// Parameters needed to re-run `load_instructions()` on hot-reload.
84pub struct InstructionReloadState {
85    pub base_dir: PathBuf,
86    pub provider_kinds: Vec<ProviderKind>,
87    pub explicit_files: Vec<PathBuf>,
88    pub auto_detect: bool,
89}
90
91/// Maximum size of a single instruction file. Files exceeding this limit are skipped.
92const MAX_FILE_SIZE: u64 = 256 * 1024; // 256 KiB
93
94/// A loaded instruction block from a single file.
95#[derive(Debug, Clone)]
96pub struct InstructionBlock {
97    /// Absolute path of the source file.
98    pub source: PathBuf,
99    /// UTF-8 text content of the file.
100    pub content: String,
101}
102
103/// Load instruction blocks from provider-specific and explicit files.
104///
105/// `base_dir` is resolved as the process working directory at startup via
106/// `std::env::current_dir()`. This matches the directory from which the user
107/// launches `zeph` and is therefore the most natural project root for file
108/// discovery. Non-git projects are fully supported; git root is not used.
109///
110/// Candidate paths are collected in this order:
111/// 1. Always: `base_dir/zeph.md` and `base_dir/.zeph/zeph.md`.
112/// 2. If `auto_detect`, per-provider paths from `detection_paths()` for each kind.
113/// 3. `explicit_files` as provided (trusted — user controls config.toml).
114///
115/// Deduplication uses `fs::canonicalize`. Paths that do not exist are silently
116/// skipped; canonicalize fails on nonexistent paths, so they cannot be deduped
117/// via symlinks against existing paths — this is an acceptable edge case documented here.
118pub fn load_instructions(
119    base_dir: &Path,
120    provider_kinds: &[ProviderKind],
121    explicit_files: &[PathBuf],
122    auto_detect: bool,
123) -> Vec<InstructionBlock> {
124    let canonical_base = match std::fs::canonicalize(base_dir) {
125        Ok(c) => c,
126        Err(e) => {
127            tracing::warn!(path = %base_dir.display(), error = %e, "failed to canonicalize base_dir, skipping all instruction files");
128            return Vec::new();
129        }
130    };
131
132    let mut candidates: Vec<PathBuf> = Vec::new();
133
134    // zeph.md is always checked regardless of provider or auto_detect setting.
135    candidates.push(base_dir.join("zeph.md"));
136    candidates.push(base_dir.join(".zeph").join("zeph.md"));
137
138    if auto_detect {
139        for &kind in provider_kinds {
140            candidates.extend(detection_paths(kind, base_dir));
141        }
142    }
143
144    // Explicit files are trusted (user controls config). Resolve relative to base_dir.
145    for p in explicit_files {
146        if p.is_absolute() {
147            candidates.push(p.clone());
148        } else {
149            candidates.push(base_dir.join(p));
150        }
151    }
152
153    // Deduplicate by canonical path. Only existing paths can be canonicalized.
154    let mut seen: HashSet<PathBuf> = HashSet::new();
155    let mut result: Vec<InstructionBlock> = Vec::new();
156
157    for path in candidates {
158        // Canonicalize first to resolve symlinks before opening — eliminates TOCTOU race.
159        // Nonexistent or unreadable paths are silently skipped.
160        let Ok(canonical) = std::fs::canonicalize(&path) else {
161            continue;
162        };
163
164        if !canonical.starts_with(&canonical_base) {
165            tracing::warn!(path = %canonical.display(), "instruction file escapes project root, skipping");
166            continue;
167        }
168
169        if !seen.insert(canonical.clone()) {
170            // Already loaded this path via a different candidate or symlink.
171            continue;
172        }
173
174        // Open the canonical path after boundary check — no TOCTOU window for symlink swap.
175        let Ok(file) = std::fs::File::open(&canonical) else {
176            continue;
177        };
178
179        let meta = match file.metadata() {
180            Ok(m) => m,
181            Err(e) => {
182                tracing::warn!(path = %path.display(), error = %e, "failed to read instruction file metadata, skipping");
183                continue;
184            }
185        };
186
187        if !meta.is_file() {
188            continue;
189        }
190
191        if meta.len() > MAX_FILE_SIZE {
192            tracing::warn!(
193                path = %path.display(),
194                size = meta.len(),
195                limit = MAX_FILE_SIZE,
196                "instruction file exceeds 256 KiB size limit, skipping"
197            );
198            continue;
199        }
200
201        let mut content = String::new();
202        match std::io::BufReader::new(file).read_to_string(&mut content) {
203            Ok(_) => {}
204            Err(e) => {
205                tracing::warn!(path = %path.display(), error = %e, "failed to read instruction file, skipping");
206                continue;
207            }
208        }
209
210        if content.contains('\0') {
211            tracing::warn!(path = %path.display(), "instruction file contains null bytes, skipping");
212            continue;
213        }
214
215        if content.is_empty() {
216            tracing::debug!(path = %path.display(), "instruction file is empty, skipping");
217            continue;
218        }
219
220        tracing::debug!(path = %path.display(), bytes = content.len(), "loaded instruction file");
221        result.push(InstructionBlock {
222            source: path,
223            content,
224        });
225    }
226
227    result
228}
229
230/// Returns candidate file paths for a given provider.
231///
232/// Uses an exhaustive match — adding a new `ProviderKind` variant will cause
233/// a compile error here, forcing the developer to update the detection table.
234fn detection_paths(kind: ProviderKind, base: &Path) -> Vec<PathBuf> {
235    match kind {
236        ProviderKind::Claude => {
237            let mut paths = vec![
238                base.join("CLAUDE.md"),
239                base.join(".claude").join("CLAUDE.md"),
240            ];
241            // Collect .claude/rules/*.md sorted by name for deterministic order.
242            let rules_dir = base.join(".claude").join("rules");
243            if let Ok(entries) = std::fs::read_dir(&rules_dir) {
244                let mut rule_files: Vec<PathBuf> = entries
245                    .filter_map(std::result::Result::ok)
246                    .map(|e| e.path())
247                    .filter(|p| p.extension().is_some_and(|ext| ext == "md"))
248                    .collect();
249                rule_files.sort();
250                paths.extend(rule_files);
251            }
252            paths
253        }
254        ProviderKind::OpenAi => {
255            vec![base.join("AGENTS.override.md"), base.join("AGENTS.md")]
256        }
257        ProviderKind::Compatible
258        | ProviderKind::Ollama
259        | ProviderKind::Candle
260        | ProviderKind::Gemini
261        | ProviderKind::Gonka
262        | ProviderKind::Cocoon => {
263            vec![base.join("AGENTS.md")]
264        }
265        _ => vec![base.join("AGENTS.md")],
266    }
267}
268
269/// Async wrapper around [`load_instructions`] that offloads filesystem I/O to the tokio
270/// blocking thread pool.
271///
272/// Returns an empty `Vec` and logs an error if the blocking task panics.
273pub async fn load_instructions_async(
274    base_dir: PathBuf,
275    provider_kinds: Vec<ProviderKind>,
276    explicit_files: Vec<PathBuf>,
277    auto_detect: bool,
278) -> Vec<InstructionBlock> {
279    tokio::task::spawn_blocking(move || {
280        load_instructions(&base_dir, &provider_kinds, &explicit_files, auto_detect)
281    })
282    .await
283    .unwrap_or_else(|e| {
284        tracing::error!(
285            error = %e,
286            "load_instructions_async: blocking task panicked, returning empty blocks"
287        );
288        Vec::new()
289    })
290}
291
292#[cfg(test)]
293mod watcher_tests {
294    use super::*;
295    use tokio::sync::mpsc;
296
297    #[tokio::test]
298    async fn start_with_valid_directory() {
299        let dir = tempfile::tempdir().unwrap();
300        let (tx, _rx) = mpsc::channel(16);
301        let result = InstructionWatcher::start(&[dir.path().to_path_buf()], tx);
302        assert!(result.is_ok());
303    }
304
305    #[tokio::test]
306    async fn start_with_empty_paths() {
307        let (tx, _rx) = mpsc::channel(16);
308        let result = InstructionWatcher::start(&[], tx);
309        assert!(result.is_ok());
310    }
311
312    #[tokio::test]
313    async fn detects_md_file_change() {
314        let dir = tempfile::tempdir().unwrap();
315        let (tx, mut rx) = mpsc::channel(16);
316        let _watcher = InstructionWatcher::start(&[dir.path().to_path_buf()], tx).unwrap();
317
318        let md_path = dir.path().join("zeph.md");
319        std::fs::write(&md_path, "initial").unwrap();
320
321        tokio::time::sleep(std::time::Duration::from_millis(100)).await;
322        std::fs::write(&md_path, "updated").unwrap();
323
324        let result = tokio::time::timeout(std::time::Duration::from_secs(3), rx.recv()).await;
325        assert!(
326            result.is_ok(),
327            "expected InstructionEvent::Changed within timeout"
328        );
329    }
330
331    #[tokio::test]
332    async fn ignores_non_md_file_change() {
333        let dir = tempfile::tempdir().unwrap();
334        let (tx, mut rx) = mpsc::channel(16);
335        let _watcher = InstructionWatcher::start(&[dir.path().to_path_buf()], tx).unwrap();
336
337        let other_path = dir.path().join("notes.txt");
338        std::fs::write(&other_path, "content").unwrap();
339
340        let result = tokio::time::timeout(std::time::Duration::from_millis(1500), rx.recv()).await;
341        assert!(result.is_err(), "should not receive event for non-.md file");
342    }
343
344    #[tokio::test]
345    async fn detects_md_file_deletion() {
346        let dir = tempfile::tempdir().unwrap();
347        let md_path = dir.path().join("zeph.md");
348        std::fs::write(&md_path, "content").unwrap();
349
350        let (tx, mut rx) = mpsc::channel(16);
351        let _watcher = InstructionWatcher::start(&[dir.path().to_path_buf()], tx).unwrap();
352
353        tokio::time::sleep(std::time::Duration::from_millis(100)).await;
354        std::fs::remove_file(&md_path).unwrap();
355
356        let result = tokio::time::timeout(std::time::Duration::from_secs(3), rx.recv()).await;
357        assert!(
358            result.is_ok(),
359            "expected InstructionEvent::Changed on .md deletion"
360        );
361    }
362}
363
364#[cfg(test)]
365mod reload_tests {
366    use super::*;
367
368    #[test]
369    fn reload_returns_updated_blocks_when_file_changes() {
370        let dir = tempfile::tempdir().unwrap();
371        let md_path = dir.path().join("zeph.md");
372        std::fs::write(&md_path, "initial content").unwrap();
373
374        let blocks = load_instructions(dir.path(), &[], &[], false);
375        assert_eq!(blocks.len(), 1);
376        assert_eq!(blocks[0].content, "initial content");
377
378        std::fs::write(&md_path, "updated content").unwrap();
379        let blocks2 = load_instructions(dir.path(), &[], &[], false);
380        assert_eq!(blocks2.len(), 1);
381        assert_eq!(blocks2[0].content, "updated content");
382    }
383
384    #[test]
385    fn reload_returns_empty_when_file_deleted() {
386        let dir = tempfile::tempdir().unwrap();
387        let md_path = dir.path().join("zeph.md");
388        std::fs::write(&md_path, "content").unwrap();
389
390        let blocks = load_instructions(dir.path(), &[], &[], false);
391        assert_eq!(blocks.len(), 1);
392
393        std::fs::remove_file(&md_path).unwrap();
394        let blocks2 = load_instructions(dir.path(), &[], &[], false);
395        assert!(
396            blocks2.is_empty(),
397            "deleted file should not be loaded on reload"
398        );
399    }
400}
401
402#[cfg(test)]
403mod tests {
404    use super::*;
405    use std::fs;
406    use tempfile::TempDir;
407
408    fn make_file(dir: &Path, name: &str, content: &str) -> PathBuf {
409        let path = dir.join(name);
410        if let Some(parent) = path.parent() {
411            fs::create_dir_all(parent).unwrap();
412        }
413        fs::write(&path, content).unwrap();
414        path
415    }
416
417    #[test]
418    fn zeph_md_loaded_even_when_auto_detect_disabled() {
419        let dir = TempDir::new().unwrap();
420        make_file(dir.path(), "zeph.md", "some content");
421        let blocks = load_instructions(dir.path(), &[], &[], false);
422        assert_eq!(blocks.len(), 1);
423        assert_eq!(blocks[0].content, "some content");
424    }
425
426    #[test]
427    fn empty_when_no_auto_detect_and_no_explicit_and_no_zeph_md() {
428        let dir = TempDir::new().unwrap();
429        let blocks = load_instructions(dir.path(), &[], &[], false);
430        assert!(blocks.is_empty());
431    }
432
433    #[test]
434    fn finds_zeph_md_in_base_dir() {
435        let dir = TempDir::new().unwrap();
436        make_file(dir.path(), "zeph.md", "zeph instructions");
437        let blocks = load_instructions(dir.path(), &[], &[], true);
438        assert_eq!(blocks.len(), 1);
439        assert_eq!(blocks[0].content, "zeph instructions");
440    }
441
442    #[test]
443    fn finds_dot_zeph_zeph_md() {
444        let dir = TempDir::new().unwrap();
445        make_file(dir.path(), ".zeph/zeph.md", "nested zeph instructions");
446        let blocks = load_instructions(dir.path(), &[], &[], true);
447        assert_eq!(blocks.len(), 1);
448        assert_eq!(blocks[0].content, "nested zeph instructions");
449    }
450
451    #[test]
452    fn detection_paths_claude() {
453        let dir = TempDir::new().unwrap();
454        make_file(dir.path(), "CLAUDE.md", "# Claude");
455        make_file(dir.path(), ".claude/CLAUDE.md", "# Dot Claude");
456        make_file(dir.path(), ".claude/rules/a.md", "rule a");
457        make_file(dir.path(), ".claude/rules/b.md", "rule b");
458
459        let blocks = load_instructions(dir.path(), &[ProviderKind::Claude], &[], true);
460        let sources: Vec<_> = blocks
461            .iter()
462            .map(|b| b.source.file_name().unwrap().to_str().unwrap())
463            .collect();
464        assert!(sources.contains(&"CLAUDE.md"));
465        assert!(sources.contains(&"a.md"));
466        assert!(sources.contains(&"b.md"));
467    }
468
469    #[test]
470    fn detection_paths_openai() {
471        let dir = TempDir::new().unwrap();
472        make_file(dir.path(), "AGENTS.md", "# Agents");
473
474        let paths = detection_paths(ProviderKind::OpenAi, dir.path());
475        assert!(paths.iter().any(|p| p.file_name().unwrap() == "AGENTS.md"));
476        assert!(
477            paths
478                .iter()
479                .any(|p| p.file_name().unwrap() == "AGENTS.override.md")
480        );
481    }
482
483    #[test]
484    fn detection_paths_ollama_and_compatible_and_candle() {
485        let dir = TempDir::new().unwrap();
486        for kind in [
487            ProviderKind::Ollama,
488            ProviderKind::Compatible,
489            ProviderKind::Candle,
490        ] {
491            let paths = detection_paths(kind, dir.path());
492            assert_eq!(paths.len(), 1);
493            assert_eq!(paths[0].file_name().unwrap(), "AGENTS.md");
494        }
495    }
496
497    #[test]
498    fn deduplication_by_canonical_path() {
499        let dir = TempDir::new().unwrap();
500        make_file(dir.path(), "AGENTS.md", "content");
501
502        // Both Ollama and Compatible resolve to AGENTS.md — should appear once.
503        let blocks = load_instructions(
504            dir.path(),
505            &[ProviderKind::Ollama, ProviderKind::Compatible],
506            &[],
507            true,
508        );
509        let agents_count = blocks
510            .iter()
511            .filter(|b| b.source.file_name().unwrap() == "AGENTS.md")
512            .count();
513        assert_eq!(agents_count, 1);
514    }
515
516    #[test]
517    fn skips_files_exceeding_size_limit() {
518        let dir = TempDir::new().unwrap();
519        let path = dir.path().join("big.md");
520        // Write slightly more than 512 KB.
521        let big = vec![b'x'; 513 * 1024];
522        fs::write(&path, &big).unwrap();
523        let blocks = load_instructions(dir.path(), &[], &[path], false);
524        assert!(blocks.is_empty());
525    }
526
527    #[test]
528    fn skips_empty_files() {
529        let dir = TempDir::new().unwrap();
530        make_file(dir.path(), "zeph.md", "");
531        let blocks = load_instructions(dir.path(), &[], &[], true);
532        assert!(blocks.is_empty());
533    }
534
535    #[test]
536    fn nonexistent_paths_are_silently_skipped() {
537        let dir = TempDir::new().unwrap();
538        let nonexistent = dir.path().join("does_not_exist.md");
539        let blocks = load_instructions(dir.path(), &[], &[nonexistent], false);
540        assert!(blocks.is_empty());
541    }
542
543    #[test]
544    fn explicit_relative_path_resolved_against_base_dir() {
545        let dir = TempDir::new().unwrap();
546        make_file(dir.path(), "custom.md", "custom content");
547        let blocks = load_instructions(dir.path(), &[], &[PathBuf::from("custom.md")], false);
548        assert_eq!(blocks.len(), 1);
549        assert_eq!(blocks[0].content, "custom content");
550    }
551
552    #[test]
553    fn invalid_utf8_file_is_skipped() {
554        let dir = TempDir::new().unwrap();
555        let path = dir.path().join("bad.md");
556        // Write bytes that are not valid UTF-8.
557        fs::write(&path, b"\xff\xfe invalid utf8 \x80\x81").unwrap();
558        let blocks = load_instructions(dir.path(), &[], &[path], false);
559        assert!(blocks.is_empty());
560    }
561
562    #[test]
563    fn multiple_providers_union_without_overlap() {
564        let dir = TempDir::new().unwrap();
565        make_file(dir.path(), "CLAUDE.md", "claude content");
566        make_file(dir.path(), "AGENTS.md", "agents content");
567
568        let blocks = load_instructions(
569            dir.path(),
570            &[ProviderKind::Claude, ProviderKind::OpenAi],
571            &[],
572            true,
573        );
574        let names: Vec<_> = blocks
575            .iter()
576            .map(|b| b.source.file_name().unwrap().to_str().unwrap())
577            .collect();
578        assert!(names.contains(&"CLAUDE.md"), "Claude file missing");
579        assert!(names.contains(&"AGENTS.md"), "OpenAI file missing");
580    }
581
582    #[test]
583    fn zeph_md_always_loaded_with_provider_auto_detect() {
584        let dir = TempDir::new().unwrap();
585        make_file(dir.path(), "zeph.md", "zeph rules");
586        // OpenAI provider has no AGENTS.md present, only zeph.md.
587        let blocks = load_instructions(dir.path(), &[ProviderKind::OpenAi], &[], true);
588        assert_eq!(blocks.len(), 1);
589        assert_eq!(blocks[0].content, "zeph rules");
590    }
591
592    #[cfg(unix)]
593    #[test]
594    fn symlink_deduplication() {
595        use std::os::unix::fs::symlink;
596        let dir = TempDir::new().unwrap();
597        make_file(dir.path(), "CLAUDE.md", "claude content");
598        symlink(
599            dir.path().join("CLAUDE.md"),
600            dir.path().join("CLAUDE_link.md"),
601        )
602        .unwrap();
603
604        // Load the original and the symlink — should appear only once after dedup.
605        let blocks = load_instructions(
606            dir.path(),
607            &[ProviderKind::Claude],
608            &[PathBuf::from("CLAUDE_link.md")],
609            true,
610        );
611        let claude_count = blocks
612            .iter()
613            .filter(|b| b.content == "claude content")
614            .count();
615        assert_eq!(claude_count, 1, "symlink should be deduped with original");
616    }
617
618    #[cfg(unix)]
619    #[test]
620    fn symlink_escaping_project_root_is_rejected() {
621        use std::os::unix::fs::symlink;
622        let outside = TempDir::new().unwrap();
623        let inside = TempDir::new().unwrap();
624        make_file(outside.path(), "secret.md", "secret content");
625
626        // Create a symlink inside the project dir pointing outside.
627        let link = inside.path().join("evil.md");
628        symlink(outside.path().join("secret.md"), &link).unwrap();
629
630        let blocks = load_instructions(inside.path(), &[], &[link], false);
631        assert!(
632            blocks.is_empty(),
633            "file escaping project root must be rejected"
634        );
635    }
636
637    #[test]
638    fn file_with_null_bytes_is_skipped() {
639        let dir = TempDir::new().unwrap();
640        let path = dir.path().join("null.md");
641        fs::write(&path, b"content\x00more").unwrap();
642        let blocks = load_instructions(dir.path(), &[], &[path], false);
643        assert!(blocks.is_empty());
644    }
645}