Skip to main content

zeph_core/
instructions.rs

1// SPDX-FileCopyrightText: 2026 Andrei G <bug-ops>
2// SPDX-License-Identifier: MIT OR Apache-2.0
3
4use std::collections::HashSet;
5use std::io::Read as _;
6use std::path::{Path, PathBuf};
7use std::time::Duration;
8
9use notify_debouncer_mini::{DebouncedEventKind, new_debouncer};
10use tokio::sync::mpsc;
11
12use crate::config::ProviderKind;
13
14#[non_exhaustive]
15pub enum InstructionEvent {
16    Changed,
17}
18
19pub struct InstructionWatcher {
20    _handle: tokio::task::JoinHandle<()>,
21}
22
23impl InstructionWatcher {
24    /// Start watching directories for instruction file (.md) changes.
25    ///
26    /// Sends `InstructionEvent::Changed` on any `.md` filesystem change (debounced 500ms).
27    ///
28    /// # Errors
29    ///
30    /// Returns an error if the filesystem watcher cannot be initialized.
31    pub fn start(
32        paths: &[PathBuf],
33        tx: mpsc::Sender<InstructionEvent>,
34    ) -> Result<Self, notify::Error> {
35        let (notify_tx, mut notify_rx) = mpsc::channel(16);
36
37        let mut debouncer = new_debouncer(
38            Duration::from_millis(500),
39            move |events: Result<Vec<notify_debouncer_mini::DebouncedEvent>, notify::Error>| {
40                let events = match events {
41                    Ok(events) => events,
42                    Err(e) => {
43                        tracing::warn!("instruction watcher error: {e}");
44                        return;
45                    }
46                };
47
48                let has_md_change = events.iter().any(|e| {
49                    e.kind == DebouncedEventKind::Any
50                        && e.path.extension().is_some_and(|ext| ext == "md")
51                });
52
53                if has_md_change {
54                    let _ = notify_tx.try_send(());
55                }
56            },
57        )?;
58
59        for path in paths {
60            if path.exists()
61                && let Err(e) = debouncer
62                    .watcher()
63                    .watch(path, notify::RecursiveMode::NonRecursive)
64            {
65                tracing::warn!(path = %path.display(), error = %e, "failed to watch instruction path");
66            }
67        }
68
69        tracing::debug!(paths = paths.len(), "starting instruction watcher");
70        let handle = tokio::spawn(async move {
71            let _debouncer = debouncer;
72            while notify_rx.recv().await.is_some() {
73                tracing::debug!("instruction file change detected, signaling reload");
74                if tx.send(InstructionEvent::Changed).await.is_err() {
75                    break;
76                }
77            }
78        });
79
80        Ok(Self { _handle: handle })
81    }
82}
83
84/// Parameters needed to re-run `load_instructions()` on hot-reload.
85pub struct InstructionReloadState {
86    pub base_dir: PathBuf,
87    pub provider_kinds: Vec<ProviderKind>,
88    pub explicit_files: Vec<PathBuf>,
89    pub auto_detect: bool,
90}
91
92/// Maximum size of a single instruction file. Files exceeding this limit are skipped.
93const MAX_FILE_SIZE: u64 = 256 * 1024; // 256 KiB
94
95/// A loaded instruction block from a single file.
96#[derive(Debug, Clone)]
97pub struct InstructionBlock {
98    /// Absolute path of the source file.
99    pub source: PathBuf,
100    /// UTF-8 text content of the file.
101    pub content: String,
102}
103
104/// Load instruction blocks from provider-specific and explicit files.
105///
106/// `base_dir` is resolved as the process working directory at startup via
107/// `std::env::current_dir()`. This matches the directory from which the user
108/// launches `zeph` and is therefore the most natural project root for file
109/// discovery. Non-git projects are fully supported; git root is not used.
110///
111/// Candidate paths are collected in this order:
112/// 1. Always: `base_dir/zeph.md` and `base_dir/.zeph/zeph.md`.
113/// 2. If `auto_detect`, per-provider paths from `detection_paths()` for each kind.
114/// 3. `explicit_files` as provided (trusted — user controls config.toml).
115///
116/// Deduplication uses `fs::canonicalize`. Paths that do not exist are silently
117/// skipped; canonicalize fails on nonexistent paths, so they cannot be deduped
118/// via symlinks against existing paths — this is an acceptable edge case documented here.
119pub fn load_instructions(
120    base_dir: &Path,
121    provider_kinds: &[ProviderKind],
122    explicit_files: &[PathBuf],
123    auto_detect: bool,
124) -> Vec<InstructionBlock> {
125    let canonical_base = match std::fs::canonicalize(base_dir) {
126        Ok(c) => c,
127        Err(e) => {
128            tracing::warn!(path = %base_dir.display(), error = %e, "failed to canonicalize base_dir, skipping all instruction files");
129            return Vec::new();
130        }
131    };
132
133    let mut candidates: Vec<PathBuf> = Vec::new();
134
135    // zeph.md is always checked regardless of provider or auto_detect setting.
136    candidates.push(base_dir.join("zeph.md"));
137    candidates.push(base_dir.join(".zeph").join("zeph.md"));
138
139    if auto_detect {
140        for &kind in provider_kinds {
141            candidates.extend(detection_paths(kind, base_dir));
142        }
143    }
144
145    // Explicit files are trusted (user controls config). Resolve relative to base_dir.
146    for p in explicit_files {
147        if p.is_absolute() {
148            candidates.push(p.clone());
149        } else {
150            candidates.push(base_dir.join(p));
151        }
152    }
153
154    // Deduplicate by canonical path. Only existing paths can be canonicalized.
155    let mut seen: HashSet<PathBuf> = HashSet::new();
156    let mut result: Vec<InstructionBlock> = Vec::new();
157
158    for path in candidates {
159        // Canonicalize first to resolve symlinks before opening — eliminates TOCTOU race.
160        // Nonexistent or unreadable paths are silently skipped.
161        let Ok(canonical) = std::fs::canonicalize(&path) else {
162            continue;
163        };
164
165        if !canonical.starts_with(&canonical_base) {
166            tracing::warn!(path = %canonical.display(), "instruction file escapes project root, skipping");
167            continue;
168        }
169
170        if !seen.insert(canonical.clone()) {
171            // Already loaded this path via a different candidate or symlink.
172            continue;
173        }
174
175        // Open the canonical path after boundary check — no TOCTOU window for symlink swap.
176        let Ok(file) = std::fs::File::open(&canonical) else {
177            continue;
178        };
179
180        let meta = match file.metadata() {
181            Ok(m) => m,
182            Err(e) => {
183                tracing::warn!(path = %path.display(), error = %e, "failed to read instruction file metadata, skipping");
184                continue;
185            }
186        };
187
188        if !meta.is_file() {
189            continue;
190        }
191
192        if meta.len() > MAX_FILE_SIZE {
193            tracing::warn!(
194                path = %path.display(),
195                size = meta.len(),
196                limit = MAX_FILE_SIZE,
197                "instruction file exceeds 256 KiB size limit, skipping"
198            );
199            continue;
200        }
201
202        let mut content = String::new();
203        match std::io::BufReader::new(file).read_to_string(&mut content) {
204            Ok(_) => {}
205            Err(e) => {
206                tracing::warn!(path = %path.display(), error = %e, "failed to read instruction file, skipping");
207                continue;
208            }
209        }
210
211        if content.contains('\0') {
212            tracing::warn!(path = %path.display(), "instruction file contains null bytes, skipping");
213            continue;
214        }
215
216        if content.is_empty() {
217            tracing::debug!(path = %path.display(), "instruction file is empty, skipping");
218            continue;
219        }
220
221        tracing::debug!(path = %path.display(), bytes = content.len(), "loaded instruction file");
222        result.push(InstructionBlock {
223            source: path,
224            content,
225        });
226    }
227
228    result
229}
230
231/// Returns candidate file paths for a given provider.
232///
233/// Uses an exhaustive match — adding a new `ProviderKind` variant will cause
234/// a compile error here, forcing the developer to update the detection table.
235fn detection_paths(kind: ProviderKind, base: &Path) -> Vec<PathBuf> {
236    match kind {
237        ProviderKind::Claude => {
238            let mut paths = vec![
239                base.join("CLAUDE.md"),
240                base.join(".claude").join("CLAUDE.md"),
241            ];
242            // Collect .claude/rules/*.md sorted by name for deterministic order.
243            let rules_dir = base.join(".claude").join("rules");
244            if let Ok(entries) = std::fs::read_dir(&rules_dir) {
245                let mut rule_files: Vec<PathBuf> = entries
246                    .filter_map(std::result::Result::ok)
247                    .map(|e| e.path())
248                    .filter(|p| p.extension().is_some_and(|ext| ext == "md"))
249                    .collect();
250                rule_files.sort();
251                paths.extend(rule_files);
252            }
253            paths
254        }
255        ProviderKind::OpenAi => {
256            vec![base.join("AGENTS.override.md"), base.join("AGENTS.md")]
257        }
258        ProviderKind::Compatible
259        | ProviderKind::Ollama
260        | ProviderKind::Candle
261        | ProviderKind::Gemini
262        | ProviderKind::Gonka
263        | ProviderKind::Cocoon => {
264            vec![base.join("AGENTS.md")]
265        }
266        _ => vec![base.join("AGENTS.md")],
267    }
268}
269
270/// Async wrapper around [`load_instructions`] that offloads filesystem I/O to the tokio
271/// blocking thread pool.
272///
273/// Returns an empty `Vec` and logs an error if the blocking task panics.
274pub async fn load_instructions_async(
275    base_dir: PathBuf,
276    provider_kinds: Vec<ProviderKind>,
277    explicit_files: Vec<PathBuf>,
278    auto_detect: bool,
279) -> Vec<InstructionBlock> {
280    tokio::task::spawn_blocking(move || {
281        load_instructions(&base_dir, &provider_kinds, &explicit_files, auto_detect)
282    })
283    .await
284    .unwrap_or_else(|e| {
285        tracing::error!(
286            error = %e,
287            "load_instructions_async: blocking task panicked, returning empty blocks"
288        );
289        Vec::new()
290    })
291}
292
293#[cfg(test)]
294mod watcher_tests {
295    use super::*;
296    use tokio::sync::mpsc;
297
298    #[tokio::test]
299    async fn start_with_valid_directory() {
300        let dir = tempfile::tempdir().unwrap();
301        let (tx, _rx) = mpsc::channel(16);
302        let result = InstructionWatcher::start(&[dir.path().to_path_buf()], tx);
303        assert!(result.is_ok());
304    }
305
306    #[tokio::test]
307    async fn start_with_empty_paths() {
308        let (tx, _rx) = mpsc::channel(16);
309        let result = InstructionWatcher::start(&[], tx);
310        assert!(result.is_ok());
311    }
312
313    #[tokio::test]
314    async fn detects_md_file_change() {
315        let dir = tempfile::tempdir().unwrap();
316        let (tx, mut rx) = mpsc::channel(16);
317        let _watcher = InstructionWatcher::start(&[dir.path().to_path_buf()], tx).unwrap();
318
319        let md_path = dir.path().join("zeph.md");
320        std::fs::write(&md_path, "initial").unwrap();
321
322        tokio::time::sleep(std::time::Duration::from_millis(100)).await;
323        std::fs::write(&md_path, "updated").unwrap();
324
325        let result = tokio::time::timeout(std::time::Duration::from_secs(3), rx.recv()).await;
326        assert!(
327            result.is_ok(),
328            "expected InstructionEvent::Changed within timeout"
329        );
330    }
331
332    #[tokio::test]
333    async fn ignores_non_md_file_change() {
334        let dir = tempfile::tempdir().unwrap();
335        let (tx, mut rx) = mpsc::channel(16);
336        let _watcher = InstructionWatcher::start(&[dir.path().to_path_buf()], tx).unwrap();
337
338        let other_path = dir.path().join("notes.txt");
339        std::fs::write(&other_path, "content").unwrap();
340
341        let result = tokio::time::timeout(std::time::Duration::from_millis(1500), rx.recv()).await;
342        assert!(result.is_err(), "should not receive event for non-.md file");
343    }
344
345    #[tokio::test]
346    async fn detects_md_file_deletion() {
347        let dir = tempfile::tempdir().unwrap();
348        let md_path = dir.path().join("zeph.md");
349        std::fs::write(&md_path, "content").unwrap();
350
351        let (tx, mut rx) = mpsc::channel(16);
352        let _watcher = InstructionWatcher::start(&[dir.path().to_path_buf()], tx).unwrap();
353
354        tokio::time::sleep(std::time::Duration::from_millis(100)).await;
355        std::fs::remove_file(&md_path).unwrap();
356
357        let result = tokio::time::timeout(std::time::Duration::from_secs(3), rx.recv()).await;
358        assert!(
359            result.is_ok(),
360            "expected InstructionEvent::Changed on .md deletion"
361        );
362    }
363}
364
365#[cfg(test)]
366mod reload_tests {
367    use super::*;
368
369    #[test]
370    fn reload_returns_updated_blocks_when_file_changes() {
371        let dir = tempfile::tempdir().unwrap();
372        let md_path = dir.path().join("zeph.md");
373        std::fs::write(&md_path, "initial content").unwrap();
374
375        let blocks = load_instructions(dir.path(), &[], &[], false);
376        assert_eq!(blocks.len(), 1);
377        assert_eq!(blocks[0].content, "initial content");
378
379        std::fs::write(&md_path, "updated content").unwrap();
380        let blocks2 = load_instructions(dir.path(), &[], &[], false);
381        assert_eq!(blocks2.len(), 1);
382        assert_eq!(blocks2[0].content, "updated content");
383    }
384
385    #[test]
386    fn reload_returns_empty_when_file_deleted() {
387        let dir = tempfile::tempdir().unwrap();
388        let md_path = dir.path().join("zeph.md");
389        std::fs::write(&md_path, "content").unwrap();
390
391        let blocks = load_instructions(dir.path(), &[], &[], false);
392        assert_eq!(blocks.len(), 1);
393
394        std::fs::remove_file(&md_path).unwrap();
395        let blocks2 = load_instructions(dir.path(), &[], &[], false);
396        assert!(
397            blocks2.is_empty(),
398            "deleted file should not be loaded on reload"
399        );
400    }
401}
402
403#[cfg(test)]
404mod tests {
405    use super::*;
406    use std::fs;
407    use tempfile::TempDir;
408
409    fn make_file(dir: &Path, name: &str, content: &str) -> PathBuf {
410        let path = dir.join(name);
411        if let Some(parent) = path.parent() {
412            fs::create_dir_all(parent).unwrap();
413        }
414        fs::write(&path, content).unwrap();
415        path
416    }
417
418    #[test]
419    fn zeph_md_loaded_even_when_auto_detect_disabled() {
420        let dir = TempDir::new().unwrap();
421        make_file(dir.path(), "zeph.md", "some content");
422        let blocks = load_instructions(dir.path(), &[], &[], false);
423        assert_eq!(blocks.len(), 1);
424        assert_eq!(blocks[0].content, "some content");
425    }
426
427    #[test]
428    fn empty_when_no_auto_detect_and_no_explicit_and_no_zeph_md() {
429        let dir = TempDir::new().unwrap();
430        let blocks = load_instructions(dir.path(), &[], &[], false);
431        assert!(blocks.is_empty());
432    }
433
434    #[test]
435    fn finds_zeph_md_in_base_dir() {
436        let dir = TempDir::new().unwrap();
437        make_file(dir.path(), "zeph.md", "zeph instructions");
438        let blocks = load_instructions(dir.path(), &[], &[], true);
439        assert_eq!(blocks.len(), 1);
440        assert_eq!(blocks[0].content, "zeph instructions");
441    }
442
443    #[test]
444    fn finds_dot_zeph_zeph_md() {
445        let dir = TempDir::new().unwrap();
446        make_file(dir.path(), ".zeph/zeph.md", "nested zeph instructions");
447        let blocks = load_instructions(dir.path(), &[], &[], true);
448        assert_eq!(blocks.len(), 1);
449        assert_eq!(blocks[0].content, "nested zeph instructions");
450    }
451
452    #[test]
453    fn detection_paths_claude() {
454        let dir = TempDir::new().unwrap();
455        make_file(dir.path(), "CLAUDE.md", "# Claude");
456        make_file(dir.path(), ".claude/CLAUDE.md", "# Dot Claude");
457        make_file(dir.path(), ".claude/rules/a.md", "rule a");
458        make_file(dir.path(), ".claude/rules/b.md", "rule b");
459
460        let blocks = load_instructions(dir.path(), &[ProviderKind::Claude], &[], true);
461        let sources: Vec<_> = blocks
462            .iter()
463            .map(|b| b.source.file_name().unwrap().to_str().unwrap())
464            .collect();
465        assert!(sources.contains(&"CLAUDE.md"));
466        assert!(sources.contains(&"a.md"));
467        assert!(sources.contains(&"b.md"));
468    }
469
470    #[test]
471    fn detection_paths_openai() {
472        let dir = TempDir::new().unwrap();
473        make_file(dir.path(), "AGENTS.md", "# Agents");
474
475        let paths = detection_paths(ProviderKind::OpenAi, dir.path());
476        assert!(paths.iter().any(|p| p.file_name().unwrap() == "AGENTS.md"));
477        assert!(
478            paths
479                .iter()
480                .any(|p| p.file_name().unwrap() == "AGENTS.override.md")
481        );
482    }
483
484    #[test]
485    fn detection_paths_ollama_and_compatible_and_candle() {
486        let dir = TempDir::new().unwrap();
487        for kind in [
488            ProviderKind::Ollama,
489            ProviderKind::Compatible,
490            ProviderKind::Candle,
491        ] {
492            let paths = detection_paths(kind, dir.path());
493            assert_eq!(paths.len(), 1);
494            assert_eq!(paths[0].file_name().unwrap(), "AGENTS.md");
495        }
496    }
497
498    #[test]
499    fn deduplication_by_canonical_path() {
500        let dir = TempDir::new().unwrap();
501        make_file(dir.path(), "AGENTS.md", "content");
502
503        // Both Ollama and Compatible resolve to AGENTS.md — should appear once.
504        let blocks = load_instructions(
505            dir.path(),
506            &[ProviderKind::Ollama, ProviderKind::Compatible],
507            &[],
508            true,
509        );
510        let agents_count = blocks
511            .iter()
512            .filter(|b| b.source.file_name().unwrap() == "AGENTS.md")
513            .count();
514        assert_eq!(agents_count, 1);
515    }
516
517    #[test]
518    fn skips_files_exceeding_size_limit() {
519        let dir = TempDir::new().unwrap();
520        let path = dir.path().join("big.md");
521        // Write slightly more than 512 KB.
522        let big = vec![b'x'; 513 * 1024];
523        fs::write(&path, &big).unwrap();
524        let blocks = load_instructions(dir.path(), &[], &[path], false);
525        assert!(blocks.is_empty());
526    }
527
528    #[test]
529    fn skips_empty_files() {
530        let dir = TempDir::new().unwrap();
531        make_file(dir.path(), "zeph.md", "");
532        let blocks = load_instructions(dir.path(), &[], &[], true);
533        assert!(blocks.is_empty());
534    }
535
536    #[test]
537    fn nonexistent_paths_are_silently_skipped() {
538        let dir = TempDir::new().unwrap();
539        let nonexistent = dir.path().join("does_not_exist.md");
540        let blocks = load_instructions(dir.path(), &[], &[nonexistent], false);
541        assert!(blocks.is_empty());
542    }
543
544    #[test]
545    fn explicit_relative_path_resolved_against_base_dir() {
546        let dir = TempDir::new().unwrap();
547        make_file(dir.path(), "custom.md", "custom content");
548        let blocks = load_instructions(dir.path(), &[], &[PathBuf::from("custom.md")], false);
549        assert_eq!(blocks.len(), 1);
550        assert_eq!(blocks[0].content, "custom content");
551    }
552
553    #[test]
554    fn invalid_utf8_file_is_skipped() {
555        let dir = TempDir::new().unwrap();
556        let path = dir.path().join("bad.md");
557        // Write bytes that are not valid UTF-8.
558        fs::write(&path, b"\xff\xfe invalid utf8 \x80\x81").unwrap();
559        let blocks = load_instructions(dir.path(), &[], &[path], false);
560        assert!(blocks.is_empty());
561    }
562
563    #[test]
564    fn multiple_providers_union_without_overlap() {
565        let dir = TempDir::new().unwrap();
566        make_file(dir.path(), "CLAUDE.md", "claude content");
567        make_file(dir.path(), "AGENTS.md", "agents content");
568
569        let blocks = load_instructions(
570            dir.path(),
571            &[ProviderKind::Claude, ProviderKind::OpenAi],
572            &[],
573            true,
574        );
575        let names: Vec<_> = blocks
576            .iter()
577            .map(|b| b.source.file_name().unwrap().to_str().unwrap())
578            .collect();
579        assert!(names.contains(&"CLAUDE.md"), "Claude file missing");
580        assert!(names.contains(&"AGENTS.md"), "OpenAI file missing");
581    }
582
583    #[test]
584    fn zeph_md_always_loaded_with_provider_auto_detect() {
585        let dir = TempDir::new().unwrap();
586        make_file(dir.path(), "zeph.md", "zeph rules");
587        // OpenAI provider has no AGENTS.md present, only zeph.md.
588        let blocks = load_instructions(dir.path(), &[ProviderKind::OpenAi], &[], true);
589        assert_eq!(blocks.len(), 1);
590        assert_eq!(blocks[0].content, "zeph rules");
591    }
592
593    #[cfg(unix)]
594    #[test]
595    fn symlink_deduplication() {
596        use std::os::unix::fs::symlink;
597        let dir = TempDir::new().unwrap();
598        make_file(dir.path(), "CLAUDE.md", "claude content");
599        symlink(
600            dir.path().join("CLAUDE.md"),
601            dir.path().join("CLAUDE_link.md"),
602        )
603        .unwrap();
604
605        // Load the original and the symlink — should appear only once after dedup.
606        let blocks = load_instructions(
607            dir.path(),
608            &[ProviderKind::Claude],
609            &[PathBuf::from("CLAUDE_link.md")],
610            true,
611        );
612        let claude_count = blocks
613            .iter()
614            .filter(|b| b.content == "claude content")
615            .count();
616        assert_eq!(claude_count, 1, "symlink should be deduped with original");
617    }
618
619    #[cfg(unix)]
620    #[test]
621    fn symlink_escaping_project_root_is_rejected() {
622        use std::os::unix::fs::symlink;
623        let outside = TempDir::new().unwrap();
624        let inside = TempDir::new().unwrap();
625        make_file(outside.path(), "secret.md", "secret content");
626
627        // Create a symlink inside the project dir pointing outside.
628        let link = inside.path().join("evil.md");
629        symlink(outside.path().join("secret.md"), &link).unwrap();
630
631        let blocks = load_instructions(inside.path(), &[], &[link], false);
632        assert!(
633            blocks.is_empty(),
634            "file escaping project root must be rejected"
635        );
636    }
637
638    #[test]
639    fn file_with_null_bytes_is_skipped() {
640        let dir = TempDir::new().unwrap();
641        let path = dir.path().join("null.md");
642        fs::write(&path, b"content\x00more").unwrap();
643        let blocks = load_instructions(dir.path(), &[], &[path], false);
644        assert!(blocks.is_empty());
645    }
646}