zeph-core 0.18.0

Core agent loop, configuration, context builder, metrics, and vault for Zeph
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
[agent]
# Agent display name
name = "Zeph"
# Maximum tool execution iterations per user message (doom-loop protection)
max_tool_iterations = 10
# Check for new Zeph releases on startup
# auto_update_check = true
# Automatically detect provider-specific instruction files (CLAUDE.md, AGENTS.md, etc.)
# instruction_auto_detect = true
# Additional instruction files to always inject into the system prompt
# instruction_files = ["custom-instructions.md"]

[llm]
# LLM provider: "ollama", "claude", "openai", "candle", "orchestrator", "compatible", "router"
provider = "ollama"
# Base URL for Ollama server
base_url = "http://localhost:11434"
# Primary model for chat completions
model = "qwen3:8b"
# Model for generating embeddings (semantic memory)
embedding_model = "qwen3-embedding"
# Provider-specific instruction file to inject into the system prompt (optional)
# instruction_file = "llm-instructions.md"

# Dedicated provider for tool-pair summarization and context compaction (optional).
# Falls back to the primary provider when unset.
# String shorthand (summary_model) or structured table ([llm.summary_provider]) — pick one.
# summary_model = "ollama/qwen3:1.7b"   # ollama/<model> | claude[/<model>] | openai[/<model>] | compatible/<name> | candle
# [llm.summary_provider]
# type = "claude"                          # claude, openai, compatible, ollama, candle
# model = "claude-haiku-4-5-20251001"     # model override (or [[llm.compatible]] entry name for compatible)

[llm.cloud]
# Claude API model (used when provider = "claude")
model = "claude-sonnet-4-5-20250929"
# Maximum tokens for Claude responses
max_tokens = 4096
# Enable Claude server-side context compaction (compact-2026-01-12 beta).
# When enabled, the API automatically summarizes long conversations; client-side compaction is skipped.
# server_compaction = false
# Enable 1M token extended context window (Opus 4.6 / Sonnet 4.6 only).
# Tokens above 200K use long-context pricing — see https://www.anthropic.com/pricing
# enable_extended_context = false

# OpenAI-compatible API (GPT-5.2, Together, Groq, Fireworks, etc.)
# [llm.openai]
# base_url = "https://api.openai.com/v1"
# model = "gpt-5.2"
# max_tokens = 4096
# embedding_model = "text-embedding-3-small"
# reasoning_effort = "medium"  # low, medium, high (for reasoning models)

# LLM response cache (SQLite-backed, blake3 key hashing)
# response_cache_enabled = false
# response_cache_ttl_secs = 3600

# Speech-to-text provider (Whisper API)
# [llm.stt]
# provider = "whisper"
# model = "whisper-1"

# Vision model for image understanding
# vision_model = "llava:7b"

# Candle local inference (feature-gated: --features candle)
# [llm.candle]
# source = "huggingface"  # "local" or "huggingface"
# filename = "mistral-7b-instruct-v0.2.Q4_K_M.gguf"
# local_path = ""          # used when source = "local"
# chat_template = "chatml"  # llama3, chatml, mistral, phi3, raw
# device = "cpu"           # auto, cpu, metal, cuda
# embedding_repo = "sentence-transformers/all-MiniLM-L6-v2"
# [llm.candle.generation]
# temperature = 0.7
# top_p = 0.9
# top_k = 40
# max_tokens = 2048
# seed = 42
# repeat_penalty = 1.1
# repeat_last_n = 64

# Model orchestrator (enabled by default)
# Routes tasks to different providers with fallback chains
# [llm.orchestrator]
# default = "ollama"
# embed = "ollama"
# [llm.orchestrator.providers.ollama]
# type = "ollama"
# model = "qwen3:8b"          # optional: override model
# base_url = "http://localhost:11434"  # optional: override base URL
# embedding_model = "qwen3-embedding"  # optional: override embedding model
# [llm.orchestrator.providers.claude]
# type = "claude"
# model = "claude-sonnet-4-5-20250929"  # optional: override [llm.cloud].model
# [llm.orchestrator.providers.candle]
# type = "candle"
# filename = "model.gguf"       # optional: GGUF filename
# device = "metal"              # optional: cpu, metal, cuda
# [llm.orchestrator.routes]
# coding = ["claude", "ollama"]
# creative = ["claude", "ollama"]
# general = ["ollama"]

# OpenAI-compatible providers (Groq, Together, Mistral, local vLLM, etc.)
# Set provider = "<name>" and ZEPH_COMPATIBLE_<NAME>_API_KEY env var to use.
# [[llm.compatible]]
# name = "groq"
# base_url = "https://api.groq.com/openai/v1"
# model = "llama-3.3-70b-versatile"
# max_tokens = 4096
# # embedding_model = "..."  # optional

# Router provider: sequential fallback across multiple providers.
# Set provider = "router" to use.
# [llm.router]
# chain = ["groq", "openai", "ollama"]

# --- Full orchestrator example with cloud + local + STT ---
# The orchestrator routes tasks to different providers with fallback chains.
# Sub-providers inherit from parent sections:
#   - "ollama" sub-provider uses [llm].base_url, [llm].model, [llm].embedding_model as defaults
#   - "claude" sub-provider requires [llm.cloud] section + ZEPH_CLAUDE_API_KEY
#   - "openai" sub-provider requires [llm.openai] section + ZEPH_OPENAI_API_KEY
#   - "candle" sub-provider requires [llm.candle] section
# Per-provider `model` overrides the inherited default.
#
# [llm]
# provider = "orchestrator"
# base_url = "http://localhost:11434"      # used by ollama sub-provider
# model = "qwen3:8b"                     # fallback model for sub-providers without explicit model
# embedding_model = "qwen3-embedding"      # used by ollama sub-provider for embeddings
#
# [llm.cloud]
# model = "claude-sonnet-4-5-20250929"
# max_tokens = 4096
#
# [llm.stt]
# provider = "whisper"
# model = "whisper-1"
#
# [llm.orchestrator]
# default = "ollama/qwen3:8b"            # "provider_name/model" or just "provider_name"
# embed = "qwen3-embedding"                # embedding model name or "provider_name/model"
#
# [llm.orchestrator.providers.ollama]
# type = "ollama"
# model = "qwen3:8b"                     # optional: overrides [llm].model
# base_url = "http://localhost:11434"      # optional: overrides [llm].base_url
# embedding_model = "qwen3-embedding"      # optional: overrides [llm].embedding_model
#
# [llm.orchestrator.providers.claude]
# type = "claude"
# model = "claude-sonnet-4-5-20250929"     # optional: overrides [llm.cloud].model
#
# [llm.orchestrator.routes]
# chat = ["ollama", "claude"]              # fallback chain: try ollama first, then claude
# embed = ["ollama/qwen3-embedding"]       # "provider_name/model" format

[skills]
# Directories to scan for SKILL.md files
# Defaults to the user config dir (for example ~/.config/zeph/skills on Linux,
# ~/Library/Application Support/Zeph/skills on macOS,
# %APPDATA%\zeph\skills on Windows).
# paths = ["/absolute/path/to/skills"]
# Maximum number of skills to inject into context per query (embedding-based selection)
max_active_skills = 5
# Prompt mode: "full" (inject full SKILL.md), "compact" (name+description only), "auto" (compact if budget < 8192)
# prompt_mode = "auto"
# Minimum score delta for skill disambiguation (0.0-1.0)
# disambiguation_threshold = 0.05

[skills.learning]
# Enable self-learning skill improvement (feature enabled by default, runtime toggle)
enabled = false
# Automatically activate improved versions (false = require manual approval)
auto_activate = false
# Minimum failures before generating improvement
min_failures = 3
# Success rate threshold below which improvement is triggered (0.0-1.0)
improve_threshold = 0.7
# Success rate below which automatic rollback occurs (0.0-1.0)
rollback_threshold = 0.5
# Minimum evaluations before rollback decision
min_evaluations = 5
# Maximum auto-generated versions per skill
max_versions = 10
# Cooldown between improvements for same skill (minutes)
cooldown_minutes = 60

[skills.trust]
# Default trust level for newly discovered skills: trusted, verified, quarantined, blocked
default_level = "quarantined"
# Trust level assigned to local (built-in) skills
local_level = "trusted"
# Trust level after blake3 hash mismatch on hot-reload
hash_mismatch_level = "quarantined"

[memory]
# SQLite database path for conversation history
# Defaults to the user data dir (for example ~/.local/share/zeph/data/zeph.db on Linux,
# ~/Library/Application Support/Zeph/data/zeph.db on macOS,
# %LOCALAPPDATA%\Zeph\data\zeph.db on Windows).
# sqlite_path = "/absolute/path/to/zeph.db"
# Maximum number of recent messages to load into context
history_limit = 50
# Qdrant vector database URL for semantic memory
qdrant_url = "http://localhost:6334"
# Number of messages before triggering summarization (0 = disabled)
summarization_threshold = 50
# Total token budget for context window (0 = auto-detect from model)
context_budget_tokens = 0
# Auto-detect context budget from model's context window size
auto_budget = true
# Soft compaction threshold (0.0-1.0): prune tool outputs + apply deferred summaries (no LLM).
soft_compaction_threshold = 0.70
# Hard compaction threshold (0.0-1.0): full LLM summarization when context usage exceeds this.
hard_compaction_threshold = 0.90
# Number of recent messages to preserve during compaction
compaction_preserve_tail = 6
# Turns to skip after a successful compaction (cooldown guard).
# Prevents immediate re-compaction when the summary itself consumes many tokens.
compaction_cooldown_turns = 2
# Token budget protected from tool output pruning (recent context zone)
prune_protect_tokens = 40000
# Minimum relevance score for cross-session memory results (0.0-1.0)
cross_session_score_threshold = 0.35
# Vector backend: "qdrant" (external) or "sqlite" (embedded, zero-dependency)
# vector_backend = "qdrant"
# Token safety margin multiplier for compaction budget (must be > 0)
# token_safety_margin = 1.0
# Redact credentials from LLM context before sending
# redact_credentials = true
# Auto-save assistant responses to semantic memory
# autosave_assistant = false
# Minimum character length for autosave (shorter responses skip embedding)
# autosave_min_length = 20
# Use structured anchored summaries for context compaction (experimental, off by default)
# structured_summaries = false

[memory.sessions]
# Maximum number of sessions returned by list operations (0 = unlimited)
max_history = 100
# Maximum characters for auto-generated session titles
title_max_chars = 60

[memory.documents]
# Qdrant collection for ingested documents
collection = "zeph_documents"
# Text chunk size in characters
chunk_size = 1000
# Overlap between consecutive chunks in characters
chunk_overlap = 100
# Number of document chunks to inject into agent context per turn
top_k = 3
# Enable RAG: inject relevant document chunks into agent context
rag_enabled = false

[memory.semantic]
# Enable semantic memory with vector search
enabled = true
# Maximum number of semantically relevant messages to recall
recall_limit = 5
# Hybrid search weights (vector + FTS5 keyword). Must sum to 1.0.
vector_weight = 0.7
keyword_weight = 0.3
# Temporal decay: penalize older memories by age
# temporal_decay_enabled = false
# temporal_decay_half_life_days = 30
# MMR re-ranking: diversify recall results
# mmr_enabled = false
# mmr_lambda = 0.7

# Code RAG: AST-based code indexing and hybrid retrieval
# Requires Qdrant for semantic retrieval; tree-sitter grammars are always available
[index]
# Enable code indexing and retrieval (requires Qdrant)
enabled = false
# Watch for file changes and reindex incrementally
watch = true
# Maximum code chunks to retrieve per query
max_chunks = 12
# Minimum cosine similarity score to accept
score_threshold = 0.25
# Fraction of code_context budget used by retriever (0.0-1.0)
budget_ratio = 0.40
# Token budget for repo structural map in system prompt (0 = disabled)
repo_map_tokens = 500
# Cache TTL for repo map in seconds (avoids regeneration on every message)
repo_map_ttl_secs = 300

# [discord]
# token = ""                    # or set ZEPH_DISCORD_TOKEN
# application_id = ""           # for slash command registration
# allowed_user_ids = []         # Discord user IDs (empty = allow all)
# allowed_role_ids = []         # Discord role IDs
# allowed_channel_ids = []      # restrict to specific channels

# [slack]
# bot_token = ""                # or set ZEPH_SLACK_BOT_TOKEN
# signing_secret = ""           # or set ZEPH_SLACK_SIGNING_SECRET
# port = 3000                   # Events API webhook port
# webhook_host = "127.0.0.1"   # bind address for Events API webhook
# allowed_user_ids = []         # Slack user IDs (empty = allow all)
# allowed_channel_ids = []      # restrict to specific channels

[mcp]
# Allowlist of permitted commands for /mcp add (empty = allow all)
allowed_commands = ["npx", "uvx", "node", "python", "python3"]
# Maximum number of dynamically added MCP servers
max_dynamic_servers = 10

# Stdio transport (spawn child process):
# [[mcp.servers]]
# id = "filesystem"
# command = "npx"
# args = ["-y", "@modelcontextprotocol/server-filesystem", "/tmp"]
# env = {}                      # environment variables for the child process
# timeout = 30

# HTTP transport (remote MCP server, e.g. Docker container):
# [[mcp.servers]]
# id = "remote-tools"
# url = "http://localhost:3001/mcp"
# timeout = 30

# LSP code intelligence via mcpls (https://github.com/bug-ops/mcpls)
# Install: cargo install mcpls
# mcpls auto-detects language servers from project files (Cargo.toml → rust-analyzer, etc.)
# [[mcp.servers]]
# id = "mcpls"
# command = "mcpls"
# args = ["--workspace-root", "."]
# timeout = 60                     # LSP servers need warmup time; 60s recommended

[cost]
# Track LLM API costs and enforce daily budget
enabled = false
# Maximum daily spend in cents (0 = unlimited)
max_daily_cents = 500

[observability]
# Tracing exporter: "" (disabled) or "otlp" (requires otel feature)
exporter = ""
# OTLP collector endpoint
endpoint = "http://localhost:4317"

[vault]
# Secret retrieval backend: "env" reads from environment variables
backend = "env"

[a2a]
# Enable A2A server for agent-to-agent communication
enabled = false
# Bind address
host = "0.0.0.0"
# HTTP port
port = 8080
# Public URL advertised in AgentCard (auto-generated if empty)
public_url = ""
# Bearer token for authentication (from vault ZEPH_A2A_AUTH_TOKEN)
# auth_token = ""
# Rate limit: max requests per minute per IP (0 = unlimited)
rate_limit = 60
# Require TLS for outbound A2A connections
require_tls = true
# Block requests to private/loopback IPs
ssrf_protection = true
# Maximum request body size in bytes (1MB)
max_body_size = 1048576

[tools]
# Enable tool execution (bash commands)
enabled = true
# Summarize long tool output via LLM instead of head+tail truncation
summarize_output = true

[tools.shell]
# Command timeout in seconds
timeout = 30
# Additional commands to block (case-insensitive, supports wildcards)
blocked_commands = []
# Commands to remove from the default blocklist (e.g., ["curl", "wget"])
allowed_commands = []
# Restrict file access to these paths (empty = current directory only)
allowed_paths = []
# Allow network commands (curl, wget, nc)
allow_network = true
# Commands that require user confirmation before execution
confirm_patterns = ["rm ", "git push -f", "git push --force", "drop table", "drop database", "truncate "]

[tools.scrape]
# HTTP request timeout in seconds
timeout = 15
# Maximum response body size in bytes (1MB)
max_body_bytes = 1048576

[tools.filters]
# Enable smart output filtering for tool results
enabled = true
# [tools.filters.test]
# enabled = true
# max_failures = 10
# truncate_stack_trace = 50
# [tools.filters.git]
# enabled = true
# max_log_entries = 20
# max_diff_lines = 500
# [tools.filters.clippy]
# enabled = true
# [tools.filters.cargo_build]
# enabled = true
# [tools.filters.dir_listing]
# enabled = true
# [tools.filters.log_dedup]
# enabled = true
# [tools.filters.security]
# enabled = true
# extra_patterns = []

# Per-tool permission rules (glob patterns with allow/ask/deny actions)
# [tools.permissions]
# shell = [{ pattern = "/tmp/*", action = "allow" }, { pattern = "/etc/*", action = "deny" }]

[tools.overflow]
# Offload large tool responses to SQLite instead of truncating in-memory.
# Characters threshold above which output is stored in the overflow table (default: 50000)
threshold = 50000
# Days to retain overflow entries before age-based cleanup on next startup (default: 7)
retention_days = 7
# Maximum bytes per overflow entry; 0 means unlimited (default: 10485760 = 10 MiB)
max_overflow_bytes = 10485760

[tools.audit]
# Enable audit logging for tool executions
enabled = false
# Audit destination: "stdout" or file path (e.g., "./data/audit.jsonl")
destination = "stdout"

[tools.policy]
# Enable declarative policy compiler for tool call authorization (requires policy-enforcer feature)
enabled = false
# Fallback effect when no rule matches: "allow" or "deny"
default_effect = "deny"
# Optional external policy rules file (TOML). Overrides inline rules when set.
# policy_file = "policy.toml"

[tools.anomaly]
# Enable sliding-window anomaly detection for tool execution errors
enabled = false
# Number of recent tool calls to track in the window
window_size = 10
# Error ratio threshold for warning alerts (0.0-1.0)
error_threshold = 0.5
# Error ratio threshold for critical alerts (0.0-1.0)
critical_threshold = 0.8

[gateway]
# Enable HTTP gateway for webhook ingestion (feature-gated: --features gateway)
enabled = false
# Bind address (127.0.0.1 = localhost only, 0.0.0.0 = all interfaces)
bind = "127.0.0.1"
# HTTP port
port = 8090
# auth_token = "secret"  # optional, from vault ZEPH_GATEWAY_TOKEN
# Rate limit: max requests per minute per IP
rate_limit = 120
# Maximum request body size in bytes (1MB)
max_body_size = 1048576

[daemon]
# Enable daemon supervisor
enabled = false
# PID file location
pid_file = "~/.zeph/zeph.pid"
# Health check interval in seconds
health_interval_secs = 30
# Maximum restart backoff in seconds
max_restart_backoff_secs = 60

[scheduler]
# Enable cron scheduler (included in default features)
enabled = true
# Example task definitions:
# [[scheduler.tasks]]
# name = "memory_cleanup"
# cron = "0 0 0 * * *"
# kind = "memory_cleanup"
# config = { max_age_days = 90 }
#
# [[scheduler.tasks]]
# name = "health_check"
# cron = "0 */5 * * * *"
# kind = "health_check"

[security]
# Redact secrets (API keys, tokens) from LLM responses before display
redact_secrets = true
# Tool access level: "readonly" (observe only), "supervised" (default, with confirmations), "full" (all tools, no confirmations)
autonomy_level = "supervised"

[security.content_isolation]
# Enable the 4-step sanitization pipeline for untrusted content (default: true)
enabled = true
# Maximum byte length of untrusted content before truncation (default: 65536)
max_content_size = 65536
# Flag detected injection patterns in the spotlighting wrapper (default: true)
flag_injection_patterns = true
# Wrap untrusted content in spotlighting XML delimiters (default: true)
spotlight_untrusted = true

[security.content_isolation.quarantine]
# Route high-risk content through an isolated LLM for fact extraction (default: false)
enabled = false
# Source kinds to route through quarantine (default: web_scrape, a2a_message)
sources = ["web_scrape", "a2a_message"]
# Provider to use for quarantine LLM calls — must be a recognized provider name
# (e.g. "claude", "ollama", "openai", or a compatible entry name)
model = "claude"

[security.pii_filter]
# Scrub PII from tool outputs before they enter LLM context and debug dumps (default: false)
enabled = false
# Scrub email addresses (default: true)
filter_email = true
# Scrub US phone numbers (default: true)
filter_phone = true
# Scrub US Social Security Numbers (default: true)
filter_ssn = true
# Scrub credit card numbers (16-digit patterns) (default: true)
filter_credit_card = true
# Custom regex patterns (optional, in addition to built-ins)
# [[security.pii_filter.custom_patterns]]
# name = "employee_id"
# pattern = "EMP-\\d{6}"
# replacement = "[PII:employee_id]"

[security.memory_validation]
# Validate content before memory_save writes and graph extraction (default: true)
enabled = true
# Maximum byte length of content passed to memory_save (default: 4096)
max_content_bytes = 4096
# Maximum byte length of a single entity name in graph extraction (default: 256)
max_entity_name_bytes = 256
# Maximum byte length of an edge fact string in graph extraction (default: 1024)
max_fact_bytes = 1024
# Maximum number of entities allowed per graph extraction result (default: 50)
max_entities_per_extraction = 50
# Maximum number of edges allowed per graph extraction result (default: 100)
max_edges_per_extraction = 100
# Forbidden substring patterns — content containing any is rejected (default: empty)
# forbidden_content_patterns = ["<script", "javascript:"]

[security.rate_limit]
# Per-category sliding-window rate limiter for tool calls (default: false)
enabled = false
# Maximum shell tool calls per 60-second window (default: 30)
shell_calls_per_minute = 30
# Maximum web scrape tool calls per 60-second window (default: 20)
web_calls_per_minute = 20
# Maximum memory tool calls per 60-second window (default: 60)
memory_calls_per_minute = 60
# Maximum MCP tool calls per 60-second window (default: 40)
mcp_calls_per_minute = 40
# Maximum other tool calls per 60-second window (default: 60)
other_calls_per_minute = 60
# Circuit breaker cooldown in seconds after limit is exceeded (default: 30)
circuit_breaker_cooldown_secs = 30

[security.guardrail]
# Enable LLM-based prompt injection pre-screener (default: false)
enabled = false
# Provider for guardrail classification — must be a leaf provider (ollama, claude, openai, compatible)
# provider = "ollama"
# Safety model to use (e.g. llama-guard-3:1b for Ollama)
# model = "llama-guard-3:1b"
# Timeout for each classification call in milliseconds (default: 500)
timeout_ms = 500
# Action when input is flagged: "block" (default) or "warn"
action = "block"
# Behavior on timeout or LLM error: "closed" = block (default), "open" = allow
fail_strategy = "closed"
# Also scan tool outputs before they enter message history (default: false; opt-in)
scan_tool_output = false
# Maximum characters sent to the guard model — input is truncated (default: 4096)
max_input_chars = 4096

# [telegram]
# token = "your-bot-token"
# Allowed usernames (empty = allow all except for /start command)
# allowed_users = ["username1", "username2"]

[timeouts]
# LLM chat completion timeout in seconds
llm_seconds = 120
# Embedding generation timeout in seconds
embedding_seconds = 30
# A2A remote call timeout in seconds
a2a_seconds = 30
# Maximum number of tool calls to execute in parallel
max_parallel_tools = 8

[debug]
# Enable debug dump: write every LLM request/response pair to timestamped files.
# CLI flag --debug-dump takes priority over this setting.
# Use /debug-dump in TUI/CLI to toggle at runtime.
enabled = false
# Directory where per-session subdirectories are created
# Defaults to the user data dir (for example ~/.local/share/zeph/debug on Linux,
# ~/Library/Application Support/Zeph/debug on macOS,
# %LOCALAPPDATA%\Zeph\debug on Windows).
# output_dir = "/absolute/path/to/debug"
# Output format for LLM request files:
#   "json"  — internal zeph-llm representation (default)
#   "raw"   — actual API payload (system extracted, content blocks, mirrors what is sent to the provider)
#   "trace" — OpenTelemetry-compatible OTLP JSON spans written to trace.json at session end
#             Use --dump-format trace on the CLI to override at runtime.
format = "json"

[debug.traces]
# OTLP gRPC endpoint for trace export (only used when format = "trace" and otel feature enabled).
# Falls back to observability.endpoint if unset.
otlp_endpoint = "http://localhost:4317"
# Service name reported to the OTel collector.
service_name = "zeph"
# Redact secrets and sensitive paths from span attributes (recommended).
redact = true

[tui]
# Show role prefix labels ([user], [zeph], etc.) in chat messages
show_source_labels = false

[acp]
# Auto-start ACP server on plain `zeph` startup using the configured transport (CLI flags override)
enabled = false
# Agent name advertised to IDE clients
agent_name = "zeph"
# Transport mode: "stdio" (default, for IDE embedding), "http", or "both"
transport = "stdio"
# Bind address for the HTTP transport
http_bind = "127.0.0.1:9800"
# Maximum number of concurrent ACP sessions (LRU eviction when exceeded)
max_sessions = 4
# Session idle timeout in seconds before eviction
session_idle_timeout_secs = 1800
# Reload/config broadcast backlog per ACP session fan-out
broadcast_capacity = 256
# Whether to serve the /.well-known/acp.json agent discovery manifest (HTTP/both only)
discovery_enabled = true
# LLM models advertised to the IDE for model switching: ["claude:claude-sonnet-4-5", "ollama:llama3"]
available_models = []

[acp.lsp]
# Enable LSP code intelligence extension when IDE advertises lsp capability
enabled = true
# Fetch diagnostics automatically when lsp/didSave notification is received
auto_diagnostics_on_save = true
# Maximum diagnostics to accept per file
max_diagnostics_per_file = 20
# Maximum files in diagnostics cache (LRU eviction)
max_diagnostic_files = 5
# Maximum reference locations returned
max_references = 100
# Maximum workspace symbol search results
max_workspace_symbols = 50
# Timeout in seconds for LSP extension method calls
request_timeout_secs = 10

[agents]
# Enable sub-agent spawning (required for /agent commands and multi-agent workflows)
enabled = false
# Maximum number of sub-agents that can run concurrently
max_concurrent = 1
# Allow sub-agents to use bypass_permissions mode (enable only in trusted environments)
allow_bypass_permissions = false
# Enable writing JSONL transcripts for sub-agent sessions (required for /agent resume)
transcript_enabled = true
# Maximum number of transcript files to retain (0 = unlimited)
transcript_max_files = 50

[orchestration]
# Enable the orchestration subsystem (/plan commands and task graph execution)
enabled = false
# Maximum number of tasks in a single plan graph
max_tasks = 20
# Maximum number of tasks that can run in parallel
max_parallel = 4
# Default failure strategy: "abort", "retry", "skip", or "ask"
default_failure_strategy = "abort"
# Default number of retries for the retry failure strategy
default_max_retries = 3
# Task execution timeout in seconds (0 = no timeout)
task_timeout_secs = 300
# Maximum tokens budget for planner LLM responses
planner_max_tokens = 4096
# Total character budget for cross-task dependency context injection
dependency_context_budget = 16384
# Show a confirmation prompt before executing a plan
confirm_before_execute = true
# Maximum tokens budget for aggregation LLM calls
aggregator_max_tokens = 4096
# Backoff in ms before retrying deferred tasks
deferral_backoff_ms = 250

[experiments]
# Enable the autonomous self-experimentation engine
enabled = false
# Maximum number of experiments to run in a single session
max_experiments = 20
# Maximum wall-clock time per experiment session in seconds
max_wall_time_secs = 3600
# Minimum relative improvement (%) required to keep an experiment result
min_improvement = 0.5
# Token budget for evaluation LLM calls
eval_budget_tokens = 100000
# Automatically apply improvements without confirmation
auto_apply = false

[experiments.schedule]
# Enable scheduled automatic experiment runs
enabled = false
# Cron expression for scheduled runs (default: 3am daily)
cron = "0 3 * * *"
# Maximum experiments per scheduled run
max_experiments_per_run = 20
# Wall-time cap for a single scheduled session in seconds
max_wall_time_secs = 1800

[logging]
# Log file path (empty string disables file logging)
# Defaults to the user data dir (for example ~/.local/share/zeph/logs/zeph.log on Linux,
# ~/Library/Application Support/Zeph/logs/zeph.log on macOS,
# %LOCALAPPDATA%\Zeph\logs\zeph.log on Windows).
# file = "/absolute/path/to/zeph.log"
# Log level for the file sink: "trace", "debug", "info", "warn", "error"
level = "info"
# Rotation strategy: "daily", "hourly", or "never"
rotation = "daily"
# Maximum number of rotated log files to retain
max_files = 7

# Knowledge graph memory
# Extracts entities and relations from conversations into a persistent graph.
# WARNING: entity names and facts are stored verbatim without PII redaction.
# Do not enable when processing conversations with sensitive personal data.
# [memory.graph]
# enabled = false
# # LLM model used for entity/relation extraction (required when enabled)
# extract_model = "claude-sonnet-4-5-20250929"
# # Maximum entities extracted per message
# max_entities_per_message = 10
# # Maximum edges (relations) extracted per message
# max_edges_per_message = 15
# # Messages between community detection runs
# community_refresh_interval = 100
# # Cosine similarity threshold for entity deduplication (0.0-1.0)
# entity_similarity_threshold = 0.85
# # Use embedding-based entity resolution instead of name matching
# use_embedding_resolution = false
# # Ambiguity threshold for embedding resolution (0.0-1.0)
# entity_ambiguous_threshold = 0.70
# # Timeout in seconds for extraction LLM calls
# extraction_timeout_secs = 15
# # Maximum graph traversal depth for recall queries
# max_hops = 2
# # Maximum entities to return per recall query
# recall_limit = 10
# # Days to retain expired edges before deletion
# expired_edge_retention_days = 90
# # Maximum total entities in the graph (0 = unlimited)
# max_entities = 0
# # Temporal recency decay rate for graph recall scoring (1/day). 0.0 = disabled.
# temporal_decay_rate = 0.0

# ACON failure-driven compression guidelines
# Learns compression rules from detected context-loss events after hard compaction.
# Requires the `compression-guidelines` feature flag to be enabled at compile time.
# [memory.compression_guidelines]
# enabled = false
# # Minimum unused failure pairs before triggering a guidelines update
# update_threshold = 5
# # Maximum token budget for the guidelines document
# max_guidelines_tokens = 500
# # Maximum failure pairs consumed per update cycle
# max_pairs_per_update = 10
# # Number of turns after hard compaction to watch for context loss
# detection_window_turns = 10
# # Interval in seconds between background updater checks
# update_interval_secs = 300
# # Maximum unused failure pairs to retain (cleanup policy)
# max_stored_pairs = 100

# Compaction probe: validates summary quality before committing it (#1609).
# Generates factual questions from compacted messages, answers them from the summary,
# and scores accuracy. HardFail blocks compaction; SoftFail logs a warning.
# [memory.compression.probe]
# # Enable compaction probe validation
# enabled = false
# # Model for probe LLM calls (empty = same as summary provider)
# model = ""
# # Minimum score to pass (scores in [hard_fail_threshold, threshold) = SoftFail)
# threshold = 0.6
# # Score below this blocks compaction (HardFail)
# hard_fail_threshold = 0.35
# # Maximum number of probe questions to generate
# max_questions = 3
# # Timeout for the entire probe (both LLM calls) in seconds
# timeout_secs = 15