1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
# Vectorless Configuration Example
# Copy this file to config.toml and fill in your API keys
#
# All configuration is loaded from this file only.
# No environment variables are used - this ensures explicit, traceable configuration.
[]
# Word count threshold for splitting sections into subsections
= 300
# Maximum tokens to send in a single segmentation request
= 3000
# Maximum tokens for each summary
= 200
# Minimum content tokens required to generate a summary
= 20
[]
# API key - get from your provider
# api_key = "sk-..."
# API endpoint
# OpenAI: https://api.openai.com/v1
# ZAI General: https://api.z.ai/api/paas/v4
# ZAI Coding: https://api.z.ai/api/coding/paas/v4
= "https://api.openai.com/v1"
# Model for summarization (use cheaper models for indexing)
= "gpt-4o-mini"
# Maximum tokens for summary generation
= 200
# Temperature for summary generation
= 0.0
[]
# API key (optional, defaults to summary.api_key)
# api_key = "sk-..."
# API endpoint for retrieval
= "https://api.openai.com/v1"
# Model for retrieval navigation (use smarter models for better results)
= "gpt-4o"
# Number of top results to return
= 3
# Maximum tokens for retrieval context
= 1000
# Temperature for retrieval
= 0.0
# Search algorithm configuration
[]
# Number of top-k results to return
= 5
# Beam width for multi-path search
= 3
# Maximum iterations for search algorithms
= 10
# Minimum score to include a path
= 0.1
# Sufficiency checker configuration
[]
# Minimum tokens for sufficiency
= 500
# Target tokens for full sufficiency
= 2000
# Maximum tokens before stopping
= 4000
# Minimum content length (characters)
= 200
# Confidence threshold for LLM judge
= 0.7
# Cache configuration
[]
# Maximum number of cache entries
= 1000
# Time-to-live for cache entries (seconds)
= 3600
# Strategy-specific configuration
[]
# MCTS exploration weight (sqrt(2) ≈ 1.414)
= 1.414
# Semantic similarity threshold
= 0.5
# High similarity threshold for "answer" decision
= 0.8
# Low similarity threshold for "explore" decision
= 0.3
[]
# Workspace directory for persisted documents
#
# Structure:
# workspace/
# ├── _meta.json # Lightweight index
# ├── {doc_id_1}.json # Document 1
# └── {doc_id_2}.json # Document 2
= "./workspace"
[]
# Maximum concurrent LLM API calls
# This limits how many requests can be in-flight at the same time
= 10
# Rate limit: requests per minute
# This is a soft limit using token bucket algorithm
= 500
# Enable rate limiting (token bucket)
= true
# Enable semaphore-based concurrency limiting
= true
[]
# Enable graceful degradation when LLM calls fail
= true
# Fallback models in priority order
# When primary model fails, system tries these in order
= ["gpt-4o-mini", "glm-4-flash"]
# Fallback endpoints (optional)
# When primary endpoint fails, system tries these in order
# endpoints = [
# "https://api.openai.com/v1",
# "https://api.z.ai/api/paas/v4"
# ]
# Behavior on rate limit error (429)
# Options: retry, fallback, retry_then_fallback, fail
= "retry_then_fallback"
# Behavior on timeout error
# Options: retry, fallback, retry_then_fallback, fail
= "retry_then_fallback"
# Behavior when all attempts fail
# Options: return_error, return_cache
= "return_error"