1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
# Vectorless Configuration Example
# Copy this file to config.toml and fill in your API keys
#
# All configuration is loaded from this file only.
# No environment variables are used - this ensures explicit, traceable configuration.
[]
# Word count threshold for splitting sections into subsections
= 300
# Maximum tokens to send in a single segmentation request
= 3000
# Maximum tokens for each summary
= 200
# Minimum content tokens required to generate a summary
= 20
[]
# API key - get from your provider
# api_key = "sk-..."
# API endpoint
# OpenAI: https://api.openai.com/v1
# ZAI General: https://api.z.ai/api/paas/v4
# ZAI Coding: https://api.z.ai/api/coding/paas/v4
= "https://api.openai.com/v1"
# Model for summarization (use cheaper models for indexing)
= "gpt-4o-mini"
# Maximum tokens for summary generation
= 200
# Temperature for summary generation
= 0.0
[]
# API key (optional, defaults to summary.api_key)
# api_key = "sk-..."
# API endpoint for retrieval
= "https://api.openai.com/v1"
# Model for retrieval navigation (use smarter models for better results)
= "gpt-4o"
# Number of top results to return
= 3
# Maximum tokens for retrieval context
= 1000
# Temperature for retrieval
= 0.0
# Search algorithm configuration
[]
# Number of top-k results to return
= 5
# Beam width for multi-path search
= 3
# Maximum iterations for search algorithms
= 10
# Minimum score to include a path
= 0.1
# Sufficiency checker configuration
[]
# Minimum tokens for sufficiency
= 500
# Target tokens for full sufficiency
= 2000
# Maximum tokens before stopping
= 4000
# Minimum content length (characters)
= 200
# Confidence threshold for LLM judge
= 0.7
# Cache configuration
[]
# Maximum number of cache entries
= 1000
# Time-to-live for cache entries (seconds)
= 3600
# Strategy-specific configuration
[]
# MCTS exploration weight (sqrt(2) ≈ 1.414)
= 1.414
# Semantic similarity threshold
= 0.5
# High similarity threshold for "answer" decision
= 0.8
# Low similarity threshold for "explore" decision
= 0.3
# Content aggregator configuration
# Controls how retrieved content is aggregated and returned
[]
# Enable/disable content aggregator
# When disabled, uses simple content collection (legacy behavior)
= true
# Maximum tokens for aggregated content
= 4000
# Minimum relevance score threshold (0.0 - 1.0)
# Content below this threshold will be filtered out
= 0.2
# Scoring strategy: "keyword_only" | "keyword_bm25" | "hybrid"
# - keyword_only: Fast keyword matching (no BM25)
# - keyword_bm25: Keyword + BM25 scoring (recommended)
# - hybrid: Keyword + LLM reranking (most accurate, slower)
= "keyword_bm25"
# Output format: "markdown" | "json" | "tree" | "flat"
# - markdown: Structured markdown with headers (default)
# - json: JSON format for programmatic use
# - tree: Tree structure preserving hierarchy
# - flat: Flat text format
= "markdown"
# Include relevance scores in output (useful for debugging)
= false
# Minimum budget allocation per depth level (0.0 - 1.0)
# Ensures each tree level gets representation
= 0.1
# Enable content deduplication
= true
# Similarity threshold for deduplication (0.0 - 1.0)
# Higher = more aggressive deduplication
= 0.9
[]
# Workspace directory for persisted documents
#
# Structure:
# workspace/
# ├── _meta.json # Lightweight index
# ├── {doc_id_1}.json # Document 1
# └── {doc_id_2}.json # Document 2
= "./workspace"
[]
# Maximum concurrent LLM API calls
# This limits how many requests can be in-flight at the same time
= 10
# Rate limit: requests per minute
# This is a soft limit using token bucket algorithm
= 500
# Enable rate limiting (token bucket)
= true
# Enable semaphore-based concurrency limiting
= true
[]
# Enable graceful degradation when LLM calls fail
= true
# Fallback models in priority order
# When primary model fails, system tries these in order
= ["gpt-4o-mini", "glm-4-flash"]
# Fallback endpoints (optional)
# When primary endpoint fails, system tries these in order
# endpoints = [
# "https://api.openai.com/v1",
# "https://api.z.ai/api/paas/v4"
# ]
# Behavior on rate limit error (429)
# Options: retry, fallback, retry_then_fallback, fail
= "retry_then_fallback"
# Behavior on timeout error
# Options: retry, fallback, retry_then_fallback, fail
= "retry_then_fallback"
# Behavior when all attempts fail
# Options: return_error, return_cache
= "return_error"