vectorless 0.1.12

# Vectorless Configuration Example
# Copy this file to config.toml and fill in your API keys
#
# All configuration is loaded from this file only.
# No environment variables are used - this ensures explicit, traceable configuration.

[indexer]
# Word count threshold for splitting sections into subsections
subsection_threshold = 300

# Maximum tokens to send in a single segmentation request
max_segment_tokens = 3000

# Maximum tokens for each summary
max_summary_tokens = 200

# Minimum content tokens required to generate a summary
min_summary_tokens = 20

[summary]
# API key - get from your provider
# api_key = "sk-..."

# API endpoint
# OpenAI: https://api.openai.com/v1
# ZAI General: https://api.z.ai/api/paas/v4
# ZAI Coding: https://api.z.ai/api/coding/paas/v4
endpoint = "https://api.openai.com/v1"

# Model for summarization (use cheaper models for indexing)
model = "gpt-4o-mini"

# Maximum tokens for summary generation
max_tokens = 200

# Temperature for summary generation
temperature = 0.0

[retrieval]
# API key (optional, defaults to summary.api_key)
# api_key = "sk-..."

# API endpoint for retrieval
endpoint = "https://api.openai.com/v1"

# Model for retrieval navigation (use smarter models for better results)
model = "gpt-4o"

# Number of top results to return
top_k = 3

# Maximum tokens for retrieval context
max_tokens = 1000

# Temperature for retrieval
temperature = 0.0

# Search algorithm configuration
[retrieval.search]
# Number of top-k results to return
top_k = 5

# Beam width for multi-path search
beam_width = 3

# Maximum iterations for search algorithms
max_iterations = 10

# Minimum score to include a path
min_score = 0.1

# Sufficiency checker configuration
[retrieval.sufficiency]
# Minimum tokens for sufficiency
min_tokens = 500

# Target tokens for full sufficiency
target_tokens = 2000

# Maximum tokens before stopping
max_tokens = 4000

# Minimum content length (characters)
min_content_length = 200

# Confidence threshold for LLM judge
confidence_threshold = 0.7

# Cache configuration
[retrieval.cache]
# Maximum number of cache entries
max_entries = 1000

# Time-to-live for cache entries (seconds)
ttl_secs = 3600

# Strategy-specific configuration
[retrieval.strategy]
# MCTS exploration weight (sqrt(2) ≈ 1.414)
exploration_weight = 1.414

# Semantic similarity threshold
similarity_threshold = 0.5

# High similarity threshold for "answer" decision
high_similarity_threshold = 0.8

# Low similarity threshold for "explore" decision
low_similarity_threshold = 0.3

# Content aggregator configuration
# Controls how retrieved content is aggregated and returned
[retrieval.content]
# Enable/disable content aggregator
# When disabled, uses simple content collection (legacy behavior)
enabled = true

# Maximum tokens for aggregated content
token_budget = 4000

# Minimum relevance score threshold (0.0 - 1.0)
# Content below this threshold will be filtered out
min_relevance_score = 0.2

# Scoring strategy: "keyword_only" | "keyword_bm25" | "hybrid"
# - keyword_only: Fast keyword matching (no BM25)
# - keyword_bm25: Keyword + BM25 scoring (recommended)
# - hybrid: Keyword + LLM reranking (most accurate, slower)
scoring_strategy = "keyword_bm25"

# Output format: "markdown" | "json" | "tree" | "flat"
# - markdown: Structured markdown with headers (default)
# - json: JSON format for programmatic use
# - tree: Tree structure preserving hierarchy
# - flat: Flat text format
output_format = "markdown"

# Include relevance scores in output (useful for debugging)
include_scores = false

# Minimum budget allocation per depth level (0.0 - 1.0)
# Ensures each tree level gets representation
hierarchical_min_per_level = 0.1

# Enable content deduplication
deduplicate = true

# Similarity threshold for deduplication (0.0 - 1.0)
# Higher = more aggressive deduplication
dedup_threshold = 0.9

[storage]
# Workspace directory for persisted documents
#
# Structure:
# workspace/
# ├── _meta.json           # Lightweight index
# ├── {doc_id_1}.json      # Document 1
# └── {doc_id_2}.json      # Document 2
workspace_dir = "./workspace"

[concurrency]
# Maximum concurrent LLM API calls
# This limits how many requests can be in-flight at the same time
max_concurrent_requests = 10

# Rate limit: requests per minute
# This is a soft limit using token bucket algorithm
requests_per_minute = 500

# Enable rate limiting (token bucket)
enabled = true

# Enable semaphore-based concurrency limiting
semaphore_enabled = true

[fallback]
# Enable graceful degradation when LLM calls fail
enabled = true

# Fallback models in priority order
# When primary model fails, system tries these in order
models = ["gpt-4o-mini", "glm-4-flash"]

# Fallback endpoints (optional)
# When primary endpoint fails, system tries these in order
# endpoints = [
#     "https://api.openai.com/v1",
#     "https://api.z.ai/api/paas/v4"
# ]

# Behavior on rate limit error (429)
# Options: retry, fallback, retry_then_fallback, fail
on_rate_limit = "retry_then_fallback"

# Behavior on timeout error
# Options: retry, fallback, retry_then_fallback, fail
on_timeout = "retry_then_fallback"

# Behavior when all attempts fail
# Options: return_error, return_cache
on_all_failed = "return_error"