vectorless 0.1.16

# Vectorless Configuration Example
# Copy this file to vectorless.toml and fill in your API keys
#
# All configuration is loaded from this file only.
# No environment variables are used - this ensures explicit, traceable configuration.

# ============================================================================
# LLM Configuration (Unified)
# ============================================================================
#
# The LLM pool allows configuring different models for different purposes:
# - summary: Used for generating document summaries during indexing
# - retrieval: Used for retrieval decisions and content evaluation
# - pilot: Used for intelligent navigation guidance
#
# Each client can have its own model, endpoint, and settings.

[llm]
# Default API key (used by all clients unless overridden per-client)
api_key = "sk-your-api-key-here"

# Summary client - generates document summaries during indexing
# Use a fast, cheap model for bulk processing
[llm.summary]
model = "gpt-4o-mini"
endpoint = "https://api.openai.com/v1"
max_tokens = 200
temperature = 0.0
# api_key = "sk-specific-key-for-summary"  # Optional: override default

# Retrieval client - used for retrieval decisions and content evaluation
# Can use a more capable model for better decisions
[llm.retrieval]
model = "gpt-4o"
endpoint = "https://api.openai.com/v1"
max_tokens = 100
temperature = 0.0
# api_key = "sk-specific-key-for-retrieval"  # Optional: override default

# Pilot client - used for intelligent navigation guidance
# Use a fast model for quick navigation decisions
[llm.pilot]
model = "gpt-4o-mini"
endpoint = "https://api.openai.com/v1"
max_tokens = 300
temperature = 0.0
# api_key = "sk-specific-key-for-pilot"  # Optional: override default

# Retry configuration (applies to all LLM calls)
[llm.retry]
max_attempts = 3
initial_delay_ms = 500
max_delay_ms = 30000
multiplier = 2.0
retry_on_rate_limit = true

# Throttle/rate limiting configuration (applies to all LLM calls)
[llm.throttle]
max_concurrent_requests = 10
requests_per_minute = 500
enabled = true
semaphore_enabled = true

# Fallback configuration (applies to all LLM calls)
[llm.fallback]
enabled = true
models = ["gpt-4o-mini", "glm-4-flash"]
# Alternative endpoints for fallback
# endpoints = [
#     "https://api.openai.com/v1",
#     "https://api.z.ai/api/paas/v4"
# ]
on_rate_limit = "retry_then_fallback"
on_timeout = "retry_then_fallback"
on_all_failed = "return_error"

# ============================================================================
# Metrics Configuration (Unified)
# ============================================================================

[metrics]
enabled = true
storage_path = "./workspace/metrics"
retention_days = 30

[metrics.llm]
track_tokens = true
track_latency = true
track_cost = true
cost_per_1k_input_tokens = 0.00015   # gpt-4o-mini pricing
cost_per_1k_output_tokens = 0.0006

[metrics.pilot]
track_decisions = true
track_accuracy = true
track_feedback = true

[metrics.retrieval]
track_paths = true
track_scores = true
track_iterations = true
track_cache = true

# ============================================================================
# Pilot Configuration
# ============================================================================

[pilot]
mode = "Balanced"  # Aggressive | Balanced | Conservative | AlgorithmOnly
guide_at_start = true
guide_at_backtrack = true

[pilot.budget]
max_tokens_per_query = 2000
max_tokens_per_call = 500
max_calls_per_query = 5
max_calls_per_level = 2
hard_limit = true

[pilot.intervention]
fork_threshold = 3
score_gap_threshold = 0.15
low_score_threshold = 0.3
max_interventions_per_level = 2

[pilot.feedback]
enabled = true
storage_path = "./workspace/feedback"
learning_rate = 0.1
min_samples_for_learning = 10

# ============================================================================
# Retrieval Configuration
# ============================================================================

[retrieval]
model = "gpt-4o"
endpoint = "https://api.openai.com/v1"
top_k = 3
max_tokens = 1000
temperature = 0.0

[retrieval.search]
top_k = 5
beam_width = 3
max_iterations = 10
min_score = 0.1

[retrieval.sufficiency]
min_tokens = 500
target_tokens = 2000
max_tokens = 4000
min_content_length = 200
confidence_threshold = 0.7

[retrieval.cache]
max_entries = 1000
ttl_secs = 3600

[retrieval.strategy]
exploration_weight = 1.414
similarity_threshold = 0.5
high_similarity_threshold = 0.8
low_similarity_threshold = 0.3

[retrieval.content]
enabled = true
token_budget = 4000
min_relevance_score = 0.2
scoring_strategy = "hybrid"  # keyword | bm25 | hybrid
output_format = "markdown"
include_scores = false
hierarchical_min_per_level = 0.1
deduplicate = true
dedup_threshold = 0.9

# ============================================================================
# Multi-turn Retrieval Configuration
# ============================================================================

[retrieval.multiturn]
enabled = true
max_sub_queries = 3
decomposition_model = "gpt-4o-mini"
aggregation_strategy = "merge"  # merge | rank | synthesize

# ============================================================================
# Reference Following Configuration
# ============================================================================

[retrieval.reference]
enabled = true
max_depth = 3
max_references = 10
follow_pages = true
follow_tables_figures = true
min_confidence = 0.5

# ============================================================================
# Storage Configuration
# ============================================================================

[storage]
workspace_dir = "./workspace"
cache_size = 100
atomic_writes = true
file_lock = true
checksum_enabled = true

[storage.compression]
enabled = false
algorithm = "gzip"
level = 6

# ============================================================================
# Indexer Configuration
# ============================================================================

[indexer]
subsection_threshold = 300
max_segment_tokens = 3000
max_summary_tokens = 200
min_summary_tokens = 20