1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
# Octoroute Configuration
#
# This file configures the HTTP server, model endpoints, routing strategy,
# and observability settings for Octoroute.
[]
= "0.0.0.0"
= 3000
= 30
# Fast model tier - for simple tasks, casual chat, quick Q&A
# Multi-model support: Multiple endpoints for load balancing and failover
# Phase 1: Simple round-robin selection
# Phase 2: Weighted load balancing with health checks
[[]]
= "qwen/qwen3-vl-8b"
= "http://192.168.1.67:1234/v1"
= 4096
= 0.7
= 1.0 # Load balancing weight (Phase 2)
= 1 # Higher priority = tried first (Phase 2)
[[]]
= "qwen/qwen3-vl-8b"
= "http://192.168.1.72:1234/v1"
= 4096
= 0.7
= 1.0
= 1
# Balanced model tier - for coding, analysis, explanations
[[]]
= "qwen/qwen3-30b-a3b-2507"
= "http://192.168.1.61:1234/v1"
= 8192
= 0.7
= 1.0
= 1
# Deep model tier - for complex reasoning, creative writing, research
[[]]
= "/home/steve/dev/llama.cpp/models/gpt-oss-120b-mxfp4.gguf"
= "https://strix-ai.localbrandonfamily.com/v1"
= 16384
= 0.7
= 1.0
= 1
[]
# Strategy options: "rule", "llm", "hybrid"
# - rule: Fast pattern-based routing (no LLM overhead)
# - llm: Intelligent routing using router tier model
# - hybrid: Rule-based first, LLM fallback (recommended)
= "hybrid"
# Default importance level if not specified in request
= "normal"
# Which model to use for LLM-based routing decisions
= "balanced"
[]
# Log level: "trace", "debug", "info", "warn", "error"
= "info"
# Note: Prometheus metrics are always enabled at /metrics on the main server port
# See README.md for security recommendations (nginx reverse proxy, firewall rules)
# Per-tier timeout overrides (optional)
# If not specified, server.request_timeout_seconds is used
[]
= 15 # 8B models respond quickly
= 30 # 30B models moderate speed
= 60 # 120B models need more time