opensourcellmrouter 0.2.4

An async LLM proxy that routes requests across multiple providers via a configurable pipeline
# opensourcellmrouter configuration
#
# Two local providers:
#   local-llama — llama.cpp server on :8080 (OpenAI-compatible)
#   ollama      — Ollama on :11434 (native Ollama API)
#
# Pipeline: classifiers tag the request → routers pick a provider → logging

[server]
host      = "0.0.0.0"
port      = 8090
dashboard = true

[logging]
enabled = true
path    = "logs/requests.jsonl"

# ── providers ─────────────────────────────────────────────────────────────────

[[providers]]
name                      = "local-llama"
format                    = "openai"
base_url                  = "http://localhost:8080/v1"
cost_per_1m_tokens        = 0.0
quality                   = 60
latency_ms                = 900
throughput_tokens_per_sec = 20

# Ollama native API — base_url has no /v1 suffix.
# The "discover" router rule queries /api/tags at startup to know which
# models are pulled (llama3.1:8b, deepseek-r1:latest, gemma3:latest, etc.).
[[providers]]
name                      = "ollama"
format                    = "ollama"
base_url                  = "http://localhost:11434"
cost_per_1m_tokens        = 0.0
quality                   = 75
latency_ms                = 600
throughput_tokens_per_sec = 30

# ── classifiers ───────────────────────────────────────────────────────────────

[classifiers.keyword]
enabled = true

[classifiers.keyword.tags]
# Matches any message mentioning images/photos → routed to a vision-capable model
vision = ["image", "photo", "picture", "screenshot", "visual", "diagram", "chart"]
# Matches video content references
video  = ["video", "clip", "footage", "frame", "timestamp"]
# Matches code-heavy requests → routed to deepseek-r1 (strong at reasoning/code)
code   = ["function", "class", "import", "def ", "fn ", "bug", "error", "stack trace",
          "compile", "runtime", "algorithm", "refactor", "debug"]
# Adult/explicit content → kept on local-llama (private, no content policy)
nsfw   = ["nsfw", "adult", "explicit", "erotic", "nude", "naked",
          "sexual", "xxx", "porn", "hentai", "fetish", "lewd"]

# ── routers (first match wins) ────────────────────────────────────────────────

# "local/..." always goes to llama.cpp regardless of content.
[[routers]]
type          = "prefix"
model_prefix  = "local/"
provider      = "local-llama"
rewrite_model = "llama3.2-3b"

# Adult content → local-llama (stays off cloud providers, no content policy).
# Swap rewrite_model to whichever GGUF you've loaded for this purpose.
[[routers]]
type          = "tag"
tag           = "nsfw"
provider      = "local-llama"
rewrite_model = "llama3.2-3b"

# Code/reasoning → deepseek-r1 (strong at step-by-step reasoning and code)
[[routers]]
type          = "tag"
tag           = "code"
provider      = "ollama"
rewrite_model = "deepseek-r1:latest"

# Vision/video → llama3.1:8b (best general capability in the Ollama pool)
[[routers]]
type          = "tag"
tag           = "vision"
provider      = "ollama"
rewrite_model = "llama3.1:8b"

[[routers]]
type          = "tag"
tag           = "video"
provider      = "ollama"
rewrite_model = "llama3.1:8b"

# Any model Ollama actually has pulled is routed there directly.
# This handles llama3.1:8b, deepseek-r1:latest, gemma3:latest,
# gpt-oss:latest, gpt-oss:20b, phi3:mini, carter-hire:latest, etc.
[[routers]]
type     = "discover"
provider = "ollama"

# Catch-all: unknown model names (e.g. "gpt-4", "claude-3") get rewritten to
# llama3.1:8b and routed to the highest-quality local provider (ollama).
[[routers]]
type          = "fallback"
providers     = ["local-llama", "ollama"]
quality_bias  = 0.7
rewrite_model = "llama3.1:8b"

# ── plugins ───────────────────────────────────────────────────────────────────

[plugins.response-healing]
enabled = true

[plugins.pareto-router]
enabled      = false
default_tier = "medium"

[plugins.pareto-router.tiers]
low    = ["local-llama"]
medium = ["ollama"]
high   = ["ollama"]