1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
# CLI Engineer Configuration File
#
# API keys are stored in environment variables:
# - OPENROUTER_API_KEY for OpenRouter
# - OPENAI_API_KEY for OpenAI
# - ANTHROPIC_API_KEY for Anthropic
# - Ollama runs locally and doesn't require an API key
# First, set enabled = true on the provider you want to use.
# Then, uncomment its model that you want to use!
# OPENAI MODELS (require OPENAI_API_KEY):
[]
= false
= 1
= "gpt-4.1" # Flagship GPT model for coding tasks
= 2.00
= 8.00
= 128000
# model = "gpt-o4-mini" # Faster, more affordable reasoning model
# cost_per_1m_input_tokens = 1.10
# cost_per_1m_output_tokens = 4.40
# max_tokens = 128000
# model = "gpt-o3" # Most powerful reasoning model
# cost_per_1m_input_tokens = 10.00
# cost_per_1m_output_tokens = 40.00
# max_tokens = 128000
# CLAUDE MODELS (requires ANTHROPIC_API_KEY):
[]
= false
= 0.7
= "claude-sonnet-4-0" # Highly capable model
= 3.00
= 15.00
= 200000
# model = "claude-opus-4-0" # Most capable model
# cost_per_1m_input_tokens = 15.00
# cost_per_1m_output_tokens = 75.00
# max_tokens = 200000
# COST-EFFECTIVE CLOUD OPTIONS (require OPENROUTER_API_KEY)
[]
= true
= 0.2
# deepseek/deepseek-r1-0528-qwen3-8b - Advanced reasoning in small, affordable model
# model = "deepseek/deepseek-r1-0528-qwen3-8b"
# cost_per_1m_input_tokens = 0.06
# cost_per_1m_output_tokens = 0.09
# max_tokens = 65536
# model = "inception/mercury-coder-small-beta" # - Lightning-fast, diffusion coding model
# cost_per_1m_input_tokens = 0.25
# cost_per_1m_output_tokens = 1.00
# max_tokens = 32768
= "qwen/qwen3-235b-a22b" # - Powerful, affordable reasoning model
= 0.13
= 0.85
= 41000
# model = "microsoft/phi-4-reasoning-plus" # Efficient general purpose
# cost_per_1m_input_tokens = 0.07
# cost_per_1m_output_tokens = 0.35
# max_tokens = 33000
# google/gemini-2.5-pro-preview - First-place positioning on the LMArena leaderboard
# model = "google/gemini-2.5-pro-preview"
# cost_per_1m_input_tokens = 1.25
# cost_per_1m_output_tokens = 10.00
# max_tokens = 8192
# Ollama - Local LLM inference (no API key required)
# Install: curl -fsSL https://ollama.ai/install.sh | sh
# Pull model: ollama pull <model_name>
#
# CONSUMER GPU RECOMMENDATIONS (4B-14B parameters):
#
# For 8GB VRAM (GTX 1070, RTX 3060):
# qwen3:4b, gemma3:4b
#
# For 12GB VRAM (RTX 3060 Ti, RTX 4060 Ti):
# qwen3:8b, deepseek-r1:8b
#
# For 16GB+ VRAM (RTX 3080, RTX 4070 Ti):
# qwen3:14b, gemma3:12b, phi4-14b
[]
= false
= 0.7
= "http://localhost:11434"
# RECOMMENDED MODELS:
= "deepseek-r1:8b" # Updated R1 reasoning and Qwen 3 model: DeepSeek-R1-0528-Qwen3-8B
= 128000
# qwen3:4b - Excellent general performance, low VRAM
# model = "qwen3:4b"
# max_tokens = 40000
# qwen3:14b - High performance, requires more VRAM
# model = "qwen3:14b"
# max_tokens = 40000
# deepseek-r1:7b - Compact reasoning (older variant)
# model = "deepseek-r1:7b"
# max_tokens = 128000
# model = "phi4-14b" - Microsoft's open source reasoning model
# max_tokens = 16384
# gemma3:4b - Google's compact model
# model = "gemma3:4b"
# max_tokens = 128000
# gemma3:12b - Stronger performance
# model = "gemma3:12b"
# max_tokens = 128000
[]
# Maximum iterations for the agentic loop
= 3
# Enable parallel task execution
= true
# Artifact directory
= "./artifacts"
# Isolated execution environment
= false
# Cleanup artifacts on exit
= false
# Disable automatic git repository initialization unless explicitly requested
= true
[]
# Enable colorful terminal output
= true
# Show progress bars
= true
# Show real-time metrics
= true
# Output format: "terminal", "json", or "plain"
= "terminal"
[]
# Fallback maximum tokens (actual model context size is used when available)
= 65536
# Compression threshold (0.0 to 1.0)
= 0.6
# Enable context caching
= true