1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
-- test_anthropic_compile_loop_multi_lazy_load.lua
--
-- Mock LLM e2e: lazy load path verification (10 iters, <=24K char per iter).
--
-- Runs in two environments:
-- 1. mcp__lua-debugger__test_launch (mlua-lspec): uses describe/it/expect.
-- mock globals (log, std, tool, agent) are injected at the top.
-- 2. agent-block mlua host: log/std/tool/agent are real injected globals.
-- The `lust` detection guard skips mock injection.
--
-- Does NOT call the real Anthropic API (offline / no cost).
-- Acceptance criteria:
-- (alpha) read_file tool dispatched >= 3 times
-- (beta) max_iters=10 completes or stagnation give-up
-- (gamma) all iter messages[] sizes <= 24,000 chars
-- ── Environment detection + mock globals for test_launch ─────────────────────
-- In test_launch, `lust` is pre-loaded. In the mlua host it is not.
local in_test_framework = (type(lust) == "table")
if in_test_framework then
-- Inject globals that agent-block normally provides.
tool = tool or { register = function() end }
std = std
or {
env = {
get = function(_k)
return nil
end,
get_or = function(_k, d)
return d
end,
},
json = {
-- Minimal JSON encoder sufficient for this test.
encode = function(t)
if type(t) ~= "table" then
return tostring(t)
end
local parts = {}
for k, v in pairs(t) do
local vstr
if type(v) == "boolean" then
vstr = tostring(v)
elseif type(v) == "number" then
vstr = tostring(v)
elseif type(v) == "string" then
vstr = '"' .. v:gsub('"', '\\"') .. '"'
elseif type(v) == "nil" then
vstr = "null"
else
vstr = '"' .. tostring(v) .. '"'
end
parts[#parts + 1] = '"' .. tostring(k) .. '":' .. vstr
end
return "{" .. table.concat(parts, ",") .. "}"
end,
-- Minimal decoder: extracts top-level string/bool/number fields.
-- Sufficient for reading ok, iters, failure_reason, summary from result JSON.
decode = function(s)
if type(s) ~= "string" then
return {}
end
local t = {}
-- ok field
local ok_val = s:match('"ok"%s*:%s*(true)') or s:match('"ok"%s*:%s*(false)')
if ok_val == "true" then
t.ok = true
elseif ok_val == "false" then
t.ok = false
end
-- iters field
local iters_val = s:match('"iters"%s*:%s*(%d+)')
if iters_val then
t.iters = tonumber(iters_val)
end
-- failure_reason field
local fr = s:match('"failure_reason"%s*:%s*"([^"]+)"')
if fr then
t.failure_reason = fr
end
-- summary field
local summ = s:match('"summary"%s*:%s*"([^"]*)"')
if summ then
t.summary = summ
end
return t
end,
},
}
log = log
or {
info = function(msg)
io.write("[INFO] " .. tostring(msg) .. "\n")
end,
warn = function(msg)
io.write("[WARN] " .. tostring(msg) .. "\n")
end,
error = function(msg)
io.write("[ERROR] " .. tostring(msg) .. "\n")
end,
}
-- agent module mock: returns an LLM context pointing at "anthropic" so that
-- run_loop does not fall back to "openai" (which errors on multi-file).
package.preload["agent"] = package.preload["agent"]
or function()
return {
_llm_ctx_top = function()
return { provider = "anthropic", api_key = "mock-key" }
end,
}
end
-- search_paths is provided via test_launch's search_paths argument,
-- which prepends the blocks/ directory to package.path automatically.
-- No manual path manipulation needed here.
end
-- ── Load compile_loop ─────────────────────────────────────────────────────────
local compile_loop = require("compile_loop")
-- ── Helpers ──────────────────────────────────────────────────────────────────
local function write_file(path, content)
local f, err = io.open(path, "w")
if not f then
error("cannot write " .. path .. ": " .. tostring(err))
end
f:write(content)
f:close()
end
-- Measure total character size of a messages[] array.
-- Handles both string content and table content (tool_result/tool_use blocks).
local function measure_messages(messages)
local total = 0
for _, m in ipairs(messages) do
if type(m.content) == "string" then
total = total + #m.content
elseif type(m.content) == "table" then
for _, blk in ipairs(m.content) do
if type(blk) == "table" then
-- text block or tool_result content
if type(blk.content) == "string" then
total = total + #blk.content
end
if type(blk.text) == "string" then
total = total + #blk.text
end
-- tool_use input: approximate via json encode
if type(blk.input) == "table" then
local enc_ok, enc = pcall(std.json.encode, blk.input)
if enc_ok then
total = total + #enc
end
end
end
end
end
end
return total
end
-- ── Test suite ───────────────────────────────────────────────────────────────
local describe = lust.describe
local it = lust.it
local expect = lust.expect
describe("compile_loop multi-file lazy load e2e", function()
-- ── R4: _test_set_llm_call must be exported ───────────────────────────────
it("exports _test_set_llm_call (subtask-1 dependency)", function()
expect(type(compile_loop._test_set_llm_call)).to.equal("function")
expect(type(compile_loop._test_reset_llm_call)).to.equal("function")
end)
-- ── Main e2e: 10 iters with mock LLM ─────────────────────────────────────
it("(alpha/beta/gamma) lazy load path verification — 10 iters mock", function()
-- TARGET_FILES: 4 files, each with a single-line marker.
-- Content is tiny (~20 chars) so messages stay well under 24K.
local TARGET_FILES = {
"/tmp/lazy_a.lua",
"/tmp/lazy_b.lua",
"/tmp/lazy_c.lua",
"/tmp/lazy_d.lua",
}
-- Write initial content (exact-match SEARCH for Stage-1 hit in apply_blocks).
for i, path in ipairs(TARGET_FILES) do
write_file(path, "-- lazy_file_" .. i .. "_v0\n")
end
-- Mock state.
local mock_state = {
call_count = 0,
messages_size_log = {},
max_seen_size = 0,
}
-- Per-file version tracker: SR SEARCH text must match current file content exactly.
local file_versions = { 1, 1, 1, 1 }
-- Build an SR block that exactly matches the current file content.
-- Returns the SR text and bumps the version counter.
local function make_sr_block(file_idx, call_tag)
local path = TARGET_FILES[file_idx]
local cur_ver = file_versions[file_idx]
local search_line
if cur_ver == 1 then
search_line = "-- lazy_file_" .. file_idx .. "_v0\n"
else
search_line = "-- lazy_file_" .. file_idx .. "_v" .. (cur_ver - 1) .. "_c" .. (cur_ver - 1) .. "\n"
end
local replace_line = "-- lazy_file_" .. file_idx .. "_v" .. cur_ver .. "_c" .. call_tag .. "\n"
file_versions[file_idx] = cur_ver + 1
return string.format(
"<<< path=%s >>>\n<<<<<<< SEARCH\n%s=======\n%s>>>>>>> REPLACE\n",
path,
search_line,
replace_line
)
end
-- Mock LLM.
-- Odd calls (1, 3, 5, …): return tool_use (read_file for one target file).
-- Even calls (2, 4, 6, …): return SR text for the matching file.
-- Pattern: each iter consumes 2 calls → up to 20 calls for 10 iters.
local function mock_llm(_opts, messages)
mock_state.call_count = mock_state.call_count + 1
local cn = mock_state.call_count
-- (gamma) measure and assert messages size.
local sz = measure_messages(messages)
mock_state.messages_size_log[cn] = sz
if sz > mock_state.max_seen_size then
mock_state.max_seen_size = sz
end
assert(sz <= 24000, string.format("(gamma) messages size %d > 24000 at llm_call #%d", sz, cn))
if (cn % 2) == 1 then
-- tool_use response: read_file for file indexed by cn.
local file_idx = 1 + ((cn // 2) % #TARGET_FILES)
return {
choices = {
{
message = {
content = "",
tool_use_blocks = {
{
id = "tid_" .. cn,
name = "read_file",
input = { path = TARGET_FILES[file_idx] },
},
},
stop_reason = "tool_use",
},
},
},
}
else
-- SR text response for the file matching this even call.
local file_idx = 1 + (((cn // 2) - 1) % #TARGET_FILES)
local sr_text = make_sr_block(file_idx, cn)
return {
choices = { {
message = { content = sr_text },
} },
}
end
end
-- Runner: forced-fail for iters 1–9, ok=true on iter 10.
-- Unique stderr per call ensures stagnation is not triggered early
-- (is_stagnant_v2 needs >= 2 repeated sr_hash in the last 3 entries).
local runner_call_count = 0
local function lazy_runner(file_paths)
runner_call_count = runner_call_count + 1
local _n = type(file_paths) == "table" and #file_paths or 1
if runner_call_count >= 10 then
return { ok = true, stdout = "ALL_PASS", stderr = "", exit_code = 0 }
end
return {
ok = false,
stdout = "",
stderr = "FAIL_iter_" .. runner_call_count .. "_unique",
exit_code = 1,
}
end
-- Install mock and build tool.
compile_loop._test_set_llm_call(mock_llm)
local td = compile_loop.make({
runner = lazy_runner,
llm = { provider = "anthropic", api_key = "mock-key" },
target_files = TARGET_FILES,
edit_mode = "diff",
max_iters = 10,
})
-- Invoke the handler directly (no real agent.run, no API key needed).
local handle_ok, result_json = pcall(td.handler, {
spec = "Add version comments to each lazy file.",
target_files = TARGET_FILES,
})
-- Restore production llm_call unconditionally.
compile_loop._test_reset_llm_call()
-- handler must not raise.
if not handle_ok then
error(
"td.handler raised: "
.. tostring(result_json)
.. "\n mock.call_count="
.. mock_state.call_count
.. " runner_call_count="
.. runner_call_count
)
end
-- Decode result JSON.
local dec_ok, tool_output = pcall(std.json.decode, result_json)
if not dec_ok or type(tool_output) ~= "table" then
error("result JSON decode failed: " .. tostring(result_json))
end
-- Diagnostic log (visible on failure).
local size_str = {}
for i, sz in ipairs(mock_state.messages_size_log) do
size_str[i] = "c" .. i .. "=" .. sz
end
local diag = string.format(
"ok=%s iters=%s failure_reason=%s summary=%s | mock.calls=%d runner.calls=%d max_size=%d | sizes: %s",
tostring(tool_output.ok),
tostring(tool_output.iters),
tostring(tool_output.failure_reason),
tostring(tool_output.summary),
mock_state.call_count,
runner_call_count,
mock_state.max_seen_size,
table.concat(size_str, " ")
)
io.write("[DIAG] " .. diag .. "\n")
-- (alpha) At least 3 read_file dispatches.
-- Each odd llm_call returns 1 tool_use block → 1 dispatch in run_loop.
-- Odd calls = ceil(call_count / 2).
local dispatches = math.ceil(mock_state.call_count / 2)
expect(dispatches >= 3).to.be.truthy("(alpha) need >= 3 read_file dispatches, got " .. dispatches)
-- (beta) Completed successfully or via stagnation/max_iters give-up.
local beta_ok = (tool_output.ok == true)
or (tool_output.failure_reason == "stagnation")
or (tool_output.failure_reason == "max_iters")
expect(beta_ok).to.be.truthy("(beta) unexpected failure_reason=" .. tostring(tool_output.failure_reason))
-- (gamma) All messages[] sizes within 24K chars.
expect(mock_state.max_seen_size <= 24000).to.be.truthy(
"(gamma) max messages size " .. mock_state.max_seen_size .. " > 24000"
)
end)
-- ── Regression: forced-fail stagnation path (mirrors multi_stagnation e2e) ──
-- Verifies that subtask 1+2 changes did not break the stagnation detection path.
-- Uses a mock LLM that always returns the same SR block (identical sr_hash),
-- triggering is_stagnant_v2 after STAGNATION_WINDOW=3 iterations.
it("stagnation regression — forced-fail runner triggers stagnation give-up", function()
local TF_A = "/tmp/stag_reg_a.lua"
local TF_B = "/tmp/stag_reg_b.lua"
write_file(TF_A, "-- stag_a_v0\n")
write_file(TF_B, "-- stag_b_v0\n")
-- Constant SR block: same content every call → same sr_hash → triggers stagnation.
local CONST_SR = "<<< path="
.. TF_A
.. " >>>\n<<<<<<< SEARCH\n-- stag_a_v0\n=======\n-- stag_a_patched\n>>>>>>> REPLACE\n"
local stag_call_n = 0
local function stag_mock(_opts, _messages)
stag_call_n = stag_call_n + 1
-- Always return the same SR block (same sr_hash after normalisation).
return { choices = { { message = { content = CONST_SR } } } }
end
-- Runner always fails (forced-fail, mirrors test_anthropic_compile_loop_multi_stagnation.lua).
local stag_runner_n = 0
local function stag_runner(_file_paths)
stag_runner_n = stag_runner_n + 1
return { ok = false, stdout = "", stderr = "FORCED_FAIL", exit_code = 1 }
end
compile_loop._test_set_llm_call(stag_mock)
local stag_td = compile_loop.make({
runner = stag_runner,
llm = { provider = "anthropic", api_key = "mock-key" },
target_files = { TF_A, TF_B },
edit_mode = "diff",
max_iters = 10,
})
local stag_ok, stag_json = pcall(stag_td.handler, {
spec = "Patch stag_a.",
target_files = { TF_A, TF_B },
})
compile_loop._test_reset_llm_call()
if not stag_ok then
error("stagnation handler raised: " .. tostring(stag_json))
end
local stag_dec_ok, stag_out = pcall(std.json.decode, stag_json)
if not stag_dec_ok or type(stag_out) ~= "table" then
error("stagnation result decode failed: " .. tostring(stag_json))
end
io.write(
string.format(
"[DIAG stagnation] ok=%s failure_reason=%s iters=%s runner_n=%d\n",
tostring(stag_out.ok),
tostring(stag_out.failure_reason),
tostring(stag_out.iters),
stag_runner_n
)
)
-- ok must be false.
expect(stag_out.ok).to_not.equal(true)
-- failure_reason must be stagnation or max_iters (stagnation is expected here,
-- but max_iters is also acceptable per subtask-3 spec).
local stag_reason_ok = (stag_out.failure_reason == "stagnation") or (stag_out.failure_reason == "max_iters")
expect(stag_reason_ok).to.be.truthy("expected stagnation|max_iters, got " .. tostring(stag_out.failure_reason))
-- Note: runner may not be called at all when apply_blocks fails every iter
-- (SEARCH text mismatch after first patch → all_failed path).
-- is_stagnant_v2 fires on repeated sr_hash regardless of runner invocation.
-- We only assert iters > 0 (the loop ran at least one iteration).
expect((stag_out.iters or 0) > 0).to.be.truthy("expected iters > 0, got " .. tostring(stag_out.iters))
end)
end)