agent-block 0.20.0

Lua-first Agent Runtime built on AgentMesh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
-- test_openai_compile_loop_multi_lazy_load.lua
--
-- Mock LLM e2e: lazy load path verification (10 iters, <=24K char per iter).
-- provider: openai
--
-- Runs in two environments:
--   1. mcp__lua-debugger__test_launch (mlua-lspec): uses describe/it/expect.
--      mock globals (log, std, tool, agent) are injected at the top.
--   2. agent-block mlua host: log/std/tool/agent are real injected globals.
--      The `lust` detection guard skips mock injection.
--
-- Does NOT call the real OpenAI API (offline / no cost).
-- Acceptance criteria:
--   (alpha) read_file tool dispatched >= 3 times
--   (beta)  max_iters=10 completes or stagnation give-up
--   (gamma) all iter messages[] sizes <= 24,000 chars

-- ── Environment detection + mock globals for test_launch ─────────────────────
-- In test_launch, `lust` is pre-loaded. In the mlua host it is not.
local in_test_framework = (type(lust) == "table")

if in_test_framework then
    -- Inject globals that agent-block normally provides.
    tool = tool or { register = function() end }

    std = std or {
        env = {
            get    = function(_k) return nil end,
            get_or = function(_k, d) return d end,
        },
        json = {
            -- Minimal JSON encoder sufficient for this test.
            encode = function(t)
                if type(t) ~= "table" then return tostring(t) end
                local parts = {}
                for k, v in pairs(t) do
                    local vstr
                    if type(v) == "boolean" then
                        vstr = tostring(v)
                    elseif type(v) == "number" then
                        vstr = tostring(v)
                    elseif type(v) == "string" then
                        vstr = '"' .. v:gsub('"', '\\"') .. '"'
                    elseif type(v) == "nil" then
                        vstr = "null"
                    else
                        vstr = '"' .. tostring(v) .. '"'
                    end
                    parts[#parts + 1] = '"' .. tostring(k) .. '":' .. vstr
                end
                return "{" .. table.concat(parts, ",") .. "}"
            end,
            -- Minimal decoder: extracts top-level string/bool/number fields.
            -- Sufficient for reading ok, iters, failure_reason, summary from result JSON.
            decode = function(s)
                if type(s) ~= "string" then return {} end
                local t = {}
                -- ok field
                local ok_val = s:match('"ok"%s*:%s*(true)') or s:match('"ok"%s*:%s*(false)')
                if ok_val == "true" then t.ok = true
                elseif ok_val == "false" then t.ok = false end
                -- iters field
                local iters_val = s:match('"iters"%s*:%s*(%d+)')
                if iters_val then t.iters = tonumber(iters_val) end
                -- failure_reason field
                local fr = s:match('"failure_reason"%s*:%s*"([^"]+)"')
                if fr then t.failure_reason = fr end
                -- summary field
                local summ = s:match('"summary"%s*:%s*"([^"]*)"')
                if summ then t.summary = summ end
                return t
            end,
        },
    }

    log = log or {
        info  = function(msg) io.write("[INFO]  " .. tostring(msg) .. "\n") end,
        warn  = function(msg) io.write("[WARN]  " .. tostring(msg) .. "\n") end,
        error = function(msg) io.write("[ERROR] " .. tostring(msg) .. "\n") end,
    }

    -- agent module mock: returns an LLM context pointing at "openai" so that
    -- run_loop uses the openai provider path.
    package.preload["agent"] = package.preload["agent"] or function()
        return {
            _llm_ctx_top = function()
                return { provider = "openai", api_key = "mock-key" }
            end,
        }
    end

    -- search_paths is provided via test_launch's search_paths argument,
    -- which prepends the blocks/ directory to package.path automatically.
    -- No manual path manipulation needed here.
end

-- ── Load compile_loop ─────────────────────────────────────────────────────────
local compile_loop = require("compile_loop")

-- ── Helpers ──────────────────────────────────────────────────────────────────

local function write_file(path, content)
    local f, err = io.open(path, "w")
    if not f then
        error("cannot write " .. path .. ": " .. tostring(err))
    end
    f:write(content)
    f:close()
end

-- Measure total character size of a messages[] array.
-- Handles both string content and table content (tool_result/tool_use blocks).
local function measure_messages(messages)
    local total = 0
    for _, m in ipairs(messages) do
        if type(m.content) == "string" then
            total = total + #m.content
        elseif type(m.content) == "table" then
            for _, blk in ipairs(m.content) do
                if type(blk) == "table" then
                    -- text block or tool_result content
                    if type(blk.content) == "string" then total = total + #blk.content end
                    if type(blk.text)    == "string" then total = total + #blk.text    end
                    -- tool_use input: approximate via json encode
                    if type(blk.input) == "table" then
                        local enc_ok, enc = pcall(std.json.encode, blk.input)
                        if enc_ok then total = total + #enc end
                    end
                end
            end
        end
    end
    return total
end

-- ── Test suite ───────────────────────────────────────────────────────────────

local describe = lust.describe
local it       = lust.it
local expect   = lust.expect

describe("compile_loop multi-file lazy load e2e (openai)", function()

    -- ── R4: _test_set_llm_call must be exported ───────────────────────────────
    it("exports _test_set_llm_call (subtask-1 dependency)", function()
        expect(type(compile_loop._test_set_llm_call)).to.equal("function")
        expect(type(compile_loop._test_reset_llm_call)).to.equal("function")
    end)

    -- ── Main e2e: 10 iters with mock LLM ─────────────────────────────────────
    it("(alpha/beta/gamma) lazy load path verification — 10 iters mock", function()
        -- TARGET_FILES: 4 files, each with a single-line marker.
        -- Content is tiny (~20 chars) so messages stay well under 24K.
        -- oai_ prefix avoids /tmp/ collision with anthropic version on concurrent runs.
        local TARGET_FILES = {
            "/tmp/oai_lazy_a.lua",
            "/tmp/oai_lazy_b.lua",
            "/tmp/oai_lazy_c.lua",
            "/tmp/oai_lazy_d.lua",
        }

        -- Write initial content (exact-match SEARCH for Stage-1 hit in apply_blocks).
        for i, path in ipairs(TARGET_FILES) do
            write_file(path, "-- oai_lazy_file_" .. i .. "_v0\n")
        end

        -- Mock state.
        local mock_state = {
            call_count         = 0,
            messages_size_log  = {},
            max_seen_size      = 0,
        }

        -- Per-file version tracker: SR SEARCH text must match current file content exactly.
        local file_versions = { 1, 1, 1, 1 }

        -- Build an SR block that exactly matches the current file content.
        -- Returns the SR text and bumps the version counter.
        local function make_sr_block(file_idx, call_tag)
            local path    = TARGET_FILES[file_idx]
            local cur_ver = file_versions[file_idx]
            local search_line
            if cur_ver == 1 then
                search_line = "-- oai_lazy_file_" .. file_idx .. "_v0\n"
            else
                search_line = "-- oai_lazy_file_" .. file_idx
                           .. "_v" .. (cur_ver - 1)
                           .. "_c" .. (cur_ver - 1) .. "\n"
            end
            local replace_line = "-- oai_lazy_file_" .. file_idx
                               .. "_v" .. cur_ver
                               .. "_c" .. call_tag .. "\n"
            file_versions[file_idx] = cur_ver + 1

            return string.format(
                "<<< path=%s >>>\n<<<<<<< SEARCH\n%s=======\n%s>>>>>>> REPLACE\n",
                path, search_line, replace_line
            )
        end

        -- Mock LLM.
        -- Odd calls (1, 3, 5, …): return tool_use (read_file for one target file).
        -- Even calls (2, 4, 6, …): return SR text for the matching file.
        -- Pattern: each iter consumes 2 calls → up to 20 calls for 10 iters.
        -- NOTE: _test_set_llm_call replaces llm_call entirely, so mock returns
        -- the internal shape (tool_use_blocks), NOT raw OpenAI wire format (Crux #1 C5).
        local function mock_llm(_opts, messages)
            mock_state.call_count = mock_state.call_count + 1
            local cn = mock_state.call_count

            -- (gamma) measure and assert messages size.
            local sz = measure_messages(messages)
            mock_state.messages_size_log[cn] = sz
            if sz > mock_state.max_seen_size then
                mock_state.max_seen_size = sz
            end
            assert(sz <= 24000,
                string.format("(gamma) messages size %d > 24000 at llm_call #%d", sz, cn))

            if (cn % 2) == 1 then
                -- tool_use response: read_file for file indexed by cn.
                local file_idx = 1 + ((cn // 2) % #TARGET_FILES)
                return {
                    choices = { {
                        message = {
                            content         = "",
                            tool_use_blocks = { {
                                id    = "tid_" .. cn,
                                name  = "read_file",
                                input = { path = TARGET_FILES[file_idx] },
                            } },
                            stop_reason     = "tool_use",
                        },
                    } },
                }
            else
                -- SR text response for the file matching this even call.
                local file_idx = 1 + (((cn // 2) - 1) % #TARGET_FILES)
                local sr_text  = make_sr_block(file_idx, cn)
                return {
                    choices = { {
                        message = { content = sr_text },
                    } },
                }
            end
        end

        -- Runner: forced-fail for iters 1–9, ok=true on iter 10.
        -- Unique stderr per call ensures stagnation is not triggered early
        -- (is_stagnant_v2 needs >= 2 repeated sr_hash in the last 3 entries).
        local runner_call_count = 0
        local function lazy_runner(file_paths)
            runner_call_count = runner_call_count + 1
            local _n = type(file_paths) == "table" and #file_paths or 1
            if runner_call_count >= 10 then
                return { ok = true, stdout = "ALL_PASS", stderr = "", exit_code = 0 }
            end
            return {
                ok        = false,
                stdout    = "",
                stderr    = "FAIL_iter_" .. runner_call_count .. "_unique",
                exit_code = 1,
            }
        end

        -- Install mock and build tool.
        compile_loop._test_set_llm_call(mock_llm)

        local td = compile_loop.make({
            runner       = lazy_runner,
            llm          = { provider = "openai", api_key = "mock-key" },
            target_files = TARGET_FILES,
            edit_mode    = "diff",
            max_iters    = 10,
        })

        -- Invoke the handler directly (no real agent.run, no API key needed).
        local handle_ok, result_json = pcall(td.handler, {
            spec         = "Add version comments to each oai lazy file.",
            target_files = TARGET_FILES,
        })

        -- Restore production llm_call unconditionally.
        compile_loop._test_reset_llm_call()

        -- handler must not raise.
        if not handle_ok then
            error("td.handler raised: " .. tostring(result_json)
                .. "\n  mock.call_count=" .. mock_state.call_count
                .. " runner_call_count=" .. runner_call_count)
        end

        -- Decode result JSON.
        local dec_ok, tool_output = pcall(std.json.decode, result_json)
        if not dec_ok or type(tool_output) ~= "table" then
            error("result JSON decode failed: " .. tostring(result_json))
        end

        -- Diagnostic log (visible on failure).
        local size_str = {}
        for i, sz in ipairs(mock_state.messages_size_log) do
            size_str[i] = "c" .. i .. "=" .. sz
        end
        local diag = string.format(
            "ok=%s iters=%s failure_reason=%s summary=%s | mock.calls=%d runner.calls=%d max_size=%d | sizes: %s",
            tostring(tool_output.ok),
            tostring(tool_output.iters),
            tostring(tool_output.failure_reason),
            tostring(tool_output.summary),
            mock_state.call_count,
            runner_call_count,
            mock_state.max_seen_size,
            table.concat(size_str, " ")
        )
        io.write("[DIAG] " .. diag .. "\n")

        -- (alpha) At least 3 read_file dispatches.
        -- Each odd llm_call returns 1 tool_use block → 1 dispatch in run_loop.
        -- Odd calls = ceil(call_count / 2).
        local dispatches = math.ceil(mock_state.call_count / 2)
        expect(dispatches >= 3).to.be.truthy(
            "(alpha) need >= 3 read_file dispatches, got " .. dispatches)

        -- (beta) Completed successfully or via stagnation/max_iters give-up.
        local beta_ok = (tool_output.ok == true)
            or (tool_output.failure_reason == "stagnation")
            or (tool_output.failure_reason == "max_iters")
        expect(beta_ok).to.be.truthy(
            "(beta) unexpected failure_reason=" .. tostring(tool_output.failure_reason))

        -- (gamma) All messages[] sizes within 24K chars.
        expect(mock_state.max_seen_size <= 24000).to.be.truthy(
            "(gamma) max messages size " .. mock_state.max_seen_size .. " > 24000")
    end)

    -- ── Regression: forced-fail stagnation path (mirrors multi_stagnation e2e) ──
    -- Verifies that subtask 1+2 changes did not break the stagnation detection path.
    -- Uses a mock LLM that always returns the same SR block (identical sr_hash),
    -- triggering is_stagnant_v2 after STAGNATION_WINDOW=3 iterations.
    -- oai_ prefix avoids /tmp/ collision with anthropic version on concurrent runs.
    it("stagnation regression — forced-fail runner triggers stagnation give-up", function()
        local TF_A = "/tmp/oai_stag_reg_a.lua"
        local TF_B = "/tmp/oai_stag_reg_b.lua"
        write_file(TF_A, "-- oai_stag_a_v0\n")
        write_file(TF_B, "-- oai_stag_b_v0\n")

        -- Constant SR block: same content every call → same sr_hash → triggers stagnation.
        local CONST_SR = "<<< path=" .. TF_A .. " >>>\n<<<<<<< SEARCH\n-- oai_stag_a_v0\n=======\n-- oai_stag_a_patched\n>>>>>>> REPLACE\n"

        local stag_call_n = 0
        local function stag_mock(_opts, _messages)
            stag_call_n = stag_call_n + 1
            -- Always return the same SR block (same sr_hash after normalisation).
            return { choices = { { message = { content = CONST_SR } } } }
        end

        -- Runner always fails (forced-fail, mirrors test_anthropic_compile_loop_multi_stagnation.lua).
        local stag_runner_n = 0
        local function stag_runner(_file_paths)
            stag_runner_n = stag_runner_n + 1
            return { ok = false, stdout = "", stderr = "FORCED_FAIL", exit_code = 1 }
        end

        compile_loop._test_set_llm_call(stag_mock)

        local stag_td = compile_loop.make({
            runner       = stag_runner,
            llm          = { provider = "openai", api_key = "mock-key" },
            target_files = { TF_A, TF_B },
            edit_mode    = "diff",
            max_iters    = 10,
        })

        local stag_ok, stag_json = pcall(stag_td.handler, {
            spec         = "Patch oai_stag_a.",
            target_files = { TF_A, TF_B },
        })

        compile_loop._test_reset_llm_call()

        if not stag_ok then
            error("stagnation handler raised: " .. tostring(stag_json))
        end

        local stag_dec_ok, stag_out = pcall(std.json.decode, stag_json)
        if not stag_dec_ok or type(stag_out) ~= "table" then
            error("stagnation result decode failed: " .. tostring(stag_json))
        end

        io.write(string.format("[DIAG stagnation] ok=%s failure_reason=%s iters=%s runner_n=%d\n",
            tostring(stag_out.ok), tostring(stag_out.failure_reason),
            tostring(stag_out.iters), stag_runner_n))

        -- ok must be false.
        expect(stag_out.ok).to_not.equal(true)

        -- failure_reason must be stagnation or max_iters (stagnation is expected here,
        -- but max_iters is also acceptable per subtask-3 spec).
        local stag_reason_ok = (stag_out.failure_reason == "stagnation")
            or (stag_out.failure_reason == "max_iters")
        expect(stag_reason_ok).to.be.truthy(
            "expected stagnation|max_iters, got " .. tostring(stag_out.failure_reason))

        -- Note: runner may not be called at all when apply_blocks fails every iter
        -- (SEARCH text mismatch after first patch → all_failed path).
        -- is_stagnant_v2 fires on repeated sr_hash regardless of runner invocation.
        -- We only assert iters > 0 (the loop ran at least one iteration).
        expect((stag_out.iters or 0) > 0).to.be.truthy(
            "expected iters > 0, got " .. tostring(stag_out.iters))
    end)

end)