synaps 0.3.4

Terminal-native AI agent runtime — parallel orchestration, reactive subagents, MCP, autonomous supervision
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
"""
21 tool-heavy benchmark questions — v2 (realistic load).
"""

import json
import os


def _read(p, name):
    try:
        with open(os.path.join(p, name)) as f:
            return f.read()
    except FileNotFoundError:
        return None


def _exists(p, name):
    return os.path.exists(os.path.join(p, name))


def _lines(p, name):
    s = _read(p, name)
    return len(s.strip().splitlines()) if s else 0


QUESTIONS = [
    # ── Phase 1: Scaffold a FastAPI project (heavy writes) ──────────

    {
        "id": 1,
        "prompt": (
            "Create a FastAPI project with the following structure. "
            "First create app/__init__.py (empty). "
            "Then create app/models.py containing Pydantic models: "
            "UserCreate(name: str, email: str, age: int), "
            "UserResponse(id: int, name: str, email: str, age: int, created_at: str), "
            "ItemCreate(title: str, description: str, price: float, owner_id: int), "
            "ItemResponse(id: int, title: str, description: str, price: float, owner_id: int, created_at: str), "
            "ErrorResponse(detail: str, code: int), and "
            "PaginatedResponse(items: list, total: int, page: int, per_page: int). "
            "Include proper type hints and Field validators: name must be 2-50 chars, "
            "email must contain @, age must be 18-120, price must be > 0. "
            "After creating the file, read it back and count the total lines."
        ),
        "verify": lambda p: (
            (s := _read(p, "app/models.py")) is not None
            and all(c in s for c in ("UserCreate", "UserResponse", "ItemCreate",
                                      "ItemResponse", "ErrorResponse", "PaginatedResponse"))
            and "Field(" in s
            and _lines(p, "app/models.py") >= 35
        ),
        "expects": "app/models.py with 6 Pydantic models + validators (35+ lines)",
    },

    {
        "id": 2,
        "prompt": (
            "Create app/database.py that implements an in-memory database using "
            "Python dicts. It should have: a Database class with __init__ that creates "
            "empty dicts for users and items plus auto-increment counters; "
            "create_user(data) that stores and returns a user dict with id and "
            "created_at (ISO format); get_user(user_id) returning user or None; "
            "list_users(page, per_page) with pagination returning (users_list, total); "
            "create_item(data) similar to create_user; get_item(item_id); "
            "list_items(owner_id=None, min_price=None, max_price=None, page=1, per_page=10) "
            "with filtering and pagination; update_item(item_id, data) that merges fields; "
            "delete_item(item_id) returning bool. "
            "Add a module-level instance: db = Database(). "
            "After creating, read the file back and verify it has all 8 methods by "
            "running: python3 -c \"from app.database import db; print([m for m in dir(db) if not m.startswith('_')])\""
        ),
        "verify": lambda p: (
            (s := _read(p, "app/database.py")) is not None
            and all(m in s for m in ("create_user", "get_user", "list_users",
                                      "create_item", "get_item", "list_items",
                                      "update_item", "delete_item"))
            and "class Database" in s
            and _lines(p, "app/database.py") >= 50
        ),
        "expects": "app/database.py: Database class with 8 methods, 50+ lines",
    },

    {
        "id": 3,
        "prompt": (
            "Create app/routes.py with FastAPI router endpoints. Import the models "
            "and database. Create a router = APIRouter(). Implement these endpoints: "
            "POST /users -> create user, GET /users/{id} -> get user (404 if missing), "
            "GET /users -> list users with page/per_page query params (defaults 1, 10), "
            "POST /items -> create item, GET /items/{id} -> get item (404 if missing), "
            "GET /items -> list items with optional owner_id, min_price, max_price filters "
            "plus pagination, PUT /items/{id} -> update item (404 if missing), "
            "DELETE /items/{id} -> delete item (404 if missing). "
            "Use proper HTTP status codes (201 for creates, 204 for delete, 404 for not found). "
            "Use the ErrorResponse model for error responses. "
            "After creating, read it back, then grep for 'def ' to count the endpoint functions."
        ),
        "verify": lambda p: (
            (s := _read(p, "app/routes.py")) is not None
            and all(ep in s for ep in ("POST", "GET", "PUT", "DELETE",
                                        "/users", "/items"))
            and s.count("def ") >= 8
            and _lines(p, "app/routes.py") >= 60
        ),
        "expects": "app/routes.py: 8 endpoints, proper status codes, 60+ lines",
    },

    {
        "id": 4,
        "prompt": (
            "Create app/main.py that imports FastAPI, the router from routes, and "
            "sets up the app with title='Benchmark API', version='1.0.0', "
            "description='A benchmark FastAPI project'. Include the router, add "
            "a root endpoint GET / returning {status: 'ok', version: '1.0.0'}, "
            "add a GET /health endpoint returning {healthy: true, checks: {database: 'ok', "
            "uptime: <float seconds since start>}} (track start time with a module-level "
            "datetime.now()). Add CORS middleware allowing all origins. "
            "Then create a requirements.txt with fastapi, uvicorn, pydantic, pytest, httpx. "
            "After creating both files, run: python3 -c \"from app.main import app; "
            "print(app.title, app.version)\" to verify it imports cleanly."
        ),
        "verify": lambda p: (
            (s := _read(p, "app/main.py")) is not None
            and "FastAPI" in s and "1.0.0" in s and "CORS" in s.upper()
            and (r := _read(p, "requirements.txt")) is not None
            and "fastapi" in r and "pytest" in r
        ),
        "expects": "app/main.py (CORS, health, root) + requirements.txt",
    },

    {
        "id": 5,
        "prompt": (
            "Create a comprehensive seed script at scripts/seed.py that: "
            "1) imports the database, 2) creates 10 users with realistic names/emails "
            "(alice@example.com through jane@example.com, ages 22-55), "
            "3) creates 20 items across those users with varied titles, descriptions "
            "(at least 30 chars each), and prices ranging from $9.99 to $299.99, "
            "4) prints a summary: 'Seeded X users and Y items', "
            "5) prints the most expensive item's title and price, "
            "6) prints the user with the most items. "
            "After creating it, run it with bash and report the full output. "
            "Then list all files in the project recursively with 'find . -type f | sort'."
        ),
        "verify": lambda p: (
            (s := _read(p, "scripts/seed.py")) is not None
            and "create_user" in s and "create_item" in s
            and _lines(p, "scripts/seed.py") >= 40
        ),
        "expects": "scripts/seed.py: 10 users, 20 items, runs and reports stats",
    },

    # ── Phase 2: Deep inspection chains (read → compute → read → answer) ──

    {
        "id": 6,
        "prompt": (
            "I need a full code review of the database module. Read app/database.py, "
            "then read app/models.py for context on the data shapes. "
            "Then run: python3 -c \"from app.database import db; "
            "db.create_user({'name':'test','email':'t@t.com','age':25}); "
            "db.create_user({'name':'test2','email':'t2@t.com','age':30}); "
            "print('users:', db.list_users(1,10)); "
            "db.create_item({'title':'Widget','description':'A test widget for benchmarking','price':19.99,'owner_id':1}); "
            "print('items:', db.list_items())\" "
            "Report the exact output, then list all issues you see in database.py "
            "(missing validation, edge cases, etc). Write your findings to REVIEW.md."
        ),
        "verify": lambda p: (
            (s := _read(p, "REVIEW.md")) is not None
            and len(s) >= 200
        ),
        "expects": "REVIEW.md with code review findings (200+ chars)",
    },

    {
        "id": 7,
        "prompt": (
            "Run the seed script (python3 scripts/seed.py), then write a query "
            "script at scripts/query.py that imports the database, runs the seed, "
            "and then: 1) finds all items priced over $100 and prints count + titles, "
            "2) groups items by owner_id and prints owner_id: count for each, "
            "3) computes the average price across all items (1 decimal), "
            "4) finds the owner with highest total item value and prints their user info. "
            "Run scripts/query.py and report the full output."
        ),
        "verify": lambda p: (
            (s := _read(p, "scripts/query.py")) is not None
            and "list_items" in s
        ),
        "expects": "scripts/query.py with 4 queries, executed with output",
    },

    {
        "id": 8,
        "prompt": (
            "Read every Python file in the app/ directory. Count: "
            "1) total lines across all files, 2) total number of function definitions "
            "(lines containing 'def '), 3) total number of class definitions, "
            "4) every import statement. Run this bash one-liner to cross-check: "
            "find app -name '*.py' -exec cat {} + | wc -l && "
            "grep -r 'def ' app/ | wc -l && grep -r 'class ' app/ | wc -l. "
            "Then create a file PROJECT_STATS.md with a markdown table of these metrics."
        ),
        "verify": lambda p: (
            (s := _read(p, "PROJECT_STATS.md")) is not None
            and "|" in s
            and "def" in s.lower()
        ),
        "expects": "PROJECT_STATS.md with metrics table from reading all app/ files",
    },

    {
        "id": 9,
        "prompt": (
            "Read app/routes.py, then read app/database.py, then read app/models.py. "
            "Trace the full request flow for POST /items: which route handler, which "
            "model validates input, which database method stores it. "
            "Then do the same for GET /items with filters. "
            "Create a file ARCHITECTURE.md documenting both flows as step-by-step "
            "numbered lists. Include the exact function signatures involved at each step. "
            "After writing it, read it back to verify it's correct."
        ),
        "verify": lambda p: (
            (s := _read(p, "ARCHITECTURE.md")) is not None
            and "POST" in s and "GET" in s
            and "create_item" in s
            and len(s) >= 400
        ),
        "expects": "ARCHITECTURE.md with traced request flows (400+ chars)",
    },

    {
        "id": 10,
        "prompt": (
            "Write a comprehensive bash script at scripts/analyze.sh that: "
            "1) prints '=== Project Structure ===' then runs find . -type f | sort, "
            "2) prints '=== Line Counts ===' then wc -l on every .py file, "
            "3) prints '=== TODO/FIXME ===' then greps for TODO or FIXME across all files, "
            "4) prints '=== Import Graph ===' then for each .py file in app/, "
            "   extracts import lines and prints 'file: imports module', "
            "5) prints '=== Validation Rules ===' then greps for Field( in models.py "
            "   and prints each match. "
            "Make it executable, run it with bash, capture the full output. "
            "The output will be long — that's intentional."
        ),
        "verify": lambda p: (
            (s := _read(p, "scripts/analyze.sh")) is not None
            and "find" in s and "wc" in s and "grep" in s
        ),
        "expects": "scripts/analyze.sh runs 5 analysis sections with output",
    },

    # ── Phase 3: Multi-file refactors (read → plan → edit → verify chains) ──

    {
        "id": 11,
        "prompt": (
            "Add comprehensive input validation to app/database.py. "
            "First read app/models.py to understand the validation rules, "
            "then read app/database.py. Add validation in create_user: "
            "raise ValueError if name is empty or >50 chars, if email doesn't "
            "contain @, if age is not 18-120. Add validation in create_item: "
            "raise ValueError if title is empty, if price <= 0, if owner_id "
            "doesn't exist in the users dict. Use surgical edits, not rewrites. "
            "After editing, run: python3 -c \"from app.database import db; "
            "try: db.create_user({'name':'','email':'bad','age':10}); "
            "except ValueError as e: print('caught:', e)\" "
            "to verify the validation works."
        ),
        "verify": lambda p: (
            (s := _read(p, "app/database.py")) is not None
            and "ValueError" in s
            and s.count("raise ValueError") >= 3
        ),
        "expects": "database.py: 3+ ValueError raises in create methods",
    },

    {
        "id": 12,
        "prompt": (
            "The routes need proper error handling. Read app/routes.py, then "
            "read app/database.py to see what exceptions it can raise. "
            "Edit app/routes.py to wrap every create/update endpoint in try/except "
            "that catches ValueError and returns a 422 with ErrorResponse. "
            "Also add input length logging: before each create, print the request "
            "body size to stderr. After editing, read the file back and count "
            "the number of try/except blocks to confirm. Then run: "
            "python3 -c \"from app.routes import router; print(len(router.routes), 'routes registered')\""
        ),
        "verify": lambda p: (
            (s := _read(p, "app/routes.py")) is not None
            and s.count("except") >= 3
            and "422" in s
        ),
        "expects": "routes.py: 3+ try/except blocks with 422 responses",
    },

    {
        "id": 13,
        "prompt": (
            "Extract the validation logic from database.py into a new module "
            "app/validators.py. Read database.py first to identify all validation "
            "code. Create validators.py with functions: validate_user(data) and "
            "validate_item(data, existing_user_ids) that raise ValueError with "
            "descriptive messages. Then edit database.py to import and use these "
            "validators instead of inline checks. After refactoring, run the "
            "validation test again: python3 -c \"from app.database import db; "
            "try: db.create_user({'name':'','email':'bad','age':10}); "
            "except ValueError as e: print('still works:', e)\" "
            "Then read both files back to verify the refactor is clean."
        ),
        "verify": lambda p: (
            (v := _read(p, "app/validators.py")) is not None
            and "validate_user" in v and "validate_item" in v
            and (d := _read(p, "app/database.py")) is not None
            and "from app.validators import" in d.replace("from .validators import", "from app.validators import")
        ),
        "expects": "validators.py extracted, database.py imports it",
    },

    {
        "id": 14,
        "prompt": (
            "Add a logging system. Create app/logger.py with a configure_logging() "
            "function that sets up Python's logging module with format "
            "'%(asctime)s [%(levelname)s] %(name)s: %(message)s'. "
            "Then edit app/database.py to add logging: log every create/update/delete "
            "operation at INFO level with the entity type and id, and log validation "
            "failures at WARNING level. Read database.py to plan the edits first, "
            "then make the edits, then run the seed script and capture the log output. "
            "After that, read database.py back to verify all log calls are present."
        ),
        "verify": lambda p: (
            (l := _read(p, "app/logger.py")) is not None
            and "configure_logging" in l
            and (d := _read(p, "app/database.py")) is not None
            and d.count("log") >= 5
        ),
        "expects": "logger.py created, database.py has 5+ log calls",
    },

    {
        "id": 15,
        "prompt": (
            "Add search functionality. Read app/database.py and app/routes.py first. "
            "Then edit database.py to add a search_items(query, fields=None) method "
            "that does case-insensitive substring matching across title and description "
            "(or specified fields). Also add search_users(query) matching name and email. "
            "Then edit routes.py to add GET /search?q=<query>&type=users|items|all "
            "that returns combined results with the type labeled. "
            "After all edits, run: python3 -c \"from app.database import db; "
            "db.create_user({'name':'Alice Test','email':'alice@test.com','age':25}); "
            "db.create_item({'title':'Test Widget','description':'A widget for testing purposes','price':9.99,'owner_id':1}); "
            "print(db.search_items('widget')); print(db.search_users('alice'))\" "
            "to verify it works."
        ),
        "verify": lambda p: (
            (d := _read(p, "app/database.py")) is not None
            and "search_items" in d and "search_users" in d
            and (r := _read(p, "app/routes.py")) is not None
            and "/search" in r
        ),
        "expects": "search methods in database.py, /search endpoint in routes.py",
    },

    # ── Phase 4: Complex cross-cutting tasks ────────────────────────

    {
        "id": 16,
        "prompt": (
            "Create a full test suite at tests/test_database.py. "
            "Read app/database.py and app/validators.py first to understand "
            "what to test. Write tests covering: "
            "- create_user happy path + all validation errors (empty name, bad email, bad age) "
            "- get_user found + not found "
            "- list_users pagination (create 15 users, verify page 1 has 10, page 2 has 5) "
            "- create_item happy path + validation errors (bad price, missing owner) "
            "- list_items filtering (by owner_id, by price range) "
            "- update_item happy path + not found "
            "- delete_item happy path + not found "
            "- search_items and search_users "
            "Each test must have a descriptive name. Use pytest fixtures for a fresh "
            "Database instance. After creating, run: python3 -m pytest tests/test_database.py -v "
            "and report the full output."
        ),
        "verify": lambda p: (
            (s := _read(p, "tests/test_database.py")) is not None
            and s.count("def test_") >= 12
            and "fixture" in s.lower() or "@pytest" in s
        ),
        "expects": "tests/test_database.py: 12+ tests, run with pytest output",
    },

    {
        "id": 17,
        "prompt": (
            "Create tests/test_routes.py that tests the API endpoints using "
            "httpx and FastAPI's TestClient. Read app/routes.py and app/main.py first. "
            "Write tests for: POST /users (valid + invalid), GET /users/{id} (found + 404), "
            "POST /items (valid + invalid + nonexistent owner), GET /items with filters, "
            "PUT /items/{id} (valid + 404), DELETE /items/{id} (valid + 404), "
            "GET /search with various queries, GET / and GET /health. "
            "Use a fixture that creates the TestClient. After creating, "
            "run pytest tests/test_routes.py -v and report full output. "
            "If any tests fail, read the error, fix the test or the code, and re-run."
        ),
        "verify": lambda p: (
            (s := _read(p, "tests/test_routes.py")) is not None
            and s.count("def test_") >= 10
            and "TestClient" in s
        ),
        "expects": "tests/test_routes.py: 10+ integration tests with TestClient",
    },

    {
        "id": 18,
        "prompt": (
            "Create a data migration script at scripts/migrate.py that: "
            "1) reads the current database state (run seed first), "
            "2) adds a 'tags' field (list of strings) to every item — derive tags "
            "   from the title/description using keyword extraction (split words, "
            "   lowercase, filter out words < 4 chars), "
            "3) adds an 'is_premium' boolean to every user (True if they own any item > $100), "
            "4) creates a migration log file at data/migration_log.json recording: "
            "   timestamp, items_updated count, users_updated count, sample of first "
            "   3 items with their new tags, "
            "5) prints a summary of all changes. "
            "Run the script and verify the migration log was created by reading it back."
        ),
        "verify": lambda p: (
            (s := _read(p, "scripts/migrate.py")) is not None
            and "tags" in s and "is_premium" in s
        ),
        "expects": "scripts/migrate.py: adds tags+is_premium, creates migration log",
    },

    {
        "id": 19,
        "prompt": (
            "Perform a full security audit. Read every file in app/ one by one. "
            "Then create SECURITY_AUDIT.md documenting: "
            "1) Input validation gaps (any endpoint that doesn't validate, any "
            "   field that could be exploited — SQL injection doesn't apply since "
            "   we use dicts, but look for XSS in stored fields, missing length limits), "
            "2) Authentication gaps (there's no auth at all — document what's needed), "
            "3) Rate limiting gaps (none exists — recommend approach), "
            "4) Data exposure risks (are any internal fields leaking?), "
            "5) Specific code fixes with file:line references. "
            "The audit should be at least 50 lines long. After writing it, "
            "read it back and verify it references all 4 app/ source files."
        ),
        "verify": lambda p: (
            (s := _read(p, "SECURITY_AUDIT.md")) is not None
            and _lines(p, "SECURITY_AUDIT.md") >= 40
            and all(f in s for f in ("models.py", "database.py", "routes.py"))
        ),
        "expects": "SECURITY_AUDIT.md: 40+ lines referencing all source files",
    },

    {
        "id": 20,
        "prompt": (
            "There are bugs being introduced now. Run these bash commands: "
            "echo '    def broken_method(self):' >> app/database.py && "
            "echo '        return self.nonexistent_attr' >> app/database.py && "
            "sed -i 's/def get_user/def getuser/' app/database.py && "
            "echo 'import nonexistent_module' >> app/routes.py. "
            "Now: 1) try to import the app and observe the errors, "
            "2) read the affected files to understand what broke, "
            "3) fix all three bugs (remove broken_method, restore get_user name, "
            "   remove bad import) using surgical edits, "
            "4) verify the fix by importing the app again, "
            "5) run the database tests to confirm nothing else broke."
        ),
        "verify": lambda p: (
            (d := _read(p, "app/database.py")) is not None
            and "def get_user(" in d
            and "nonexistent_attr" not in d
            and (r := _read(p, "app/routes.py")) is not None
            and "import nonexistent_module" not in r
        ),
        "expects": "3 injected bugs found and fixed, tests pass",
    },

    {
        "id": 21,
        "prompt": (
            "Final comprehensive audit. "
            "1) Read every .py file in app/ and count total lines, functions, classes. "
            "2) Run the full test suite: python3 -m pytest tests/ -v "
            "3) Run the analyze script: bash scripts/analyze.sh "
            "4) Read config if any exists. "
            "5) Create FINAL_REPORT.md with: "
            "   - Project structure (list all files with line counts), "
            "   - Test results summary (X passed, Y failed), "
            "   - Code metrics (functions, classes, total lines), "
            "   - Architecture summary (how the modules connect), "
            "   - Top 3 recommended improvements. "
            "The report should be thorough — at least 60 lines. "
            "After creating it, read it back to verify completeness."
        ),
        "verify": lambda p: (
            (s := _read(p, "FINAL_REPORT.md")) is not None
            and _lines(p, "FINAL_REPORT.md") >= 50
            and "test" in s.lower()
            and "routes" in s.lower()
        ),
        "expects": "FINAL_REPORT.md: 50+ lines, tests + metrics + recommendations",
    },
]


def question_count():
    return len(QUESTIONS)


if __name__ == "__main__":
    print(f"{question_count()} questions defined")
    for q in QUESTIONS:
        print(f"  Q{q['id']:2d}: {q['expects']}")