- Wire Interpreter into v2 pipeline (after Thinker tool_output, before Output) - Rename tool_exec -> tool_call everywhere (consistent convention across v1/v2) - Switch Director v1+v2 to anthropic/claude-haiku-4.5 (was opus, reserved) - Fix UI apply_machine_ops crash when states are strings instead of dicts - Fix runtime_test.py async poll to match on message ID (prevent stale results) - Add traceback to pipeline error logging Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
101 lines
3.2 KiB
JSON
101 lines
3.2 KiB
JSON
{
|
|
"timestamp": "2026-03-29 06:04:47",
|
|
"testcases": {
|
|
"S3* Audit Corrections": [
|
|
{
|
|
"step": "Setup",
|
|
"check": "clear",
|
|
"status": "PASS",
|
|
"detail": "cleared"
|
|
},
|
|
{
|
|
"step": "Tool calls produce results (baseline)",
|
|
"check": "send: create two buttons: Alpha and Beta",
|
|
"status": "PASS",
|
|
"detail": "response: 👍 Okay, I've created buttons labeled \"Alpha\" and \"Beta\".\n"
|
|
},
|
|
{
|
|
"step": "Tool calls produce results (baseline)",
|
|
"check": "actions: length >= 1",
|
|
"status": "PASS",
|
|
"detail": "2 actions >= 1"
|
|
},
|
|
{
|
|
"step": "Tool calls produce results (baseline)",
|
|
"check": "actions: any action contains \"alpha\" or \"Alpha\"",
|
|
"status": "PASS",
|
|
"detail": "found 'alpha' in actions"
|
|
},
|
|
{
|
|
"step": "Dashboard mismatch triggers re-emit",
|
|
"check": "send: I see nothing on my dashboard, fix it",
|
|
"status": "PASS",
|
|
"detail": "response: 👍 Done — Alpha and Beta buttons are now live on your dashboard. They should appe"
|
|
},
|
|
{
|
|
"step": "Dashboard mismatch triggers re-emit",
|
|
"check": "response: not contains \"sorry\" or \"apologize\"",
|
|
"status": "PASS",
|
|
"detail": "none of ['sorry', 'apologize'] found (as expected)"
|
|
},
|
|
{
|
|
"step": "Dashboard mismatch triggers re-emit",
|
|
"check": "actions: length >= 1",
|
|
"status": "PASS",
|
|
"detail": "2 actions >= 1"
|
|
},
|
|
{
|
|
"step": "DB error triggers retry with corrected SQL",
|
|
"check": "send: SELECT * FROM NichtExistent LIMIT 5",
|
|
"status": "PASS",
|
|
"detail": "response: Ah, it seems like the table `NichtExistent` does not exist. Double-check the tab"
|
|
},
|
|
{
|
|
"step": "DB error triggers retry with corrected SQL",
|
|
"check": "trace: has tool_call",
|
|
"status": "PASS",
|
|
"detail": "found event 'tool_call'"
|
|
},
|
|
{
|
|
"step": "DB error triggers retry with corrected SQL",
|
|
"check": "response: not contains \"1146\"",
|
|
"status": "PASS",
|
|
"detail": "none of ['1146'] found (as expected)"
|
|
},
|
|
{
|
|
"step": "DB error triggers retry with corrected SQL",
|
|
"check": "response: length > 10",
|
|
"status": "PASS",
|
|
"detail": "length 163 > 10"
|
|
},
|
|
{
|
|
"step": "Complex request gets Director plan",
|
|
"check": "send: investigate which customers have the mos",
|
|
"status": "PASS",
|
|
"detail": "response: Okay, I'll look into which customers have the most devices. This might take a mo"
|
|
},
|
|
{
|
|
"step": "Complex request gets Director plan",
|
|
"check": "trace: has director_plan",
|
|
"status": "FAIL",
|
|
"detail": "no 'director_plan' event in trace"
|
|
},
|
|
{
|
|
"step": "Complex request gets Director plan",
|
|
"check": "trace: has tool_call",
|
|
"status": "PASS",
|
|
"detail": "found event 'tool_call'"
|
|
},
|
|
{
|
|
"step": "Complex request gets Director plan",
|
|
"check": "response: length > 20",
|
|
"status": "PASS",
|
|
"detail": "length 86 > 20"
|
|
}
|
|
]
|
|
},
|
|
"summary": {
|
|
"passed": 14,
|
|
"failed": 1
|
|
}
|
|
} |