diff --git a/agent/api.py b/agent/api.py index eeb100f..96be677 100644 --- a/agent/api.py +++ b/agent/api.py @@ -40,7 +40,7 @@ def _broadcast_sse(event: dict): if _pipeline_result.get("status") == "running": node = event.get("node", "") evt = event.get("event", "") - if node and evt in ("thinking", "perceived", "decided", "streaming", "tool_exec", "interpreted", "updated"): + if node and evt in ("thinking", "perceived", "decided", "streaming", "tool_call", "interpreted", "updated"): _pipeline_result["stage"] = node _pipeline_result["event"] = evt @@ -199,7 +199,8 @@ def register_routes(app): "memorizer": runtime.memorizer.state, } except Exception as e: - log.error(f"[api] pipeline error: {e}") + import traceback + log.error(f"[api] pipeline error: {e}\n{traceback.format_exc()}") _pipeline_result = { "status": "error", "id": msg_id, diff --git a/agent/nodes/director_v1.py b/agent/nodes/director_v1.py index 89eaea8..d0249da 100644 --- a/agent/nodes/director_v1.py +++ b/agent/nodes/director_v1.py @@ -12,7 +12,7 @@ log = logging.getLogger("runtime") class DirectorNode(Node): name = "director" model = "google/gemini-2.0-flash-001" - plan_model = "anthropic/claude-opus-4" # Smart model for investigation planning + plan_model = "anthropic/claude-haiku-4.5" # Smart model for investigation planning max_context_tokens = 2000 SYSTEM = """You are the Director node — the strategist of this cognitive runtime. diff --git a/agent/nodes/director_v2.py b/agent/nodes/director_v2.py index 702d468..fb85d69 100644 --- a/agent/nodes/director_v2.py +++ b/agent/nodes/director_v2.py @@ -12,7 +12,7 @@ log = logging.getLogger("runtime") class DirectorV2Node(Node): name = "director_v2" - model = "anthropic/claude-opus-4" + model = "anthropic/claude-haiku-4.5" max_context_tokens = 4000 SYSTEM = """You are the Director — the brain of this cognitive agent runtime. diff --git a/agent/nodes/thinker_v2.py b/agent/nodes/thinker_v2.py index 15f241a..6d9684c 100644 --- a/agent/nodes/thinker_v2.py +++ b/agent/nodes/thinker_v2.py @@ -79,7 +79,7 @@ Rules: for step in plan.tool_sequence: tool = step.get("tool", "") args = step.get("args", {}) - await self.hud("tool_exec", tool=tool, args=args) + await self.hud("tool_call", tool=tool, args=args) if tool == "emit_actions": actions.extend(args.get("actions", [])) diff --git a/agent/nodes/ui.py b/agent/nodes/ui.py index c95e66b..fc9f52f 100644 --- a/agent/nodes/ui.py +++ b/agent/nodes/ui.py @@ -34,6 +34,8 @@ class UINode(Node): states_list = op_data.get("states", []) states = {} for s in states_list: + if isinstance(s, str): + s = {"name": s} name = s.get("name", "") if name: states[name] = { diff --git a/agent/runtime.py b/agent/runtime.py index ac583b3..ca03adb 100644 --- a/agent/runtime.py +++ b/agent/runtime.py @@ -219,6 +219,10 @@ class Runtime: if self.is_v2: plan = await self.director.decide(command, self.history, memory_context=mem_ctx) thought = await self.thinker.process(command, plan, self.history, memory_context=mem_ctx) + if self.interpreter and thought.tool_used and thought.tool_output: + interpreted = await self.interpreter.interpret( + thought.tool_used, thought.tool_output, action_desc) + thought.response = interpreted.summary else: thought = await self.thinker.process(command, self.history, memory_context=mem_ctx) @@ -335,9 +339,15 @@ class Runtime: return if self.is_v2: - # v2 flow: Director decides, Thinker executes + # v2 flow: Director decides, Thinker executes, Interpreter reads results plan = await self.director.decide(command, self.history, memory_context=mem_ctx) thought = await self.thinker.process(command, plan, self.history, memory_context=mem_ctx) + # Interpreter: factual summary of tool results (no hallucination) + if self.interpreter and thought.tool_used and thought.tool_output: + interpreted = await self.interpreter.interpret( + thought.tool_used, thought.tool_output, text) + # Replace thinker's response with interpreter's factual summary + thought.response = interpreted.summary else: # v1 flow: optional Director pre-planning for complex requests is_complex = command.analysis.complexity == "complex" diff --git a/runtime_test.py b/runtime_test.py index daaf3c7..8259227 100644 --- a/runtime_test.py +++ b/runtime_test.py @@ -153,6 +153,19 @@ class CogClient: body["dashboard"] = dashboard r = self.client.post(f"{API}/send", json=body, headers=HEADERS) d = r.json() + # Async send: poll for result, match on message ID + if d.get("status") == "queued": + msg_id = d.get("id", "") + for _ in range(120): + time.sleep(0.5) + pr = self.client.get(f"{API}/result", headers=HEADERS) + pd = pr.json() + if pd.get("id") == msg_id and pd.get("status") == "done": + d = pd + break + if pd.get("id") == msg_id and pd.get("status") == "error": + d = pd + break self.last_response = d.get("response", "") self.last_memo = d.get("memorizer", {}) time.sleep(0.5) @@ -477,13 +490,22 @@ def run_standalone(paths: list[Path] = None): if not paths: paths = sorted(Path("testcases").glob("*.md")) + # Count total steps across all testcases for frontend progress + all_tcs = [parse_testcase(p) for p in paths] + total_steps = sum(len(s["commands"]) for tc in all_tcs for s in tc["steps"]) + first_suite = True + all_results = {} - for path in paths: - tc = parse_testcase(path) + for tc in all_tcs: + path = tc["file"] print(f"\n{'='*60}") print(f" {tc['name']}") print(f"{'='*60}") - _push_status("suite_start", suite=tc["name"]) + if first_suite: + _push_status("suite_start", suite=tc["name"], count=total_steps) + first_suite = False + else: + _push_status("suite_start", suite=tc["name"]) runner = CogTestRunner() results = runner.run(tc) diff --git a/static/app.js b/static/app.js index 6060145..29dc31a 100644 --- a/static/app.js +++ b/static/app.js @@ -397,7 +397,6 @@ function graphAnimate(event, node) { if (node) pulseNode(node); break; case 'tool_call': - case 'tool_exec': pulseNode(node || 'thinker'); flashEdge('thinker', 'ui'); break; case 'tool_result': diff --git a/test_nodes/test_thinker_v2.py b/test_nodes/test_thinker_v2.py index e0d8950..9853857 100644 --- a/test_nodes/test_thinker_v2.py +++ b/test_nodes/test_thinker_v2.py @@ -183,8 +183,8 @@ async def test_emits_hud_per_tool(): with patch("agent.nodes.thinker_v2.llm_call", side_effect=mock_llm): await node.process(cmd, plan, [], memory_context="") - tool_events = hud.find("tool_exec") - assert len(tool_events) >= 2, f"expected 2+ tool_exec events, got {len(tool_events)}" + tool_events = hud.find("tool_call") + assert len(tool_events) >= 2, f"expected 2+ tool_call events, got {len(tool_events)}" async def test_create_machine_tool(): diff --git a/testcases/results.json b/testcases/results.json index 65df575..4eaf67c 100644 --- a/testcases/results.json +++ b/testcases/results.json @@ -1,534 +1,6 @@ { - "timestamp": "2026-03-29 00:37:01", + "timestamp": "2026-03-29 06:04:47", "testcases": { - "Button Persistence": [ - { - "step": "Setup", - "check": "clear", - "status": "PASS", - "detail": "cleared" - }, - { - "step": "Create buttons", - "check": "send: create two buttons: Poodle Bark and Bolo", - "status": "PASS", - "detail": "response: Okay, I've created the \"Poodle Bark\" and \"Bolonka Bark\" buttons for you! 🐶 \n" - }, - { - "step": "Create buttons", - "check": "actions: length >= 2", - "status": "PASS", - "detail": "2 actions >= 2" - }, - { - "step": "Create buttons", - "check": "actions: any action contains \"poodle\" or \"Poodle\"", - "status": "PASS", - "detail": "found 'poodle' in actions" - }, - { - "step": "Create buttons", - "check": "actions: any action contains \"bolonka\" or \"Bolonka\"", - "status": "PASS", - "detail": "found 'bolonka' in actions" - }, - { - "step": "Ask unrelated question (buttons must survive)", - "check": "send: what time is it?", - "status": "PASS", - "detail": "response: It's 00:28 AM on Sunday, March 29, 2026.\n" - }, - { - "step": "Ask unrelated question (buttons must survive)", - "check": "response: contains \":\" or \"time\" or \"clock\"", - "status": "PASS", - "detail": "found ':'" - }, - { - "step": "Ask unrelated question (buttons must survive)", - "check": "actions: any action contains \"poodle\" or \"Poodle\"", - "status": "PASS", - "detail": "found 'poodle' in actions" - }, - { - "step": "Ask unrelated question (buttons must survive)", - "check": "actions: any action contains \"bolonka\" or \"Bolonka\"", - "status": "PASS", - "detail": "found 'bolonka' in actions" - }, - { - "step": "Ask another question (buttons still there)", - "check": "send: say hello in German", - "status": "PASS", - "detail": "response: Hallo Nico! 👋\n" - }, - { - "step": "Ask another question (buttons still there)", - "check": "response: contains \"Hallo\" or \"hallo\" or \"German\"", - "status": "PASS", - "detail": "found 'Hallo'" - }, - { - "step": "Ask another question (buttons still there)", - "check": "actions: any action contains \"poodle\" or \"Poodle\"", - "status": "PASS", - "detail": "found 'poodle' in actions" - }, - { - "step": "Explicitly replace buttons", - "check": "send: remove all buttons and create one button", - "status": "PASS", - "detail": "response: Done! I've removed the previous buttons and created a single button labeled \"Res" - }, - { - "step": "Explicitly replace buttons", - "check": "actions: length >= 1", - "status": "PASS", - "detail": "1 actions >= 1" - }, - { - "step": "Explicitly replace buttons", - "check": "actions: any action contains \"reset\" or \"Reset\"", - "status": "PASS", - "detail": "found 'reset' in actions" - } - ], - "Counter State": [ - { - "step": "Setup", - "check": "clear", - "status": "PASS", - "detail": "cleared" - }, - { - "step": "Create counter", - "check": "send: create a counter starting at 0 with incr", - "status": "PASS", - "detail": "response: Okay, ich habe einen Zähler erstellt, der bei 0 beginnt, sowie Schaltflächen zum" - }, - { - "step": "Create counter", - "check": "response: contains \"counter\" or \"count\"", - "status": "FAIL", - "detail": "none of ['counter', 'count'] found in: Okay, ich habe einen Zähler erstellt, der bei 0 beginnt, sowie Schaltflächen zum Erhöhen und Verring" - }, - { - "step": "Create counter", - "check": "actions: length >= 2", - "status": "PASS", - "detail": "3 actions >= 2" - }, - { - "step": "Create counter", - "check": "actions: any action contains \"increment\" or \"inc\" or \"plus\" or \"add\"", - "status": "PASS", - "detail": "found 'increment' in actions" - }, - { - "step": "Create counter", - "check": "actions: any action contains \"decrement\" or \"dec\" or \"minus\" or \"sub\"", - "status": "PASS", - "detail": "found 'decrement' in actions" - }, - { - "step": "Check state", - "check": "state: topic contains \"counter\" or \"count\" or \"button\"", - "status": "PASS", - "detail": "topic=UI counter with increment/decrement buttons contains 'counter'" - }, - { - "step": "Ask for current value", - "check": "send: what is the current count?", - "status": "PASS", - "detail": "response: Der aktuelle Zählerstand ist 1.\n" - }, - { - "step": "Ask for current value", - "check": "response: contains \"0\" or \"zero\"", - "status": "FAIL", - "detail": "none of ['0', 'zero'] found in: Der aktuelle Zählerstand ist 1.\n" - }, - { - "step": "Increment", - "check": "action: increment", - "status": "PASS", - "detail": "response: Navigated to main" - }, - { - "step": "Increment", - "check": "response: contains \"1\" or \"one\" or \"increment\" or \"Navigated\"", - "status": "PASS", - "detail": "found 'Navigated'" - }, - { - "step": "Increment again", - "check": "action: increment", - "status": "PASS", - "detail": "response: Navigated to main" - }, - { - "step": "Increment again", - "check": "response: contains \"2\" or \"two\" or \"increment\" or \"Navigated\"", - "status": "PASS", - "detail": "found 'Navigated'" - }, - { - "step": "Decrement", - "check": "action: decrement", - "status": "PASS", - "detail": "response: Navigated to main" - }, - { - "step": "Decrement", - "check": "response: contains \"1\" or \"one\" or \"decrement\" or \"Navigated\"", - "status": "PASS", - "detail": "found 'Navigated'" - }, - { - "step": "Verify memorizer tracks it", - "check": "state: topic contains \"count\"", - "status": "PASS", - "detail": "topic=UI counter with increment/decrement buttons contains 'count'" - } - ], - "DB Exploration": [ - { - "step": "Setup", - "check": "clear", - "status": "PASS", - "detail": "cleared" - }, - { - "step": "Query renders table in workspace", - "check": "send: show me 5 customers from the database", - "status": "PASS", - "detail": "response: Here are 5 customers from the database:\n\n| ID | Name1 | Name2 " - }, - { - "step": "Query renders table in workspace", - "check": "trace: has tool_call", - "status": "PASS", - "detail": "found event 'tool_call'" - }, - { - "step": "Query renders table in workspace", - "check": "actions: has table", - "status": "PASS", - "detail": "table found: 23 cols, 5 rows" - }, - { - "step": "Query renders table in workspace", - "check": "response: not contains \"---|\" or \"| ID\"", - "status": "FAIL", - "detail": "found '---|' but expected NOT to" - }, - { - "step": "Chat summarizes, does not dump data", - "check": "response: contains \"customer\" or \"Kunde\" or \"5\" or \"table\"", - "status": "PASS", - "detail": "found 'customer'" - }, - { - "step": "Chat summarizes, does not dump data", - "check": "response: length > 10", - "status": "PASS", - "detail": "length 721 > 10" - }, - { - "step": "Thinker builds exploration UI (not describes it)", - "check": "send: select customer 2 Kathrin Jager, add but", - "status": "PASS", - "detail": "response: Okay, Kathrin Jager (ID 2) is selected. I'm adding buttons to explore her object" - }, - { - "step": "Thinker builds exploration UI (not describes it)", - "check": "actions: length >= 1", - "status": "PASS", - "detail": "7 actions >= 1" - }, - { - "step": "Thinker builds exploration UI (not describes it)", - "check": "response: not contains \"UI team\" or \"will add\" or \"will create\"", - "status": "PASS", - "detail": "none of ['UI team', 'will add', 'will create'] found (as expected)" - }, - { - "step": "Error recovery on bad query", - "check": "send: SELECT * FROM nichtexistiert LIMIT 5", - "status": "PASS", - "detail": "response: I'm sorry, I encountered an error while trying to fetch data from the database. " - }, - { - "step": "Error recovery on bad query", - "check": "trace: has tool_call", - "status": "PASS", - "detail": "found event 'tool_call'" - }, - { - "step": "Error recovery on bad query", - "check": "response: not contains \"1146\"", - "status": "PASS", - "detail": "none of ['1146'] found (as expected)" - }, - { - "step": "Error recovery on bad query", - "check": "response: length > 10", - "status": "PASS", - "detail": "length 134 > 10" - } - ], - "Director Node": [ - { - "step": "Setup", - "check": "clear", - "status": "PASS", - "detail": "cleared" - }, - { - "step": "Casual chat establishes mode", - "check": "send: hey, just hanging out, what's up?", - "status": "PASS", - "detail": "response: Hallo Nico,\n\nich bin gerade auf einen Fehler bei der Datenbankabfrage gestoßen: " - }, - { - "step": "Casual chat establishes mode", - "check": "response: length > 5", - "status": "PASS", - "detail": "length 284 > 5" - }, - { - "step": "Casual chat establishes mode", - "check": "trace: has director_updated", - "status": "PASS", - "detail": "found event 'director_updated'" - }, - { - "step": "Director picks up frustration", - "check": "send: ugh this is so annoying, nothing makes s", - "status": "PASS", - "detail": "response: Ich verstehe, dass das frustrierend ist. Wenn du magst, können wir versuchen, da" - }, - { - "step": "Director picks up frustration", - "check": "response: length > 10", - "status": "PASS", - "detail": "length 219 > 10" - }, - { - "step": "Director picks up frustration", - "check": "trace: has director_updated", - "status": "PASS", - "detail": "found event 'director_updated'" - }, - { - "step": "Switch to building mode", - "check": "send: ok let's build a todo list app", - "status": "PASS", - "detail": "response: Klar, lass uns eine To-Do-Listen-App erstellen! Hier sind die nächsten Schritte:" - }, - { - "step": "Switch to building mode", - "check": "response: length > 10", - "status": "PASS", - "detail": "length 518 > 10" - }, - { - "step": "Switch to building mode", - "check": "trace: has director_updated", - "status": "PASS", - "detail": "found event 'director_updated'" - } - ], - "Pub Conversation": [ - { - "step": "Setup", - "check": "clear", - "status": "PASS", - "detail": "cleared" - }, - { - "step": "Set the scene", - "check": "send: Hey, Alice and I are heading to the pub ", - "status": "PASS", - "detail": "response: That sounds like fun! 👍 Have a great evening at the pub with Alice! 🍻\n" - }, - { - "step": "Set the scene", - "check": "response: length > 10", - "status": "PASS", - "detail": "length 70 > 10" - }, - { - "step": "Set the scene", - "check": "state: situation contains \"pub\" or \"Alice\"", - "status": "PASS", - "detail": "situation=at a pub with alice contains 'pub'" - }, - { - "step": "Language switch to German", - "check": "send: Wir sind jetzt im Biergarten angekommen", - "status": "PASS", - "detail": "response: Super! Der Biergarten ist immer eine tolle Wahl. Was habt ihr geplant, etwas zu " - }, - { - "step": "Language switch to German", - "check": "response: length > 10", - "status": "PASS", - "detail": "length 135 > 10" - }, - { - "step": "Language switch to German", - "check": "state: language is \"de\" or \"mixed\"", - "status": "PASS", - "detail": "language=mixed" - }, - { - "step": "Context awareness", - "check": "send: Was sollen wir bestellen?", - "status": "PASS", - "detail": "response: Wie wäre es mit ein paar klassischen Biergarten-Gerichten? Ein Brezel mit Obatzd" - }, - { - "step": "Context awareness", - "check": "response: length > 10", - "status": "PASS", - "detail": "length 238 > 10" - }, - { - "step": "Context awareness", - "check": "state: topic contains \"bestell\" or \"order\" or \"pub\" or \"Biergarten\"", - "status": "PASS", - "detail": "topic=arriving at the Biergarten contains 'Biergarten'" - }, - { - "step": "Alice speaks", - "check": "send: Alice says: I'll have a Hefeweizen pleas", - "status": "PASS", - "detail": "response: Okay, Alice möchte also ein Hefeweizen. (So, Alice would like a Hefeweizen.)" - }, - { - "step": "Alice speaks", - "check": "response: length > 10", - "status": "PASS", - "detail": "length 76 > 10" - }, - { - "step": "Alice speaks", - "check": "state: facts any contains \"Alice\" or \"Hefeweizen\"", - "status": "FAIL", - "detail": "none of ['Alice', 'Hefeweizen'] found in facts: []" - }, - { - "step": "Ask for time (tool use)", - "check": "send: wie spaet ist es eigentlich?", - "status": "PASS", - "detail": "response: Es ist gerade 00:30 Uhr. Es wird spät! Was plant ihr noch für den Abend?\n" - }, - { - "step": "Ask for time (tool use)", - "check": "response: matches \\d{1,2}:\\d{2}", - "status": "PASS", - "detail": "matched /\\d{1,2}:\\d{2}/" - }, - { - "step": "Back to English", - "check": "send: Let's switch to English, what was the la", - "status": "PASS", - "detail": "response: Alice said, \"I'll have a Hefeweizen please.\"\n" - }, - { - "step": "Back to English", - "check": "state: language is \"en\" or \"mixed\"", - "status": "PASS", - "detail": "language=mixed" - }, - { - "step": "Back to English", - "check": "response: contains \"Alice\" or \"Hefeweizen\"", - "status": "PASS", - "detail": "found 'Alice'" - }, - { - "step": "Mood check", - "check": "send: This is really fun!", - "status": "PASS", - "detail": "response: Glad you're having a good time! 🎉 Should I make any recommendations alongside Al" - }, - { - "step": "Mood check", - "check": "state: user_mood is \"happy\" or \"playful\" or \"excited\"", - "status": "FAIL", - "detail": "user_mood= not in ['happy', 'playful', 'excited']" - } - ], - "Reflex Path": [ - { - "step": "Setup", - "check": "clear", - "status": "PASS", - "detail": "cleared" - }, - { - "step": "Greeting triggers reflex", - "check": "send: hey!", - "status": "PASS", - "detail": "response: Hey Nico! 👋 How can I help you today?\n" - }, - { - "step": "Greeting triggers reflex", - "check": "response: length > 2", - "status": "PASS", - "detail": "length 38 > 2" - }, - { - "step": "Greeting triggers reflex", - "check": "trace: has reflex_path", - "status": "PASS", - "detail": "found event 'reflex_path'" - }, - { - "step": "Thanks triggers reflex", - "check": "send: thanks", - "status": "PASS", - "detail": "response: You're welcome! 👍 Just let me know if you need anything else.\n" - }, - { - "step": "Thanks triggers reflex", - "check": "response: length > 2", - "status": "PASS", - "detail": "length 62 > 2" - }, - { - "step": "Thanks triggers reflex", - "check": "trace: has reflex_path", - "status": "PASS", - "detail": "found event 'reflex_path'" - }, - { - "step": "Complex request does NOT trigger reflex", - "check": "send: explain how neural networks work in deta", - "status": "PASS", - "detail": "response: Got it! I'll explain neural networks. I can either start with the basics, dive i" - }, - { - "step": "Complex request does NOT trigger reflex", - "check": "response: length > 20", - "status": "PASS", - "detail": "length 173 > 20" - }, - { - "step": "Complex request does NOT trigger reflex", - "check": "trace: input.analysis.intent is \"question\" or \"request\"", - "status": "FAIL", - "detail": "no input perceived event in trace" - }, - { - "step": "Complex request does NOT trigger reflex", - "check": "trace: has decided", - "status": "PASS", - "detail": "found event 'decided'" - } - ], "S3* Audit Corrections": [ { "step": "Setup", @@ -540,13 +12,13 @@ "step": "Tool calls produce results (baseline)", "check": "send: create two buttons: Alpha and Beta", "status": "PASS", - "detail": "response: Okay Nico, I've added \"Alpha\" and \"Beta\" buttons for you! 😊\n\nSince you're intere" + "detail": "response: 👍 Okay, I've created buttons labeled \"Alpha\" and \"Beta\".\n" }, { "step": "Tool calls produce results (baseline)", "check": "actions: length >= 1", "status": "PASS", - "detail": "7 actions >= 1" + "detail": "2 actions >= 1" }, { "step": "Tool calls produce results (baseline)", @@ -558,7 +30,7 @@ "step": "Dashboard mismatch triggers re-emit", "check": "send: I see nothing on my dashboard, fix it", "status": "PASS", - "detail": "response: You are absolutely right, Nico! My apologies, there seems to be an issue with th" + "detail": "response: 👍 Done — Alpha and Beta buttons are now live on your dashboard. They should appe" }, { "step": "Dashboard mismatch triggers re-emit", @@ -570,13 +42,13 @@ "step": "Dashboard mismatch triggers re-emit", "check": "actions: length >= 1", "status": "PASS", - "detail": "7 actions >= 1" + "detail": "2 actions >= 1" }, { "step": "DB error triggers retry with corrected SQL", "check": "send: SELECT * FROM NichtExistent LIMIT 5", "status": "PASS", - "detail": "response: Ah, my mistake! I see that I tried to run a query on a table called \"NichtExiste" + "detail": "response: Ah, it seems like the table `NichtExistent` does not exist. Double-check the tab" }, { "step": "DB error triggers retry with corrected SQL", @@ -594,13 +66,13 @@ "step": "DB error triggers retry with corrected SQL", "check": "response: length > 10", "status": "PASS", - "detail": "length 311 > 10" + "detail": "length 163 > 10" }, { "step": "Complex request gets Director plan", "check": "send: investigate which customers have the mos", "status": "PASS", - "detail": "response: Alright Nico, let's dig into the customer data! Sorry for the earlier dashboard " + "detail": "response: Okay, I'll look into which customers have the most devices. This might take a mo" }, { "step": "Complex request gets Director plan", @@ -618,416 +90,12 @@ "step": "Complex request gets Director plan", "check": "response: length > 20", "status": "PASS", - "detail": "length 476 > 20" - } - ], - "State Machines": [ - { - "step": "Setup", - "check": "clear", - "status": "PASS", - "detail": "cleared" - }, - { - "step": "Create a machine", - "check": "send: create a navigation machine called \"nav\"", - "status": "PASS", - "detail": "response: I encountered an issue trying to retrieve the customers with the most devices; t" - }, - { - "step": "Create a machine", - "check": "trace: has tool_call create_machine", - "status": "PASS", - "detail": "found create_machine via machine_created event" - }, - { - "step": "Create a machine", - "check": "trace: machine_created id=\"nav\"", - "status": "PASS", - "detail": "machine 'nav' created" - }, - { - "step": "Verify machine renders", - "check": "send: what machines are on my dashboard?", - "status": "PASS", - "detail": "response: You currently have the following machines on your dashboard:\n\n1. **nn\\_explorer" - }, - { - "step": "Verify machine renders", - "check": "response: contains \"nav\" or \"machine\"", - "status": "PASS", - "detail": "found 'nav'" - }, - { - "step": "Navigate via button click (local transition)", - "check": "action: menu_1", - "status": "PASS", - "detail": "response: Navigated to sub1" - }, - { - "step": "Navigate via button click (local transition)", - "check": "trace: has machine_transition", - "status": "PASS", - "detail": "found event 'machine_transition'" - }, - { - "step": "Navigate via button click (local transition)", - "check": "trace: no thinker", - "status": "PASS", - "detail": "no 'thinker' event (as expected)" - }, - { - "step": "Add a state to existing machine", - "check": "send: add a state \"sub3\" to the nav machine wi", - "status": "PASS", - "detail": "response: OK, I'll add a \"sub3\" state to the \"nav\" machine along with a \"Back\" button and " - }, - { - "step": "Add a state to existing machine", - "check": "trace: has tool_call add_state", - "status": "PASS", - "detail": "found add_state via machine_state_added event" - }, - { - "step": "Reset machine", - "check": "send: reset the nav machine to its initial sta", - "status": "PASS", - "detail": "response: OK, the \"nav\" machine has been reset to its initial state (\"main\"). You should n" - }, - { - "step": "Reset machine", - "check": "trace: has tool_call reset_machine", - "status": "PASS", - "detail": "found reset_machine via machine_reset event" - }, - { - "step": "Reset machine", - "check": "response: contains \"main\" or \"reset\" or \"initial\"", - "status": "PASS", - "detail": "found 'main'" - }, - { - "step": "Create second machine alongside first", - "check": "send: create a counter machine called \"clicks\"", - "status": "PASS", - "detail": "response: OK, creating a counter machine called \"clicks\":\n\n```python\nclicks = Blueprint(\"c" - }, - { - "step": "Create second machine alongside first", - "check": "trace: has tool_call create_machine", - "status": "PASS", - "detail": "found create_machine via machine_created event" - }, - { - "step": "Create second machine alongside first", - "check": "trace: machine_created id=\"clicks\"", - "status": "PASS", - "detail": "machine 'clicks' created" - }, - { - "step": "Both machines coexist", - "check": "send: what machines are running?", - "status": "PASS", - "detail": "response: You currently have the following machines running on your dashboard:\n\n1. **nn\\_" - }, - { - "step": "Both machines coexist", - "check": "response: contains \"nav\"", - "status": "PASS", - "detail": "found 'nav'" - }, - { - "step": "Both machines coexist", - "check": "response: contains \"click\"", - "status": "PASS", - "detail": "found 'click'" - }, - { - "step": "Destroy one machine", - "check": "send: destroy the clicks machine", - "status": "PASS", - "detail": "response: The \"clicks\" machine has been removed. If you need it again in the future, you'l" - }, - { - "step": "Destroy one machine", - "check": "trace: has tool_call destroy_machine", - "status": "PASS", - "detail": "found destroy_machine via machine_destroyed event" - }, - { - "step": "Destroy one machine", - "check": "send: what machines are running?", - "status": "PASS", - "detail": "response: Currently, these machines are active on your dashboard:\n\n* **nn\\_explorer**: I" - }, - { - "step": "Destroy one machine", - "check": "response: contains \"nav\"", - "status": "PASS", - "detail": "found 'nav'" - } - ], - "Structured Input Analysis": [ - { - "step": "Setup", - "check": "clear", - "status": "PASS", - "detail": "cleared" - }, - { - "step": "Social greeting", - "check": "send: hi there!", - "status": "PASS", - "detail": "response: Hi Nico! 👋 Want to add or modify a machine on the dashboard?\n" - }, - { - "step": "Social greeting", - "check": "response: length > 3", - "status": "PASS", - "detail": "length 61 > 3" - }, - { - "step": "Social greeting", - "check": "trace: input.analysis.intent is \"social\"", - "status": "PASS", - "detail": "input.analysis.intent=social" - }, - { - "step": "Social greeting", - "check": "trace: input.analysis.complexity is \"trivial\"", - "status": "PASS", - "detail": "input.analysis.complexity=trivial" - }, - { - "step": "Simple request", - "check": "send: create a counter starting at 0", - "status": "FAIL", - "detail": "The read operation timed out" - }, - { - "step": "Simple request", - "check": "response: length > 10", - "status": "PASS", - "detail": "length 61 > 10" - }, - { - "step": "Simple request", - "check": "trace: input.analysis.intent is \"request\" or \"action\"", - "status": "FAIL", - "detail": "input.analysis.intent=social, expected one of ['request', 'action']" - }, - { - "step": "Simple request", - "check": "trace: input.analysis.complexity is \"simple\" or \"complex\"", - "status": "FAIL", - "detail": "input.analysis.complexity=trivial, expected one of ['simple', 'complex']" - }, - { - "step": "German question", - "check": "send: Wie spaet ist es?", - "status": "PASS", - "detail": "response: Okay, let's create a counter. First, what do you want to call this counter machi" - }, - { - "step": "German question", - "check": "response: length > 5", - "status": "PASS", - "detail": "length 134 > 5" - }, - { - "step": "German question", - "check": "trace: input.analysis.language is \"de\"", - "status": "FAIL", - "detail": "input.analysis.language=mixed, expected one of ['de']" - }, - { - "step": "German question", - "check": "trace: input.analysis.intent is \"question\"", - "status": "FAIL", - "detail": "input.analysis.intent=request, expected one of ['question']" - }, - { - "step": "Frustrated tone", - "check": "send: this is broken, nothing works and I'm si", - "status": "PASS", - "detail": "response: I understand that you're experiencing frustration. Let's focus on getting things" - }, - { - "step": "Frustrated tone", - "check": "response: length > 10", - "status": "PASS", - "detail": "length 206 > 10" - }, - { - "step": "Frustrated tone", - "check": "trace: input.analysis.tone is \"frustrated\" or \"urgent\"", - "status": "PASS", - "detail": "input.analysis.tone=frustrated" - }, - { - "step": "Simple acknowledgment", - "check": "send: ok thanks bye", - "status": "PASS", - "detail": "response: Alright, if you change your mind or need assistance later, don't hesitate to rea" - }, - { - "step": "Simple acknowledgment", - "check": "trace: input.analysis.intent is \"social\"", - "status": "PASS", - "detail": "input.analysis.intent=social" - }, - { - "step": "Simple acknowledgment", - "check": "trace: input.analysis.complexity is \"trivial\"", - "status": "PASS", - "detail": "input.analysis.complexity=trivial" - } - ], - "Dashboard Feedback (S3*)": [ - { - "step": "Setup", - "check": "clear", - "status": "PASS", - "detail": "cleared" - }, - { - "step": "Thinker sees buttons in dashboard", - "check": "send: create two buttons: hello and world", - "status": "PASS", - "detail": "response: Okay, Nico, I understand your frustration. Let's simplify this. 😉\n\nI've created " - }, - { - "step": "Thinker sees buttons in dashboard", - "check": "actions: length >= 2", - "status": "PASS", - "detail": "10 actions >= 2" - }, - { - "step": "Thinker sees buttons in dashboard", - "check": "send: what buttons can you see in my dashboard", - "status": "PASS", - "detail": "response: Currently, your dashboard displays the following buttons:\n\n- Hello\n- World\n" - }, - { - "step": "Thinker sees buttons in dashboard", - "check": "response: contains \"Hello\" or \"hello\"", - "status": "PASS", - "detail": "found 'Hello'" - }, - { - "step": "Thinker sees buttons in dashboard", - "check": "response: contains \"World\" or \"world\"", - "status": "PASS", - "detail": "found 'World'" - }, - { - "step": "Thinker detects empty dashboard", - "check": "send: I see nothing in my dashboard, what happ", - "status": "PASS", - "detail": "response: Ah, that's not ideal! Sorry about that, Nico. It seems like there's a mismatch b" - }, - { - "step": "Thinker detects empty dashboard", - "check": "response: contains \"button\" or \"fix\" or \"restore\" or \"create\" or \"empty\"", - "status": "PASS", - "detail": "found 'button'" - }, - { - "step": "Dashboard state flows to thinker context", - "check": "send: create a counter starting at 5", - "status": "PASS", - "detail": "response: Okay, creating a new counter that starts at 5. \n\n(New UI counter shown to user, " - }, - { - "step": "Dashboard state flows to thinker context", - "check": "actions: length >= 1", - "status": "PASS", - "detail": "17 actions >= 1" - }, - { - "step": "Dashboard state flows to thinker context", - "check": "send: what does my dashboard show?", - "status": "PASS", - "detail": "response: Your dashboard is showing:\n\n* A counter with a current value of 5.\n* Buttons" - }, - { - "step": "Dashboard state flows to thinker context", - "check": "response: contains \"5\" or \"count\"", - "status": "PASS", - "detail": "found '5'" - } - ], - "Dashboard Mismatch Recovery": [ - { - "step": "Setup", - "check": "clear", - "status": "PASS", - "detail": "cleared" - }, - { - "step": "Create buttons", - "check": "send: create two buttons: red and blue", - "status": "PASS", - "detail": "response: Okay, I have created two buttons, labeled \"Red\" and \"Blue.\"\n\nSince the button st" - }, - { - "step": "Create buttons", - "check": "actions: length >= 2", - "status": "PASS", - "detail": "10 actions >= 2" - }, - { - "step": "Dashboard empty — Thinker re-emits", - "check": "send: I clicked red but nothing happened", - "status": "PASS", - "detail": "response: Okay, I understand nothing happened when you clicked the \"Red\" button. That's de" - }, - { - "step": "Dashboard empty — Thinker re-emits", - "check": "response: contains \"button\" or \"red\" or \"blue\"", - "status": "PASS", - "detail": "found 'button'" - }, - { - "step": "Dashboard empty — Thinker re-emits", - "check": "actions: length >= 1", - "status": "PASS", - "detail": "10 actions >= 1" - }, - { - "step": "Create counter", - "check": "send: create a counter starting at 0", - "status": "PASS", - "detail": "response: I can't create the counter directly for you, but I can set it up on the dashboar" - }, - { - "step": "Create counter", - "check": "actions: length >= 1", - "status": "PASS", - "detail": "17 actions >= 1" - }, - { - "step": "Counter missing from dashboard — Thinker recovers", - "check": "send: the dashboard is broken, I only see old ", - "status": "PASS", - "detail": "response: I understand your dashboard is only showing old content. I'm resending all butto" - }, - { - "step": "Counter missing from dashboard — Thinker recovers", - "check": "response: contains \"counter\" or \"count\" or \"fix\" or \"recreat\" or \"refresh\" or \"button\" or \"update\"", - "status": "PASS", - "detail": "found 'refresh'" - }, - { - "step": "Counter missing from dashboard — Thinker recovers", - "check": "actions: length >= 1", - "status": "PASS", - "detail": "11 actions >= 1" + "detail": "length 86 > 20" } ] }, "summary": { - "passed": 155, - "failed": 12 + "passed": 14, + "failed": 1 } } \ No newline at end of file