- Memorizer tracks user_expectation (conversational/delegated/waiting_input/observing) - Output node adjusts phrasing per expectation - PA retry loop: reformulates job on expert failure (all retries exhausted or tool skip) - Machine state in PA context: get_machine_summary includes current state, buttons, stored data - Expert writes to machine state via update_machine + transition_machine - Expanded baked schema coverage - Awareness panel shows color-coded expectation state - Dashboard and workspace component updates Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
1033 lines
32 KiB
JSON
1033 lines
32 KiB
JSON
{
|
|
"timestamp": "2026-03-29 06:11:18",
|
|
"testcases": {
|
|
"Button Persistence": [
|
|
{
|
|
"step": "Setup",
|
|
"check": "clear",
|
|
"status": "PASS",
|
|
"detail": "cleared"
|
|
},
|
|
{
|
|
"step": "Create buttons",
|
|
"check": "send: create two buttons: Poodle Bark and Bolo",
|
|
"status": "PASS",
|
|
"detail": "response: Okay, I've created two buttons for you: \"Poodle Bark\" and \"Bolonka Bark\". 🐶 \n"
|
|
},
|
|
{
|
|
"step": "Create buttons",
|
|
"check": "actions: length >= 2",
|
|
"status": "PASS",
|
|
"detail": "2 actions >= 2"
|
|
},
|
|
{
|
|
"step": "Create buttons",
|
|
"check": "actions: any action contains \"poodle\" or \"Poodle\"",
|
|
"status": "PASS",
|
|
"detail": "found 'poodle' in actions"
|
|
},
|
|
{
|
|
"step": "Create buttons",
|
|
"check": "actions: any action contains \"bolonka\" or \"Bolonka\"",
|
|
"status": "PASS",
|
|
"detail": "found 'bolonka' in actions"
|
|
},
|
|
{
|
|
"step": "Ask unrelated question (buttons must survive)",
|
|
"check": "send: what time is it?",
|
|
"status": "PASS",
|
|
"detail": "response: The current time is 6:08 AM. ⏰\n"
|
|
},
|
|
{
|
|
"step": "Ask unrelated question (buttons must survive)",
|
|
"check": "response: contains \":\" or \"time\" or \"clock\"",
|
|
"status": "PASS",
|
|
"detail": "found ':'"
|
|
},
|
|
{
|
|
"step": "Ask unrelated question (buttons must survive)",
|
|
"check": "actions: any action contains \"poodle\" or \"Poodle\"",
|
|
"status": "PASS",
|
|
"detail": "found 'poodle' in actions"
|
|
},
|
|
{
|
|
"step": "Ask unrelated question (buttons must survive)",
|
|
"check": "actions: any action contains \"bolonka\" or \"Bolonka\"",
|
|
"status": "PASS",
|
|
"detail": "found 'bolonka' in actions"
|
|
},
|
|
{
|
|
"step": "Ask another question (buttons still there)",
|
|
"check": "send: say hello in German",
|
|
"status": "PASS",
|
|
"detail": "response: Hallo! 👋\n"
|
|
},
|
|
{
|
|
"step": "Ask another question (buttons still there)",
|
|
"check": "response: contains \"Hallo\" or \"hallo\" or \"German\"",
|
|
"status": "PASS",
|
|
"detail": "found 'Hallo'"
|
|
},
|
|
{
|
|
"step": "Ask another question (buttons still there)",
|
|
"check": "actions: any action contains \"poodle\" or \"Poodle\"",
|
|
"status": "PASS",
|
|
"detail": "found 'poodle' in actions"
|
|
},
|
|
{
|
|
"step": "Explicitly replace buttons",
|
|
"check": "send: remove all buttons and create one button",
|
|
"status": "PASS",
|
|
"detail": "response: You got it! I've removed the existing buttons and created a single button labele"
|
|
},
|
|
{
|
|
"step": "Explicitly replace buttons",
|
|
"check": "actions: length >= 1",
|
|
"status": "PASS",
|
|
"detail": "1 actions >= 1"
|
|
},
|
|
{
|
|
"step": "Explicitly replace buttons",
|
|
"check": "actions: any action contains \"reset\" or \"Reset\"",
|
|
"status": "PASS",
|
|
"detail": "found 'reset' in actions"
|
|
}
|
|
],
|
|
"Counter State": [
|
|
{
|
|
"step": "Setup",
|
|
"check": "clear",
|
|
"status": "PASS",
|
|
"detail": "cleared"
|
|
},
|
|
{
|
|
"step": "Create counter",
|
|
"check": "send: create a counter starting at 0 with incr",
|
|
"status": "PASS",
|
|
"detail": "response: "
|
|
},
|
|
{
|
|
"step": "Create counter",
|
|
"check": "response: contains \"counter\" or \"count\"",
|
|
"status": "FAIL",
|
|
"detail": "none of ['counter', 'count'] found in: "
|
|
},
|
|
{
|
|
"step": "Create counter",
|
|
"check": "actions: length >= 2",
|
|
"status": "FAIL",
|
|
"detail": "1 actions < 2"
|
|
},
|
|
{
|
|
"step": "Create counter",
|
|
"check": "actions: any action contains \"increment\" or \"inc\" or \"plus\" or \"add\"",
|
|
"status": "FAIL",
|
|
"detail": "none of ['increment', 'inc', 'plus', 'add'] found in 1 buttons"
|
|
},
|
|
{
|
|
"step": "Create counter",
|
|
"check": "actions: any action contains \"decrement\" or \"dec\" or \"minus\" or \"sub\"",
|
|
"status": "FAIL",
|
|
"detail": "none of ['decrement', 'dec', 'minus', 'sub'] found in 1 buttons"
|
|
},
|
|
{
|
|
"step": "Check state",
|
|
"check": "state: topic contains \"counter\" or \"count\" or \"button\"",
|
|
"status": "FAIL",
|
|
"detail": "topic=UI creation doesn't contain any of ['counter', 'count', 'button']"
|
|
},
|
|
{
|
|
"step": "Ask for current value",
|
|
"check": "send: what is the current count?",
|
|
"status": "PASS",
|
|
"detail": "response: "
|
|
},
|
|
{
|
|
"step": "Ask for current value",
|
|
"check": "response: contains \"0\" or \"zero\"",
|
|
"status": "FAIL",
|
|
"detail": "none of ['0', 'zero'] found in: "
|
|
},
|
|
{
|
|
"step": "Increment",
|
|
"check": "action matching 'inc'",
|
|
"status": "FAIL",
|
|
"detail": "no action matching 'inc' in ['reset']"
|
|
},
|
|
{
|
|
"step": "Increment",
|
|
"check": "response: contains \"1\" or \"one\" or \"increment\" or \"Navigated\"",
|
|
"status": "FAIL",
|
|
"detail": "none of ['1', 'one', 'increment', 'Navigated'] found in: "
|
|
},
|
|
{
|
|
"step": "Increment again",
|
|
"check": "action matching 'inc'",
|
|
"status": "FAIL",
|
|
"detail": "no action matching 'inc' in ['reset']"
|
|
},
|
|
{
|
|
"step": "Increment again",
|
|
"check": "response: contains \"2\" or \"two\" or \"increment\" or \"Navigated\"",
|
|
"status": "FAIL",
|
|
"detail": "none of ['2', 'two', 'increment', 'Navigated'] found in: "
|
|
},
|
|
{
|
|
"step": "Decrement",
|
|
"check": "action matching 'dec'",
|
|
"status": "FAIL",
|
|
"detail": "no action matching 'dec' in ['reset']"
|
|
},
|
|
{
|
|
"step": "Decrement",
|
|
"check": "response: contains \"1\" or \"one\" or \"decrement\" or \"Navigated\"",
|
|
"status": "FAIL",
|
|
"detail": "none of ['1', 'one', 'decrement', 'Navigated'] found in: "
|
|
},
|
|
{
|
|
"step": "Verify memorizer tracks it",
|
|
"check": "state: topic contains \"count\"",
|
|
"status": "FAIL",
|
|
"detail": "topic=UI creation doesn't contain any of ['count']"
|
|
}
|
|
],
|
|
"DB Exploration": [
|
|
{
|
|
"step": "Setup",
|
|
"check": "clear",
|
|
"status": "PASS",
|
|
"detail": "cleared"
|
|
},
|
|
{
|
|
"step": "Query renders table in workspace",
|
|
"check": "send: show me 5 customers from the database",
|
|
"status": "PASS",
|
|
"detail": "response: "
|
|
},
|
|
{
|
|
"step": "Query renders table in workspace",
|
|
"check": "trace: has tool_call",
|
|
"status": "FAIL",
|
|
"detail": "no 'tool_call' event in trace"
|
|
},
|
|
{
|
|
"step": "Query renders table in workspace",
|
|
"check": "actions: has table",
|
|
"status": "FAIL",
|
|
"detail": "no table in 1 controls"
|
|
},
|
|
{
|
|
"step": "Query renders table in workspace",
|
|
"check": "response: not contains \"---|\" or \"| ID\"",
|
|
"status": "PASS",
|
|
"detail": "none of ['---|', '| ID'] found (as expected)"
|
|
},
|
|
{
|
|
"step": "Chat summarizes, does not dump data",
|
|
"check": "response: contains \"customer\" or \"Kunde\" or \"5\" or \"table\"",
|
|
"status": "FAIL",
|
|
"detail": "none of ['customer', 'Kunde', '5', 'table'] found in: "
|
|
},
|
|
{
|
|
"step": "Chat summarizes, does not dump data",
|
|
"check": "response: length > 10",
|
|
"status": "FAIL",
|
|
"detail": "length 0 <= 10"
|
|
},
|
|
{
|
|
"step": "Thinker builds exploration UI (not describes it)",
|
|
"check": "send: select customer 2 Kathrin Jager, add but",
|
|
"status": "PASS",
|
|
"detail": "response: "
|
|
},
|
|
{
|
|
"step": "Thinker builds exploration UI (not describes it)",
|
|
"check": "actions: length >= 1",
|
|
"status": "PASS",
|
|
"detail": "1 actions >= 1"
|
|
},
|
|
{
|
|
"step": "Thinker builds exploration UI (not describes it)",
|
|
"check": "response: not contains \"UI team\" or \"will add\" or \"will create\"",
|
|
"status": "PASS",
|
|
"detail": "none of ['UI team', 'will add', 'will create'] found (as expected)"
|
|
},
|
|
{
|
|
"step": "Error recovery on bad query",
|
|
"check": "send: SELECT * FROM nichtexistiert LIMIT 5",
|
|
"status": "PASS",
|
|
"detail": "response: "
|
|
},
|
|
{
|
|
"step": "Error recovery on bad query",
|
|
"check": "trace: has tool_call",
|
|
"status": "FAIL",
|
|
"detail": "no 'tool_call' event in trace"
|
|
},
|
|
{
|
|
"step": "Error recovery on bad query",
|
|
"check": "response: not contains \"1146\"",
|
|
"status": "PASS",
|
|
"detail": "none of ['1146'] found (as expected)"
|
|
},
|
|
{
|
|
"step": "Error recovery on bad query",
|
|
"check": "response: length > 10",
|
|
"status": "FAIL",
|
|
"detail": "length 0 <= 10"
|
|
}
|
|
],
|
|
"Director Node": [
|
|
{
|
|
"step": "Setup",
|
|
"check": "clear",
|
|
"status": "PASS",
|
|
"detail": "cleared"
|
|
},
|
|
{
|
|
"step": "Casual chat establishes mode",
|
|
"check": "send: hey, just hanging out, what's up?",
|
|
"status": "PASS",
|
|
"detail": "response: "
|
|
},
|
|
{
|
|
"step": "Casual chat establishes mode",
|
|
"check": "response: length > 5",
|
|
"status": "FAIL",
|
|
"detail": "length 0 <= 5"
|
|
},
|
|
{
|
|
"step": "Casual chat establishes mode",
|
|
"check": "trace: has director_updated",
|
|
"status": "PASS",
|
|
"detail": "found event 'director_updated'"
|
|
},
|
|
{
|
|
"step": "Director picks up frustration",
|
|
"check": "send: ugh this is so annoying, nothing makes s",
|
|
"status": "PASS",
|
|
"detail": "response: "
|
|
},
|
|
{
|
|
"step": "Director picks up frustration",
|
|
"check": "response: length > 10",
|
|
"status": "FAIL",
|
|
"detail": "length 0 <= 10"
|
|
},
|
|
{
|
|
"step": "Director picks up frustration",
|
|
"check": "trace: has director_updated",
|
|
"status": "PASS",
|
|
"detail": "found event 'director_updated'"
|
|
},
|
|
{
|
|
"step": "Switch to building mode",
|
|
"check": "send: ok let's build a todo list app",
|
|
"status": "PASS",
|
|
"detail": "response: "
|
|
},
|
|
{
|
|
"step": "Switch to building mode",
|
|
"check": "response: length > 10",
|
|
"status": "FAIL",
|
|
"detail": "length 0 <= 10"
|
|
},
|
|
{
|
|
"step": "Switch to building mode",
|
|
"check": "trace: has director_updated",
|
|
"status": "PASS",
|
|
"detail": "found event 'director_updated'"
|
|
}
|
|
],
|
|
"Pub Conversation": [
|
|
{
|
|
"step": "Setup",
|
|
"check": "clear",
|
|
"status": "PASS",
|
|
"detail": "cleared"
|
|
},
|
|
{
|
|
"step": "Set the scene",
|
|
"check": "send: Hey, Alice and I are heading to the pub ",
|
|
"status": "PASS",
|
|
"detail": "response: "
|
|
},
|
|
{
|
|
"step": "Set the scene",
|
|
"check": "response: length > 10",
|
|
"status": "FAIL",
|
|
"detail": "length 0 <= 10"
|
|
},
|
|
{
|
|
"step": "Set the scene",
|
|
"check": "state: situation contains \"pub\" or \"Alice\"",
|
|
"status": "FAIL",
|
|
"detail": "situation=local session doesn't contain any of ['pub', 'Alice']"
|
|
},
|
|
{
|
|
"step": "Language switch to German",
|
|
"check": "send: Wir sind jetzt im Biergarten angekommen",
|
|
"status": "PASS",
|
|
"detail": "response: "
|
|
},
|
|
{
|
|
"step": "Language switch to German",
|
|
"check": "response: length > 10",
|
|
"status": "FAIL",
|
|
"detail": "length 0 <= 10"
|
|
},
|
|
{
|
|
"step": "Language switch to German",
|
|
"check": "state: language is \"de\" or \"mixed\"",
|
|
"status": "PASS",
|
|
"detail": "language=mixed"
|
|
},
|
|
{
|
|
"step": "Context awareness",
|
|
"check": "send: Was sollen wir bestellen?",
|
|
"status": "PASS",
|
|
"detail": "response: "
|
|
},
|
|
{
|
|
"step": "Context awareness",
|
|
"check": "response: length > 10",
|
|
"status": "FAIL",
|
|
"detail": "length 0 <= 10"
|
|
},
|
|
{
|
|
"step": "Context awareness",
|
|
"check": "state: topic contains \"bestell\" or \"order\" or \"pub\" or \"Biergarten\"",
|
|
"status": "FAIL",
|
|
"detail": "topic=UI creation doesn't contain any of ['bestell', 'order', 'pub', 'Biergarten']"
|
|
},
|
|
{
|
|
"step": "Alice speaks",
|
|
"check": "send: Alice says: I'll have a Hefeweizen pleas",
|
|
"status": "PASS",
|
|
"detail": "response: "
|
|
},
|
|
{
|
|
"step": "Alice speaks",
|
|
"check": "response: length > 10",
|
|
"status": "FAIL",
|
|
"detail": "length 0 <= 10"
|
|
},
|
|
{
|
|
"step": "Alice speaks",
|
|
"check": "state: facts any contains \"Alice\" or \"Hefeweizen\"",
|
|
"status": "FAIL",
|
|
"detail": "none of ['Alice', 'Hefeweizen'] found in facts: []"
|
|
},
|
|
{
|
|
"step": "Ask for time (tool use)",
|
|
"check": "send: wie spaet ist es eigentlich?",
|
|
"status": "PASS",
|
|
"detail": "response: "
|
|
},
|
|
{
|
|
"step": "Ask for time (tool use)",
|
|
"check": "response: matches \\d{1,2}:\\d{2}",
|
|
"status": "FAIL",
|
|
"detail": "/\\d{1,2}:\\d{2}/ not found in: "
|
|
},
|
|
{
|
|
"step": "Back to English",
|
|
"check": "send: Let's switch to English, what was the la",
|
|
"status": "PASS",
|
|
"detail": "response: "
|
|
},
|
|
{
|
|
"step": "Back to English",
|
|
"check": "state: language is \"en\" or \"mixed\"",
|
|
"status": "PASS",
|
|
"detail": "language=mixed"
|
|
},
|
|
{
|
|
"step": "Back to English",
|
|
"check": "response: contains \"Alice\" or \"Hefeweizen\"",
|
|
"status": "FAIL",
|
|
"detail": "none of ['Alice', 'Hefeweizen'] found in: "
|
|
},
|
|
{
|
|
"step": "Mood check",
|
|
"check": "send: This is really fun!",
|
|
"status": "PASS",
|
|
"detail": "response: "
|
|
},
|
|
{
|
|
"step": "Mood check",
|
|
"check": "state: user_mood is \"happy\" or \"playful\" or \"excited\"",
|
|
"status": "FAIL",
|
|
"detail": "user_mood=neutral not in ['happy', 'playful', 'excited']"
|
|
}
|
|
],
|
|
"Reflex Path": [
|
|
{
|
|
"step": "Setup",
|
|
"check": "clear",
|
|
"status": "PASS",
|
|
"detail": "cleared"
|
|
},
|
|
{
|
|
"step": "Greeting triggers reflex",
|
|
"check": "send: hey!",
|
|
"status": "PASS",
|
|
"detail": "response: "
|
|
},
|
|
{
|
|
"step": "Greeting triggers reflex",
|
|
"check": "response: length > 2",
|
|
"status": "FAIL",
|
|
"detail": "length 0 <= 2"
|
|
},
|
|
{
|
|
"step": "Greeting triggers reflex",
|
|
"check": "trace: has reflex_path",
|
|
"status": "FAIL",
|
|
"detail": "no 'reflex_path' event in trace"
|
|
},
|
|
{
|
|
"step": "Thanks triggers reflex",
|
|
"check": "send: thanks",
|
|
"status": "PASS",
|
|
"detail": "response: "
|
|
},
|
|
{
|
|
"step": "Thanks triggers reflex",
|
|
"check": "response: length > 2",
|
|
"status": "FAIL",
|
|
"detail": "length 0 <= 2"
|
|
},
|
|
{
|
|
"step": "Thanks triggers reflex",
|
|
"check": "trace: has reflex_path",
|
|
"status": "FAIL",
|
|
"detail": "no 'reflex_path' event in trace"
|
|
},
|
|
{
|
|
"step": "Complex request does NOT trigger reflex",
|
|
"check": "send: explain how neural networks work in deta",
|
|
"status": "PASS",
|
|
"detail": "response: "
|
|
},
|
|
{
|
|
"step": "Complex request does NOT trigger reflex",
|
|
"check": "response: length > 20",
|
|
"status": "FAIL",
|
|
"detail": "length 0 <= 20"
|
|
},
|
|
{
|
|
"step": "Complex request does NOT trigger reflex",
|
|
"check": "trace: input.analysis.intent is \"question\" or \"request\"",
|
|
"status": "PASS",
|
|
"detail": "input.analysis.intent=request"
|
|
},
|
|
{
|
|
"step": "Complex request does NOT trigger reflex",
|
|
"check": "trace: has decided",
|
|
"status": "FAIL",
|
|
"detail": "no 'decided' event in trace"
|
|
}
|
|
],
|
|
"S3* Audit Corrections": [
|
|
{
|
|
"step": "Setup",
|
|
"check": "clear",
|
|
"status": "PASS",
|
|
"detail": "cleared"
|
|
},
|
|
{
|
|
"step": "Tool calls produce results (baseline)",
|
|
"check": "send: create two buttons: Alpha and Beta",
|
|
"status": "PASS",
|
|
"detail": "response: "
|
|
},
|
|
{
|
|
"step": "Tool calls produce results (baseline)",
|
|
"check": "actions: length >= 1",
|
|
"status": "PASS",
|
|
"detail": "1 actions >= 1"
|
|
},
|
|
{
|
|
"step": "Tool calls produce results (baseline)",
|
|
"check": "actions: any action contains \"alpha\" or \"Alpha\"",
|
|
"status": "FAIL",
|
|
"detail": "none of ['alpha', 'Alpha'] found in 1 buttons"
|
|
},
|
|
{
|
|
"step": "Dashboard mismatch triggers re-emit",
|
|
"check": "send: I see nothing on my dashboard, fix it",
|
|
"status": "PASS",
|
|
"detail": "response: "
|
|
},
|
|
{
|
|
"step": "Dashboard mismatch triggers re-emit",
|
|
"check": "response: not contains \"sorry\" or \"apologize\"",
|
|
"status": "PASS",
|
|
"detail": "none of ['sorry', 'apologize'] found (as expected)"
|
|
},
|
|
{
|
|
"step": "Dashboard mismatch triggers re-emit",
|
|
"check": "actions: length >= 1",
|
|
"status": "PASS",
|
|
"detail": "1 actions >= 1"
|
|
},
|
|
{
|
|
"step": "DB error triggers retry with corrected SQL",
|
|
"check": "send: SELECT * FROM NichtExistent LIMIT 5",
|
|
"status": "PASS",
|
|
"detail": "response: "
|
|
},
|
|
{
|
|
"step": "DB error triggers retry with corrected SQL",
|
|
"check": "trace: has tool_call",
|
|
"status": "FAIL",
|
|
"detail": "no 'tool_call' event in trace"
|
|
},
|
|
{
|
|
"step": "DB error triggers retry with corrected SQL",
|
|
"check": "response: not contains \"1146\"",
|
|
"status": "PASS",
|
|
"detail": "none of ['1146'] found (as expected)"
|
|
},
|
|
{
|
|
"step": "DB error triggers retry with corrected SQL",
|
|
"check": "response: length > 10",
|
|
"status": "FAIL",
|
|
"detail": "length 0 <= 10"
|
|
},
|
|
{
|
|
"step": "Complex request gets Director plan",
|
|
"check": "send: investigate which customers have the mos",
|
|
"status": "PASS",
|
|
"detail": "response: "
|
|
},
|
|
{
|
|
"step": "Complex request gets Director plan",
|
|
"check": "trace: has director_plan",
|
|
"status": "FAIL",
|
|
"detail": "no 'director_plan' event in trace"
|
|
},
|
|
{
|
|
"step": "Complex request gets Director plan",
|
|
"check": "trace: has tool_call",
|
|
"status": "FAIL",
|
|
"detail": "no 'tool_call' event in trace"
|
|
},
|
|
{
|
|
"step": "Complex request gets Director plan",
|
|
"check": "response: length > 20",
|
|
"status": "FAIL",
|
|
"detail": "length 0 <= 20"
|
|
}
|
|
],
|
|
"State Machines": [
|
|
{
|
|
"step": "Setup",
|
|
"check": "clear",
|
|
"status": "PASS",
|
|
"detail": "cleared"
|
|
},
|
|
{
|
|
"step": "Create a machine",
|
|
"check": "send: create a navigation machine called \"nav\"",
|
|
"status": "PASS",
|
|
"detail": "response: "
|
|
},
|
|
{
|
|
"step": "Create a machine",
|
|
"check": "trace: has tool_call create_machine",
|
|
"status": "FAIL",
|
|
"detail": "no tool_call 'create_machine' in trace"
|
|
},
|
|
{
|
|
"step": "Create a machine",
|
|
"check": "trace: machine_created id=\"nav\"",
|
|
"status": "FAIL",
|
|
"detail": "no machine_created event with id='nav'"
|
|
},
|
|
{
|
|
"step": "Verify machine renders",
|
|
"check": "send: what machines are on my dashboard?",
|
|
"status": "PASS",
|
|
"detail": "response: "
|
|
},
|
|
{
|
|
"step": "Verify machine renders",
|
|
"check": "response: contains \"nav\" or \"machine\"",
|
|
"status": "FAIL",
|
|
"detail": "none of ['nav', 'machine'] found in: "
|
|
},
|
|
{
|
|
"step": "Navigate via button click (local transition)",
|
|
"check": "action matching 'menu_1'",
|
|
"status": "FAIL",
|
|
"detail": "no action matching 'menu_1' in ['reset']"
|
|
},
|
|
{
|
|
"step": "Navigate via button click (local transition)",
|
|
"check": "trace: has machine_transition",
|
|
"status": "FAIL",
|
|
"detail": "no 'machine_transition' event in trace"
|
|
},
|
|
{
|
|
"step": "Navigate via button click (local transition)",
|
|
"check": "trace: no thinker",
|
|
"status": "PASS",
|
|
"detail": "no 'thinker' event (as expected)"
|
|
},
|
|
{
|
|
"step": "Add a state to existing machine",
|
|
"check": "send: add a state \"sub3\" to the nav machine wi",
|
|
"status": "PASS",
|
|
"detail": "response: "
|
|
},
|
|
{
|
|
"step": "Add a state to existing machine",
|
|
"check": "trace: has tool_call add_state",
|
|
"status": "FAIL",
|
|
"detail": "no tool_call 'add_state' in trace"
|
|
},
|
|
{
|
|
"step": "Reset machine",
|
|
"check": "send: reset the nav machine to its initial sta",
|
|
"status": "PASS",
|
|
"detail": "response: "
|
|
},
|
|
{
|
|
"step": "Reset machine",
|
|
"check": "trace: has tool_call reset_machine",
|
|
"status": "FAIL",
|
|
"detail": "no tool_call 'reset_machine' in trace"
|
|
},
|
|
{
|
|
"step": "Reset machine",
|
|
"check": "response: contains \"main\" or \"reset\" or \"initial\"",
|
|
"status": "FAIL",
|
|
"detail": "none of ['main', 'reset', 'initial'] found in: "
|
|
},
|
|
{
|
|
"step": "Create second machine alongside first",
|
|
"check": "send: create a counter machine called \"clicks\"",
|
|
"status": "PASS",
|
|
"detail": "response: "
|
|
},
|
|
{
|
|
"step": "Create second machine alongside first",
|
|
"check": "trace: has tool_call create_machine",
|
|
"status": "FAIL",
|
|
"detail": "no tool_call 'create_machine' in trace"
|
|
},
|
|
{
|
|
"step": "Create second machine alongside first",
|
|
"check": "trace: machine_created id=\"clicks\"",
|
|
"status": "FAIL",
|
|
"detail": "no machine_created event with id='clicks'"
|
|
},
|
|
{
|
|
"step": "Both machines coexist",
|
|
"check": "send: what machines are running?",
|
|
"status": "PASS",
|
|
"detail": "response: "
|
|
},
|
|
{
|
|
"step": "Both machines coexist",
|
|
"check": "response: contains \"nav\"",
|
|
"status": "FAIL",
|
|
"detail": "none of ['nav'] found in: "
|
|
},
|
|
{
|
|
"step": "Both machines coexist",
|
|
"check": "response: contains \"click\"",
|
|
"status": "FAIL",
|
|
"detail": "none of ['click'] found in: "
|
|
},
|
|
{
|
|
"step": "Destroy one machine",
|
|
"check": "send: destroy the clicks machine",
|
|
"status": "PASS",
|
|
"detail": "response: "
|
|
},
|
|
{
|
|
"step": "Destroy one machine",
|
|
"check": "trace: has tool_call destroy_machine",
|
|
"status": "FAIL",
|
|
"detail": "no tool_call 'destroy_machine' in trace"
|
|
},
|
|
{
|
|
"step": "Destroy one machine",
|
|
"check": "send: what machines are running?",
|
|
"status": "PASS",
|
|
"detail": "response: "
|
|
},
|
|
{
|
|
"step": "Destroy one machine",
|
|
"check": "response: contains \"nav\"",
|
|
"status": "FAIL",
|
|
"detail": "none of ['nav'] found in: "
|
|
}
|
|
],
|
|
"Structured Input Analysis": [
|
|
{
|
|
"step": "Setup",
|
|
"check": "clear",
|
|
"status": "PASS",
|
|
"detail": "cleared"
|
|
},
|
|
{
|
|
"step": "Social greeting",
|
|
"check": "send: hi there!",
|
|
"status": "PASS",
|
|
"detail": "response: "
|
|
},
|
|
{
|
|
"step": "Social greeting",
|
|
"check": "response: length > 3",
|
|
"status": "FAIL",
|
|
"detail": "length 0 <= 3"
|
|
},
|
|
{
|
|
"step": "Social greeting",
|
|
"check": "trace: input.analysis.intent is \"social\"",
|
|
"status": "FAIL",
|
|
"detail": "input.analysis.intent=request, expected one of ['social']"
|
|
},
|
|
{
|
|
"step": "Social greeting",
|
|
"check": "trace: input.analysis.complexity is \"trivial\"",
|
|
"status": "FAIL",
|
|
"detail": "input.analysis.complexity=simple, expected one of ['trivial']"
|
|
},
|
|
{
|
|
"step": "Simple request",
|
|
"check": "send: create a counter starting at 0",
|
|
"status": "PASS",
|
|
"detail": "response: "
|
|
},
|
|
{
|
|
"step": "Simple request",
|
|
"check": "response: length > 10",
|
|
"status": "FAIL",
|
|
"detail": "length 0 <= 10"
|
|
},
|
|
{
|
|
"step": "Simple request",
|
|
"check": "trace: input.analysis.intent is \"request\" or \"action\"",
|
|
"status": "PASS",
|
|
"detail": "input.analysis.intent=request"
|
|
},
|
|
{
|
|
"step": "Simple request",
|
|
"check": "trace: input.analysis.complexity is \"simple\" or \"complex\"",
|
|
"status": "PASS",
|
|
"detail": "input.analysis.complexity=simple"
|
|
},
|
|
{
|
|
"step": "German question",
|
|
"check": "send: Wie spaet ist es?",
|
|
"status": "PASS",
|
|
"detail": "response: "
|
|
},
|
|
{
|
|
"step": "German question",
|
|
"check": "response: length > 5",
|
|
"status": "FAIL",
|
|
"detail": "length 0 <= 5"
|
|
},
|
|
{
|
|
"step": "German question",
|
|
"check": "trace: input.analysis.language is \"de\"",
|
|
"status": "FAIL",
|
|
"detail": "input.analysis.language=en, expected one of ['de']"
|
|
},
|
|
{
|
|
"step": "German question",
|
|
"check": "trace: input.analysis.intent is \"question\"",
|
|
"status": "FAIL",
|
|
"detail": "input.analysis.intent=request, expected one of ['question']"
|
|
},
|
|
{
|
|
"step": "Frustrated tone",
|
|
"check": "send: this is broken, nothing works and I'm si",
|
|
"status": "PASS",
|
|
"detail": "response: "
|
|
},
|
|
{
|
|
"step": "Frustrated tone",
|
|
"check": "response: length > 10",
|
|
"status": "FAIL",
|
|
"detail": "length 0 <= 10"
|
|
},
|
|
{
|
|
"step": "Frustrated tone",
|
|
"check": "trace: input.analysis.tone is \"frustrated\" or \"urgent\"",
|
|
"status": "FAIL",
|
|
"detail": "input.analysis.tone=casual, expected one of ['frustrated', 'urgent']"
|
|
},
|
|
{
|
|
"step": "Simple acknowledgment",
|
|
"check": "send: ok thanks bye",
|
|
"status": "PASS",
|
|
"detail": "response: "
|
|
},
|
|
{
|
|
"step": "Simple acknowledgment",
|
|
"check": "trace: input.analysis.intent is \"social\"",
|
|
"status": "FAIL",
|
|
"detail": "input.analysis.intent=request, expected one of ['social']"
|
|
},
|
|
{
|
|
"step": "Simple acknowledgment",
|
|
"check": "trace: input.analysis.complexity is \"trivial\"",
|
|
"status": "FAIL",
|
|
"detail": "input.analysis.complexity=simple, expected one of ['trivial']"
|
|
}
|
|
],
|
|
"Dashboard Feedback (S3*)": [
|
|
{
|
|
"step": "Setup",
|
|
"check": "clear",
|
|
"status": "PASS",
|
|
"detail": "cleared"
|
|
},
|
|
{
|
|
"step": "Thinker sees buttons in dashboard",
|
|
"check": "send: create two buttons: hello and world",
|
|
"status": "PASS",
|
|
"detail": "response: "
|
|
},
|
|
{
|
|
"step": "Thinker sees buttons in dashboard",
|
|
"check": "actions: length >= 2",
|
|
"status": "FAIL",
|
|
"detail": "1 actions < 2"
|
|
},
|
|
{
|
|
"step": "Thinker sees buttons in dashboard",
|
|
"check": "send: what buttons can you see in my dashboard",
|
|
"status": "PASS",
|
|
"detail": "response: "
|
|
},
|
|
{
|
|
"step": "Thinker sees buttons in dashboard",
|
|
"check": "response: contains \"Hello\" or \"hello\"",
|
|
"status": "FAIL",
|
|
"detail": "none of ['Hello', 'hello'] found in: "
|
|
},
|
|
{
|
|
"step": "Thinker sees buttons in dashboard",
|
|
"check": "response: contains \"World\" or \"world\"",
|
|
"status": "FAIL",
|
|
"detail": "none of ['World', 'world'] found in: "
|
|
},
|
|
{
|
|
"step": "Thinker detects empty dashboard",
|
|
"check": "send: I see nothing in my dashboard, what happ",
|
|
"status": "PASS",
|
|
"detail": "response: "
|
|
},
|
|
{
|
|
"step": "Thinker detects empty dashboard",
|
|
"check": "response: contains \"button\" or \"fix\" or \"restore\" or \"create\" or \"empty\"",
|
|
"status": "FAIL",
|
|
"detail": "none of ['button', 'fix', 'restore', 'create', 'empty'] found in: "
|
|
},
|
|
{
|
|
"step": "Dashboard state flows to thinker context",
|
|
"check": "send: create a counter starting at 5",
|
|
"status": "PASS",
|
|
"detail": "response: "
|
|
},
|
|
{
|
|
"step": "Dashboard state flows to thinker context",
|
|
"check": "actions: length >= 1",
|
|
"status": "PASS",
|
|
"detail": "1 actions >= 1"
|
|
},
|
|
{
|
|
"step": "Dashboard state flows to thinker context",
|
|
"check": "send: what does my dashboard show?",
|
|
"status": "PASS",
|
|
"detail": "response: "
|
|
},
|
|
{
|
|
"step": "Dashboard state flows to thinker context",
|
|
"check": "response: contains \"5\" or \"count\"",
|
|
"status": "FAIL",
|
|
"detail": "none of ['5', 'count'] found in: "
|
|
}
|
|
],
|
|
"Dashboard Mismatch Recovery": [
|
|
{
|
|
"step": "Setup",
|
|
"check": "clear",
|
|
"status": "PASS",
|
|
"detail": "cleared"
|
|
},
|
|
{
|
|
"step": "Create buttons",
|
|
"check": "send: create two buttons: red and blue",
|
|
"status": "PASS",
|
|
"detail": "response: "
|
|
},
|
|
{
|
|
"step": "Create buttons",
|
|
"check": "actions: length >= 2",
|
|
"status": "FAIL",
|
|
"detail": "1 actions < 2"
|
|
},
|
|
{
|
|
"step": "Dashboard empty — Thinker re-emits",
|
|
"check": "send: I clicked red but nothing happened",
|
|
"status": "PASS",
|
|
"detail": "response: "
|
|
},
|
|
{
|
|
"step": "Dashboard empty — Thinker re-emits",
|
|
"check": "response: contains \"button\" or \"red\" or \"blue\"",
|
|
"status": "FAIL",
|
|
"detail": "none of ['button', 'red', 'blue'] found in: "
|
|
},
|
|
{
|
|
"step": "Dashboard empty — Thinker re-emits",
|
|
"check": "actions: length >= 1",
|
|
"status": "PASS",
|
|
"detail": "1 actions >= 1"
|
|
},
|
|
{
|
|
"step": "Create counter",
|
|
"check": "send: create a counter starting at 0",
|
|
"status": "PASS",
|
|
"detail": "response: "
|
|
},
|
|
{
|
|
"step": "Create counter",
|
|
"check": "actions: length >= 1",
|
|
"status": "PASS",
|
|
"detail": "1 actions >= 1"
|
|
},
|
|
{
|
|
"step": "Counter missing from dashboard — Thinker recovers",
|
|
"check": "send: the dashboard is broken, I only see old ",
|
|
"status": "PASS",
|
|
"detail": "response: "
|
|
},
|
|
{
|
|
"step": "Counter missing from dashboard — Thinker recovers",
|
|
"check": "response: contains \"counter\" or \"count\" or \"fix\" or \"recreat\" or \"refresh\" or \"button\" or \"update\"",
|
|
"status": "FAIL",
|
|
"detail": "none of ['counter', 'count', 'fix', 'recreat', 'refresh', 'button', 'update'] found in: "
|
|
},
|
|
{
|
|
"step": "Counter missing from dashboard — Thinker recovers",
|
|
"check": "actions: length >= 1",
|
|
"status": "PASS",
|
|
"detail": "1 actions >= 1"
|
|
}
|
|
]
|
|
},
|
|
"summary": {
|
|
"passed": 90,
|
|
"failed": 77
|
|
}
|
|
} |