agent-runtime/testcases/results.json
Nico 925fff731f v0.17.0: User expectation tracking, PA retry loop, machine state in PA context
- Memorizer tracks user_expectation (conversational/delegated/waiting_input/observing)
- Output node adjusts phrasing per expectation
- PA retry loop: reformulates job on expert failure (all retries exhausted or tool skip)
- Machine state in PA context: get_machine_summary includes current state, buttons, stored data
- Expert writes to machine state via update_machine + transition_machine
- Expanded baked schema coverage
- Awareness panel shows color-coded expectation state
- Dashboard and workspace component updates

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-03-30 19:03:07 +02:00

363 lines
11 KiB
JSON

{
"timestamp": "2026-03-30 00:02:55",
"testcases": {
"Artifact System": [
{
"step": "Setup",
"check": "clear",
"status": "PASS",
"detail": "cleared"
},
{
"step": "Query produces data_table artifact",
"check": "send: show me 3 customers in a table",
"status": "PASS",
"detail": "response: The database contains information for three customers: Kathrin Jager, Leon Schre"
},
{
"step": "Query produces data_table artifact",
"check": "trace: has tool_call",
"status": "PASS",
"detail": "found event 'tool_call'"
},
{
"step": "Query produces data_table artifact",
"check": "response: length > 10",
"status": "PASS",
"detail": "length 138 > 10"
},
{
"step": "Entity detail via card",
"check": "send: show me details for customer 1",
"status": "PASS",
"detail": "response: ```tool_code\nquery_db({\"query\":\"SELECT * FROM customers WHERE customer_id = 1\"})"
},
{
"step": "Entity detail via card",
"check": "trace: has tool_call",
"status": "PASS",
"detail": "found event 'tool_call'"
},
{
"step": "Entity detail via card",
"check": "response: length > 10",
"status": "PASS",
"detail": "length 84 > 10"
},
{
"step": "Action bar via buttons",
"check": "send: create two buttons on my dashboard: Refr",
"status": "PASS",
"detail": "response: I have added the 'Refresh' and 'Export' buttons to your dashboard. These buttons"
},
{
"step": "Action bar via buttons",
"check": "actions: length >= 2",
"status": "PASS",
"detail": "2 actions >= 2"
},
{
"step": "Action bar via buttons",
"check": "actions: any action contains \"refresh\" or \"Refresh\"",
"status": "PASS",
"detail": "found 'refresh' in actions"
},
{
"step": "Machine artifact",
"check": "send: create a machine called \"flow\" with init",
"status": "PASS",
"detail": "response: OK, I've created a new interactive machine called 'flow' with the initial state "
},
{
"step": "Machine artifact",
"check": "trace: has machine_created",
"status": "PASS",
"detail": "found event 'machine_created'"
},
{
"step": "Query after buttons survive",
"check": "send: how many customers are there?",
"status": "PASS",
"detail": "response: There are 693 customers in the database.\n"
},
{
"step": "Query after buttons survive",
"check": "response: length > 5",
"status": "PASS",
"detail": "length 41 > 5"
},
{
"step": "Query after buttons survive",
"check": "actions: any action contains \"refresh\" or \"Refresh\"",
"status": "PASS",
"detail": "found 'refresh' in actions"
}
],
"Fast v4": [
{
"step": "Setup",
"check": "clear",
"status": "PASS",
"detail": "cleared"
},
{
"step": "Reflex",
"check": "send: hi!",
"status": "PASS",
"detail": "response: Hey Nico! 👋 How can I help you today?\n"
},
{
"step": "Reflex",
"check": "response: length > 2",
"status": "PASS",
"detail": "length 38 > 2"
},
{
"step": "PA routes to expert",
"check": "send: show me 3 customers",
"status": "PASS",
"detail": "response: Alright, I've fetched 3 customer records for you. You can see the ID, Name detai"
},
{
"step": "PA routes to expert",
"check": "trace: has routed",
"status": "PASS",
"detail": "found event 'routed'"
},
{
"step": "PA routes to expert",
"check": "trace: has tool_call",
"status": "PASS",
"detail": "found event 'tool_call'"
},
{
"step": "PA routes to expert",
"check": "response: length > 10",
"status": "PASS",
"detail": "length 181 > 10"
},
{
"step": "German query",
"check": "send: Zeig mir alle Tabellen in der Datenbank",
"status": "PASS",
"detail": "response: Okay, ich habe eine Liste aller Tabellen in der \"eras2_production\" Datenbank abg"
},
{
"step": "German query",
"check": "trace: has tool_call",
"status": "PASS",
"detail": "found event 'tool_call'"
},
{
"step": "German query",
"check": "response: length > 10",
"status": "PASS",
"detail": "length 303 > 10"
},
{
"step": "Schema discovery",
"check": "send: describe the kunden table",
"status": "PASS",
"detail": "response: The `kunden` table stores customer information, including names, customer number"
},
{
"step": "Schema discovery",
"check": "trace: has tool_call",
"status": "PASS",
"detail": "found event 'tool_call'"
},
{
"step": "Schema discovery",
"check": "response: length > 10",
"status": "PASS",
"detail": "length 391 > 10"
},
{
"step": "Count query (cached schema)",
"check": "send: how many customers are there?",
"status": "PASS",
"detail": "response: There are 693 customers in the database.\n"
},
{
"step": "Count query (cached schema)",
"check": "trace: has tool_call",
"status": "PASS",
"detail": "found event 'tool_call'"
},
{
"step": "Count query (cached schema)",
"check": "response: length > 5",
"status": "PASS",
"detail": "length 41 > 5"
},
{
"step": "Complex query",
"check": "send: which customers have the most devices?",
"status": "PASS",
"detail": "response: The query results list the top 10 customers with the most devices. Anne Bürger h"
},
{
"step": "Complex query",
"check": "trace: has tool_call",
"status": "PASS",
"detail": "found event 'tool_call'"
},
{
"step": "Complex query",
"check": "response: length > 20",
"status": "PASS",
"detail": "length 166 > 20"
},
{
"step": "Error recovery",
"check": "send: SELECT * FROM nichtexistiert",
"status": "PASS",
"detail": "response: Oops! It seems like the table `nichtexistiert` doesn't exist in the database. 🤔 "
},
{
"step": "Error recovery",
"check": "trace: has tool_call",
"status": "PASS",
"detail": "found event 'tool_call'"
},
{
"step": "Error recovery",
"check": "response: length > 10",
"status": "PASS",
"detail": "length 396 > 10"
},
{
"step": "Memorizer",
"check": "send: my name is Nico",
"status": "PASS",
"detail": "response: Thanks, Nico! I'll remember that. Do you have any other questions? 😊\n"
},
{
"step": "Memorizer",
"check": "state: facts any contains \"Nico\"",
"status": "PASS",
"detail": "found 'Nico' in facts"
},
{
"step": "Language switch",
"check": "send: Hallo wie gehts?",
"status": "PASS",
"detail": "response: Mir geht es gut, danke der Nachfrage, Nico! Und selbst? Gibt es etwas, bei dem i"
},
{
"step": "Language switch",
"check": "state: language is \"de\" or \"mixed\"",
"status": "PASS",
"detail": "language=mixed"
},
{
"step": "Bye",
"check": "send: ok bye",
"status": "PASS",
"detail": "response: Bye Nico! 👋 If you need anything else, just let me know. 😊\n"
},
{
"step": "Bye",
"check": "response: length > 2",
"status": "PASS",
"detail": "length 59 > 2"
}
],
"Dashboard Integration": [
{
"step": "Setup",
"check": "clear",
"status": "PASS",
"detail": "cleared"
},
{
"step": "Expert creates buttons",
"check": "send: create two buttons on my dashboard: Repo",
"status": "PASS",
"detail": "response: I have added 'Report' and 'Export' buttons to your dashboard.\n\n(UI buttons shown"
},
{
"step": "Expert creates buttons",
"check": "actions: length >= 2",
"status": "PASS",
"detail": "3 actions >= 2"
},
{
"step": "Expert creates buttons",
"check": "actions: any action contains \"report\" or \"Report\"",
"status": "PASS",
"detail": "found 'report' in actions"
},
{
"step": "Buttons survive a query",
"check": "send: how many customers are there?",
"status": "PASS",
"detail": "response: I'm running a query to count all customer IDs. One moment...\n"
},
{
"step": "Buttons survive a query",
"check": "response: length > 5",
"status": "PASS",
"detail": "length 61 > 5"
},
{
"step": "Buttons survive a query",
"check": "actions: any action contains \"report\" or \"Report\"",
"status": "PASS",
"detail": "found 'report' in actions"
},
{
"step": "Expert creates a machine",
"check": "send: create a navigation machine called \"work",
"status": "PASS",
"detail": "response: I've created the 'workflow' machine with 'start' and 'step2' states. The 'start'"
},
{
"step": "Expert creates a machine",
"check": "trace: has tool_call create_machine",
"status": "PASS",
"detail": "found create_machine via machine_created event"
},
{
"step": "Expert shows data table",
"check": "send: show me 5 customers in a table",
"status": "PASS",
"detail": "response: Here are five customer entries with their IDs, names, object count, and status:\n"
},
{
"step": "Expert shows data table",
"check": "trace: has tool_call",
"status": "PASS",
"detail": "found event 'tool_call'"
},
{
"step": "Expert shows data table",
"check": "response: length > 10",
"status": "PASS",
"detail": "length 118 > 10"
},
{
"step": "Expert replaces buttons",
"check": "send: remove all buttons and create one button",
"status": "PASS",
"detail": "response: I have removed the existing 'Report' and 'Export' buttons from the dashboard and"
},
{
"step": "Expert replaces buttons",
"check": "actions: length >= 1",
"status": "PASS",
"detail": "2 actions >= 1"
},
{
"step": "Expert replaces buttons",
"check": "actions: any action contains \"reset\" or \"Reset\"",
"status": "PASS",
"detail": "found 'reset' in actions"
}
]
},
"summary": {
"passed": 58,
"failed": 0
}
}