- Memorizer tracks user_expectation (conversational/delegated/waiting_input/observing) - Output node adjusts phrasing per expectation - PA retry loop: reformulates job on expert failure (all retries exhausted or tool skip) - Machine state in PA context: get_machine_summary includes current state, buttons, stored data - Expert writes to machine state via update_machine + transition_machine - Expanded baked schema coverage - Awareness panel shows color-coded expectation state - Dashboard and workspace component updates Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
363 lines
11 KiB
JSON
363 lines
11 KiB
JSON
{
|
|
"timestamp": "2026-03-30 00:02:55",
|
|
"testcases": {
|
|
"Artifact System": [
|
|
{
|
|
"step": "Setup",
|
|
"check": "clear",
|
|
"status": "PASS",
|
|
"detail": "cleared"
|
|
},
|
|
{
|
|
"step": "Query produces data_table artifact",
|
|
"check": "send: show me 3 customers in a table",
|
|
"status": "PASS",
|
|
"detail": "response: The database contains information for three customers: Kathrin Jager, Leon Schre"
|
|
},
|
|
{
|
|
"step": "Query produces data_table artifact",
|
|
"check": "trace: has tool_call",
|
|
"status": "PASS",
|
|
"detail": "found event 'tool_call'"
|
|
},
|
|
{
|
|
"step": "Query produces data_table artifact",
|
|
"check": "response: length > 10",
|
|
"status": "PASS",
|
|
"detail": "length 138 > 10"
|
|
},
|
|
{
|
|
"step": "Entity detail via card",
|
|
"check": "send: show me details for customer 1",
|
|
"status": "PASS",
|
|
"detail": "response: ```tool_code\nquery_db({\"query\":\"SELECT * FROM customers WHERE customer_id = 1\"})"
|
|
},
|
|
{
|
|
"step": "Entity detail via card",
|
|
"check": "trace: has tool_call",
|
|
"status": "PASS",
|
|
"detail": "found event 'tool_call'"
|
|
},
|
|
{
|
|
"step": "Entity detail via card",
|
|
"check": "response: length > 10",
|
|
"status": "PASS",
|
|
"detail": "length 84 > 10"
|
|
},
|
|
{
|
|
"step": "Action bar via buttons",
|
|
"check": "send: create two buttons on my dashboard: Refr",
|
|
"status": "PASS",
|
|
"detail": "response: I have added the 'Refresh' and 'Export' buttons to your dashboard. These buttons"
|
|
},
|
|
{
|
|
"step": "Action bar via buttons",
|
|
"check": "actions: length >= 2",
|
|
"status": "PASS",
|
|
"detail": "2 actions >= 2"
|
|
},
|
|
{
|
|
"step": "Action bar via buttons",
|
|
"check": "actions: any action contains \"refresh\" or \"Refresh\"",
|
|
"status": "PASS",
|
|
"detail": "found 'refresh' in actions"
|
|
},
|
|
{
|
|
"step": "Machine artifact",
|
|
"check": "send: create a machine called \"flow\" with init",
|
|
"status": "PASS",
|
|
"detail": "response: OK, I've created a new interactive machine called 'flow' with the initial state "
|
|
},
|
|
{
|
|
"step": "Machine artifact",
|
|
"check": "trace: has machine_created",
|
|
"status": "PASS",
|
|
"detail": "found event 'machine_created'"
|
|
},
|
|
{
|
|
"step": "Query after buttons survive",
|
|
"check": "send: how many customers are there?",
|
|
"status": "PASS",
|
|
"detail": "response: There are 693 customers in the database.\n"
|
|
},
|
|
{
|
|
"step": "Query after buttons survive",
|
|
"check": "response: length > 5",
|
|
"status": "PASS",
|
|
"detail": "length 41 > 5"
|
|
},
|
|
{
|
|
"step": "Query after buttons survive",
|
|
"check": "actions: any action contains \"refresh\" or \"Refresh\"",
|
|
"status": "PASS",
|
|
"detail": "found 'refresh' in actions"
|
|
}
|
|
],
|
|
"Fast v4": [
|
|
{
|
|
"step": "Setup",
|
|
"check": "clear",
|
|
"status": "PASS",
|
|
"detail": "cleared"
|
|
},
|
|
{
|
|
"step": "Reflex",
|
|
"check": "send: hi!",
|
|
"status": "PASS",
|
|
"detail": "response: Hey Nico! 👋 How can I help you today?\n"
|
|
},
|
|
{
|
|
"step": "Reflex",
|
|
"check": "response: length > 2",
|
|
"status": "PASS",
|
|
"detail": "length 38 > 2"
|
|
},
|
|
{
|
|
"step": "PA routes to expert",
|
|
"check": "send: show me 3 customers",
|
|
"status": "PASS",
|
|
"detail": "response: Alright, I've fetched 3 customer records for you. You can see the ID, Name detai"
|
|
},
|
|
{
|
|
"step": "PA routes to expert",
|
|
"check": "trace: has routed",
|
|
"status": "PASS",
|
|
"detail": "found event 'routed'"
|
|
},
|
|
{
|
|
"step": "PA routes to expert",
|
|
"check": "trace: has tool_call",
|
|
"status": "PASS",
|
|
"detail": "found event 'tool_call'"
|
|
},
|
|
{
|
|
"step": "PA routes to expert",
|
|
"check": "response: length > 10",
|
|
"status": "PASS",
|
|
"detail": "length 181 > 10"
|
|
},
|
|
{
|
|
"step": "German query",
|
|
"check": "send: Zeig mir alle Tabellen in der Datenbank",
|
|
"status": "PASS",
|
|
"detail": "response: Okay, ich habe eine Liste aller Tabellen in der \"eras2_production\" Datenbank abg"
|
|
},
|
|
{
|
|
"step": "German query",
|
|
"check": "trace: has tool_call",
|
|
"status": "PASS",
|
|
"detail": "found event 'tool_call'"
|
|
},
|
|
{
|
|
"step": "German query",
|
|
"check": "response: length > 10",
|
|
"status": "PASS",
|
|
"detail": "length 303 > 10"
|
|
},
|
|
{
|
|
"step": "Schema discovery",
|
|
"check": "send: describe the kunden table",
|
|
"status": "PASS",
|
|
"detail": "response: The `kunden` table stores customer information, including names, customer number"
|
|
},
|
|
{
|
|
"step": "Schema discovery",
|
|
"check": "trace: has tool_call",
|
|
"status": "PASS",
|
|
"detail": "found event 'tool_call'"
|
|
},
|
|
{
|
|
"step": "Schema discovery",
|
|
"check": "response: length > 10",
|
|
"status": "PASS",
|
|
"detail": "length 391 > 10"
|
|
},
|
|
{
|
|
"step": "Count query (cached schema)",
|
|
"check": "send: how many customers are there?",
|
|
"status": "PASS",
|
|
"detail": "response: There are 693 customers in the database.\n"
|
|
},
|
|
{
|
|
"step": "Count query (cached schema)",
|
|
"check": "trace: has tool_call",
|
|
"status": "PASS",
|
|
"detail": "found event 'tool_call'"
|
|
},
|
|
{
|
|
"step": "Count query (cached schema)",
|
|
"check": "response: length > 5",
|
|
"status": "PASS",
|
|
"detail": "length 41 > 5"
|
|
},
|
|
{
|
|
"step": "Complex query",
|
|
"check": "send: which customers have the most devices?",
|
|
"status": "PASS",
|
|
"detail": "response: The query results list the top 10 customers with the most devices. Anne Bürger h"
|
|
},
|
|
{
|
|
"step": "Complex query",
|
|
"check": "trace: has tool_call",
|
|
"status": "PASS",
|
|
"detail": "found event 'tool_call'"
|
|
},
|
|
{
|
|
"step": "Complex query",
|
|
"check": "response: length > 20",
|
|
"status": "PASS",
|
|
"detail": "length 166 > 20"
|
|
},
|
|
{
|
|
"step": "Error recovery",
|
|
"check": "send: SELECT * FROM nichtexistiert",
|
|
"status": "PASS",
|
|
"detail": "response: Oops! It seems like the table `nichtexistiert` doesn't exist in the database. 🤔 "
|
|
},
|
|
{
|
|
"step": "Error recovery",
|
|
"check": "trace: has tool_call",
|
|
"status": "PASS",
|
|
"detail": "found event 'tool_call'"
|
|
},
|
|
{
|
|
"step": "Error recovery",
|
|
"check": "response: length > 10",
|
|
"status": "PASS",
|
|
"detail": "length 396 > 10"
|
|
},
|
|
{
|
|
"step": "Memorizer",
|
|
"check": "send: my name is Nico",
|
|
"status": "PASS",
|
|
"detail": "response: Thanks, Nico! I'll remember that. Do you have any other questions? 😊\n"
|
|
},
|
|
{
|
|
"step": "Memorizer",
|
|
"check": "state: facts any contains \"Nico\"",
|
|
"status": "PASS",
|
|
"detail": "found 'Nico' in facts"
|
|
},
|
|
{
|
|
"step": "Language switch",
|
|
"check": "send: Hallo wie gehts?",
|
|
"status": "PASS",
|
|
"detail": "response: Mir geht es gut, danke der Nachfrage, Nico! Und selbst? Gibt es etwas, bei dem i"
|
|
},
|
|
{
|
|
"step": "Language switch",
|
|
"check": "state: language is \"de\" or \"mixed\"",
|
|
"status": "PASS",
|
|
"detail": "language=mixed"
|
|
},
|
|
{
|
|
"step": "Bye",
|
|
"check": "send: ok bye",
|
|
"status": "PASS",
|
|
"detail": "response: Bye Nico! 👋 If you need anything else, just let me know. 😊\n"
|
|
},
|
|
{
|
|
"step": "Bye",
|
|
"check": "response: length > 2",
|
|
"status": "PASS",
|
|
"detail": "length 59 > 2"
|
|
}
|
|
],
|
|
"Dashboard Integration": [
|
|
{
|
|
"step": "Setup",
|
|
"check": "clear",
|
|
"status": "PASS",
|
|
"detail": "cleared"
|
|
},
|
|
{
|
|
"step": "Expert creates buttons",
|
|
"check": "send: create two buttons on my dashboard: Repo",
|
|
"status": "PASS",
|
|
"detail": "response: I have added 'Report' and 'Export' buttons to your dashboard.\n\n(UI buttons shown"
|
|
},
|
|
{
|
|
"step": "Expert creates buttons",
|
|
"check": "actions: length >= 2",
|
|
"status": "PASS",
|
|
"detail": "3 actions >= 2"
|
|
},
|
|
{
|
|
"step": "Expert creates buttons",
|
|
"check": "actions: any action contains \"report\" or \"Report\"",
|
|
"status": "PASS",
|
|
"detail": "found 'report' in actions"
|
|
},
|
|
{
|
|
"step": "Buttons survive a query",
|
|
"check": "send: how many customers are there?",
|
|
"status": "PASS",
|
|
"detail": "response: I'm running a query to count all customer IDs. One moment...\n"
|
|
},
|
|
{
|
|
"step": "Buttons survive a query",
|
|
"check": "response: length > 5",
|
|
"status": "PASS",
|
|
"detail": "length 61 > 5"
|
|
},
|
|
{
|
|
"step": "Buttons survive a query",
|
|
"check": "actions: any action contains \"report\" or \"Report\"",
|
|
"status": "PASS",
|
|
"detail": "found 'report' in actions"
|
|
},
|
|
{
|
|
"step": "Expert creates a machine",
|
|
"check": "send: create a navigation machine called \"work",
|
|
"status": "PASS",
|
|
"detail": "response: I've created the 'workflow' machine with 'start' and 'step2' states. The 'start'"
|
|
},
|
|
{
|
|
"step": "Expert creates a machine",
|
|
"check": "trace: has tool_call create_machine",
|
|
"status": "PASS",
|
|
"detail": "found create_machine via machine_created event"
|
|
},
|
|
{
|
|
"step": "Expert shows data table",
|
|
"check": "send: show me 5 customers in a table",
|
|
"status": "PASS",
|
|
"detail": "response: Here are five customer entries with their IDs, names, object count, and status:\n"
|
|
},
|
|
{
|
|
"step": "Expert shows data table",
|
|
"check": "trace: has tool_call",
|
|
"status": "PASS",
|
|
"detail": "found event 'tool_call'"
|
|
},
|
|
{
|
|
"step": "Expert shows data table",
|
|
"check": "response: length > 10",
|
|
"status": "PASS",
|
|
"detail": "length 118 > 10"
|
|
},
|
|
{
|
|
"step": "Expert replaces buttons",
|
|
"check": "send: remove all buttons and create one button",
|
|
"status": "PASS",
|
|
"detail": "response: I have removed the existing 'Report' and 'Export' buttons from the dashboard and"
|
|
},
|
|
{
|
|
"step": "Expert replaces buttons",
|
|
"check": "actions: length >= 1",
|
|
"status": "PASS",
|
|
"detail": "2 actions >= 1"
|
|
},
|
|
{
|
|
"step": "Expert replaces buttons",
|
|
"check": "actions: any action contains \"reset\" or \"Reset\"",
|
|
"status": "PASS",
|
|
"detail": "found 'reset' in actions"
|
|
}
|
|
]
|
|
},
|
|
"summary": {
|
|
"passed": 58,
|
|
"failed": 0
|
|
}
|
|
} |