{ "timestamp": "2026-03-30 00:02:55", "testcases": { "Artifact System": [ { "step": "Setup", "check": "clear", "status": "PASS", "detail": "cleared" }, { "step": "Query produces data_table artifact", "check": "send: show me 3 customers in a table", "status": "PASS", "detail": "response: The database contains information for three customers: Kathrin Jager, Leon Schre" }, { "step": "Query produces data_table artifact", "check": "trace: has tool_call", "status": "PASS", "detail": "found event 'tool_call'" }, { "step": "Query produces data_table artifact", "check": "response: length > 10", "status": "PASS", "detail": "length 138 > 10" }, { "step": "Entity detail via card", "check": "send: show me details for customer 1", "status": "PASS", "detail": "response: ```tool_code\nquery_db({\"query\":\"SELECT * FROM customers WHERE customer_id = 1\"})" }, { "step": "Entity detail via card", "check": "trace: has tool_call", "status": "PASS", "detail": "found event 'tool_call'" }, { "step": "Entity detail via card", "check": "response: length > 10", "status": "PASS", "detail": "length 84 > 10" }, { "step": "Action bar via buttons", "check": "send: create two buttons on my dashboard: Refr", "status": "PASS", "detail": "response: I have added the 'Refresh' and 'Export' buttons to your dashboard. These buttons" }, { "step": "Action bar via buttons", "check": "actions: length >= 2", "status": "PASS", "detail": "2 actions >= 2" }, { "step": "Action bar via buttons", "check": "actions: any action contains \"refresh\" or \"Refresh\"", "status": "PASS", "detail": "found 'refresh' in actions" }, { "step": "Machine artifact", "check": "send: create a machine called \"flow\" with init", "status": "PASS", "detail": "response: OK, I've created a new interactive machine called 'flow' with the initial state " }, { "step": "Machine artifact", "check": "trace: has machine_created", "status": "PASS", "detail": "found event 'machine_created'" }, { "step": "Query after buttons survive", "check": "send: how many customers are there?", "status": "PASS", "detail": "response: There are 693 customers in the database.\n" }, { "step": "Query after buttons survive", "check": "response: length > 5", "status": "PASS", "detail": "length 41 > 5" }, { "step": "Query after buttons survive", "check": "actions: any action contains \"refresh\" or \"Refresh\"", "status": "PASS", "detail": "found 'refresh' in actions" } ], "Fast v4": [ { "step": "Setup", "check": "clear", "status": "PASS", "detail": "cleared" }, { "step": "Reflex", "check": "send: hi!", "status": "PASS", "detail": "response: Hey Nico! πŸ‘‹ How can I help you today?\n" }, { "step": "Reflex", "check": "response: length > 2", "status": "PASS", "detail": "length 38 > 2" }, { "step": "PA routes to expert", "check": "send: show me 3 customers", "status": "PASS", "detail": "response: Alright, I've fetched 3 customer records for you. You can see the ID, Name detai" }, { "step": "PA routes to expert", "check": "trace: has routed", "status": "PASS", "detail": "found event 'routed'" }, { "step": "PA routes to expert", "check": "trace: has tool_call", "status": "PASS", "detail": "found event 'tool_call'" }, { "step": "PA routes to expert", "check": "response: length > 10", "status": "PASS", "detail": "length 181 > 10" }, { "step": "German query", "check": "send: Zeig mir alle Tabellen in der Datenbank", "status": "PASS", "detail": "response: Okay, ich habe eine Liste aller Tabellen in der \"eras2_production\" Datenbank abg" }, { "step": "German query", "check": "trace: has tool_call", "status": "PASS", "detail": "found event 'tool_call'" }, { "step": "German query", "check": "response: length > 10", "status": "PASS", "detail": "length 303 > 10" }, { "step": "Schema discovery", "check": "send: describe the kunden table", "status": "PASS", "detail": "response: The `kunden` table stores customer information, including names, customer number" }, { "step": "Schema discovery", "check": "trace: has tool_call", "status": "PASS", "detail": "found event 'tool_call'" }, { "step": "Schema discovery", "check": "response: length > 10", "status": "PASS", "detail": "length 391 > 10" }, { "step": "Count query (cached schema)", "check": "send: how many customers are there?", "status": "PASS", "detail": "response: There are 693 customers in the database.\n" }, { "step": "Count query (cached schema)", "check": "trace: has tool_call", "status": "PASS", "detail": "found event 'tool_call'" }, { "step": "Count query (cached schema)", "check": "response: length > 5", "status": "PASS", "detail": "length 41 > 5" }, { "step": "Complex query", "check": "send: which customers have the most devices?", "status": "PASS", "detail": "response: The query results list the top 10 customers with the most devices. Anne BΓΌrger h" }, { "step": "Complex query", "check": "trace: has tool_call", "status": "PASS", "detail": "found event 'tool_call'" }, { "step": "Complex query", "check": "response: length > 20", "status": "PASS", "detail": "length 166 > 20" }, { "step": "Error recovery", "check": "send: SELECT * FROM nichtexistiert", "status": "PASS", "detail": "response: Oops! It seems like the table `nichtexistiert` doesn't exist in the database. πŸ€” " }, { "step": "Error recovery", "check": "trace: has tool_call", "status": "PASS", "detail": "found event 'tool_call'" }, { "step": "Error recovery", "check": "response: length > 10", "status": "PASS", "detail": "length 396 > 10" }, { "step": "Memorizer", "check": "send: my name is Nico", "status": "PASS", "detail": "response: Thanks, Nico! I'll remember that. Do you have any other questions? 😊\n" }, { "step": "Memorizer", "check": "state: facts any contains \"Nico\"", "status": "PASS", "detail": "found 'Nico' in facts" }, { "step": "Language switch", "check": "send: Hallo wie gehts?", "status": "PASS", "detail": "response: Mir geht es gut, danke der Nachfrage, Nico! Und selbst? Gibt es etwas, bei dem i" }, { "step": "Language switch", "check": "state: language is \"de\" or \"mixed\"", "status": "PASS", "detail": "language=mixed" }, { "step": "Bye", "check": "send: ok bye", "status": "PASS", "detail": "response: Bye Nico! πŸ‘‹ If you need anything else, just let me know. 😊\n" }, { "step": "Bye", "check": "response: length > 2", "status": "PASS", "detail": "length 59 > 2" } ], "Dashboard Integration": [ { "step": "Setup", "check": "clear", "status": "PASS", "detail": "cleared" }, { "step": "Expert creates buttons", "check": "send: create two buttons on my dashboard: Repo", "status": "PASS", "detail": "response: I have added 'Report' and 'Export' buttons to your dashboard.\n\n(UI buttons shown" }, { "step": "Expert creates buttons", "check": "actions: length >= 2", "status": "PASS", "detail": "3 actions >= 2" }, { "step": "Expert creates buttons", "check": "actions: any action contains \"report\" or \"Report\"", "status": "PASS", "detail": "found 'report' in actions" }, { "step": "Buttons survive a query", "check": "send: how many customers are there?", "status": "PASS", "detail": "response: I'm running a query to count all customer IDs. One moment...\n" }, { "step": "Buttons survive a query", "check": "response: length > 5", "status": "PASS", "detail": "length 61 > 5" }, { "step": "Buttons survive a query", "check": "actions: any action contains \"report\" or \"Report\"", "status": "PASS", "detail": "found 'report' in actions" }, { "step": "Expert creates a machine", "check": "send: create a navigation machine called \"work", "status": "PASS", "detail": "response: I've created the 'workflow' machine with 'start' and 'step2' states. The 'start'" }, { "step": "Expert creates a machine", "check": "trace: has tool_call create_machine", "status": "PASS", "detail": "found create_machine via machine_created event" }, { "step": "Expert shows data table", "check": "send: show me 5 customers in a table", "status": "PASS", "detail": "response: Here are five customer entries with their IDs, names, object count, and status:\n" }, { "step": "Expert shows data table", "check": "trace: has tool_call", "status": "PASS", "detail": "found event 'tool_call'" }, { "step": "Expert shows data table", "check": "response: length > 10", "status": "PASS", "detail": "length 118 > 10" }, { "step": "Expert replaces buttons", "check": "send: remove all buttons and create one button", "status": "PASS", "detail": "response: I have removed the existing 'Report' and 'Export' buttons from the dashboard and" }, { "step": "Expert replaces buttons", "check": "actions: length >= 1", "status": "PASS", "detail": "2 actions >= 1" }, { "step": "Expert replaces buttons", "check": "actions: any action contains \"reset\" or \"Reset\"", "status": "PASS", "detail": "found 'reset' in actions" } ] }, "summary": { "passed": 58, "failed": 0 } }