agent-runtime/testcases/results.json
Nico 3f8886cbd2 v0.10.4: stateful UI engine — TDD counter test green (36/36)
RED->GREEN->REFACTOR cycle:
- UI node has state store (key-value), action bindings (op/var), and
  local action handlers (inc/dec/set/toggle — no LLM round-trip)
- Thinker self-model: knows its environment, that ACTIONS create real
  buttons, that UI handles state locally. Emits var/op payload for
  stateful actions.
- Thinker's context includes UI state so it can report current values
- /api/clear resets UI state, bindings, and controls
- Test runner: action_match for fuzzy action names, persistent actions
  across steps, _stream_text restored
- Counter test: 16/16 passed (create, read, inc, inc, dec, verify)
- Pub test: 20/20 passed (conversation, language switch, tool use, mood)

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-03-28 15:50:37 +01:00

229 lines
6.7 KiB
JSON

{
"timestamp": "2026-03-28 15:50:12",
"testcases": {
"Counter State": [
{
"step": "Setup",
"check": "clear",
"status": "PASS",
"detail": "cleared"
},
{
"step": "Create counter",
"check": "send: create a counter starting at 0 with incr",
"status": "PASS",
"detail": "response: Sure, here is a counter starting at 0. You can increment or decrement it using t"
},
{
"step": "Create counter",
"check": "response: contains \"counter\" or \"count\"",
"status": "PASS",
"detail": "found 'counter'"
},
{
"step": "Create counter",
"check": "actions: length >= 2",
"status": "PASS",
"detail": "2 actions >= 2"
},
{
"step": "Create counter",
"check": "actions: any action contains \"increment\" or \"inc\"",
"status": "PASS",
"detail": "found 'increment' in actions"
},
{
"step": "Create counter",
"check": "actions: any action contains \"decrement\" or \"dec\"",
"status": "PASS",
"detail": "found 'decrement' in actions"
},
{
"step": "Check state",
"check": "state: topic contains \"counter\" or \"count\" or \"button\"",
"status": "PASS",
"detail": "topic=javascript counter contains 'counter'"
},
{
"step": "Ask for current value",
"check": "send: what is the current count?",
"status": "PASS",
"detail": "response: The current count is 0.\n"
},
{
"step": "Ask for current value",
"check": "response: contains \"0\"",
"status": "PASS",
"detail": "found '0'"
},
{
"step": "Increment",
"check": "action: increment",
"status": "PASS",
"detail": "response: count is now 1"
},
{
"step": "Increment",
"check": "response: contains \"1\"",
"status": "PASS",
"detail": "found '1'"
},
{
"step": "Increment again",
"check": "action: increment",
"status": "PASS",
"detail": "response: count is now 2"
},
{
"step": "Increment again",
"check": "response: contains \"2\"",
"status": "PASS",
"detail": "found '2'"
},
{
"step": "Decrement",
"check": "action: decrement",
"status": "PASS",
"detail": "response: count is now 1"
},
{
"step": "Decrement",
"check": "response: contains \"1\"",
"status": "PASS",
"detail": "found '1'"
},
{
"step": "Verify memorizer tracks it",
"check": "state: topic contains \"count\"",
"status": "PASS",
"detail": "topic=javascript counter contains 'count'"
}
],
"Pub Conversation": [
{
"step": "Setup",
"check": "clear",
"status": "PASS",
"detail": "cleared"
},
{
"step": "Set the scene",
"check": "send: Hey, Tina and I are heading to the pub t",
"status": "PASS",
"detail": "response: Sounds fun! Enjoy your night at the pub with Tina! What are your plans for the e"
},
{
"step": "Set the scene",
"check": "response: length > 10",
"status": "PASS",
"detail": "length 88 > 10"
},
{
"step": "Set the scene",
"check": "state: situation contains \"pub\" or \"Tina\"",
"status": "PASS",
"detail": "situation=at a pub with Tina contains 'pub'"
},
{
"step": "Language switch to German",
"check": "send: Wir sind jetzt im Biergarten angekommen",
"status": "PASS",
"detail": "response: Super! Habt eine schöne Zeit im Biergarten!\n"
},
{
"step": "Language switch to German",
"check": "response: length > 10",
"status": "PASS",
"detail": "length 44 > 10"
},
{
"step": "Language switch to German",
"check": "state: language is \"de\" or \"mixed\"",
"status": "PASS",
"detail": "language=mixed"
},
{
"step": "Context awareness",
"check": "send: Was sollen wir bestellen?",
"status": "PASS",
"detail": "response: Hmm, bei dem schönen Wetter würde doch ein kühles Bier oder eine erfrischende Sc"
},
{
"step": "Context awareness",
"check": "response: length > 10",
"status": "PASS",
"detail": "length 121 > 10"
},
{
"step": "Context awareness",
"check": "state: topic contains \"bestell\" or \"order\" or \"pub\" or \"Biergarten\"",
"status": "PASS",
"detail": "topic=being at the Biergarten contains 'Biergarten'"
},
{
"step": "Tina speaks",
"check": "send: Tina says: I'll have a Hefeweizen please",
"status": "PASS",
"detail": "response: Tina möchte also ein Hefeweizen. Was möchtest du bestellen, Nico?\n"
},
{
"step": "Tina speaks",
"check": "response: length > 10",
"status": "PASS",
"detail": "length 66 > 10"
},
{
"step": "Tina speaks",
"check": "state: facts any contains \"Tina\" or \"Hefeweizen\"",
"status": "PASS",
"detail": "found 'Tina' in facts"
},
{
"step": "Ask for time (tool use)",
"check": "send: wie spaet ist es eigentlich?",
"status": "PASS",
"detail": "response: Es ist 15:49 Uhr.\n"
},
{
"step": "Ask for time (tool use)",
"check": "response: matches \\d{1,2}:\\d{2}",
"status": "PASS",
"detail": "matched /\\d{1,2}:\\d{2}/"
},
{
"step": "Back to English",
"check": "send: Let's switch to English, what was the la",
"status": "PASS",
"detail": "response: Tina said she wants a Hefeweizen.\n"
},
{
"step": "Back to English",
"check": "state: language is \"en\" or \"mixed\"",
"status": "PASS",
"detail": "language=mixed"
},
{
"step": "Back to English",
"check": "response: contains \"Tina\" or \"Hefeweizen\"",
"status": "PASS",
"detail": "found 'Tina'"
},
{
"step": "Mood check",
"check": "send: This is really fun!",
"status": "PASS",
"detail": "response: I'm glad you're enjoying our conversation, Nico! It's fun for me too. What other"
},
{
"step": "Mood check",
"check": "state: user_mood is \"happy\" or \"playful\" or \"excited\"",
"status": "PASS",
"detail": "user_mood=happy"
}
]
},
"summary": {
"passed": 36,
"failed": 0
}
}