diff --git a/agent/api.py b/agent/api.py
index 6daa686..6dc8b54 100644
--- a/agent/api.py
+++ b/agent/api.py
@@ -155,7 +155,16 @@ def register_routes(app):
                 rt = _active_runtime or runtime
                 try:
                     if msg.get("type") == "action":
-                        await rt.handle_action(msg.get("action", "unknown"), msg.get("data"))
+                        action = msg.get("action", "unknown")
+                        data_payload = msg.get("data")
+                        if hasattr(rt, 'use_frames') and rt.use_frames:
+                            # Frame engine handles actions as ACTION: prefix messages
+                            action_text = f"ACTION:{action}"
+                            if data_payload:
+                                action_text += f"|data:{json.dumps(data_payload)}"
+                            await rt.handle_message(action_text)
+                        else:
+                            await rt.handle_action(action, data_payload)
                     elif msg.get("type") == "cancel_process":
                         rt.process_manager.cancel(msg.get("pid", 0))
                     else:
@@ -355,6 +364,7 @@ def register_routes(app):
             "language": "en",
             "style_hint": "casual, technical",
             "facts": [],
+            "user_expectation": "conversational",
         }
         _pipeline_result = {"status": "idle", "id": "", "stage": "cleared"}
         # Notify frontend via WS
diff --git a/agent/frame_engine.py b/agent/frame_engine.py
index 90cf6b0..8cf2c46 100644
--- a/agent/frame_engine.py
+++ b/agent/frame_engine.py
@@ -302,9 +302,59 @@ class FrameEngine:
             expert.send_hud = original_hud
 
         thought_summary = (f"response[{len(thought.response)}] tool={thought.tool_used or 'none'} "
-                           f"actions={len(thought.actions)}")
+                           f"actions={len(thought.actions)} errors={len(thought.errors)}")
         has_tool = bool(thought.tool_used and thought.tool_output)
 
+        # PA retry: if expert failed OR skipped tools when data was needed
+        expectation = self.memorizer.state.get("user_expectation", "conversational")
+        # Detect hallucination: expert returned no tool output for a data job
+        job_needs_data = any(k in (routing.job or "").lower()
+                             for k in ["query", "select", "tabelle", "table", "daten", "data",
+                                       "cost", "kosten", "count", "anzahl", "average", "schnitt",
+                                       "find", "finde", "show", "zeig", "list", "beschreib"])
+        expert_skipped_tools = not has_tool and not thought.errors and job_needs_data
+        if (thought.errors or expert_skipped_tools) and not has_tool and expectation in ("delegated", "waiting_input", "conversational"):
+            retry_reason = f"{len(thought.errors)} errors" if thought.errors else "no tool calls for data job"
+            self._end_frame(rec, output_summary=thought_summary,
+                            route="pa_retry", condition=f"expert_failed ({retry_reason}), expectation={expectation}")
+            await self._send_hud({"node": "runtime", "event": "pa_retry",
+                                  "detail": f"expert failed: {retry_reason}, retrying via PA"})
+
+            # Stream retry notice to user
+            retry_msg = "Anderer Ansatz..." if routing.language == "de" else "Trying a different approach..."
+            await self.sink.send_delta(retry_msg + "\n")
+
+            # PA reformulates with error context
+            retry_errors = thought.errors if thought.errors else [
+                {"query": "(none)", "error": "Expert produced no database queries. The job requires data lookup but the expert answered without querying. Reformulate with explicit query instructions."}
+            ]
+            error_summary = "; ".join(e.get("error", "")[:80] for e in retry_errors[-2:])
+            rec = self._begin_frame(self.frame + 1, "pa_retry",
+                                    input_summary=f"errors: {error_summary[:100]}")
+            routing2 = await self.nodes["pa"].route_retry(
+                command, self.history, memory_context=mem_ctx,
+                identity=self.identity, channel=self.channel,
+                original_job=routing.job, errors=retry_errors)
+            self._end_frame(rec, output_summary=f"retry_job: {(routing2.job or '')[:60]}",
+                            route=f"expert_{routing2.expert}" if routing2.expert != "none" else "output")
+
+            if routing2.expert != "none":
+                expert2 = self._experts.get(routing2.expert, expert)
+                rec = self._begin_frame(self.frame + 1, f"expert_{routing2.expert}_retry",
+                                        input_summary=f"retry job: {(routing2.job or '')[:80]}")
+                original_hud2 = expert2.send_hud
+                expert2.send_hud = self._make_progress_wrapper(original_hud2, routing2.language)
+                try:
+                    thought = await expert2.execute(routing2.job, routing2.language)
+                finally:
+                    expert2.send_hud = original_hud2
+                thought_summary = (f"response[{len(thought.response)}] tool={thought.tool_used or 'none'} "
+                                   f"errors={len(thought.errors)}")
+                has_tool = bool(thought.tool_used and thought.tool_output)
+                self._end_frame(rec, output_summary=thought_summary,
+                                route="interpreter" if has_tool else "output+ui")
+                routing = routing2  # use retry routing for rest of pipeline
+
         # Interpreter (conditional)
         if self.has_interpreter and has_tool:
             self._end_frame(rec, output_summary=thought_summary,
@@ -607,6 +657,10 @@ class FrameEngine:
         response, controls = await asyncio.gather(output_task, ui_task)
         if controls:
             await self.sink.send_controls(controls)
+        # Send artifacts (new system) alongside controls
+        artifacts = self.ui_node.get_artifacts()
+        if artifacts:
+            await self.sink.send_artifacts(artifacts)
         return response
 
     def _check_condition(self, name: str, command: Command = None,
@@ -624,6 +678,7 @@ class FrameEngine:
         return {
             "response": response,
             "controls": self.ui_node.current_controls,
+            "artifacts": self.ui_node.get_artifacts(),
             "memorizer": self.memorizer.state,
             "frames": self.frame,
             "trace": self.last_trace.to_dict(),
diff --git a/agent/nodes/eras_expert.py b/agent/nodes/eras_expert.py
index 5fe49ec..d35e661 100644
--- a/agent/nodes/eras_expert.py
+++ b/agent/nodes/eras_expert.py
@@ -118,6 +118,89 @@ IstRekonstruiert (bool), Herkunft (int)
 ManuellerWert (double), Rohablesung (double)
 Anmerkung, Fehler, Ampullenfarbe (longtext)
 
+=== auftraege (2960 rows) — billing work orders ===
+PK: ID (int)
+AuftragNummer, Bezeichnung (longtext)
+ErstellDatum, Abgeschlossen (datetime)
+ZugeordneteAbrechnungsinformationID (FK → abrechnungsinformationen.ID)
+ErstellMitarbeiterID (FK), AuftragsTyp (int), Status (int)
+Anmerkung, ObererText, UntererText (longtext)
+
+=== auftragspositionen (5094 rows) — line items per work order ===
+PK: ID (int)
+AuftragID (FK → auftraege.ID)
+ArtikelID (FK → artikel.ID)
+SollMenge, IstMenge (int)
+ZugeordneterGeraeteArtikelID (FK), ZugeordneteVertragPositionID (FK)
+
+=== artikelposition (70164 rows) — billing line items with prices ===
+PK: ID (int)
+ZugewiesenerArtikelID (FK → artikel.ID)
+ZugewieseneAbrechnungID (FK → abrechnungsinformationen.ID)
+RechnungID (FK → rechnung.ID)
+MengeVorgabe, Menge (decimal), NettoVorgabe, Netto (decimal), MWST (decimal)
+Rechnungsart (int), VorschussBerechnung (bool), ARechnung (bool)
+VerstecktInNebenkostenID (FK), ZugeordneteVertragPositionID (FK)
+
+=== artikel (1078 rows) — service/product catalog ===
+PK: ID (int)
+Artikelnummer, Bezeichnung (longtext)
+Netto (decimal), MWST (decimal)
+BerechnungsZiel (int), UmlageIn (int)
+ZugeordnetePreislisteID (FK)
+IstStandard, ARechnung, AppZusatz, IstEigenKostenpos (bool)
+
+=== rechnung (7356 rows) — invoices ===
+PK: ID (int)
+Rechnungsnummer (longtext), Rechnungsart (int)
+BezahltAm (datetime), BezahlterBetrag (decimal)
+Druckdatum, Erstelldatum, Exportdatum (datetime)
+AbrechnungsinformationID (FK → abrechnungsinformationen.ID)
+AbschlagSummeSonder, AbschlagSummeStandard (decimal)
+Bankeinzug (bool)
+
+=== abrechnungsinformationen (4261 rows) — billing periods/settings ===
+PK: ID (int)
+Von, Bis (datetime) — billing period
+AbrechnungHeizung, AbrechnungWarmwasser, AbrechnungNebenkosten, AbrechnungKaltwasser (bool)
+Tarifabrechnung, BHKW, HeizsaldoInNebenkosten, AbrechnungLegionellen, AbrechnungRauchmelder (bool)
+
+=== nebenkosten (42209 rows) — ancillary cost items ===
+PK: ID (int)
+Von, Bis (datetime)
+Bezeichnung (longtext), Mwst (decimal), Brutto (decimal)
+EinheitDerKostenart (longtext), Umlage (int), UmlageZiel (int)
+ZugeordnetesObjektID (FK → objekte.ID)
+NurEigentuemer, NurNutzer (bool)
+
+=== vorauszahlungen (83932 rows) — advance payments per tenant ===
+PK: ID (int)
+ZugeordneterNutzerID (FK → nutzer.ID)
+BetragNebenkosten, BetragHeizkosten, BetragWarmwasser (decimal)
+Von, Bis (datetime), IstNetto (bool)
+
+=== heizbetriebskosten (22557 rows) — heating operation costs ===
+PK: ID (int)
+Von, Bis (datetime), Bezeichnung (longtext)
+Mwst (decimal), Brutto (decimal), Art (int)
+ZugeordnetesObjektID (FK → objekte.ID)
+ZugeordneteVerbrauchsgruppeID (FK)
+
+=== brennstofflieferungen (6477 rows) — fuel deliveries ===
+PK: ID (int)
+GeliefertAm (datetime), Menge (decimal), Betrag (decimal)
+Mwst (decimal), Heizwert (decimal)
+Anfangsstand, Endstand (decimal)
+ZugeordneterEnergieVerwerterID (FK), BrennstoffMediumID (FK)
+ZugeordneteAbrechnungsinformationID (FK → abrechnungsinformationen.ID)
+
+=== vertragpositionen (4395 rows) — contract line items ===
+PK: ID (int)
+LaufzeitVon, LaufzeitBis (datetime)
+Menge (decimal), Gesamtpreis (decimal), PreisProEinheit (decimal), Mwst (decimal)
+ArtikelID (FK → artikel.ID), VertragNummer (longtext)
+Art (int), Umlage (int)
+
 JOIN PATTERNS (use exactly):
 Kunde → Objekte:     JOIN objektkunde ok ON ok.KundeID = k.ID JOIN objekte o ON o.ID = ok.ObjektID
 Objekt → Adresse:    JOIN objektadressen oa ON oa.ObjektID = o.ID JOIN adressen a ON a.ID = oa.AdresseID
@@ -126,6 +209,13 @@ Objekt → NE:         JOIN nutzeinheit ne ON ne.ObjektID = o.ID
 NE → Nutzer:         JOIN nutzer nu ON nu.NutzeinheitID = ne.ID
 NE → Geraete:        JOIN geraete g ON g.NutzeinheitID = ne.ID
 Geraet → Verbrauch:  JOIN geraeteverbraeuche gv ON gv.GeraetID = g.ID
+Auftrag → Positionen: JOIN auftragspositionen ap ON ap.AuftragID = a.ID
+Auftrag → Abrechnung: JOIN abrechnungsinformationen ai ON ai.ID = a.ZugeordneteAbrechnungsinformationID
+Artikelpos → Artikel: JOIN artikel art ON art.ID = ap.ZugewiesenerArtikelID
+Artikelpos → Rechnung: JOIN rechnung r ON r.ID = ap.RechnungID
+Artikelpos → Abrechnung: JOIN abrechnungsinformationen ai ON ai.ID = ap.ZugewieseneAbrechnungID
+Nebenkosten → Objekt: JOIN objekte o ON o.ID = nk.ZugeordnetesObjektID
+Vorauszahlung → Nutzer: JOIN nutzer nu ON nu.ID = vz.ZugeordneterNutzerID
 
 RULES:
 - For tables listed above: use ONLY the listed column names. Never guess.
diff --git a/agent/nodes/expert_base.py b/agent/nodes/expert_base.py
index 36ef7cd..9b5bcdc 100644
--- a/agent/nodes/expert_base.py
+++ b/agent/nodes/expert_base.py
@@ -38,28 +38,38 @@ Given a job description, produce a JSON tool sequence to accomplish it.
 
 Available tools:
 - query_db(query, database) — SQL SELECT/DESCRIBE/SHOW only
-- emit_actions(actions) — show buttons [{{label, action, payload?}}]
+- emit_actions(actions) — show buttons [{label, action, payload?}]
 - set_state(key, value) — persistent key-value
 - create_machine(id, initial, states) — interactive UI navigation
 - add_state / reset_machine / destroy_machine — machine lifecycle
+- update_machine(id, data) — update wizard data fields (e.g. {"bundesland": "Bayern"})
+- transition_machine(id, target) — move machine to a specific state
+- emit_artifact(type, data, actions?, meta?) — emit a typed workspace artifact:
+  type="entity_detail": data={title, subtitle?, fields:[{label,value}]}, actions=[{label,action}]
+  type="data_table": data={title?, columns:[str], rows:[{col:val}]}
+  type="document_page": data={title, sections:[{heading,content}]}
+  type="action_bar": actions=[{label, action, payload?}]
+  type="status": data={label, value?, display_type:"progress"|"info"|"text"}
 
-NOTE: Cards are generated automatically in the response step from query results.
-Do NOT plan emit_card or emit_list — just query the data and the system handles display.
+PREFERRED: Use emit_artifact for all display output. Legacy emit_card/emit_display still work but emit_artifact is cleaner.
+Cards are also generated automatically in the response step from query results.
 
 Output ONLY valid JSON:
-{{
+{
   "tool_sequence": [
-    {{"tool": "query_db", "args": {{"query": "SELECT ...", "database": "{database}"}}}}
+    {"tool": "query_db", "args": {"query": "SELECT ...", "database": "{database}"}}
   ],
   "response_hint": "How to phrase the result"
-}}
+}
 
 Rules:
 - NEVER guess column names. Use ONLY columns from the schema.
 - Max 5 tools. Keep it focused.
 - For entity details: query all relevant fields, the response step creates the card.
 - For lists: query multiple rows, the table renders automatically.
-- The job is self-contained."""
+- The job is self-contained.
+- NEVER answer data questions without querying the database. You MUST include at least one query_db call for any job that asks about data, counts, costs, or entities. If you are unsure which tables to use, start with DESCRIBE or SELECT * FROM table LIMIT 3 to explore.
+- An EMPTY tool_sequence is ONLY acceptable if the job explicitly asks for a UI-only action (buttons, machine, display) with no data lookup."""
 
     RESPONSE_SYSTEM = """You are a domain expert summarizing results for the user.
 
@@ -70,22 +80,22 @@ Job: {job}
 
 Output a JSON object with "text" (response to user) and optionally "card" (structured display):
 
-{{
+{
   "text": "Concise natural response, 1-3 sentences. Reference data. Match language: {language}.",
-  "card": {{
+  "card": {
     "title": "Entity Name or ID",
     "subtitle": "Type or category",
-    "fields": [{{"label": "Field", "value": "actual value from results"}}],
-    "actions": [{{"label": "Next action", "action": "action_id"}}]
-  }}
-}}
+    "fields": [{"label": "Field", "value": "actual value from results"}],
+    "actions": [{"label": "Next action", "action": "action_id"}]
+  }
+}
 
 Rules:
 - "text" is REQUIRED. Keep it short.
 - "card" is OPTIONAL. Include it for single-entity details (Kunde, Objekt, Auftrag).
 - Card fields must use ACTUAL values from the query results, never templates/placeholders.
 - For lists of multiple entities, use multiple fields or skip the card.
-- If no card makes sense, just return {{"text": "..."}}.
+- If no card makes sense, just return {"text": "..."}.
 - Output ONLY valid JSON."""
 
     def __init__(self, send_hud, process_manager=None):
@@ -113,10 +123,12 @@ Rules:
                         plan_prompt += f"  DESCRIBE result: {err['describe'][:300]}\n"
                 plan_prompt += "\nFix the query. If a column was unknown, use the DESCRIBE result above or try SELECT * LIMIT 3 to see actual columns."
 
+            plan_system = self.PLAN_SYSTEM
+            plan_system = plan_system.replace("{domain}", self.DOMAIN_SYSTEM)
+            plan_system = plan_system.replace("{schema}", self.SCHEMA)
+            plan_system = plan_system.replace("{database}", self.default_database)
             plan_messages = [
-                {"role": "system", "content": self.PLAN_SYSTEM.format(
-                    domain=self.DOMAIN_SYSTEM, schema=self.SCHEMA,
-                    database=self.default_database)},
+                {"role": "system", "content": plan_system},
                 {"role": "user", "content": plan_prompt},
             ]
             plan_raw = await llm_call(self.model, plan_messages)
@@ -129,6 +141,7 @@ Rules:
             state_updates = {}
             display_items = []
             machine_ops = []
+            artifacts = []
             tool_used = ""
             tool_output = ""
             had_error = False
@@ -162,6 +175,20 @@ Rules:
                     machine_ops.append({"op": "reset", **args})
                 elif tool == "destroy_machine":
                     machine_ops.append({"op": "destroy", **args})
+                elif tool == "update_machine":
+                    machine_ops.append({"op": "update_data", **args})
+                elif tool == "transition_machine":
+                    machine_ops.append({"op": "transition", **args})
+                elif tool == "emit_artifact":
+                    import uuid
+                    artifact = {
+                        "id": args.get("id", str(uuid.uuid4())[:8]),
+                        "type": args.get("type", "status"),
+                        "data": args.get("data", {}),
+                        "actions": args.get("actions", []),
+                        "meta": args.get("meta", {}),
+                    }
+                    artifacts.append(artifact)
                 elif tool == "query_db":
                     query = args.get("query", "")
                     database = args.get("database", self.default_database)
@@ -213,9 +240,13 @@ Rules:
             for err in errors_so_far[-2:]:
                 results_text += f"  {err['error'][:100]}\n"
 
+        resp_system = self.RESPONSE_SYSTEM
+        resp_system = resp_system.replace("{domain}", self.DOMAIN_SYSTEM)
+        resp_system = resp_system.replace("{job}", job)
+        resp_system = resp_system.replace("{results}", results_text)
+        resp_system = resp_system.replace("{language}", language)
         resp_messages = [
-            {"role": "system", "content": self.RESPONSE_SYSTEM.format(
-                domain=self.DOMAIN_SYSTEM, job=job, results=results_text, language=language)},
+            {"role": "system", "content": resp_system},
             {"role": "user", "content": job},
         ]
         raw_response = await llm_call(self.model, resp_messages)
@@ -231,7 +262,14 @@ Rules:
                 text = text.strip()
             resp_data = json.loads(text)
             response = resp_data.get("text", raw_response)
-            if resp_data.get("card"):
+            if resp_data.get("artifact"):
+                # New: artifact in response JSON
+                art = resp_data["artifact"]
+                import uuid
+                if "id" not in art:
+                    art["id"] = str(uuid.uuid4())[:8]
+                artifacts.append(art)
+            elif resp_data.get("card"):
                 card = resp_data["card"]
                 card["type"] = "card"
                 display_items.append(card)
@@ -248,6 +286,8 @@ Rules:
             state_updates=state_updates,
             display_items=display_items,
             machine_ops=machine_ops,
+            errors=errors_so_far,
+            artifacts=artifacts,
         )
 
     def _parse_plan(self, raw: str) -> tuple[list, str]:
diff --git a/agent/nodes/input_v1.py b/agent/nodes/input_v1.py
index e3b37ed..0db3a52 100644
--- a/agent/nodes/input_v1.py
+++ b/agent/nodes/input_v1.py
@@ -22,7 +22,7 @@ Listener: {identity} on {channel}
 Return ONLY valid JSON. No markdown, no explanation.
 
 Schema:
-{{
+{
   "who": "name or unknown",
   "language": "en | de | mixed",
   "intent": "question | request | social | action | feedback",
@@ -30,7 +30,7 @@ Schema:
   "tone": "casual | frustrated | playful | urgent",
   "complexity": "trivial | simple | complex",
   "context": "brief note or empty"
-}}
+}
 
 Rules:
 - Classify the CURRENT message only. Previous messages are context, not the target.
@@ -53,11 +53,11 @@ Rules:
   casual = neutral
 
 Examples:
-"hi there!" -> {{"language":"en","intent":"social","tone":"casual","complexity":"trivial"}}
-"Wie spaet ist es?" -> {{"language":"de","intent":"question","tone":"casual","complexity":"simple"}}
-"this is broken, nothing works" -> {{"language":"en","intent":"feedback","tone":"frustrated","complexity":"simple"}}
-"create two buttons" -> {{"language":"en","intent":"request","tone":"casual","complexity":"simple"}}
-"ok thanks bye" -> {{"language":"en","intent":"social","tone":"casual","complexity":"trivial"}}
+"hi there!" -> {"language":"en","intent":"social","tone":"casual","complexity":"trivial"}
+"Wie spaet ist es?" -> {"language":"de","intent":"question","tone":"casual","complexity":"simple"}
+"this is broken, nothing works" -> {"language":"en","intent":"feedback","tone":"frustrated","complexity":"simple"}
+"create two buttons" -> {"language":"en","intent":"request","tone":"casual","complexity":"simple"}
+"ok thanks bye" -> {"language":"en","intent":"social","tone":"casual","complexity":"trivial"}
 
 {memory_context}"""
 
@@ -78,8 +78,9 @@ Examples:
             history_summary = "Recent conversation:\n" + "\n".join(lines)
 
         messages = [
-            {"role": "system", "content": self.SYSTEM.format(
-                memory_context=memory_context, identity=identity, channel=channel)},
+            {"role": "system", "content": self.SYSTEM.replace(
+                "{memory_context}", memory_context).replace(
+                "{identity}", identity).replace("{channel}", channel)},
         ]
         if history_summary:
             messages.append({"role": "user", "content": history_summary})
diff --git a/agent/nodes/memorizer_v1.py b/agent/nodes/memorizer_v1.py
index a90f2b2..fe2c7e5 100644
--- a/agent/nodes/memorizer_v1.py
+++ b/agent/nodes/memorizer_v1.py
@@ -26,6 +26,19 @@ Given the conversation so far, output a JSON object with these fields:
 - language: string — primary language being used (en, de, mixed)
 - style_hint: string — how Output should talk (casual, formal, technical, poetic, etc.)
 - facts: list of strings — important facts learned about the user. NEVER drop facts from the existing list unless they are proven wrong. Always include all existing facts plus any new ones.
+- user_expectation: string — what the user expects the agent to do next. One of:
+  "conversational" — default. User is chatting, asking questions, browsing. Normal back-and-forth.
+  "delegated" — user gave an imperative task ("build X", "do Y", "create Z"). They expect autonomous progress, not clarifying questions.
+  "waiting_input" — agent asked a question or presented choices. User's next message is likely an answer.
+  "observing" — user returned after being idle, or is reviewing a large output. Brief responses, wait for explicit engagement.
+  Cues:
+  - Imperative verbs + task scope ("build", "create", "do", "find") → delegated
+  - Agent ended with "Moment..." / thinking message but user hasn't seen full results yet → delegated (task still in progress)
+  - Short follow-ups like "und?", "ja?", "weiter?", "and?", "so?", "result?", "ergebnis?" → waiting_input (user is waiting for the agent to deliver)
+  - Agent ended with a question ("Sollen wir...?", "Gibt es...?") → waiting_input
+  - User said "ok/thanks/bye/danke" after output → observing
+  - Everything else → conversational
+  IMPORTANT: If the agent just delivered partial results or said "Moment..." and the user sends a short nudge, that is ALWAYS waiting_input, never conversational.
 
 Output ONLY valid JSON. No explanation, no markdown fences."""
 
@@ -40,6 +53,7 @@ Output ONLY valid JSON. No explanation, no markdown fences."""
             "language": "en",
             "style_hint": "casual, technical",
             "facts": [],
+            "user_expectation": "conversational",
         }
 
     def get_context_block(self, sensor_lines: list[str] = None, ui_state: dict = None) -> str:
diff --git a/agent/nodes/output_v1.py b/agent/nodes/output_v1.py
index b89874b..4601e91 100644
--- a/agent/nodes/output_v1.py
+++ b/agent/nodes/output_v1.py
@@ -34,6 +34,12 @@ YOUR JOB: Transform the Thinker's reasoning into a natural, human-readable text
 - Keep the user's language — if they wrote German, respond in German.
 - Be concise. Don't describe data that the UI node will show as a table.
 
+PHRASING by user_expectation (from memorizer):
+- "delegated": progress-report style. State what was done and what's next. No questions unless blocked.
+- "waiting_input": acknowledge the user's answer and continue the flow naturally.
+- "observing": keep it brief. No unsolicited follow-up questions or suggestions.
+- "conversational": natural, warm dialogue. Follow-ups are fine.
+
 {memory_context}"""
 
     async def process(self, thought: ThoughtResult, history: list[dict],
@@ -42,7 +48,7 @@ YOUR JOB: Transform the Thinker's reasoning into a natural, human-readable text
         await self.hud("streaming")
 
         messages = [
-            {"role": "system", "content": self.SYSTEM.format(memory_context=memory_context)},
+            {"role": "system", "content": self.SYSTEM.replace("{memory_context}", memory_context)},
         ]
         for msg in history[-20:]:
             messages.append(msg)
diff --git a/agent/nodes/pa_v1.py b/agent/nodes/pa_v1.py
index 5b3cb7f..30d31e7 100644
--- a/agent/nodes/pa_v1.py
+++ b/agent/nodes/pa_v1.py
@@ -27,6 +27,8 @@ Experts have these tools:
 - query_db — SQL queries on their domain database
 - emit_actions — create buttons on the dashboard
 - create_machine / add_state / reset_machine / destroy_machine — interactive UI components
+- update_machine(id, data) — update wizard data fields on existing machine
+- transition_machine(id, target) — move machine to a specific state
 - set_state — persistent key-value store
 - emit_display — formatted data display
 
@@ -36,13 +38,13 @@ YOUR JOB:
 3. Only respond directly for social chat (greetings, thanks, bye, small talk)
 
 Output ONLY valid JSON:
-{{
+{
   "expert": "{expert_names} | none",
   "job": "Self-contained task. Include ALL context — the expert has NO conversation history. Describe what to query, what UI to build, what the user expects to see.",
   "thinking_message": "Short message for user while expert works, in their language",
   "response_hint": "If expert=none, your direct response to the user.",
   "language": "de | en | mixed"
-}}
+}
 
 Rules:
 - expert=none ONLY for social chat (hi, thanks, bye, how are you)
@@ -53,6 +55,16 @@ Rules:
 - thinking_message: natural, in user's language. e.g. "Moment, ich schaue nach..."
 - If the user mentions data, tables, customers, devices, buttons, counters → expert
 - When unsure which expert: pick the one whose domain matches best
+- MACHINE STATE: If there are active machines/wizards listed in the context below, ALWAYS include the machine's current state and stored data in the job. The expert needs this to continue the workflow. Example: "Machine 'angebot_wizard' is on step 'select_age', data: {bundesland: Bayern}. User asks: ..."
+- If the user asks about their wizard/workflow progress and the info is already visible in the context, respond directly (expert=none) using the machine state from context. Only route to expert if the user needs data queried or tools called.
+- For update_machine / transition_machine requests: route to expert with the machine ID and operation details in the job.
+
+USER EXPECTATION (from memorizer):
+- If user_expectation is "delegated": formulate comprehensive, autonomous jobs. Do NOT include clarifying questions in the job. Tell the expert to proceed and report results.
+- If user_expectation is "waiting_input": the user is waiting for results or nudging ("und?", "ja?", "weiter?"). Look at conversation history to find what they were waiting for and re-formulate that job. If they answered a question you asked, extract their answer and fold it into context.
+- If user_expectation is "observing": only route to expert if the user explicitly asks for something. Otherwise respond directly with brief acknowledgment.
+- If user_expectation is "conversational": normal routing behavior.
+- CONTINUATION: When user sends a very short message (1-3 words like "und?", "weiter", "ja") after partial/incomplete results, treat it as "continue the previous task". Include the original question and any partial results in the job.
 
 {memory_context}"""
 
@@ -89,10 +101,15 @@ Rules:
             expert_lines.append("- (no experts available — handle everything directly)")
 
         expert_names = " | ".join(self._available_experts) if self._available_experts else "none"
+        # Manual substitution to avoid .format() breaking on curly braces in memory_context
+        system_content = self.SYSTEM
+        system_content = system_content.replace("{memory_context}", memory_context)
+        system_content = system_content.replace("{identity}", identity)
+        system_content = system_content.replace("{channel}", channel)
+        system_content = system_content.replace("{experts}", "\n".join(expert_lines))
+        system_content = system_content.replace("{expert_names}", expert_names)
         messages = [
-            {"role": "system", "content": self.SYSTEM.format(
-                memory_context=memory_context, identity=identity, channel=channel,
-                experts="\n".join(expert_lines), expert_names=expert_names)},
+            {"role": "system", "content": system_content},
         ]
 
         # Summarize recent history (PA sees full context)
@@ -118,7 +135,7 @@ Rules:
         log.info(f"[pa] raw: {raw[:300]}")
 
         routing = self._parse_routing(raw, command)
-        await self.hud("routed", expert=routing.expert, job=routing.job[:100],
+        await self.hud("routed", expert=routing.expert, job=(routing.job or "")[:100],
                        direct=routing.expert == "none")
 
         # Update directive style based on tone
@@ -131,6 +148,72 @@ Rules:
 
         return routing
 
+    async def route_retry(self, command: Command, history: list[dict],
+                          memory_context: str = "", identity: str = "unknown",
+                          channel: str = "unknown", original_job: str = "",
+                          errors: list = None) -> PARouting:
+        """Re-route after expert failure. PA reformulates with error context."""
+        await self.hud("thinking", detail="reformulating after expert failure")
+
+        error_lines = []
+        for err in (errors or [])[-3:]:
+            error_lines.append(f"- Query: {err.get('query', '?')[:100]}")
+            error_lines.append(f"  Error: {err.get('error', '?')[:100]}")
+            if err.get("describe"):
+                error_lines.append(f"  Schema: {err['describe'][:200]}")
+
+        retry_prompt = f"""The expert FAILED the previous job. You must reformulate.
+
+ORIGINAL JOB: {original_job}
+
+ERRORS:
+{chr(10).join(error_lines)}
+
+REFORMULATE the job with a DIFFERENT approach:
+- If the query was too complex (JOINs, window functions), break it into simpler steps
+- If columns were wrong, use the DESCRIBE info above to fix them
+- If the table structure is unclear, tell the expert to first explore with SELECT * LIMIT 5
+- Think about what data the user actually needs and find a simpler path to it
+
+Output the same JSON format as before. The job MUST be different from the original."""
+
+        expert_lines = []
+        for name in self._available_experts:
+            desc = self.EXPERT_DESCRIPTIONS.get(name, f"{name} — domain expert")
+            expert_lines.append(f"- {desc}")
+        expert_names = " | ".join(self._available_experts) if self._available_experts else "none"
+
+        system_content = self.SYSTEM
+        system_content = system_content.replace("{memory_context}", memory_context)
+        system_content = system_content.replace("{identity}", identity)
+        system_content = system_content.replace("{channel}", channel)
+        system_content = system_content.replace("{experts}", "\n".join(expert_lines))
+        system_content = system_content.replace("{expert_names}", expert_names)
+
+        messages = [
+            {"role": "system", "content": system_content},
+        ]
+        recent = history[-8:]
+        if recent:
+            lines = []
+            for msg in recent:
+                role = msg.get("role", "?")
+                content = msg.get("content", "")[:200]
+                lines.append(f"  {role}: {content}")
+            messages.append({"role": "user", "content": "Recent conversation:\n" + "\n".join(lines)})
+            messages.append({"role": "assistant", "content": "OK, I have the context."})
+
+        messages.append({"role": "user", "content": retry_prompt})
+        messages = self.trim_context(messages)
+
+        raw = await llm_call(self.model, messages)
+        log.info(f"[pa] retry raw: {raw[:300]}")
+
+        routing = self._parse_routing(raw, command)
+        await self.hud("routed", expert=routing.expert, job=(routing.job or "")[:100],
+                       direct=routing.expert == "none", retry=True)
+        return routing
+
     def _parse_routing(self, raw: str, command: Command) -> PARouting:
         """Parse LLM JSON into PARouting with fallback."""
         text = raw.strip()
@@ -149,10 +232,10 @@ Rules:
                 expert = "none"
             return PARouting(
                 expert=expert,
-                job=data.get("job", ""),
-                thinking_message=data.get("thinking_message", ""),
-                response_hint=data.get("response_hint", ""),
-                language=data.get("language", command.analysis.language),
+                job=data.get("job") or "",
+                thinking_message=data.get("thinking_message") or "",
+                response_hint=data.get("response_hint") or "",
+                language=data.get("language") or command.analysis.language,
             )
         except (json.JSONDecodeError, Exception) as e:
             log.error(f"[pa] parse failed: {e}, raw: {text[:200]}")
diff --git a/agent/nodes/thinker_v1.py b/agent/nodes/thinker_v1.py
index b23f64f..985a1cf 100644
--- a/agent/nodes/thinker_v1.py
+++ b/agent/nodes/thinker_v1.py
@@ -236,7 +236,7 @@ You are one node in a pipeline: Input (perceives) -> You (reason) -> Output (spe
 
 1. emit_actions() — show buttons. Button clicks come back as "ACTION: action_name".
    Stateful buttons: include var/op in payload (inc/dec/set/toggle). UI handles locally.
-   Example: label:"+1", action:"increment", payload:{{"var":"count","op":"inc","initial":0}}
+   Example: label:"+1", action:"increment", payload:{"var":"count","op":"inc","initial":0}
 
 2. set_state(key, value) — persistent key-value store shown as live labels.
    Survives across turns. Use for tracking mode, progress, flags.
@@ -253,9 +253,9 @@ You are one node in a pipeline: Input (perceives) -> You (reason) -> Output (spe
    destroy_machine(id) — remove machine from dashboard.
    Example — navigation menu:
      create_machine(id="nav", initial="main", states=[
-       {{"name":"main","buttons":[{{"label":"Menu 1","action":"menu_1","go":"sub1"}},{{"label":"Menu 2","action":"menu_2","go":"sub2"}}],"content":["Welcome"]}},
-       {{"name":"sub1","buttons":[{{"label":"Back","action":"back","go":"main"}}],"content":["Sub 1 details"]}},
-       {{"name":"sub2","buttons":[{{"label":"Back","action":"back","go":"main"}}],"content":["Sub 2 details"]}}
+       {"name":"main","buttons":[{"label":"Menu 1","action":"menu_1","go":"sub1"},{"label":"Menu 2","action":"menu_2","go":"sub2"}],"content":["Welcome"]},
+       {"name":"sub1","buttons":[{"label":"Back","action":"back","go":"main"}],"content":["Sub 1 details"]},
+       {"name":"sub2","buttons":[{"label":"Back","action":"back","go":"main"}],"content":["Sub 2 details"]}
      ])
    PREFER machines over emit_actions for anything with navigation or multiple views.
    ALWAYS include states when creating a machine. Never write code — use the tool.
@@ -350,10 +350,10 @@ conn.commit()
 cursor.execute("SELECT name FROM sqlite_master WHERE type='table'")
 tables = cursor.fetchall()
 for t in tables:
-    cursor.execute(f"SELECT * FROM {{t[0]}}")
+    cursor.execute(f"SELECT * FROM {t[0]}")
     rows = cursor.fetchall()
     cols = [d[0] for d in cursor.description]
-    print(f"Table: {{t[0]}}")
+    print(f"Table: {t[0]}")
     print(" | ".join(cols))
     for row in rows:
         print(" | ".join(str(c) for c in row))
@@ -446,7 +446,7 @@ conn.close()'''
         await self.hud("thinking", detail="reasoning about response")
 
         messages = [
-            {"role": "system", "content": self.SYSTEM.format(memory_context=memory_context)},
+            {"role": "system", "content": self.SYSTEM.replace("{memory_context}", memory_context)},
         ]
         for msg in history[-12:]:
             messages.append(msg)
diff --git a/agent/nodes/thinker_v2.py b/agent/nodes/thinker_v2.py
index f3c41af..08f5822 100644
--- a/agent/nodes/thinker_v2.py
+++ b/agent/nodes/thinker_v2.py
@@ -88,7 +88,7 @@ Rules:
             hint += f"\nTool result:\n{tool_output[:500]}"
 
         messages = [
-            {"role": "system", "content": self.RESPONSE_SYSTEM.format(hint=hint)},
+            {"role": "system", "content": self.RESPONSE_SYSTEM.replace("{hint}", hint)},
         ]
         for msg in history[-8:]:
             messages.append(msg)
diff --git a/agent/nodes/ui.py b/agent/nodes/ui.py
index cd2e4fb..2c2b1b3 100644
--- a/agent/nodes/ui.py
+++ b/agent/nodes/ui.py
@@ -2,9 +2,10 @@
 
 import json
 import logging
+import uuid
 
 from .base import Node
-from ..types import ThoughtResult
+from ..types import ThoughtResult, Artifact
 
 log = logging.getLogger("runtime")
 
@@ -16,6 +17,7 @@ class UINode(Node):
     def __init__(self, send_hud):
         super().__init__(send_hud)
         self.thinker_controls: list[dict] = []  # buttons, labels, tables from Thinker
+        self.artifacts: list[dict] = []  # typed workspace artifacts
         self.state: dict = {}  # {"count": 0, "theme": "dark", ...}
         self.bindings: dict = {}  # {"increment": {"op": "inc", "var": "count"}, ...}
         self.machines: dict = {}  # {"nav": {initial, states, current}, ...}
@@ -79,6 +81,7 @@ class UINode(Node):
                     "initial": initial,
                     "current": initial,
                     "states": states,
+                    "data": {},  # wizard field storage (e.g. {"bundesland": "Bayern"})
                 }
                 log.info(f"[ui] machine created: {mid} (initial={initial}, {len(states)} states)")
                 await self.hud("machine_created", id=mid, initial=initial, state_count=len(states))
@@ -104,6 +107,28 @@ class UINode(Node):
                 log.info(f"[ui] machine reset: {mid} -> {initial}")
                 await self.hud("machine_reset", id=mid, state=initial)
 
+            elif op == "update_data":
+                if mid not in self.machines:
+                    log.warning(f"[ui] update_data: machine '{mid}' not found")
+                    continue
+                data_update = op_data.get("data", {})
+                self.machines[mid]["data"].update(data_update)
+                log.info(f"[ui] machine data updated: {mid} += {data_update}")
+                await self.hud("machine_data_updated", id=mid, data=data_update)
+
+            elif op == "transition":
+                if mid not in self.machines:
+                    log.warning(f"[ui] transition: machine '{mid}' not found")
+                    continue
+                target = op_data.get("target", "")
+                if target in self.machines[mid]["states"]:
+                    old = self.machines[mid]["current"]
+                    self.machines[mid]["current"] = target
+                    log.info(f"[ui] machine transition (expert): {mid} {old} -> {target}")
+                    await self.hud("machine_transitioned", id=mid, old=old, target=target)
+                else:
+                    log.warning(f"[ui] transition target '{target}' not found in {mid}")
+
             elif op == "destroy":
                 if mid in self.machines:
                     del self.machines[mid]
@@ -157,15 +182,31 @@ class UINode(Node):
         return controls
 
     def get_machine_summary(self) -> str:
-        """Summary for Thinker context — shape only, not full data."""
+        """Rich summary for PA/Thinker context — includes current state details and stored data."""
         if not self.machines:
             return ""
         parts = []
         for mid, m in self.machines.items():
             current = m["current"]
             state_names = list(m["states"].keys())
-            parts.append(f"  machine '{mid}': state={current}, states={state_names}")
-        return "Machines:\n" + "\n".join(parts)
+            state_def = m["states"].get(current, {})
+            line = f"  machine '{mid}': state={current}, states={state_names}"
+            # Current state content
+            content = state_def.get("content", [])
+            if content:
+                line += f", content={content}"
+            # Current state buttons
+            buttons = state_def.get("buttons", [])
+            if buttons:
+                btn_labels = [b.get("label", b.get("action", "?")) for b in buttons if isinstance(b, dict)]
+                if btn_labels:
+                    line += f", buttons={btn_labels}"
+            # Stored wizard data
+            data = m.get("data", {})
+            if data:
+                line += f", data={data}"
+            parts.append(line)
+        return "Active machines (interactive wizard/workflow state):\n" + "\n".join(parts)
 
     # --- State operations ---
 
@@ -343,21 +384,155 @@ class UINode(Node):
 
         return controls
 
+    def _build_artifacts(self, thought: ThoughtResult) -> list[dict]:
+        """Convert ThoughtResult into typed artifacts."""
+        arts = []
+
+        # 1. Direct artifacts from expert's emit_artifact calls
+        if thought.artifacts:
+            for a in thought.artifacts:
+                if not a.get("id"):
+                    a["id"] = str(uuid.uuid4())[:8]
+                arts.append(a)
+
+        # 2. Convert display_items (cards, lists) → entity_detail artifacts
+        if thought.display_items:
+            for item in thought.display_items:
+                item_type = item.get("type", "text")
+                if item_type == "card":
+                    arts.append({
+                        "id": str(uuid.uuid4())[:8],
+                        "type": "entity_detail",
+                        "data": {
+                            "title": item.get("title", ""),
+                            "subtitle": item.get("subtitle", ""),
+                            "fields": item.get("fields", []),
+                        },
+                        "actions": item.get("actions", []),
+                        "meta": {},
+                    })
+                elif item_type == "list":
+                    arts.append({
+                        "id": str(uuid.uuid4())[:8],
+                        "type": "entity_detail",
+                        "data": {
+                            "title": item.get("title", ""),
+                            "items": item.get("items", []),
+                        },
+                        "actions": [],
+                        "meta": {"list": True},
+                    })
+                else:
+                    arts.append({
+                        "id": str(uuid.uuid4())[:8],
+                        "type": "status",
+                        "data": {
+                            "display_type": item_type,
+                            "label": item.get("label", ""),
+                            "value": item.get("value", ""),
+                            "style": item.get("style", ""),
+                        },
+                        "actions": [],
+                        "meta": {},
+                    })
+
+        # 3. Convert actions → action_bar artifact
+        if thought.actions:
+            btns = self._parse_thinker_actions(thought.actions)
+            arts.append({
+                "id": "action_bar",
+                "type": "action_bar",
+                "data": {},
+                "actions": [{"label": b["label"], "action": b["action"],
+                             "payload": b.get("payload", {})} for b in btns],
+                "meta": {},
+            })
+        elif self.thinker_controls:
+            # Preserve existing buttons as action_bar
+            existing_btns = [c for c in self.thinker_controls if c.get("type") == "button"]
+            if existing_btns:
+                arts.append({
+                    "id": "action_bar",
+                    "type": "action_bar",
+                    "data": {},
+                    "actions": [{"label": b["label"], "action": b["action"],
+                                 "payload": b.get("payload", {})} for b in existing_btns],
+                    "meta": {},
+                })
+
+        # 4. Convert tool_output table → data_table artifact
+        if thought.tool_output:
+            table = self._extract_table(thought.tool_output)
+            if table:
+                arts.append({
+                    "id": str(uuid.uuid4())[:8],
+                    "type": "data_table",
+                    "data": {
+                        "columns": table["columns"],
+                        "rows": table["data"],
+                    },
+                    "actions": [],
+                    "meta": {"source": thought.tool_used or "query_db"},
+                })
+
+        # 5. State variables → status artifacts
+        if thought.state_updates:
+            for key, value in thought.state_updates.items():
+                self.set_var(key, value)
+        for var, value in self.state.items():
+            arts.append({
+                "id": f"state_{var}",
+                "type": "status",
+                "data": {"label": var, "value": str(value), "display_type": "text"},
+                "actions": [],
+                "meta": {"state_var": True},
+            })
+
+        # 6. Machines → machine artifacts
+        for mid, machine in self.machines.items():
+            current = machine["current"]
+            state_def = machine["states"].get(current, {})
+            arts.append({
+                "id": f"machine_{mid}",
+                "type": "machine",
+                "data": {
+                    "machine_id": mid,
+                    "current": current,
+                    "states": list(machine["states"].keys()),
+                    "content": state_def.get("content", []),
+                    "stored_data": machine.get("data", {}),
+                },
+                "actions": [{"label": b.get("label", ""), "action": b.get("action", ""),
+                             "go": b.get("go", "")}
+                            for b in state_def.get("buttons", []) if isinstance(b, dict)],
+                "meta": {"live": True},
+            })
+
+        return arts
+
+    def get_artifacts(self) -> list[dict]:
+        """Return current artifact list."""
+        return self.artifacts
+
     async def process(self, thought: ThoughtResult, history: list[dict],
                       memory_context: str = "") -> list[dict]:
         # Apply machine ops first (create/add_state/reset/destroy)
         if thought.machine_ops:
             await self.apply_machine_ops(thought.machine_ops)
 
+        # Build artifacts (new system)
+        self.artifacts = self._build_artifacts(thought)
+
+        # Build legacy controls (backward compat)
         thinker_ctrls = self._build_controls(thought)
 
         if thinker_ctrls:
             self.thinker_controls = thinker_ctrls
         # Always emit the merged view (thinker + machine)
         merged = self.current_controls
-        if merged:
+        if merged or self.artifacts:
             await self.hud("controls", controls=merged)
-            log.info(f"[ui] emitting {len(merged)} controls ({len(self.thinker_controls)} thinker + {len(self.get_machine_controls())} machine)")
+            log.info(f"[ui] emitting {len(merged)} controls + {len(self.artifacts)} artifacts")
         else:
             await self.hud("decided", instruction="no new controls")
 
diff --git a/agent/runtime.py b/agent/runtime.py
index 1a22046..f014b3c 100644
--- a/agent/runtime.py
+++ b/agent/runtime.py
@@ -56,6 +56,13 @@ class OutputSink:
             except Exception:
                 pass
 
+    async def send_artifacts(self, artifacts: list):
+        if self.ws:
+            try:
+                await self.ws.send_text(json.dumps({"type": "artifacts", "artifacts": artifacts}))
+            except Exception:
+                pass
+
     async def send_hud(self, data: dict):
         if self.ws:
             try:
@@ -221,9 +228,10 @@ class Runtime:
         self.history.append({"role": "user", "content": action_desc})
 
         sensor_lines = self.sensor.get_context_lines()
-        director_line = self.director.get_context_line()
+        director_line = self.director.get_context_line() if self.director else ""
         mem_ctx = self.memorizer.get_context_block(sensor_lines=sensor_lines, ui_state=self.ui_node.state)
-        mem_ctx += f"\n\n{director_line}"
+        if director_line:
+            mem_ctx += f"\n\n{director_line}"
 
         command = Command(
             analysis=InputAnalysis(intent="action", topic=action, complexity="simple"),
@@ -242,7 +250,7 @@ class Runtime:
         self.history.append({"role": "assistant", "content": response})
 
         await self.memorizer.update(self.history)
-        if not self.is_v2:
+        if not self.is_v2 and self.director:
             await self.director.update(self.history, self.memorizer.state)
 
         if len(self.history) > self.MAX_HISTORY:
@@ -319,9 +327,10 @@ class Runtime:
         # Check Sensor flags (idle return, workspace mismatch)
         sensor_flags = self.sensor.consume_flags()
         sensor_lines = self.sensor.get_context_lines()
-        director_line = self.director.get_context_line()
+        director_line = self.director.get_context_line() if self.director else ""
         mem_ctx = self.memorizer.get_context_block(sensor_lines=sensor_lines, ui_state=self.ui_node.state)
-        mem_ctx += f"\n\n{director_line}"
+        if director_line:
+            mem_ctx += f"\n\n{director_line}"
         machine_summary = self.ui_node.get_machine_summary()
         if machine_summary:
             mem_ctx += f"\n\n{machine_summary}"
diff --git a/agent/types.py b/agent/types.py
index 4e69e1e..98d58ba 100644
--- a/agent/types.py
+++ b/agent/types.py
@@ -76,6 +76,19 @@ class PARouting:
     language: str = "de"              # Response language
 
 
+@dataclass
+class Artifact:
+    """A typed workspace item. The unit of workspace content."""
+    id: str                                       # unique ID
+    type: str                                     # entity_detail | data_table | document_page | action_bar | status
+    data: dict = field(default_factory=dict)      # type-specific payload
+    actions: list = field(default_factory=list)   # [{label, action, payload?}]
+    meta: dict = field(default_factory=dict)      # {entity?, related?, source_query?}
+
+    def to_dict(self) -> dict:
+        return asdict(self)
+
+
 @dataclass
 class ThoughtResult:
     """Thinker node's output — either a direct answer or tool results."""
@@ -86,3 +99,5 @@ class ThoughtResult:
     state_updates: dict = field(default_factory=dict)  # {key: value} from set_state
     display_items: list = field(default_factory=list)  # [{type, label, value?, style?}] from emit_display
     machine_ops: list = field(default_factory=list)  # [{op, id, ...}] from machine tools
+    errors: list = field(default_factory=list)  # [{query, error, describe?}] from failed retries
+    artifacts: list = field(default_factory=list)  # [Artifact] from emit_artifact
diff --git a/runtime_test.py b/runtime_test.py
index b0b6c76..44ed61e 100644
--- a/runtime_test.py
+++ b/runtime_test.py
@@ -382,6 +382,12 @@ def check_trace(trace: list, check: str) -> tuple[bool, str]:
                 return True, f"found reset_machine via machine_reset event"
             if t.get("event") == "machine_destroyed" and tool_name == "destroy_machine":
                 return True, f"found destroy_machine via machine_destroyed event"
+            if t.get("event") == "machine_data_updated" and tool_name == "update_machine":
+                return True, f"found update_machine via machine_data_updated event"
+            if t.get("event") == "machine_transitioned" and tool_name == "transition_machine":
+                return True, f"found transition_machine via machine_transitioned event"
+            if t.get("event") == "pa_retry" and tool_name == "pa_retry":
+                return True, f"found pa_retry event"
         return False, f"no tool_call '{tool_name}' in trace"
 
     # machine_created id="NAV" — checks for specific machine creation
diff --git a/static/js/awareness.js b/static/js/awareness.js
index 9dc3114..6eba5c4 100644
--- a/static/js/awareness.js
+++ b/static/js/awareness.js
@@ -162,9 +162,17 @@ export function updateMeter(node, tokens, maxTokens, fillPct) {
 export function updateAwarenessState(state) {
   const body = document.getElementById('aw-state-body');
   if (!body) return;
+  const expectation = state.user_expectation || 'conversational';
+  const expClass = {
+    conversational: 'aw-exp-conv',
+    delegated: 'aw-exp-deleg',
+    waiting_input: 'aw-exp-wait',
+    observing: 'aw-exp-obs',
+  }[expectation] || '';
   const display = [
     ['user', state.user_name],
     ['mood', state.user_mood],
+    ['expectation', expectation, expClass],
     ['topic', state.topic],
     ['lang', state.language],
     ['style', state.style_hint],
@@ -173,8 +181,8 @@ export function updateAwarenessState(state) {
   const facts = state.facts || [];
   const history = state.topic_history || [];
 
-  let html = display.map(([k, v]) =>
-    `<div class="aw-row"><span class="aw-key">${esc(k)}</span><span class="aw-val">${esc(v || 'null')}</span></div>`
+  let html = display.map(([k, v, cls]) =>
+    `<div class="aw-row"><span class="aw-key">${esc(k)}</span><span class="aw-val ${cls || ''}">${esc(v || 'null')}</span></div>`
   ).join('');
 
   if (facts.length) {
diff --git a/static/js/dashboard.js b/static/js/dashboard.js
index 225240e..1cb169b 100644
--- a/static/js/dashboard.js
+++ b/static/js/dashboard.js
@@ -1,6 +1,9 @@
-/** Dashboard: workspace controls rendering (buttons, tables, labels, displays, machines). */
+/** Dashboard: workspace artifact + control rendering.
+ *  Artifact system: typed artifacts (entity_detail, data_table, document_page, action_bar, status, machine).
+ *  Legacy: dockControls() still works as fallback for old control format.
+ */
 
-import { esc } from './util.js';
+import { esc, renderMarkdown } from './util.js';
 import { addTrace } from './trace.js';
 import { setDashboard } from './chat.js';
 
@@ -8,8 +11,233 @@ let _ws = null;
 
 export function setWs(ws) { _ws = ws; }
 
+function _sendAction(action, data) {
+  if (_ws && _ws.readyState === 1) {
+    _ws.send(JSON.stringify({ type: 'action', action, data: data || {} }));
+    addTrace('runtime', 'action', action);
+  }
+}
+
+// --- Artifact system ---
+
+export function dockArtifacts(artifacts) {
+  const body = document.getElementById('workspace-body');
+  if (!body) return;
+  body.innerHTML = '';
+  const container = document.createElement('div');
+  container.className = 'artifacts-container';
+
+  for (const art of artifacts) {
+    const wrapper = document.createElement('div');
+    wrapper.className = 'ws-artifact ws-artifact-' + (art.type || 'unknown');
+    wrapper.dataset.artifactId = art.id || '';
+
+    const renderer = RENDERERS[art.type];
+    if (renderer) {
+      renderer(wrapper, art);
+    } else {
+      wrapper.innerHTML = '<div class="ws-artifact-fallback">' + esc(JSON.stringify(art.data || {})) + '</div>';
+    }
+
+    container.appendChild(wrapper);
+  }
+  body.appendChild(container);
+  // Also set dashboard for S3* audit (flatten actions from artifacts)
+  const flatControls = artifacts.flatMap(a => (a.actions || []).map(act => ({type: 'button', ...act})));
+  setDashboard(flatControls);
+}
+
+// --- Artifact renderers ---
+
+const RENDERERS = {
+  entity_detail: renderEntityDetail,
+  data_table: renderDataTable,
+  document_page: renderDocumentPage,
+  action_bar: renderActionBar,
+  status: renderStatus,
+  machine: renderMachine,
+};
+
+function renderEntityDetail(el, art) {
+  const d = art.data || {};
+  let html = '';
+  if (d.title) html += '<div class="ws-card-title">' + esc(d.title) + '</div>';
+  if (d.subtitle) html += '<div class="ws-card-subtitle">' + esc(d.subtitle) + '</div>';
+
+  // List mode (multiple items)
+  if (d.items && d.items.length) {
+    html += '<div class="ws-list">';
+    for (const item of d.items) {
+      html += '<div class="ws-card ws-card-nested">';
+      if (item.title) html += '<div class="ws-card-title">' + esc(item.title) + '</div>';
+      if (item.fields) {
+        html += '<div class="ws-card-fields">';
+        for (const f of item.fields) {
+          html += '<div class="ws-card-field"><span class="ws-card-key">' + esc(f.label || '') + '</span><span class="ws-card-val">' + esc(String(f.value ?? '')) + '</span></div>';
+        }
+        html += '</div>';
+      }
+      html += '</div>';
+    }
+    html += '</div>';
+  }
+
+  // Single entity fields
+  if (d.fields && d.fields.length) {
+    html += '<div class="ws-card-fields">';
+    for (const f of d.fields) {
+      const val = f.action
+        ? '<span class="ws-card-link" data-action="' + esc(f.action) + '">' + esc(String(f.value ?? '')) + '</span>'
+        : '<span class="ws-card-val">' + esc(String(f.value ?? '')) + '</span>';
+      html += '<div class="ws-card-field"><span class="ws-card-key">' + esc(f.label || '') + '</span>' + val + '</div>';
+    }
+    html += '</div>';
+  }
+
+  // Actions
+  if (art.actions && art.actions.length) {
+    html += '<div class="ws-card-actions">';
+    for (const a of art.actions) {
+      html += '<button class="control-btn ws-card-btn" data-action="' + esc(a.action || '') + '">' + esc(a.label || '') + '</button>';
+    }
+    html += '</div>';
+  }
+
+  el.innerHTML = html;
+  _wireActions(el);
+}
+
+function renderDataTable(el, art) {
+  const d = art.data || {};
+  if (d.title) {
+    const title = document.createElement('div');
+    title.className = 'ws-artifact-header';
+    title.textContent = d.title;
+    el.appendChild(title);
+  }
+  const table = document.createElement('table');
+  table.className = 'control-table';
+  const cols = d.columns || (d.rows && d.rows.length ? Object.keys(d.rows[0]) : []);
+  if (cols.length) {
+    const thead = document.createElement('tr');
+    for (const col of cols) {
+      const th = document.createElement('th');
+      th.textContent = col;
+      thead.appendChild(th);
+    }
+    table.appendChild(thead);
+  }
+  for (const row of (d.rows || d.data || [])) {
+    const tr = document.createElement('tr');
+    if (Array.isArray(row)) {
+      for (const cell of row) {
+        const td = document.createElement('td'); td.textContent = cell; tr.appendChild(td);
+      }
+    } else if (typeof row === 'object') {
+      for (const col of cols) {
+        const td = document.createElement('td'); td.textContent = row[col] ?? ''; tr.appendChild(td);
+      }
+    }
+    table.appendChild(tr);
+  }
+  el.appendChild(table);
+}
+
+function renderDocumentPage(el, art) {
+  const d = art.data || {};
+  let html = '';
+  if (d.title) html += '<div class="ws-doc-title">' + esc(d.title) + '</div>';
+  for (const section of (d.sections || [])) {
+    html += '<div class="ws-doc-section">';
+    if (section.heading) html += '<div class="ws-doc-heading">' + esc(section.heading) + '</div>';
+    if (section.content) html += '<div class="ws-doc-content">' + renderMarkdown(section.content) + '</div>';
+    html += '</div>';
+  }
+  // Actions (e.g. PDF export)
+  if (art.actions && art.actions.length) {
+    html += '<div class="ws-card-actions">';
+    for (const a of art.actions) {
+      html += '<button class="control-btn ws-card-btn" data-action="' + esc(a.action || '') + '">' + esc(a.label || '') + '</button>';
+    }
+    html += '</div>';
+  }
+  el.innerHTML = html;
+  _wireActions(el);
+}
+
+function renderActionBar(el, art) {
+  for (const a of (art.actions || [])) {
+    const btn = document.createElement('button');
+    btn.className = 'control-btn';
+    btn.textContent = a.label || '';
+    btn.onclick = () => _sendAction(a.action, a.payload || {});
+    el.appendChild(btn);
+  }
+}
+
+function renderStatus(el, art) {
+  const d = art.data || {};
+  const dt = d.display_type || 'text';
+  el.classList.add('display-' + dt);
+  if (dt === 'progress') {
+    const pct = Math.min(100, Math.max(0, Number(d.value) || 0));
+    el.innerHTML = '<span class="cd-label">' + esc(d.label) + '</span>'
+      + '<div class="cd-bar"><div class="cd-fill" style="width:' + pct + '%"></div></div>'
+      + '<span class="cd-pct">' + pct + '%</span>';
+  } else if (dt === 'info') {
+    el.innerHTML = '<span class="cd-icon">\u2139</span><span class="cd-label">' + esc(d.label) + '</span>';
+  } else {
+    el.innerHTML = '<span class="cd-label">' + esc(d.label || '') + '</span>'
+      + (d.value ? '<span class="cd-value">' + esc(String(d.value)) + '</span>' : '');
+  }
+}
+
+function renderMachine(el, art) {
+  const d = art.data || {};
+  const mid = d.machine_id || '';
+  // Header
+  let html = '<div class="ws-machine-header"><span class="ws-machine-name">' + esc(mid) + '</span>'
+    + '<span class="ws-machine-state">' + esc(d.current || '') + '</span></div>';
+  // Content
+  for (const text of (d.content || [])) {
+    html += '<div class="ws-machine-content">' + esc(text) + '</div>';
+  }
+  // Stored data
+  const stored = d.stored_data || {};
+  if (Object.keys(stored).length) {
+    html += '<div class="ws-machine-data">';
+    for (const [k, v] of Object.entries(stored)) {
+      html += '<span class="ws-machine-datum">' + esc(k) + '=' + esc(String(v)) + '</span>';
+    }
+    html += '</div>';
+  }
+  // Buttons
+  if (art.actions && art.actions.length) {
+    html += '<div class="ws-card-actions">';
+    for (const a of art.actions) {
+      html += '<button class="control-btn ws-card-btn" data-action="' + esc(a.action || '') + '">' + esc(a.label || '') + '</button>';
+    }
+    html += '</div>';
+  }
+  el.innerHTML = html;
+  _wireActions(el);
+}
+
+// --- Helpers ---
+
+function _wireActions(el) {
+  el.querySelectorAll('.ws-card-link').forEach(link => {
+    link.onclick = (e) => { e.stopPropagation(); _sendAction(link.dataset.action, {}); };
+  });
+  el.querySelectorAll('.ws-card-btn').forEach(btn => {
+    btn.onclick = (e) => { e.stopPropagation(); _sendAction(btn.dataset.action, {}); };
+  });
+}
+
+// --- Legacy control rendering (backward compat) ---
+
 export function dockControls(controls) {
-  setDashboard(controls);  // S3*: remember what's rendered
+  setDashboard(controls);
   const body = document.getElementById('workspace-body');
   if (!body) return;
   body.innerHTML = '';
@@ -21,12 +249,7 @@ export function dockControls(controls) {
       const btn = document.createElement('button');
       btn.className = 'control-btn';
       btn.textContent = ctrl.label;
-      btn.onclick = () => {
-        if (_ws && _ws.readyState === 1) {
-          _ws.send(JSON.stringify({ type: 'action', action: ctrl.action, data: ctrl.payload || ctrl.data || {} }));
-          addTrace('runtime', 'action', ctrl.action);
-        }
-      };
+      btn.onclick = () => _sendAction(ctrl.action, ctrl.payload || ctrl.data || {});
       container.appendChild(btn);
     } else if (ctrl.type === 'table') {
       const table = document.createElement('table');
@@ -34,22 +257,16 @@ export function dockControls(controls) {
       if (ctrl.columns) {
         const thead = document.createElement('tr');
         for (const col of ctrl.columns) {
-          const th = document.createElement('th');
-          th.textContent = col;
-          thead.appendChild(th);
+          const th = document.createElement('th'); th.textContent = col; thead.appendChild(th);
         }
         table.appendChild(thead);
       }
       for (const row of (ctrl.data || [])) {
         const tr = document.createElement('tr');
         if (Array.isArray(row)) {
-          for (const cell of row) {
-            const td = document.createElement('td'); td.textContent = cell; tr.appendChild(td);
-          }
+          for (const cell of row) { const td = document.createElement('td'); td.textContent = cell; tr.appendChild(td); }
         } else if (typeof row === 'object') {
-          for (const col of (ctrl.columns || Object.keys(row))) {
-            const td = document.createElement('td'); td.textContent = row[col] ?? ''; tr.appendChild(td);
-          }
+          for (const col of (ctrl.columns || Object.keys(row))) { const td = document.createElement('td'); td.textContent = row[col] ?? ''; tr.appendChild(td); }
         }
         table.appendChild(tr);
       }
@@ -62,105 +279,42 @@ export function dockControls(controls) {
     } else if (ctrl.type === 'display') {
       const disp = document.createElement('div');
       const dt = ctrl.display_type || 'text';
-      const style = ctrl.style ? ' display-' + ctrl.style : '';
-      disp.className = 'control-display display-' + dt + style;
+      disp.className = 'control-display display-' + dt;
       if (dt === 'progress') {
         const pct = Math.min(100, Math.max(0, Number(ctrl.value) || 0));
-        disp.innerHTML = '<span class="cd-label">' + esc(ctrl.label) + '</span>'
-          + '<div class="cd-bar"><div class="cd-fill" style="width:' + pct + '%"></div></div>'
-          + '<span class="cd-pct">' + pct + '%</span>';
-      } else if (dt === 'status') {
-        disp.innerHTML = '<span class="cd-icon">' + (ctrl.style === 'success' ? '\u2713' : ctrl.style === 'error' ? '\u2717' : '\u2139') + '</span>'
-          + '<span class="cd-label">' + esc(ctrl.label) + '</span>';
+        disp.innerHTML = '<span class="cd-label">' + esc(ctrl.label) + '</span><div class="cd-bar"><div class="cd-fill" style="width:' + pct + '%"></div></div><span class="cd-pct">' + pct + '%</span>';
       } else {
-        disp.innerHTML = '<span class="cd-label">' + esc(ctrl.label) + '</span>'
-          + (ctrl.value ? '<span class="cd-value">' + esc(String(ctrl.value)) + '</span>' : '');
+        disp.innerHTML = '<span class="cd-label">' + esc(ctrl.label) + '</span>' + (ctrl.value ? '<span class="cd-value">' + esc(String(ctrl.value)) + '</span>' : '');
       }
       container.appendChild(disp);
     } else if (ctrl.type === 'card') {
-      container.appendChild(renderCard(ctrl));
-    } else if (ctrl.type === 'list') {
-      const listEl = document.createElement('div');
-      listEl.className = 'ws-list';
-      if (ctrl.title) {
-        const h = document.createElement('div');
-        h.className = 'ws-list-title';
-        h.textContent = ctrl.title;
-        listEl.appendChild(h);
+      const card = document.createElement('div');
+      card.className = 'ws-card';
+      let html = '';
+      if (ctrl.title) html += '<div class="ws-card-title">' + esc(ctrl.title) + '</div>';
+      if (ctrl.subtitle) html += '<div class="ws-card-subtitle">' + esc(ctrl.subtitle) + '</div>';
+      if (ctrl.fields && ctrl.fields.length) {
+        html += '<div class="ws-card-fields">';
+        for (const f of ctrl.fields) {
+          html += '<div class="ws-card-field"><span class="ws-card-key">' + esc(f.label || '') + '</span><span class="ws-card-val">' + esc(String(f.value ?? '')) + '</span></div>';
+        }
+        html += '</div>';
       }
-      for (const item of (ctrl.items || [])) {
-        item.type = item.type || 'card';
-        listEl.appendChild(renderCard(item));
+      if (ctrl.actions && ctrl.actions.length) {
+        html += '<div class="ws-card-actions">';
+        for (const a of ctrl.actions) {
+          html += '<button class="control-btn ws-card-btn" data-action="' + esc(a.action || '') + '">' + esc(a.label || '') + '</button>';
+        }
+        html += '</div>';
       }
-      container.appendChild(listEl);
+      card.innerHTML = html;
+      _wireActions(card);
+      container.appendChild(card);
     }
   }
   body.appendChild(container);
 }
 
-function renderCard(card) {
-  const el = document.createElement('div');
-  el.className = 'ws-card';
-  if (card.action) {
-    el.classList.add('ws-card-clickable');
-    el.onclick = () => {
-      if (_ws && _ws.readyState === 1) {
-        _ws.send(JSON.stringify({ type: 'action', action: card.action, data: card.payload || {} }));
-        addTrace('runtime', 'action', card.action);
-      }
-    };
-  }
-
-  let html = '';
-  if (card.title) html += '<div class="ws-card-title">' + esc(card.title) + '</div>';
-  if (card.subtitle) html += '<div class="ws-card-subtitle">' + esc(card.subtitle) + '</div>';
-
-  if (card.fields && card.fields.length) {
-    html += '<div class="ws-card-fields">';
-    for (const f of card.fields) {
-      const val = f.action
-        ? '<span class="ws-card-link" data-action="' + esc(f.action) + '">' + esc(String(f.value ?? '')) + '</span>'
-        : '<span class="ws-card-val">' + esc(String(f.value ?? '')) + '</span>';
-      html += '<div class="ws-card-field"><span class="ws-card-key">' + esc(f.label || '') + '</span>' + val + '</div>';
-    }
-    html += '</div>';
-  }
-
-  if (card.actions && card.actions.length) {
-    html += '<div class="ws-card-actions">';
-    for (const a of card.actions) {
-      html += '<button class="control-btn ws-card-btn" data-action="' + esc(a.action || '') + '">' + esc(a.label || '') + '</button>';
-    }
-    html += '</div>';
-  }
-
-  el.innerHTML = html;
-
-  // Wire up field links and action buttons
-  el.querySelectorAll('.ws-card-link').forEach(link => {
-    link.onclick = (e) => {
-      e.stopPropagation();
-      const action = link.dataset.action;
-      if (_ws && _ws.readyState === 1) {
-        _ws.send(JSON.stringify({ type: 'action', action, data: {} }));
-        addTrace('runtime', 'action', action);
-      }
-    };
-  });
-  el.querySelectorAll('.ws-card-btn').forEach(btn => {
-    btn.onclick = (e) => {
-      e.stopPropagation();
-      const action = btn.dataset.action;
-      if (_ws && _ws.readyState === 1) {
-        _ws.send(JSON.stringify({ type: 'action', action, data: {} }));
-        addTrace('runtime', 'action', action);
-      }
-    };
-  });
-
-  return el;
-}
-
 export function clearDashboard() {
   const body = document.getElementById('workspace-body');
   if (body) body.innerHTML = '';
diff --git a/static/js/ws.js b/static/js/ws.js
index b2cfd06..6f8825f 100644
--- a/static/js/ws.js
+++ b/static/js/ws.js
@@ -3,7 +3,7 @@
 import { authToken, isAuthFailed, setAuthFailed, showLogin } from './auth.js';
 import { addTrace } from './trace.js';
 import { addMsg, handleDelta, handleDone, setWs as setChatWs } from './chat.js';
-import { dockControls, setWs as setDashWs } from './dashboard.js';
+import { dockControls, dockArtifacts, setWs as setDashWs } from './dashboard.js';
 import { graphAnimate } from './graph.js';
 import { updateMeter, updateNodeFromHud, updateAwarenessState, updateAwarenessSensors } from './awareness.js';
 import { updateTestStatus } from './tests.js';
@@ -61,6 +61,8 @@ export function connect() {
       handleDelta(data.content);
     } else if (data.type === 'done') {
       handleDone();
+    } else if (data.type === 'artifacts') {
+      dockArtifacts(data.artifacts);
     } else if (data.type === 'controls') {
       dockControls(data.controls);
     } else if (data.type === 'cleared') {
diff --git a/static/style.css b/static/style.css
index 7466be3..0d0205a 100644
--- a/static/style.css
+++ b/static/style.css
@@ -127,6 +127,10 @@ button:hover { background: #1d4ed8; }
 .aw-row { display: flex; justify-content: space-between; padding: 0.08rem 0; }
 .aw-key { color: #888; font-size: 0.65rem; }
 .aw-val { color: #e0e0e0; font-size: 0.7rem; font-weight: 500; }
+.aw-exp-conv { color: #4caf50; }
+.aw-exp-deleg { color: #ff9800; }
+.aw-exp-wait { color: #42a5f5; }
+.aw-exp-obs { color: #9e9e9e; }
 
 /* UI Controls (workspace) */
 .controls-container { padding: 0.3rem 0; display: flex; flex-wrap: wrap; gap: 0.3rem; align-items: flex-start; }
@@ -159,6 +163,34 @@ button:hover { background: #1d4ed8; }
 .ws-card-btn { font-size: 0.7rem; padding: 0.2rem 0.5rem; }
 .ws-list { display: flex; flex-direction: column; gap: 0.3rem; width: 100%; }
 .ws-list-title { font-size: 0.75rem; font-weight: 700; color: #888; text-transform: uppercase; letter-spacing: 0.03em; margin-bottom: 0.2rem; }
+.ws-card-nested { margin: 0; border-color: #1a1a2e; }
+
+/* Artifact system */
+.artifacts-container { padding: 0.3rem 0; display: flex; flex-direction: column; gap: 0.4rem; }
+.ws-artifact { width: 100%; }
+.ws-artifact-entity { background: #111; border: 1px solid #222; border-radius: 0.4rem; padding: 0.5rem 0.6rem; }
+.ws-artifact-data_table { }
+.ws-artifact-action_bar { display: flex; flex-wrap: wrap; gap: 0.3rem; }
+.ws-artifact-status { padding: 0.25rem 0.4rem; font-size: 0.75rem; display: flex; align-items: center; gap: 0.4rem; }
+.ws-artifact-header { font-size: 0.75rem; font-weight: 600; color: #888; margin-bottom: 0.2rem; }
+.ws-artifact-fallback { font-size: 0.7rem; color: #666; font-family: monospace; white-space: pre-wrap; }
+
+/* Document page artifact */
+.ws-artifact-document_page { background: #111; border: 1px solid #222; border-radius: 0.4rem; padding: 0.8rem 1rem; }
+.ws-doc-title { font-size: 1rem; font-weight: 700; color: #e0e0e0; margin-bottom: 0.6rem; border-bottom: 1px solid #333; padding-bottom: 0.4rem; }
+.ws-doc-section { margin-bottom: 0.5rem; }
+.ws-doc-heading { font-size: 0.8rem; font-weight: 700; color: #a78bfa; margin-bottom: 0.2rem; }
+.ws-doc-content { font-size: 0.75rem; color: #ccc; line-height: 1.5; }
+.ws-doc-content ul, .ws-doc-content ol { margin: 0.2rem 0; padding-left: 1.2rem; }
+
+/* Machine artifact */
+.ws-artifact-machine { background: #111; border: 1px solid #2563eb33; border-radius: 0.4rem; padding: 0.5rem 0.6rem; }
+.ws-machine-header { display: flex; justify-content: space-between; align-items: center; margin-bottom: 0.3rem; }
+.ws-machine-name { font-size: 0.75rem; font-weight: 600; color: #a78bfa; }
+.ws-machine-state { font-size: 0.7rem; color: #60a5fa; background: #1e3a5f; padding: 0.1rem 0.4rem; border-radius: 0.2rem; }
+.ws-machine-content { font-size: 0.75rem; color: #ccc; padding: 0.1rem 0; }
+.ws-machine-data { display: flex; flex-wrap: wrap; gap: 0.3rem; margin-top: 0.2rem; }
+.ws-machine-datum { font-size: 0.65rem; color: #888; background: #1a1a2e; padding: 0.1rem 0.3rem; border-radius: 0.2rem; }
 
 /* Login overlay */
 #login-overlay { position: fixed; inset: 0; background: rgba(0,0,0,0.85); display: flex; align-items: center; justify-content: center; z-index: 1000; }
diff --git a/testcases/artifact_system.md b/testcases/artifact_system.md
new file mode 100644
index 0000000..c30ec23
--- /dev/null
+++ b/testcases/artifact_system.md
@@ -0,0 +1,33 @@
+# Artifact System
+
+Tests that the artifact rendering pipeline works end-to-end.
+Expert produces data → UINode converts to artifacts → frontend renders.
+
+## Setup
+- clear history
+
+## Steps
+
+### 1. Query produces data_table artifact
+- send: show me 3 customers in a table
+- expect_trace: has tool_call
+- expect_response: length > 10
+
+### 2. Entity detail via card
+- send: show me details for customer 1
+- expect_trace: has tool_call
+- expect_response: length > 10
+
+### 3. Action bar via buttons
+- send: create two buttons on my dashboard: Refresh and Export
+- expect_actions: length >= 2
+- expect_actions: any action contains "refresh" or "Refresh"
+
+### 4. Machine artifact
+- send: create a machine called "flow" with initial state "ready" and a state called "done"
+- expect_trace: has machine_created
+
+### 5. Query after buttons survive
+- send: how many customers are there?
+- expect_response: length > 5
+- expect_actions: any action contains "refresh" or "Refresh"
diff --git a/testcases/expectation_tracking.md b/testcases/expectation_tracking.md
new file mode 100644
index 0000000..6120aaa
--- /dev/null
+++ b/testcases/expectation_tracking.md
@@ -0,0 +1,50 @@
+# Expectation Tracking
+
+Tests that memorizer tracks user_expectation and it influences PA/Output behavior.
+Exercises machine features (update_machine, transition_machine) alongside expectation transitions.
+
+## Setup
+- clear history
+
+## Steps
+
+### 1. Greeting sets conversational
+- send: hi there!
+- expect_response: length > 2
+- expect_state: user_expectation is "conversational"
+
+### 2. Create a wizard machine
+- send: create a machine called "project" with states: planning (initial) and executing
+- expect_trace: has machine_created
+
+### 3. Delegate a task
+- send: build me a summary report of the top 5 customers by device count
+- expect_response: length > 20
+- expect_state: user_expectation is "delegated" or "observing"
+
+### 4. Ask about wizard (status check stays in flow)
+- send: what state is my project machine in?
+- expect_response: contains "planning" or "project"
+- expect_state: user_expectation is "conversational" or "delegated"
+
+### 5. Store data on machine
+- send: use update_machine to store status=in_progress on the project machine
+- expect_response: length > 5
+
+### 6. Transition machine
+- send: use transition_machine to move project to executing state
+- expect_response: length > 5
+
+### 7. Verify machine state and data
+- send: what is the current state and data of the project machine?
+- expect_response: contains "executing" or "in_progress"
+
+### 8. Short nudge triggers waiting_input
+- send: und?
+- expect_response: length > 5
+- expect_state: user_expectation is "waiting_input" or "conversational"
+
+### 9. Quick thanks (observing)
+- send: ok danke
+- expect_response: length > 0
+- expect_state: user_expectation is "observing" or "observational" or "conversational"
diff --git a/testcases/machine_pa_context.md b/testcases/machine_pa_context.md
new file mode 100644
index 0000000..192091a
--- /dev/null
+++ b/testcases/machine_pa_context.md
@@ -0,0 +1,41 @@
+# Machine State → PA Context
+
+Tests that PA reads machine state when routing, and experts can write back to machines.
+Validates: enriched machine summary, update_machine, transition_machine.
+
+## Setup
+- clear history
+
+## Steps
+
+### 1. Create a machine
+- send: create a navigation machine called "wizard" with initial state "start" and a second state called "details"
+- expect_trace: has machine_created
+
+### 2. PA sees machine in context
+- send: what machines are active on my dashboard?
+- expect_response: contains "wizard" or "start"
+
+### 3. Expert stores data on machine
+- send: use update_machine to store region=Bayern on the wizard machine
+- expect_response: contains "Bayern" or "region" or "stored" or "updated"
+
+### 4. PA sees stored data
+- send: what data is stored in my wizard machine?
+- expect_response: contains "Bayern" or "region"
+
+### 5. Expert transitions machine to details
+- send: use transition_machine to move wizard to details state
+- expect_response: length > 5
+
+### 6. PA sees updated state
+- send: what state is the wizard in now?
+- expect_response: contains "details"
+
+### 7. Expert transitions back
+- send: use transition_machine to move wizard back to start
+- expect_response: length > 5
+
+### 8. Final state check
+- send: tell me the current wizard state and stored data
+- expect_response: contains "start"
diff --git a/testcases/pa_retry.md b/testcases/pa_retry.md
new file mode 100644
index 0000000..f97dbd9
--- /dev/null
+++ b/testcases/pa_retry.md
@@ -0,0 +1,19 @@
+# PA Retry on Expert Failure
+
+Tests that when expert fails, PA reformulates and retries with a different approach.
+
+## Setup
+- clear history
+
+## Steps
+
+### 1. Complex analytical query that may need retry
+- send: Finde KWZ-Geraete mit verdaechtigen Verbrauchsspruengen - also wo der Verbrauch zwischen zwei Ablesungen stark ansteigt
+- expect_response: length > 20
+
+### 2. Verify results contain device data
+- expect_response: contains "Gera" or "gera" or "KWZ" or "kwz" or "Verbrauch" or "device"
+
+### 3. Follow up with details
+- send: zeig mir die Verbraeuche von einem dieser Geraete
+- expect_response: length > 10
diff --git a/testcases/results.json b/testcases/results.json
index 4eaf67c..fe7aa86 100644
--- a/testcases/results.json
+++ b/testcases/results.json
@@ -1,7 +1,7 @@
 {
-  "timestamp": "2026-03-29 06:04:47",
+  "timestamp": "2026-03-30 00:02:55",
   "testcases": {
-    "S3* Audit Corrections": [
+    "Artifact System": [
       {
         "step": "Setup",
         "check": "clear",
@@ -9,93 +9,355 @@
         "detail": "cleared"
       },
       {
-        "step": "Tool calls produce results (baseline)",
-        "check": "send: create two buttons: Alpha and Beta",
+        "step": "Query produces data_table artifact",
+        "check": "send: show me 3 customers in a table",
         "status": "PASS",
-        "detail": "response: 👍 Okay, I've created buttons labeled \"Alpha\" and \"Beta\".\n"
+        "detail": "response: The database contains information for three customers: Kathrin Jager, Leon Schre"
       },
       {
-        "step": "Tool calls produce results (baseline)",
-        "check": "actions: length >= 1",
-        "status": "PASS",
-        "detail": "2 actions >= 1"
-      },
-      {
-        "step": "Tool calls produce results (baseline)",
-        "check": "actions: any action contains \"alpha\" or \"Alpha\"",
-        "status": "PASS",
-        "detail": "found 'alpha' in actions"
-      },
-      {
-        "step": "Dashboard mismatch triggers re-emit",
-        "check": "send: I see nothing on my dashboard, fix it",
-        "status": "PASS",
-        "detail": "response: 👍 Done — Alpha and Beta buttons are now live on your dashboard. They should appe"
-      },
-      {
-        "step": "Dashboard mismatch triggers re-emit",
-        "check": "response: not contains \"sorry\" or \"apologize\"",
-        "status": "PASS",
-        "detail": "none of ['sorry', 'apologize'] found (as expected)"
-      },
-      {
-        "step": "Dashboard mismatch triggers re-emit",
-        "check": "actions: length >= 1",
-        "status": "PASS",
-        "detail": "2 actions >= 1"
-      },
-      {
-        "step": "DB error triggers retry with corrected SQL",
-        "check": "send: SELECT * FROM NichtExistent LIMIT 5",
-        "status": "PASS",
-        "detail": "response: Ah, it seems like the table `NichtExistent` does not exist. Double-check the tab"
-      },
-      {
-        "step": "DB error triggers retry with corrected SQL",
+        "step": "Query produces data_table artifact",
         "check": "trace: has tool_call",
         "status": "PASS",
         "detail": "found event 'tool_call'"
       },
       {
-        "step": "DB error triggers retry with corrected SQL",
-        "check": "response: not contains \"1146\"",
-        "status": "PASS",
-        "detail": "none of ['1146'] found (as expected)"
-      },
-      {
-        "step": "DB error triggers retry with corrected SQL",
+        "step": "Query produces data_table artifact",
         "check": "response: length > 10",
         "status": "PASS",
-        "detail": "length 163 > 10"
+        "detail": "length 138 > 10"
       },
       {
-        "step": "Complex request gets Director plan",
-        "check": "send: investigate which customers have the mos",
+        "step": "Entity detail via card",
+        "check": "send: show me details for customer 1",
         "status": "PASS",
-        "detail": "response: Okay, I'll look into which customers have the most devices. This might take a mo"
+        "detail": "response: ```tool_code\nquery_db({\"query\":\"SELECT * FROM customers WHERE customer_id = 1\"})"
       },
       {
-        "step": "Complex request gets Director plan",
-        "check": "trace: has director_plan",
-        "status": "FAIL",
-        "detail": "no 'director_plan' event in trace"
-      },
-      {
-        "step": "Complex request gets Director plan",
+        "step": "Entity detail via card",
         "check": "trace: has tool_call",
         "status": "PASS",
         "detail": "found event 'tool_call'"
       },
       {
-        "step": "Complex request gets Director plan",
+        "step": "Entity detail via card",
+        "check": "response: length > 10",
+        "status": "PASS",
+        "detail": "length 84 > 10"
+      },
+      {
+        "step": "Action bar via buttons",
+        "check": "send: create two buttons on my dashboard: Refr",
+        "status": "PASS",
+        "detail": "response: I have added the 'Refresh' and 'Export' buttons to your dashboard. These buttons"
+      },
+      {
+        "step": "Action bar via buttons",
+        "check": "actions: length >= 2",
+        "status": "PASS",
+        "detail": "2 actions >= 2"
+      },
+      {
+        "step": "Action bar via buttons",
+        "check": "actions: any action contains \"refresh\" or \"Refresh\"",
+        "status": "PASS",
+        "detail": "found 'refresh' in actions"
+      },
+      {
+        "step": "Machine artifact",
+        "check": "send: create a machine called \"flow\" with init",
+        "status": "PASS",
+        "detail": "response: OK, I've created a new interactive machine called 'flow' with the initial state "
+      },
+      {
+        "step": "Machine artifact",
+        "check": "trace: has machine_created",
+        "status": "PASS",
+        "detail": "found event 'machine_created'"
+      },
+      {
+        "step": "Query after buttons survive",
+        "check": "send: how many customers are there?",
+        "status": "PASS",
+        "detail": "response: There are 693 customers in the database.\n"
+      },
+      {
+        "step": "Query after buttons survive",
+        "check": "response: length > 5",
+        "status": "PASS",
+        "detail": "length 41 > 5"
+      },
+      {
+        "step": "Query after buttons survive",
+        "check": "actions: any action contains \"refresh\" or \"Refresh\"",
+        "status": "PASS",
+        "detail": "found 'refresh' in actions"
+      }
+    ],
+    "Fast v4": [
+      {
+        "step": "Setup",
+        "check": "clear",
+        "status": "PASS",
+        "detail": "cleared"
+      },
+      {
+        "step": "Reflex",
+        "check": "send: hi!",
+        "status": "PASS",
+        "detail": "response: Hey Nico! 👋 How can I help you today?\n"
+      },
+      {
+        "step": "Reflex",
+        "check": "response: length > 2",
+        "status": "PASS",
+        "detail": "length 38 > 2"
+      },
+      {
+        "step": "PA routes to expert",
+        "check": "send: show me 3 customers",
+        "status": "PASS",
+        "detail": "response: Alright, I've fetched 3 customer records for you. You can see the ID, Name detai"
+      },
+      {
+        "step": "PA routes to expert",
+        "check": "trace: has routed",
+        "status": "PASS",
+        "detail": "found event 'routed'"
+      },
+      {
+        "step": "PA routes to expert",
+        "check": "trace: has tool_call",
+        "status": "PASS",
+        "detail": "found event 'tool_call'"
+      },
+      {
+        "step": "PA routes to expert",
+        "check": "response: length > 10",
+        "status": "PASS",
+        "detail": "length 181 > 10"
+      },
+      {
+        "step": "German query",
+        "check": "send: Zeig mir alle Tabellen in der Datenbank",
+        "status": "PASS",
+        "detail": "response: Okay, ich habe eine Liste aller Tabellen in der \"eras2_production\" Datenbank abg"
+      },
+      {
+        "step": "German query",
+        "check": "trace: has tool_call",
+        "status": "PASS",
+        "detail": "found event 'tool_call'"
+      },
+      {
+        "step": "German query",
+        "check": "response: length > 10",
+        "status": "PASS",
+        "detail": "length 303 > 10"
+      },
+      {
+        "step": "Schema discovery",
+        "check": "send: describe the kunden table",
+        "status": "PASS",
+        "detail": "response: The `kunden` table stores customer information, including names, customer number"
+      },
+      {
+        "step": "Schema discovery",
+        "check": "trace: has tool_call",
+        "status": "PASS",
+        "detail": "found event 'tool_call'"
+      },
+      {
+        "step": "Schema discovery",
+        "check": "response: length > 10",
+        "status": "PASS",
+        "detail": "length 391 > 10"
+      },
+      {
+        "step": "Count query (cached schema)",
+        "check": "send: how many customers are there?",
+        "status": "PASS",
+        "detail": "response: There are 693 customers in the database.\n"
+      },
+      {
+        "step": "Count query (cached schema)",
+        "check": "trace: has tool_call",
+        "status": "PASS",
+        "detail": "found event 'tool_call'"
+      },
+      {
+        "step": "Count query (cached schema)",
+        "check": "response: length > 5",
+        "status": "PASS",
+        "detail": "length 41 > 5"
+      },
+      {
+        "step": "Complex query",
+        "check": "send: which customers have the most devices?",
+        "status": "PASS",
+        "detail": "response: The query results list the top 10 customers with the most devices. Anne Bürger h"
+      },
+      {
+        "step": "Complex query",
+        "check": "trace: has tool_call",
+        "status": "PASS",
+        "detail": "found event 'tool_call'"
+      },
+      {
+        "step": "Complex query",
         "check": "response: length > 20",
         "status": "PASS",
-        "detail": "length 86 > 20"
+        "detail": "length 166 > 20"
+      },
+      {
+        "step": "Error recovery",
+        "check": "send: SELECT * FROM nichtexistiert",
+        "status": "PASS",
+        "detail": "response: Oops! It seems like the table `nichtexistiert` doesn't exist in the database. 🤔 "
+      },
+      {
+        "step": "Error recovery",
+        "check": "trace: has tool_call",
+        "status": "PASS",
+        "detail": "found event 'tool_call'"
+      },
+      {
+        "step": "Error recovery",
+        "check": "response: length > 10",
+        "status": "PASS",
+        "detail": "length 396 > 10"
+      },
+      {
+        "step": "Memorizer",
+        "check": "send: my name is Nico",
+        "status": "PASS",
+        "detail": "response: Thanks, Nico! I'll remember that. Do you have any other questions? 😊\n"
+      },
+      {
+        "step": "Memorizer",
+        "check": "state: facts any contains \"Nico\"",
+        "status": "PASS",
+        "detail": "found 'Nico' in facts"
+      },
+      {
+        "step": "Language switch",
+        "check": "send: Hallo wie gehts?",
+        "status": "PASS",
+        "detail": "response: Mir geht es gut, danke der Nachfrage, Nico! Und selbst? Gibt es etwas, bei dem i"
+      },
+      {
+        "step": "Language switch",
+        "check": "state: language is \"de\" or \"mixed\"",
+        "status": "PASS",
+        "detail": "language=mixed"
+      },
+      {
+        "step": "Bye",
+        "check": "send: ok bye",
+        "status": "PASS",
+        "detail": "response: Bye Nico! 👋 If you need anything else, just let me know. 😊\n"
+      },
+      {
+        "step": "Bye",
+        "check": "response: length > 2",
+        "status": "PASS",
+        "detail": "length 59 > 2"
+      }
+    ],
+    "Dashboard Integration": [
+      {
+        "step": "Setup",
+        "check": "clear",
+        "status": "PASS",
+        "detail": "cleared"
+      },
+      {
+        "step": "Expert creates buttons",
+        "check": "send: create two buttons on my dashboard: Repo",
+        "status": "PASS",
+        "detail": "response: I have added 'Report' and 'Export' buttons to your dashboard.\n\n(UI buttons shown"
+      },
+      {
+        "step": "Expert creates buttons",
+        "check": "actions: length >= 2",
+        "status": "PASS",
+        "detail": "3 actions >= 2"
+      },
+      {
+        "step": "Expert creates buttons",
+        "check": "actions: any action contains \"report\" or \"Report\"",
+        "status": "PASS",
+        "detail": "found 'report' in actions"
+      },
+      {
+        "step": "Buttons survive a query",
+        "check": "send: how many customers are there?",
+        "status": "PASS",
+        "detail": "response: I'm running a query to count all customer IDs. One moment...\n"
+      },
+      {
+        "step": "Buttons survive a query",
+        "check": "response: length > 5",
+        "status": "PASS",
+        "detail": "length 61 > 5"
+      },
+      {
+        "step": "Buttons survive a query",
+        "check": "actions: any action contains \"report\" or \"Report\"",
+        "status": "PASS",
+        "detail": "found 'report' in actions"
+      },
+      {
+        "step": "Expert creates a machine",
+        "check": "send: create a navigation machine called \"work",
+        "status": "PASS",
+        "detail": "response: I've created the 'workflow' machine with 'start' and 'step2' states. The 'start'"
+      },
+      {
+        "step": "Expert creates a machine",
+        "check": "trace: has tool_call create_machine",
+        "status": "PASS",
+        "detail": "found create_machine via machine_created event"
+      },
+      {
+        "step": "Expert shows data table",
+        "check": "send: show me 5 customers in a table",
+        "status": "PASS",
+        "detail": "response: Here are five customer entries with their IDs, names, object count, and status:\n"
+      },
+      {
+        "step": "Expert shows data table",
+        "check": "trace: has tool_call",
+        "status": "PASS",
+        "detail": "found event 'tool_call'"
+      },
+      {
+        "step": "Expert shows data table",
+        "check": "response: length > 10",
+        "status": "PASS",
+        "detail": "length 118 > 10"
+      },
+      {
+        "step": "Expert replaces buttons",
+        "check": "send: remove all buttons and create one button",
+        "status": "PASS",
+        "detail": "response: I have removed the existing 'Report' and 'Export' buttons from the dashboard and"
+      },
+      {
+        "step": "Expert replaces buttons",
+        "check": "actions: length >= 1",
+        "status": "PASS",
+        "detail": "2 actions >= 1"
+      },
+      {
+        "step": "Expert replaces buttons",
+        "check": "actions: any action contains \"reset\" or \"Reset\"",
+        "status": "PASS",
+        "detail": "found 'reset' in actions"
       }
     ]
   },
   "summary": {
-    "passed": 14,
-    "failed": 1
+    "passed": 58,
+    "failed": 0
   }
 }
\ No newline at end of file
diff --git a/testcases/results_v1.json b/testcases/results_v1.json
new file mode 100644
index 0000000..d7b94fa
--- /dev/null
+++ b/testcases/results_v1.json
@@ -0,0 +1,1033 @@
+{
+  "timestamp": "2026-03-29 06:11:18",
+  "testcases": {
+    "Button Persistence": [
+      {
+        "step": "Setup",
+        "check": "clear",
+        "status": "PASS",
+        "detail": "cleared"
+      },
+      {
+        "step": "Create buttons",
+        "check": "send: create two buttons: Poodle Bark and Bolo",
+        "status": "PASS",
+        "detail": "response: Okay, I've created two buttons for you: \"Poodle Bark\" and \"Bolonka Bark\". 🐶 \n"
+      },
+      {
+        "step": "Create buttons",
+        "check": "actions: length >= 2",
+        "status": "PASS",
+        "detail": "2 actions >= 2"
+      },
+      {
+        "step": "Create buttons",
+        "check": "actions: any action contains \"poodle\" or \"Poodle\"",
+        "status": "PASS",
+        "detail": "found 'poodle' in actions"
+      },
+      {
+        "step": "Create buttons",
+        "check": "actions: any action contains \"bolonka\" or \"Bolonka\"",
+        "status": "PASS",
+        "detail": "found 'bolonka' in actions"
+      },
+      {
+        "step": "Ask unrelated question (buttons must survive)",
+        "check": "send: what time is it?",
+        "status": "PASS",
+        "detail": "response: The current time is 6:08 AM. ⏰\n"
+      },
+      {
+        "step": "Ask unrelated question (buttons must survive)",
+        "check": "response: contains \":\" or \"time\" or \"clock\"",
+        "status": "PASS",
+        "detail": "found ':'"
+      },
+      {
+        "step": "Ask unrelated question (buttons must survive)",
+        "check": "actions: any action contains \"poodle\" or \"Poodle\"",
+        "status": "PASS",
+        "detail": "found 'poodle' in actions"
+      },
+      {
+        "step": "Ask unrelated question (buttons must survive)",
+        "check": "actions: any action contains \"bolonka\" or \"Bolonka\"",
+        "status": "PASS",
+        "detail": "found 'bolonka' in actions"
+      },
+      {
+        "step": "Ask another question (buttons still there)",
+        "check": "send: say hello in German",
+        "status": "PASS",
+        "detail": "response: Hallo! 👋\n"
+      },
+      {
+        "step": "Ask another question (buttons still there)",
+        "check": "response: contains \"Hallo\" or \"hallo\" or \"German\"",
+        "status": "PASS",
+        "detail": "found 'Hallo'"
+      },
+      {
+        "step": "Ask another question (buttons still there)",
+        "check": "actions: any action contains \"poodle\" or \"Poodle\"",
+        "status": "PASS",
+        "detail": "found 'poodle' in actions"
+      },
+      {
+        "step": "Explicitly replace buttons",
+        "check": "send: remove all buttons and create one button",
+        "status": "PASS",
+        "detail": "response: You got it! I've removed the existing buttons and created a single button labele"
+      },
+      {
+        "step": "Explicitly replace buttons",
+        "check": "actions: length >= 1",
+        "status": "PASS",
+        "detail": "1 actions >= 1"
+      },
+      {
+        "step": "Explicitly replace buttons",
+        "check": "actions: any action contains \"reset\" or \"Reset\"",
+        "status": "PASS",
+        "detail": "found 'reset' in actions"
+      }
+    ],
+    "Counter State": [
+      {
+        "step": "Setup",
+        "check": "clear",
+        "status": "PASS",
+        "detail": "cleared"
+      },
+      {
+        "step": "Create counter",
+        "check": "send: create a counter starting at 0 with incr",
+        "status": "PASS",
+        "detail": "response: "
+      },
+      {
+        "step": "Create counter",
+        "check": "response: contains \"counter\" or \"count\"",
+        "status": "FAIL",
+        "detail": "none of ['counter', 'count'] found in: "
+      },
+      {
+        "step": "Create counter",
+        "check": "actions: length >= 2",
+        "status": "FAIL",
+        "detail": "1 actions < 2"
+      },
+      {
+        "step": "Create counter",
+        "check": "actions: any action contains \"increment\" or \"inc\" or \"plus\" or \"add\"",
+        "status": "FAIL",
+        "detail": "none of ['increment', 'inc', 'plus', 'add'] found in 1 buttons"
+      },
+      {
+        "step": "Create counter",
+        "check": "actions: any action contains \"decrement\" or \"dec\" or \"minus\" or \"sub\"",
+        "status": "FAIL",
+        "detail": "none of ['decrement', 'dec', 'minus', 'sub'] found in 1 buttons"
+      },
+      {
+        "step": "Check state",
+        "check": "state: topic contains \"counter\" or \"count\" or \"button\"",
+        "status": "FAIL",
+        "detail": "topic=UI creation doesn't contain any of ['counter', 'count', 'button']"
+      },
+      {
+        "step": "Ask for current value",
+        "check": "send: what is the current count?",
+        "status": "PASS",
+        "detail": "response: "
+      },
+      {
+        "step": "Ask for current value",
+        "check": "response: contains \"0\" or \"zero\"",
+        "status": "FAIL",
+        "detail": "none of ['0', 'zero'] found in: "
+      },
+      {
+        "step": "Increment",
+        "check": "action matching 'inc'",
+        "status": "FAIL",
+        "detail": "no action matching 'inc' in ['reset']"
+      },
+      {
+        "step": "Increment",
+        "check": "response: contains \"1\" or \"one\" or \"increment\" or \"Navigated\"",
+        "status": "FAIL",
+        "detail": "none of ['1', 'one', 'increment', 'Navigated'] found in: "
+      },
+      {
+        "step": "Increment again",
+        "check": "action matching 'inc'",
+        "status": "FAIL",
+        "detail": "no action matching 'inc' in ['reset']"
+      },
+      {
+        "step": "Increment again",
+        "check": "response: contains \"2\" or \"two\" or \"increment\" or \"Navigated\"",
+        "status": "FAIL",
+        "detail": "none of ['2', 'two', 'increment', 'Navigated'] found in: "
+      },
+      {
+        "step": "Decrement",
+        "check": "action matching 'dec'",
+        "status": "FAIL",
+        "detail": "no action matching 'dec' in ['reset']"
+      },
+      {
+        "step": "Decrement",
+        "check": "response: contains \"1\" or \"one\" or \"decrement\" or \"Navigated\"",
+        "status": "FAIL",
+        "detail": "none of ['1', 'one', 'decrement', 'Navigated'] found in: "
+      },
+      {
+        "step": "Verify memorizer tracks it",
+        "check": "state: topic contains \"count\"",
+        "status": "FAIL",
+        "detail": "topic=UI creation doesn't contain any of ['count']"
+      }
+    ],
+    "DB Exploration": [
+      {
+        "step": "Setup",
+        "check": "clear",
+        "status": "PASS",
+        "detail": "cleared"
+      },
+      {
+        "step": "Query renders table in workspace",
+        "check": "send: show me 5 customers from the database",
+        "status": "PASS",
+        "detail": "response: "
+      },
+      {
+        "step": "Query renders table in workspace",
+        "check": "trace: has tool_call",
+        "status": "FAIL",
+        "detail": "no 'tool_call' event in trace"
+      },
+      {
+        "step": "Query renders table in workspace",
+        "check": "actions: has table",
+        "status": "FAIL",
+        "detail": "no table in 1 controls"
+      },
+      {
+        "step": "Query renders table in workspace",
+        "check": "response: not contains \"---|\" or \"| ID\"",
+        "status": "PASS",
+        "detail": "none of ['---|', '| ID'] found (as expected)"
+      },
+      {
+        "step": "Chat summarizes, does not dump data",
+        "check": "response: contains \"customer\" or \"Kunde\" or \"5\" or \"table\"",
+        "status": "FAIL",
+        "detail": "none of ['customer', 'Kunde', '5', 'table'] found in: "
+      },
+      {
+        "step": "Chat summarizes, does not dump data",
+        "check": "response: length > 10",
+        "status": "FAIL",
+        "detail": "length 0 <= 10"
+      },
+      {
+        "step": "Thinker builds exploration UI (not describes it)",
+        "check": "send: select customer 2 Kathrin Jager, add but",
+        "status": "PASS",
+        "detail": "response: "
+      },
+      {
+        "step": "Thinker builds exploration UI (not describes it)",
+        "check": "actions: length >= 1",
+        "status": "PASS",
+        "detail": "1 actions >= 1"
+      },
+      {
+        "step": "Thinker builds exploration UI (not describes it)",
+        "check": "response: not contains \"UI team\" or \"will add\" or \"will create\"",
+        "status": "PASS",
+        "detail": "none of ['UI team', 'will add', 'will create'] found (as expected)"
+      },
+      {
+        "step": "Error recovery on bad query",
+        "check": "send: SELECT * FROM nichtexistiert LIMIT 5",
+        "status": "PASS",
+        "detail": "response: "
+      },
+      {
+        "step": "Error recovery on bad query",
+        "check": "trace: has tool_call",
+        "status": "FAIL",
+        "detail": "no 'tool_call' event in trace"
+      },
+      {
+        "step": "Error recovery on bad query",
+        "check": "response: not contains \"1146\"",
+        "status": "PASS",
+        "detail": "none of ['1146'] found (as expected)"
+      },
+      {
+        "step": "Error recovery on bad query",
+        "check": "response: length > 10",
+        "status": "FAIL",
+        "detail": "length 0 <= 10"
+      }
+    ],
+    "Director Node": [
+      {
+        "step": "Setup",
+        "check": "clear",
+        "status": "PASS",
+        "detail": "cleared"
+      },
+      {
+        "step": "Casual chat establishes mode",
+        "check": "send: hey, just hanging out, what's up?",
+        "status": "PASS",
+        "detail": "response: "
+      },
+      {
+        "step": "Casual chat establishes mode",
+        "check": "response: length > 5",
+        "status": "FAIL",
+        "detail": "length 0 <= 5"
+      },
+      {
+        "step": "Casual chat establishes mode",
+        "check": "trace: has director_updated",
+        "status": "PASS",
+        "detail": "found event 'director_updated'"
+      },
+      {
+        "step": "Director picks up frustration",
+        "check": "send: ugh this is so annoying, nothing makes s",
+        "status": "PASS",
+        "detail": "response: "
+      },
+      {
+        "step": "Director picks up frustration",
+        "check": "response: length > 10",
+        "status": "FAIL",
+        "detail": "length 0 <= 10"
+      },
+      {
+        "step": "Director picks up frustration",
+        "check": "trace: has director_updated",
+        "status": "PASS",
+        "detail": "found event 'director_updated'"
+      },
+      {
+        "step": "Switch to building mode",
+        "check": "send: ok let's build a todo list app",
+        "status": "PASS",
+        "detail": "response: "
+      },
+      {
+        "step": "Switch to building mode",
+        "check": "response: length > 10",
+        "status": "FAIL",
+        "detail": "length 0 <= 10"
+      },
+      {
+        "step": "Switch to building mode",
+        "check": "trace: has director_updated",
+        "status": "PASS",
+        "detail": "found event 'director_updated'"
+      }
+    ],
+    "Pub Conversation": [
+      {
+        "step": "Setup",
+        "check": "clear",
+        "status": "PASS",
+        "detail": "cleared"
+      },
+      {
+        "step": "Set the scene",
+        "check": "send: Hey, Alice and I are heading to the pub ",
+        "status": "PASS",
+        "detail": "response: "
+      },
+      {
+        "step": "Set the scene",
+        "check": "response: length > 10",
+        "status": "FAIL",
+        "detail": "length 0 <= 10"
+      },
+      {
+        "step": "Set the scene",
+        "check": "state: situation contains \"pub\" or \"Alice\"",
+        "status": "FAIL",
+        "detail": "situation=local session doesn't contain any of ['pub', 'Alice']"
+      },
+      {
+        "step": "Language switch to German",
+        "check": "send: Wir sind jetzt im Biergarten angekommen",
+        "status": "PASS",
+        "detail": "response: "
+      },
+      {
+        "step": "Language switch to German",
+        "check": "response: length > 10",
+        "status": "FAIL",
+        "detail": "length 0 <= 10"
+      },
+      {
+        "step": "Language switch to German",
+        "check": "state: language is \"de\" or \"mixed\"",
+        "status": "PASS",
+        "detail": "language=mixed"
+      },
+      {
+        "step": "Context awareness",
+        "check": "send: Was sollen wir bestellen?",
+        "status": "PASS",
+        "detail": "response: "
+      },
+      {
+        "step": "Context awareness",
+        "check": "response: length > 10",
+        "status": "FAIL",
+        "detail": "length 0 <= 10"
+      },
+      {
+        "step": "Context awareness",
+        "check": "state: topic contains \"bestell\" or \"order\" or \"pub\" or \"Biergarten\"",
+        "status": "FAIL",
+        "detail": "topic=UI creation doesn't contain any of ['bestell', 'order', 'pub', 'Biergarten']"
+      },
+      {
+        "step": "Alice speaks",
+        "check": "send: Alice says: I'll have a Hefeweizen pleas",
+        "status": "PASS",
+        "detail": "response: "
+      },
+      {
+        "step": "Alice speaks",
+        "check": "response: length > 10",
+        "status": "FAIL",
+        "detail": "length 0 <= 10"
+      },
+      {
+        "step": "Alice speaks",
+        "check": "state: facts any contains \"Alice\" or \"Hefeweizen\"",
+        "status": "FAIL",
+        "detail": "none of ['Alice', 'Hefeweizen'] found in facts: []"
+      },
+      {
+        "step": "Ask for time (tool use)",
+        "check": "send: wie spaet ist es eigentlich?",
+        "status": "PASS",
+        "detail": "response: "
+      },
+      {
+        "step": "Ask for time (tool use)",
+        "check": "response: matches \\d{1,2}:\\d{2}",
+        "status": "FAIL",
+        "detail": "/\\d{1,2}:\\d{2}/ not found in: "
+      },
+      {
+        "step": "Back to English",
+        "check": "send: Let's switch to English, what was the la",
+        "status": "PASS",
+        "detail": "response: "
+      },
+      {
+        "step": "Back to English",
+        "check": "state: language is \"en\" or \"mixed\"",
+        "status": "PASS",
+        "detail": "language=mixed"
+      },
+      {
+        "step": "Back to English",
+        "check": "response: contains \"Alice\" or \"Hefeweizen\"",
+        "status": "FAIL",
+        "detail": "none of ['Alice', 'Hefeweizen'] found in: "
+      },
+      {
+        "step": "Mood check",
+        "check": "send: This is really fun!",
+        "status": "PASS",
+        "detail": "response: "
+      },
+      {
+        "step": "Mood check",
+        "check": "state: user_mood is \"happy\" or \"playful\" or \"excited\"",
+        "status": "FAIL",
+        "detail": "user_mood=neutral not in ['happy', 'playful', 'excited']"
+      }
+    ],
+    "Reflex Path": [
+      {
+        "step": "Setup",
+        "check": "clear",
+        "status": "PASS",
+        "detail": "cleared"
+      },
+      {
+        "step": "Greeting triggers reflex",
+        "check": "send: hey!",
+        "status": "PASS",
+        "detail": "response: "
+      },
+      {
+        "step": "Greeting triggers reflex",
+        "check": "response: length > 2",
+        "status": "FAIL",
+        "detail": "length 0 <= 2"
+      },
+      {
+        "step": "Greeting triggers reflex",
+        "check": "trace: has reflex_path",
+        "status": "FAIL",
+        "detail": "no 'reflex_path' event in trace"
+      },
+      {
+        "step": "Thanks triggers reflex",
+        "check": "send: thanks",
+        "status": "PASS",
+        "detail": "response: "
+      },
+      {
+        "step": "Thanks triggers reflex",
+        "check": "response: length > 2",
+        "status": "FAIL",
+        "detail": "length 0 <= 2"
+      },
+      {
+        "step": "Thanks triggers reflex",
+        "check": "trace: has reflex_path",
+        "status": "FAIL",
+        "detail": "no 'reflex_path' event in trace"
+      },
+      {
+        "step": "Complex request does NOT trigger reflex",
+        "check": "send: explain how neural networks work in deta",
+        "status": "PASS",
+        "detail": "response: "
+      },
+      {
+        "step": "Complex request does NOT trigger reflex",
+        "check": "response: length > 20",
+        "status": "FAIL",
+        "detail": "length 0 <= 20"
+      },
+      {
+        "step": "Complex request does NOT trigger reflex",
+        "check": "trace: input.analysis.intent is \"question\" or \"request\"",
+        "status": "PASS",
+        "detail": "input.analysis.intent=request"
+      },
+      {
+        "step": "Complex request does NOT trigger reflex",
+        "check": "trace: has decided",
+        "status": "FAIL",
+        "detail": "no 'decided' event in trace"
+      }
+    ],
+    "S3* Audit Corrections": [
+      {
+        "step": "Setup",
+        "check": "clear",
+        "status": "PASS",
+        "detail": "cleared"
+      },
+      {
+        "step": "Tool calls produce results (baseline)",
+        "check": "send: create two buttons: Alpha and Beta",
+        "status": "PASS",
+        "detail": "response: "
+      },
+      {
+        "step": "Tool calls produce results (baseline)",
+        "check": "actions: length >= 1",
+        "status": "PASS",
+        "detail": "1 actions >= 1"
+      },
+      {
+        "step": "Tool calls produce results (baseline)",
+        "check": "actions: any action contains \"alpha\" or \"Alpha\"",
+        "status": "FAIL",
+        "detail": "none of ['alpha', 'Alpha'] found in 1 buttons"
+      },
+      {
+        "step": "Dashboard mismatch triggers re-emit",
+        "check": "send: I see nothing on my dashboard, fix it",
+        "status": "PASS",
+        "detail": "response: "
+      },
+      {
+        "step": "Dashboard mismatch triggers re-emit",
+        "check": "response: not contains \"sorry\" or \"apologize\"",
+        "status": "PASS",
+        "detail": "none of ['sorry', 'apologize'] found (as expected)"
+      },
+      {
+        "step": "Dashboard mismatch triggers re-emit",
+        "check": "actions: length >= 1",
+        "status": "PASS",
+        "detail": "1 actions >= 1"
+      },
+      {
+        "step": "DB error triggers retry with corrected SQL",
+        "check": "send: SELECT * FROM NichtExistent LIMIT 5",
+        "status": "PASS",
+        "detail": "response: "
+      },
+      {
+        "step": "DB error triggers retry with corrected SQL",
+        "check": "trace: has tool_call",
+        "status": "FAIL",
+        "detail": "no 'tool_call' event in trace"
+      },
+      {
+        "step": "DB error triggers retry with corrected SQL",
+        "check": "response: not contains \"1146\"",
+        "status": "PASS",
+        "detail": "none of ['1146'] found (as expected)"
+      },
+      {
+        "step": "DB error triggers retry with corrected SQL",
+        "check": "response: length > 10",
+        "status": "FAIL",
+        "detail": "length 0 <= 10"
+      },
+      {
+        "step": "Complex request gets Director plan",
+        "check": "send: investigate which customers have the mos",
+        "status": "PASS",
+        "detail": "response: "
+      },
+      {
+        "step": "Complex request gets Director plan",
+        "check": "trace: has director_plan",
+        "status": "FAIL",
+        "detail": "no 'director_plan' event in trace"
+      },
+      {
+        "step": "Complex request gets Director plan",
+        "check": "trace: has tool_call",
+        "status": "FAIL",
+        "detail": "no 'tool_call' event in trace"
+      },
+      {
+        "step": "Complex request gets Director plan",
+        "check": "response: length > 20",
+        "status": "FAIL",
+        "detail": "length 0 <= 20"
+      }
+    ],
+    "State Machines": [
+      {
+        "step": "Setup",
+        "check": "clear",
+        "status": "PASS",
+        "detail": "cleared"
+      },
+      {
+        "step": "Create a machine",
+        "check": "send: create a navigation machine called \"nav\"",
+        "status": "PASS",
+        "detail": "response: "
+      },
+      {
+        "step": "Create a machine",
+        "check": "trace: has tool_call create_machine",
+        "status": "FAIL",
+        "detail": "no tool_call 'create_machine' in trace"
+      },
+      {
+        "step": "Create a machine",
+        "check": "trace: machine_created id=\"nav\"",
+        "status": "FAIL",
+        "detail": "no machine_created event with id='nav'"
+      },
+      {
+        "step": "Verify machine renders",
+        "check": "send: what machines are on my dashboard?",
+        "status": "PASS",
+        "detail": "response: "
+      },
+      {
+        "step": "Verify machine renders",
+        "check": "response: contains \"nav\" or \"machine\"",
+        "status": "FAIL",
+        "detail": "none of ['nav', 'machine'] found in: "
+      },
+      {
+        "step": "Navigate via button click (local transition)",
+        "check": "action matching 'menu_1'",
+        "status": "FAIL",
+        "detail": "no action matching 'menu_1' in ['reset']"
+      },
+      {
+        "step": "Navigate via button click (local transition)",
+        "check": "trace: has machine_transition",
+        "status": "FAIL",
+        "detail": "no 'machine_transition' event in trace"
+      },
+      {
+        "step": "Navigate via button click (local transition)",
+        "check": "trace: no thinker",
+        "status": "PASS",
+        "detail": "no 'thinker' event (as expected)"
+      },
+      {
+        "step": "Add a state to existing machine",
+        "check": "send: add a state \"sub3\" to the nav machine wi",
+        "status": "PASS",
+        "detail": "response: "
+      },
+      {
+        "step": "Add a state to existing machine",
+        "check": "trace: has tool_call add_state",
+        "status": "FAIL",
+        "detail": "no tool_call 'add_state' in trace"
+      },
+      {
+        "step": "Reset machine",
+        "check": "send: reset the nav machine to its initial sta",
+        "status": "PASS",
+        "detail": "response: "
+      },
+      {
+        "step": "Reset machine",
+        "check": "trace: has tool_call reset_machine",
+        "status": "FAIL",
+        "detail": "no tool_call 'reset_machine' in trace"
+      },
+      {
+        "step": "Reset machine",
+        "check": "response: contains \"main\" or \"reset\" or \"initial\"",
+        "status": "FAIL",
+        "detail": "none of ['main', 'reset', 'initial'] found in: "
+      },
+      {
+        "step": "Create second machine alongside first",
+        "check": "send: create a counter machine called \"clicks\"",
+        "status": "PASS",
+        "detail": "response: "
+      },
+      {
+        "step": "Create second machine alongside first",
+        "check": "trace: has tool_call create_machine",
+        "status": "FAIL",
+        "detail": "no tool_call 'create_machine' in trace"
+      },
+      {
+        "step": "Create second machine alongside first",
+        "check": "trace: machine_created id=\"clicks\"",
+        "status": "FAIL",
+        "detail": "no machine_created event with id='clicks'"
+      },
+      {
+        "step": "Both machines coexist",
+        "check": "send: what machines are running?",
+        "status": "PASS",
+        "detail": "response: "
+      },
+      {
+        "step": "Both machines coexist",
+        "check": "response: contains \"nav\"",
+        "status": "FAIL",
+        "detail": "none of ['nav'] found in: "
+      },
+      {
+        "step": "Both machines coexist",
+        "check": "response: contains \"click\"",
+        "status": "FAIL",
+        "detail": "none of ['click'] found in: "
+      },
+      {
+        "step": "Destroy one machine",
+        "check": "send: destroy the clicks machine",
+        "status": "PASS",
+        "detail": "response: "
+      },
+      {
+        "step": "Destroy one machine",
+        "check": "trace: has tool_call destroy_machine",
+        "status": "FAIL",
+        "detail": "no tool_call 'destroy_machine' in trace"
+      },
+      {
+        "step": "Destroy one machine",
+        "check": "send: what machines are running?",
+        "status": "PASS",
+        "detail": "response: "
+      },
+      {
+        "step": "Destroy one machine",
+        "check": "response: contains \"nav\"",
+        "status": "FAIL",
+        "detail": "none of ['nav'] found in: "
+      }
+    ],
+    "Structured Input Analysis": [
+      {
+        "step": "Setup",
+        "check": "clear",
+        "status": "PASS",
+        "detail": "cleared"
+      },
+      {
+        "step": "Social greeting",
+        "check": "send: hi there!",
+        "status": "PASS",
+        "detail": "response: "
+      },
+      {
+        "step": "Social greeting",
+        "check": "response: length > 3",
+        "status": "FAIL",
+        "detail": "length 0 <= 3"
+      },
+      {
+        "step": "Social greeting",
+        "check": "trace: input.analysis.intent is \"social\"",
+        "status": "FAIL",
+        "detail": "input.analysis.intent=request, expected one of ['social']"
+      },
+      {
+        "step": "Social greeting",
+        "check": "trace: input.analysis.complexity is \"trivial\"",
+        "status": "FAIL",
+        "detail": "input.analysis.complexity=simple, expected one of ['trivial']"
+      },
+      {
+        "step": "Simple request",
+        "check": "send: create a counter starting at 0",
+        "status": "PASS",
+        "detail": "response: "
+      },
+      {
+        "step": "Simple request",
+        "check": "response: length > 10",
+        "status": "FAIL",
+        "detail": "length 0 <= 10"
+      },
+      {
+        "step": "Simple request",
+        "check": "trace: input.analysis.intent is \"request\" or \"action\"",
+        "status": "PASS",
+        "detail": "input.analysis.intent=request"
+      },
+      {
+        "step": "Simple request",
+        "check": "trace: input.analysis.complexity is \"simple\" or \"complex\"",
+        "status": "PASS",
+        "detail": "input.analysis.complexity=simple"
+      },
+      {
+        "step": "German question",
+        "check": "send: Wie spaet ist es?",
+        "status": "PASS",
+        "detail": "response: "
+      },
+      {
+        "step": "German question",
+        "check": "response: length > 5",
+        "status": "FAIL",
+        "detail": "length 0 <= 5"
+      },
+      {
+        "step": "German question",
+        "check": "trace: input.analysis.language is \"de\"",
+        "status": "FAIL",
+        "detail": "input.analysis.language=en, expected one of ['de']"
+      },
+      {
+        "step": "German question",
+        "check": "trace: input.analysis.intent is \"question\"",
+        "status": "FAIL",
+        "detail": "input.analysis.intent=request, expected one of ['question']"
+      },
+      {
+        "step": "Frustrated tone",
+        "check": "send: this is broken, nothing works and I'm si",
+        "status": "PASS",
+        "detail": "response: "
+      },
+      {
+        "step": "Frustrated tone",
+        "check": "response: length > 10",
+        "status": "FAIL",
+        "detail": "length 0 <= 10"
+      },
+      {
+        "step": "Frustrated tone",
+        "check": "trace: input.analysis.tone is \"frustrated\" or \"urgent\"",
+        "status": "FAIL",
+        "detail": "input.analysis.tone=casual, expected one of ['frustrated', 'urgent']"
+      },
+      {
+        "step": "Simple acknowledgment",
+        "check": "send: ok thanks bye",
+        "status": "PASS",
+        "detail": "response: "
+      },
+      {
+        "step": "Simple acknowledgment",
+        "check": "trace: input.analysis.intent is \"social\"",
+        "status": "FAIL",
+        "detail": "input.analysis.intent=request, expected one of ['social']"
+      },
+      {
+        "step": "Simple acknowledgment",
+        "check": "trace: input.analysis.complexity is \"trivial\"",
+        "status": "FAIL",
+        "detail": "input.analysis.complexity=simple, expected one of ['trivial']"
+      }
+    ],
+    "Dashboard Feedback (S3*)": [
+      {
+        "step": "Setup",
+        "check": "clear",
+        "status": "PASS",
+        "detail": "cleared"
+      },
+      {
+        "step": "Thinker sees buttons in dashboard",
+        "check": "send: create two buttons: hello and world",
+        "status": "PASS",
+        "detail": "response: "
+      },
+      {
+        "step": "Thinker sees buttons in dashboard",
+        "check": "actions: length >= 2",
+        "status": "FAIL",
+        "detail": "1 actions < 2"
+      },
+      {
+        "step": "Thinker sees buttons in dashboard",
+        "check": "send: what buttons can you see in my dashboard",
+        "status": "PASS",
+        "detail": "response: "
+      },
+      {
+        "step": "Thinker sees buttons in dashboard",
+        "check": "response: contains \"Hello\" or \"hello\"",
+        "status": "FAIL",
+        "detail": "none of ['Hello', 'hello'] found in: "
+      },
+      {
+        "step": "Thinker sees buttons in dashboard",
+        "check": "response: contains \"World\" or \"world\"",
+        "status": "FAIL",
+        "detail": "none of ['World', 'world'] found in: "
+      },
+      {
+        "step": "Thinker detects empty dashboard",
+        "check": "send: I see nothing in my dashboard, what happ",
+        "status": "PASS",
+        "detail": "response: "
+      },
+      {
+        "step": "Thinker detects empty dashboard",
+        "check": "response: contains \"button\" or \"fix\" or \"restore\" or \"create\" or \"empty\"",
+        "status": "FAIL",
+        "detail": "none of ['button', 'fix', 'restore', 'create', 'empty'] found in: "
+      },
+      {
+        "step": "Dashboard state flows to thinker context",
+        "check": "send: create a counter starting at 5",
+        "status": "PASS",
+        "detail": "response: "
+      },
+      {
+        "step": "Dashboard state flows to thinker context",
+        "check": "actions: length >= 1",
+        "status": "PASS",
+        "detail": "1 actions >= 1"
+      },
+      {
+        "step": "Dashboard state flows to thinker context",
+        "check": "send: what does my dashboard show?",
+        "status": "PASS",
+        "detail": "response: "
+      },
+      {
+        "step": "Dashboard state flows to thinker context",
+        "check": "response: contains \"5\" or \"count\"",
+        "status": "FAIL",
+        "detail": "none of ['5', 'count'] found in: "
+      }
+    ],
+    "Dashboard Mismatch Recovery": [
+      {
+        "step": "Setup",
+        "check": "clear",
+        "status": "PASS",
+        "detail": "cleared"
+      },
+      {
+        "step": "Create buttons",
+        "check": "send: create two buttons: red and blue",
+        "status": "PASS",
+        "detail": "response: "
+      },
+      {
+        "step": "Create buttons",
+        "check": "actions: length >= 2",
+        "status": "FAIL",
+        "detail": "1 actions < 2"
+      },
+      {
+        "step": "Dashboard empty — Thinker re-emits",
+        "check": "send: I clicked red but nothing happened",
+        "status": "PASS",
+        "detail": "response: "
+      },
+      {
+        "step": "Dashboard empty — Thinker re-emits",
+        "check": "response: contains \"button\" or \"red\" or \"blue\"",
+        "status": "FAIL",
+        "detail": "none of ['button', 'red', 'blue'] found in: "
+      },
+      {
+        "step": "Dashboard empty — Thinker re-emits",
+        "check": "actions: length >= 1",
+        "status": "PASS",
+        "detail": "1 actions >= 1"
+      },
+      {
+        "step": "Create counter",
+        "check": "send: create a counter starting at 0",
+        "status": "PASS",
+        "detail": "response: "
+      },
+      {
+        "step": "Create counter",
+        "check": "actions: length >= 1",
+        "status": "PASS",
+        "detail": "1 actions >= 1"
+      },
+      {
+        "step": "Counter missing from dashboard — Thinker recovers",
+        "check": "send: the dashboard is broken, I only see old ",
+        "status": "PASS",
+        "detail": "response: "
+      },
+      {
+        "step": "Counter missing from dashboard — Thinker recovers",
+        "check": "response: contains \"counter\" or \"count\" or \"fix\" or \"recreat\" or \"refresh\" or \"button\" or \"update\"",
+        "status": "FAIL",
+        "detail": "none of ['counter', 'count', 'fix', 'recreat', 'refresh', 'button', 'update'] found in: "
+      },
+      {
+        "step": "Counter missing from dashboard — Thinker recovers",
+        "check": "actions: length >= 1",
+        "status": "PASS",
+        "detail": "1 actions >= 1"
+      }
+    ]
+  },
+  "summary": {
+    "passed": 90,
+    "failed": 77
+  }
+}
\ No newline at end of file
diff --git a/testcases/results_v2.json b/testcases/results_v2.json
new file mode 100644
index 0000000..57121f6
--- /dev/null
+++ b/testcases/results_v2.json
@@ -0,0 +1,1033 @@
+{
+  "timestamp": "2026-03-29 12:22:42",
+  "testcases": {
+    "Button Persistence": [
+      {
+        "step": "Setup",
+        "check": "clear",
+        "status": "PASS",
+        "detail": "cleared"
+      },
+      {
+        "step": "Create buttons",
+        "check": "send: create two buttons: Poodle Bark and Bolo",
+        "status": "PASS",
+        "detail": "response: "
+      },
+      {
+        "step": "Create buttons",
+        "check": "actions: length >= 2",
+        "status": "FAIL",
+        "detail": "0 actions < 2"
+      },
+      {
+        "step": "Create buttons",
+        "check": "actions: any action contains \"poodle\" or \"Poodle\"",
+        "status": "FAIL",
+        "detail": "none of ['poodle', 'Poodle'] found in 0 buttons"
+      },
+      {
+        "step": "Create buttons",
+        "check": "actions: any action contains \"bolonka\" or \"Bolonka\"",
+        "status": "FAIL",
+        "detail": "none of ['bolonka', 'Bolonka'] found in 0 buttons"
+      },
+      {
+        "step": "Ask unrelated question (buttons must survive)",
+        "check": "send: what time is it?",
+        "status": "PASS",
+        "detail": "response: Alright, I've created two bark buttons for you: Poodle Bark and Bolonka Bark. 🐶 "
+      },
+      {
+        "step": "Ask unrelated question (buttons must survive)",
+        "check": "response: contains \":\" or \"time\" or \"clock\"",
+        "status": "PASS",
+        "detail": "found ':'"
+      },
+      {
+        "step": "Ask unrelated question (buttons must survive)",
+        "check": "actions: any action contains \"poodle\" or \"Poodle\"",
+        "status": "PASS",
+        "detail": "found 'poodle' in actions"
+      },
+      {
+        "step": "Ask unrelated question (buttons must survive)",
+        "check": "actions: any action contains \"bolonka\" or \"Bolonka\"",
+        "status": "PASS",
+        "detail": "found 'bolonka' in actions"
+      },
+      {
+        "step": "Ask another question (buttons still there)",
+        "check": "send: say hello in German",
+        "status": "PASS",
+        "detail": "response: Hallo! Or, if you prefer something more formal, Guten Tag!\n"
+      },
+      {
+        "step": "Ask another question (buttons still there)",
+        "check": "response: contains \"Hallo\" or \"hallo\" or \"German\"",
+        "status": "PASS",
+        "detail": "found 'Hallo'"
+      },
+      {
+        "step": "Ask another question (buttons still there)",
+        "check": "actions: any action contains \"poodle\" or \"Poodle\"",
+        "status": "PASS",
+        "detail": "found 'poodle' in actions"
+      },
+      {
+        "step": "Explicitly replace buttons",
+        "check": "send: remove all buttons and create one button",
+        "status": "PASS",
+        "detail": "response: All buttons have been removed, and a new button called \"Reset\" has been created."
+      },
+      {
+        "step": "Explicitly replace buttons",
+        "check": "actions: length >= 1",
+        "status": "PASS",
+        "detail": "1 actions >= 1"
+      },
+      {
+        "step": "Explicitly replace buttons",
+        "check": "actions: any action contains \"reset\" or \"Reset\"",
+        "status": "PASS",
+        "detail": "found 'reset' in actions"
+      }
+    ],
+    "Counter State": [
+      {
+        "step": "Setup",
+        "check": "clear",
+        "status": "PASS",
+        "detail": "cleared"
+      },
+      {
+        "step": "Create counter",
+        "check": "send: create a counter starting at 0 with incr",
+        "status": "PASS",
+        "detail": "response: "
+      },
+      {
+        "step": "Create counter",
+        "check": "response: contains \"counter\" or \"count\"",
+        "status": "FAIL",
+        "detail": "none of ['counter', 'count'] found in: "
+      },
+      {
+        "step": "Create counter",
+        "check": "actions: length >= 2",
+        "status": "FAIL",
+        "detail": "1 actions < 2"
+      },
+      {
+        "step": "Create counter",
+        "check": "actions: any action contains \"increment\" or \"inc\" or \"plus\" or \"add\"",
+        "status": "FAIL",
+        "detail": "none of ['increment', 'inc', 'plus', 'add'] found in 1 buttons"
+      },
+      {
+        "step": "Create counter",
+        "check": "actions: any action contains \"decrement\" or \"dec\" or \"minus\" or \"sub\"",
+        "status": "FAIL",
+        "detail": "none of ['decrement', 'dec', 'minus', 'sub'] found in 1 buttons"
+      },
+      {
+        "step": "Check state",
+        "check": "state: topic contains \"counter\" or \"count\" or \"button\"",
+        "status": "PASS",
+        "detail": "topic=creating reset button contains 'button'"
+      },
+      {
+        "step": "Ask for current value",
+        "check": "send: what is the current count?",
+        "status": "PASS",
+        "detail": "response: "
+      },
+      {
+        "step": "Ask for current value",
+        "check": "response: contains \"0\" or \"zero\"",
+        "status": "FAIL",
+        "detail": "none of ['0', 'zero'] found in: "
+      },
+      {
+        "step": "Increment",
+        "check": "action matching 'inc'",
+        "status": "FAIL",
+        "detail": "no action matching 'inc' in ['reset']"
+      },
+      {
+        "step": "Increment",
+        "check": "response: contains \"1\" or \"one\" or \"increment\" or \"Navigated\"",
+        "status": "FAIL",
+        "detail": "none of ['1', 'one', 'increment', 'Navigated'] found in: "
+      },
+      {
+        "step": "Increment again",
+        "check": "action matching 'inc'",
+        "status": "FAIL",
+        "detail": "no action matching 'inc' in ['reset']"
+      },
+      {
+        "step": "Increment again",
+        "check": "response: contains \"2\" or \"two\" or \"increment\" or \"Navigated\"",
+        "status": "FAIL",
+        "detail": "none of ['2', 'two', 'increment', 'Navigated'] found in: "
+      },
+      {
+        "step": "Decrement",
+        "check": "action matching 'dec'",
+        "status": "FAIL",
+        "detail": "no action matching 'dec' in ['reset']"
+      },
+      {
+        "step": "Decrement",
+        "check": "response: contains \"1\" or \"one\" or \"decrement\" or \"Navigated\"",
+        "status": "FAIL",
+        "detail": "none of ['1', 'one', 'decrement', 'Navigated'] found in: "
+      },
+      {
+        "step": "Verify memorizer tracks it",
+        "check": "state: topic contains \"count\"",
+        "status": "FAIL",
+        "detail": "topic=creating reset button doesn't contain any of ['count']"
+      }
+    ],
+    "DB Exploration": [
+      {
+        "step": "Setup",
+        "check": "clear",
+        "status": "PASS",
+        "detail": "cleared"
+      },
+      {
+        "step": "Query renders table in workspace",
+        "check": "send: show me 5 customers from the database",
+        "status": "PASS",
+        "detail": "response: Here are 5 customers from the database. The detailed data is shown in the table."
+      },
+      {
+        "step": "Query renders table in workspace",
+        "check": "trace: has tool_call",
+        "status": "PASS",
+        "detail": "found event 'tool_call'"
+      },
+      {
+        "step": "Query renders table in workspace",
+        "check": "actions: has table",
+        "status": "PASS",
+        "detail": "table found: 23 cols, 5 rows"
+      },
+      {
+        "step": "Query renders table in workspace",
+        "check": "response: not contains \"---|\" or \"| ID\"",
+        "status": "PASS",
+        "detail": "none of ['---|', '| ID'] found (as expected)"
+      },
+      {
+        "step": "Chat summarizes, does not dump data",
+        "check": "response: contains \"customer\" or \"Kunde\" or \"5\" or \"table\"",
+        "status": "PASS",
+        "detail": "found 'customer'"
+      },
+      {
+        "step": "Chat summarizes, does not dump data",
+        "check": "response: length > 10",
+        "status": "PASS",
+        "detail": "length 81 > 10"
+      },
+      {
+        "step": "Thinker builds exploration UI (not describes it)",
+        "check": "send: select customer 2 Kathrin Jager, add but",
+        "status": "PASS",
+        "detail": "response: Okay, I've selected customer Kathrin Jager. You can now explore her objects and "
+      },
+      {
+        "step": "Thinker builds exploration UI (not describes it)",
+        "check": "actions: length >= 1",
+        "status": "PASS",
+        "detail": "5 actions >= 1"
+      },
+      {
+        "step": "Thinker builds exploration UI (not describes it)",
+        "check": "response: not contains \"UI team\" or \"will add\" or \"will create\"",
+        "status": "PASS",
+        "detail": "none of ['UI team', 'will add', 'will create'] found (as expected)"
+      },
+      {
+        "step": "Error recovery on bad query",
+        "check": "send: SELECT * FROM nichtexistiert LIMIT 5",
+        "status": "PASS",
+        "detail": "response: I encountered an error. The table `eras2_production.nichtexistiert` does not exi"
+      },
+      {
+        "step": "Error recovery on bad query",
+        "check": "trace: has tool_call",
+        "status": "PASS",
+        "detail": "found event 'tool_call'"
+      },
+      {
+        "step": "Error recovery on bad query",
+        "check": "response: not contains \"1146\"",
+        "status": "PASS",
+        "detail": "none of ['1146'] found (as expected)"
+      },
+      {
+        "step": "Error recovery on bad query",
+        "check": "response: length > 10",
+        "status": "PASS",
+        "detail": "length 277 > 10"
+      }
+    ],
+    "Director Node": [
+      {
+        "step": "Setup",
+        "check": "clear",
+        "status": "PASS",
+        "detail": "cleared"
+      },
+      {
+        "step": "Casual chat establishes mode",
+        "check": "send: hey, just hanging out, what's up?",
+        "status": "PASS",
+        "detail": "response: Alright! Since we're exploring Kathrin Jager's profile, would you like to see he"
+      },
+      {
+        "step": "Casual chat establishes mode",
+        "check": "response: length > 5",
+        "status": "PASS",
+        "detail": "length 107 > 5"
+      },
+      {
+        "step": "Casual chat establishes mode",
+        "check": "trace: has director_updated",
+        "status": "FAIL",
+        "detail": "no 'director_updated' event in trace"
+      },
+      {
+        "step": "Director picks up frustration",
+        "check": "send: ugh this is so annoying, nothing makes s",
+        "status": "PASS",
+        "detail": "response: "
+      },
+      {
+        "step": "Director picks up frustration",
+        "check": "response: length > 10",
+        "status": "FAIL",
+        "detail": "length 0 <= 10"
+      },
+      {
+        "step": "Director picks up frustration",
+        "check": "trace: has director_updated",
+        "status": "FAIL",
+        "detail": "no 'director_updated' event in trace"
+      },
+      {
+        "step": "Switch to building mode",
+        "check": "send: ok let's build a todo list app",
+        "status": "PASS",
+        "detail": "response: Frustration noted! Let's ditch that and build a fun todo app instead. Start addi"
+      },
+      {
+        "step": "Switch to building mode",
+        "check": "response: length > 10",
+        "status": "PASS",
+        "detail": "length 90 > 10"
+      },
+      {
+        "step": "Switch to building mode",
+        "check": "trace: has director_updated",
+        "status": "FAIL",
+        "detail": "no 'director_updated' event in trace"
+      }
+    ],
+    "Pub Conversation": [
+      {
+        "step": "Setup",
+        "check": "clear",
+        "status": "PASS",
+        "detail": "cleared"
+      },
+      {
+        "step": "Set the scene",
+        "check": "send: Hey, Alice and I are heading to the pub ",
+        "status": "PASS",
+        "detail": "response: Nice! Have a great time at the pub tonight. Let me know if you need anything whe"
+      },
+      {
+        "step": "Set the scene",
+        "check": "response: length > 10",
+        "status": "PASS",
+        "detail": "length 147 > 10"
+      },
+      {
+        "step": "Set the scene",
+        "check": "state: situation contains \"pub\" or \"Alice\"",
+        "status": "PASS",
+        "detail": "situation=at a pub with Alice later tonight contains 'pub'"
+      },
+      {
+        "step": "Language switch to German",
+        "check": "send: Wir sind jetzt im Biergarten angekommen",
+        "status": "PASS",
+        "detail": "response: Super! Viel Spaß im Biergarten mit Alice! Genießt die Zeit. 🍺\n"
+      },
+      {
+        "step": "Language switch to German",
+        "check": "response: length > 10",
+        "status": "PASS",
+        "detail": "length 62 > 10"
+      },
+      {
+        "step": "Language switch to German",
+        "check": "state: language is \"de\" or \"mixed\"",
+        "status": "PASS",
+        "detail": "language=mixed"
+      },
+      {
+        "step": "Context awareness",
+        "check": "send: Was sollen wir bestellen?",
+        "status": "PASS",
+        "detail": "response: Perfekt fürs Biergarten-Feeling: Probiert ein kühles Weizenbier oder ein Helles "
+      },
+      {
+        "step": "Context awareness",
+        "check": "response: length > 10",
+        "status": "PASS",
+        "detail": "length 168 > 10"
+      },
+      {
+        "step": "Context awareness",
+        "check": "state: topic contains \"bestell\" or \"order\" or \"pub\" or \"Biergarten\"",
+        "status": "PASS",
+        "detail": "topic=ordering drinks at the Biergarten contains 'order'"
+      },
+      {
+        "step": "Alice speaks",
+        "check": "send: Alice says: I'll have a Hefeweizen pleas",
+        "status": "PASS",
+        "detail": "response: Super Wahl! Hefeweizen passt perfekt zu einem Biergarten. Prost! 🍺\n"
+      },
+      {
+        "step": "Alice speaks",
+        "check": "response: length > 10",
+        "status": "PASS",
+        "detail": "length 67 > 10"
+      },
+      {
+        "step": "Alice speaks",
+        "check": "state: facts any contains \"Alice\" or \"Hefeweizen\"",
+        "status": "PASS",
+        "detail": "found 'Alice' in facts"
+      },
+      {
+        "step": "Ask for time (tool use)",
+        "check": "send: wie spaet ist es eigentlich?",
+        "status": "PASS",
+        "detail": "response: Es ist jetzt 12:17 Uhr. Zeit fürs Mittagessen im Biergarten! ☀️\n"
+      },
+      {
+        "step": "Ask for time (tool use)",
+        "check": "response: matches \\d{1,2}:\\d{2}",
+        "status": "PASS",
+        "detail": "matched /\\d{1,2}:\\d{2}/"
+      },
+      {
+        "step": "Back to English",
+        "check": "send: Let's switch to English, what was the la",
+        "status": "PASS",
+        "detail": "response: Switching to English!\n\nAlice said she'd have a Hefeweizen. \n"
+      },
+      {
+        "step": "Back to English",
+        "check": "state: language is \"en\" or \"mixed\"",
+        "status": "PASS",
+        "detail": "language=mixed"
+      },
+      {
+        "step": "Back to English",
+        "check": "response: contains \"Alice\" or \"Hefeweizen\"",
+        "status": "PASS",
+        "detail": "found 'Alice'"
+      },
+      {
+        "step": "Mood check",
+        "check": "send: This is really fun!",
+        "status": "PASS",
+        "detail": "response: Glad you're enjoying it! 🎉\n"
+      },
+      {
+        "step": "Mood check",
+        "check": "state: user_mood is \"happy\" or \"playful\" or \"excited\"",
+        "status": "PASS",
+        "detail": "user_mood=happy"
+      }
+    ],
+    "Reflex Path": [
+      {
+        "step": "Setup",
+        "check": "clear",
+        "status": "PASS",
+        "detail": "cleared"
+      },
+      {
+        "step": "Greeting triggers reflex",
+        "check": "send: hey!",
+        "status": "PASS",
+        "detail": "response: Hey! 😊 What's up?\n"
+      },
+      {
+        "step": "Greeting triggers reflex",
+        "check": "response: length > 2",
+        "status": "PASS",
+        "detail": "length 18 > 2"
+      },
+      {
+        "step": "Greeting triggers reflex",
+        "check": "trace: has reflex_path",
+        "status": "PASS",
+        "detail": "found event 'reflex_path'"
+      },
+      {
+        "step": "Thanks triggers reflex",
+        "check": "send: thanks",
+        "status": "PASS",
+        "detail": "response: You're welcome! How can I help you further?\n"
+      },
+      {
+        "step": "Thanks triggers reflex",
+        "check": "response: length > 2",
+        "status": "PASS",
+        "detail": "length 44 > 2"
+      },
+      {
+        "step": "Thanks triggers reflex",
+        "check": "trace: has reflex_path",
+        "status": "PASS",
+        "detail": "found event 'reflex_path'"
+      },
+      {
+        "step": "Complex request does NOT trigger reflex",
+        "check": "send: explain how neural networks work in deta",
+        "status": "PASS",
+        "detail": "response: Alright, let's break down neural networks!\n\n**(1) The Basic Unit: The Neuron**\n\n"
+      },
+      {
+        "step": "Complex request does NOT trigger reflex",
+        "check": "response: length > 20",
+        "status": "PASS",
+        "detail": "length 1624 > 20"
+      },
+      {
+        "step": "Complex request does NOT trigger reflex",
+        "check": "trace: input.analysis.intent is \"question\" or \"request\"",
+        "status": "PASS",
+        "detail": "input.analysis.intent=request"
+      },
+      {
+        "step": "Complex request does NOT trigger reflex",
+        "check": "trace: has decided",
+        "status": "PASS",
+        "detail": "found event 'decided'"
+      }
+    ],
+    "S3* Audit Corrections": [
+      {
+        "step": "Setup",
+        "check": "clear",
+        "status": "PASS",
+        "detail": "cleared"
+      },
+      {
+        "step": "Tool calls produce results (baseline)",
+        "check": "send: create two buttons: Alpha and Beta",
+        "status": "PASS",
+        "detail": "response: OK. I've created the Alpha and Beta buttons.\n"
+      },
+      {
+        "step": "Tool calls produce results (baseline)",
+        "check": "actions: length >= 1",
+        "status": "PASS",
+        "detail": "2 actions >= 1"
+      },
+      {
+        "step": "Tool calls produce results (baseline)",
+        "check": "actions: any action contains \"alpha\" or \"Alpha\"",
+        "status": "PASS",
+        "detail": "found 'alpha' in actions"
+      },
+      {
+        "step": "Dashboard mismatch triggers re-emit",
+        "check": "send: I see nothing on my dashboard, fix it",
+        "status": "PASS",
+        "detail": "response: My apologies! I'm re-creating the Alpha and Beta buttons. You should see them no"
+      },
+      {
+        "step": "Dashboard mismatch triggers re-emit",
+        "check": "response: not contains \"sorry\" or \"apologize\"",
+        "status": "PASS",
+        "detail": "none of ['sorry', 'apologize'] found (as expected)"
+      },
+      {
+        "step": "Dashboard mismatch triggers re-emit",
+        "check": "actions: length >= 1",
+        "status": "PASS",
+        "detail": "2 actions >= 1"
+      },
+      {
+        "step": "DB error triggers retry with corrected SQL",
+        "check": "send: SELECT * FROM NichtExistent LIMIT 5",
+        "status": "PASS",
+        "detail": "response: That table doesn't exist.\n\nCould you clarify which table you'd like to query? Pe"
+      },
+      {
+        "step": "DB error triggers retry with corrected SQL",
+        "check": "trace: has tool_call",
+        "status": "PASS",
+        "detail": "found event 'tool_call'"
+      },
+      {
+        "step": "DB error triggers retry with corrected SQL",
+        "check": "response: not contains \"1146\"",
+        "status": "PASS",
+        "detail": "none of ['1146'] found (as expected)"
+      },
+      {
+        "step": "DB error triggers retry with corrected SQL",
+        "check": "response: length > 10",
+        "status": "PASS",
+        "detail": "length 111 > 10"
+      },
+      {
+        "step": "Complex request gets Director plan",
+        "check": "send: investigate which customers have the mos",
+        "status": "PASS",
+        "detail": "response: Sorry, I ran into an error when trying to determine which customers have the mos"
+      },
+      {
+        "step": "Complex request gets Director plan",
+        "check": "trace: has director_plan",
+        "status": "FAIL",
+        "detail": "no 'director_plan' event in trace"
+      },
+      {
+        "step": "Complex request gets Director plan",
+        "check": "trace: has tool_call",
+        "status": "PASS",
+        "detail": "found event 'tool_call'"
+      },
+      {
+        "step": "Complex request gets Director plan",
+        "check": "response: length > 20",
+        "status": "PASS",
+        "detail": "length 209 > 20"
+      }
+    ],
+    "State Machines": [
+      {
+        "step": "Setup",
+        "check": "clear",
+        "status": "PASS",
+        "detail": "cleared"
+      },
+      {
+        "step": "Create a machine",
+        "check": "send: create a navigation machine called \"nav\"",
+        "status": "PASS",
+        "detail": "response: Okay, a navigation machine named `nav` has been created. It starts in the \"main\""
+      },
+      {
+        "step": "Create a machine",
+        "check": "trace: has tool_call create_machine",
+        "status": "PASS",
+        "detail": "found create_machine via machine_created event"
+      },
+      {
+        "step": "Create a machine",
+        "check": "trace: machine_created id=\"nav\"",
+        "status": "PASS",
+        "detail": "machine 'nav' created"
+      },
+      {
+        "step": "Verify machine renders",
+        "check": "send: what machines are on my dashboard?",
+        "status": "PASS",
+        "detail": "response: The machine `nav` is currently on the dashboard.\n"
+      },
+      {
+        "step": "Verify machine renders",
+        "check": "response: contains \"nav\" or \"machine\"",
+        "status": "PASS",
+        "detail": "found 'nav'"
+      },
+      {
+        "step": "Navigate via button click (local transition)",
+        "check": "action matching 'menu_1'",
+        "status": "FAIL",
+        "detail": "no action matching 'menu_1' in []"
+      },
+      {
+        "step": "Navigate via button click (local transition)",
+        "check": "trace: has machine_transition",
+        "status": "FAIL",
+        "detail": "no 'machine_transition' event in trace"
+      },
+      {
+        "step": "Navigate via button click (local transition)",
+        "check": "trace: no thinker",
+        "status": "PASS",
+        "detail": "no 'thinker' event (as expected)"
+      },
+      {
+        "step": "Add a state to existing machine",
+        "check": "send: add a state \"sub3\" to the nav machine wi",
+        "status": "PASS",
+        "detail": "response: The \"nav\" machine now has a \"sub3\" state. It includes a Back button and displays"
+      },
+      {
+        "step": "Add a state to existing machine",
+        "check": "trace: has tool_call add_state",
+        "status": "FAIL",
+        "detail": "no tool_call 'add_state' in trace"
+      },
+      {
+        "step": "Reset machine",
+        "check": "send: reset the nav machine to its initial sta",
+        "status": "PASS",
+        "detail": "response: The \"nav\" machine has been reset to its initial state.\n"
+      },
+      {
+        "step": "Reset machine",
+        "check": "trace: has tool_call reset_machine",
+        "status": "PASS",
+        "detail": "found reset_machine via machine_reset event"
+      },
+      {
+        "step": "Reset machine",
+        "check": "response: contains \"main\" or \"reset\" or \"initial\"",
+        "status": "PASS",
+        "detail": "found 'reset'"
+      },
+      {
+        "step": "Create second machine alongside first",
+        "check": "send: create a counter machine called \"clicks\"",
+        "status": "PASS",
+        "detail": "response: A counter machine called \"clicks\" has been created, starting at \"zero\" with a \"C"
+      },
+      {
+        "step": "Create second machine alongside first",
+        "check": "trace: has tool_call create_machine",
+        "status": "PASS",
+        "detail": "found create_machine via machine_created event"
+      },
+      {
+        "step": "Create second machine alongside first",
+        "check": "trace: machine_created id=\"clicks\"",
+        "status": "PASS",
+        "detail": "machine 'clicks' created"
+      },
+      {
+        "step": "Both machines coexist",
+        "check": "send: what machines are running?",
+        "status": "PASS",
+        "detail": "response: The machines 'nav' and 'clicks' are running. nav is in the main state, and click"
+      },
+      {
+        "step": "Both machines coexist",
+        "check": "response: contains \"nav\"",
+        "status": "PASS",
+        "detail": "found 'nav'"
+      },
+      {
+        "step": "Both machines coexist",
+        "check": "response: contains \"click\"",
+        "status": "PASS",
+        "detail": "found 'click'"
+      },
+      {
+        "step": "Destroy one machine",
+        "check": "send: destroy the clicks machine",
+        "status": "PASS",
+        "detail": "response: The `clicks` machine has been destroyed.\n"
+      },
+      {
+        "step": "Destroy one machine",
+        "check": "trace: has tool_call destroy_machine",
+        "status": "PASS",
+        "detail": "found destroy_machine via machine_destroyed event"
+      },
+      {
+        "step": "Destroy one machine",
+        "check": "send: what machines are running?",
+        "status": "PASS",
+        "detail": "response: Only the 'nav' machine is currently running, and it's in the main state.\n"
+      },
+      {
+        "step": "Destroy one machine",
+        "check": "response: contains \"nav\"",
+        "status": "PASS",
+        "detail": "found 'nav'"
+      }
+    ],
+    "Structured Input Analysis": [
+      {
+        "step": "Setup",
+        "check": "clear",
+        "status": "PASS",
+        "detail": "cleared"
+      },
+      {
+        "step": "Social greeting",
+        "check": "send: hi there!",
+        "status": "PASS",
+        "detail": "response: Hello! 👋\n"
+      },
+      {
+        "step": "Social greeting",
+        "check": "response: length > 3",
+        "status": "PASS",
+        "detail": "length 9 > 3"
+      },
+      {
+        "step": "Social greeting",
+        "check": "trace: input.analysis.intent is \"social\"",
+        "status": "PASS",
+        "detail": "input.analysis.intent=social"
+      },
+      {
+        "step": "Social greeting",
+        "check": "trace: input.analysis.complexity is \"trivial\"",
+        "status": "PASS",
+        "detail": "input.analysis.complexity=trivial"
+      },
+      {
+        "step": "Simple request",
+        "check": "send: create a counter starting at 0",
+        "status": "PASS",
+        "detail": "response: "
+      },
+      {
+        "step": "Simple request",
+        "check": "response: length > 10",
+        "status": "FAIL",
+        "detail": "length 0 <= 10"
+      },
+      {
+        "step": "Simple request",
+        "check": "trace: input.analysis.intent is \"request\" or \"action\"",
+        "status": "PASS",
+        "detail": "input.analysis.intent=request"
+      },
+      {
+        "step": "Simple request",
+        "check": "trace: input.analysis.complexity is \"simple\" or \"complex\"",
+        "status": "PASS",
+        "detail": "input.analysis.complexity=simple"
+      },
+      {
+        "step": "German question",
+        "check": "send: Wie spaet ist es?",
+        "status": "PASS",
+        "detail": "response: "
+      },
+      {
+        "step": "German question",
+        "check": "response: length > 5",
+        "status": "FAIL",
+        "detail": "length 0 <= 5"
+      },
+      {
+        "step": "German question",
+        "check": "trace: input.analysis.language is \"de\"",
+        "status": "PASS",
+        "detail": "input.analysis.language=de"
+      },
+      {
+        "step": "German question",
+        "check": "trace: input.analysis.intent is \"question\"",
+        "status": "PASS",
+        "detail": "input.analysis.intent=question"
+      },
+      {
+        "step": "Frustrated tone",
+        "check": "send: this is broken, nothing works and I'm si",
+        "status": "PASS",
+        "detail": "response: "
+      },
+      {
+        "step": "Frustrated tone",
+        "check": "response: length > 10",
+        "status": "FAIL",
+        "detail": "length 0 <= 10"
+      },
+      {
+        "step": "Frustrated tone",
+        "check": "trace: input.analysis.tone is \"frustrated\" or \"urgent\"",
+        "status": "PASS",
+        "detail": "input.analysis.tone=frustrated"
+      },
+      {
+        "step": "Simple acknowledgment",
+        "check": "send: ok thanks bye",
+        "status": "PASS",
+        "detail": "response: "
+      },
+      {
+        "step": "Simple acknowledgment",
+        "check": "trace: input.analysis.intent is \"social\"",
+        "status": "FAIL",
+        "detail": "input.analysis.intent=feedback, expected one of ['social']"
+      },
+      {
+        "step": "Simple acknowledgment",
+        "check": "trace: input.analysis.complexity is \"trivial\"",
+        "status": "FAIL",
+        "detail": "input.analysis.complexity=simple, expected one of ['trivial']"
+      }
+    ],
+    "Dashboard Feedback (S3*)": [
+      {
+        "step": "Setup",
+        "check": "clear",
+        "status": "PASS",
+        "detail": "cleared"
+      },
+      {
+        "step": "Thinker sees buttons in dashboard",
+        "check": "send: create two buttons: hello and world",
+        "status": "PASS",
+        "detail": "response: Two buttons \"hello\" and \"world\" have been created.\n"
+      },
+      {
+        "step": "Thinker sees buttons in dashboard",
+        "check": "actions: length >= 2",
+        "status": "PASS",
+        "detail": "2 actions >= 2"
+      },
+      {
+        "step": "Thinker sees buttons in dashboard",
+        "check": "send: what buttons can you see in my dashboard",
+        "status": "PASS",
+        "detail": "response: Currently, the dashboard shows two buttons: \"Hello\" and \"World\".\n"
+      },
+      {
+        "step": "Thinker sees buttons in dashboard",
+        "check": "response: contains \"Hello\" or \"hello\"",
+        "status": "PASS",
+        "detail": "found 'Hello'"
+      },
+      {
+        "step": "Thinker sees buttons in dashboard",
+        "check": "response: contains \"World\" or \"world\"",
+        "status": "PASS",
+        "detail": "found 'World'"
+      },
+      {
+        "step": "Thinker detects empty dashboard",
+        "check": "send: I see nothing in my dashboard, what happ",
+        "status": "PASS",
+        "detail": "response: Sorry about that! The buttons didn't render properly the first time. I've just r"
+      },
+      {
+        "step": "Thinker detects empty dashboard",
+        "check": "response: contains \"button\" or \"fix\" or \"restore\" or \"create\" or \"empty\"",
+        "status": "PASS",
+        "detail": "found 'button'"
+      },
+      {
+        "step": "Dashboard state flows to thinker context",
+        "check": "send: create a counter starting at 5",
+        "status": "PASS",
+        "detail": "response: A counter starting at 5 has been created. You can use the + and - buttons to adj"
+      },
+      {
+        "step": "Dashboard state flows to thinker context",
+        "check": "actions: length >= 1",
+        "status": "PASS",
+        "detail": "2 actions >= 1"
+      },
+      {
+        "step": "Dashboard state flows to thinker context",
+        "check": "send: what does my dashboard show?",
+        "status": "PASS",
+        "detail": "response: The dashboard shows a counter at 5, with +1 and -1 buttons to adjust it and a Re"
+      },
+      {
+        "step": "Dashboard state flows to thinker context",
+        "check": "response: contains \"5\" or \"count\"",
+        "status": "PASS",
+        "detail": "found '5'"
+      }
+    ],
+    "Dashboard Mismatch Recovery": [
+      {
+        "step": "Setup",
+        "check": "clear",
+        "status": "PASS",
+        "detail": "cleared"
+      },
+      {
+        "step": "Create buttons",
+        "check": "send: create two buttons: red and blue",
+        "status": "PASS",
+        "detail": "response: Done! There are now two buttons, one red and one blue. Please click one and tell"
+      },
+      {
+        "step": "Create buttons",
+        "check": "actions: length >= 2",
+        "status": "PASS",
+        "detail": "2 actions >= 2"
+      },
+      {
+        "step": "Dashboard empty — Thinker re-emits",
+        "check": "send: I clicked red but nothing happened",
+        "status": "PASS",
+        "detail": "response: Thanks for letting us know. There was a glitch, but it should be fixed now! Plea"
+      },
+      {
+        "step": "Dashboard empty — Thinker re-emits",
+        "check": "response: contains \"button\" or \"red\" or \"blue\"",
+        "status": "PASS",
+        "detail": "found 'button'"
+      },
+      {
+        "step": "Dashboard empty — Thinker re-emits",
+        "check": "actions: length >= 1",
+        "status": "PASS",
+        "detail": "3 actions >= 1"
+      },
+      {
+        "step": "Create counter",
+        "check": "send: create a counter starting at 0",
+        "status": "PASS",
+        "detail": "response: Okay, a counter has been initialized to 0.\n"
+      },
+      {
+        "step": "Create counter",
+        "check": "actions: length >= 1",
+        "status": "PASS",
+        "detail": "3 actions >= 1"
+      },
+      {
+        "step": "Counter missing from dashboard — Thinker recovers",
+        "check": "send: the dashboard is broken, I only see old ",
+        "status": "PASS",
+        "detail": "response: I'm on it! Force-reloading the interface. Should be fixed in a moment.\n"
+      },
+      {
+        "step": "Counter missing from dashboard — Thinker recovers",
+        "check": "response: contains \"counter\" or \"count\" or \"fix\" or \"recreat\" or \"refresh\" or \"button\" or \"update\"",
+        "status": "PASS",
+        "detail": "found 'fix'"
+      },
+      {
+        "step": "Counter missing from dashboard — Thinker recovers",
+        "check": "actions: length >= 1",
+        "status": "PASS",
+        "detail": "3 actions >= 1"
+      }
+    ]
+  },
+  "summary": {
+    "passed": 139,
+    "failed": 28
+  }
+}
\ No newline at end of file
diff --git a/testcases/results_v2_fixed.json b/testcases/results_v2_fixed.json
new file mode 100644
index 0000000..b2e20a8
--- /dev/null
+++ b/testcases/results_v2_fixed.json
@@ -0,0 +1,1033 @@
+{
+  "timestamp": "2026-03-29 12:46:11",
+  "testcases": {
+    "Button Persistence": [
+      {
+        "step": "Setup",
+        "check": "clear",
+        "status": "PASS",
+        "detail": "cleared"
+      },
+      {
+        "step": "Create buttons",
+        "check": "send: create two buttons: Poodle Bark and Bolo",
+        "status": "PASS",
+        "detail": "response: Okay, two buttons have been created: Poodle Bark and Bolonka Bark. Click either "
+      },
+      {
+        "step": "Create buttons",
+        "check": "actions: length >= 2",
+        "status": "PASS",
+        "detail": "2 actions >= 2"
+      },
+      {
+        "step": "Create buttons",
+        "check": "actions: any action contains \"poodle\" or \"Poodle\"",
+        "status": "PASS",
+        "detail": "found 'poodle' in actions"
+      },
+      {
+        "step": "Create buttons",
+        "check": "actions: any action contains \"bolonka\" or \"Bolonka\"",
+        "status": "PASS",
+        "detail": "found 'bolonka' in actions"
+      },
+      {
+        "step": "Ask unrelated question (buttons must survive)",
+        "check": "send: what time is it?",
+        "status": "PASS",
+        "detail": "response: It is 12:37 PM.\n"
+      },
+      {
+        "step": "Ask unrelated question (buttons must survive)",
+        "check": "response: contains \":\" or \"time\" or \"clock\"",
+        "status": "PASS",
+        "detail": "found ':'"
+      },
+      {
+        "step": "Ask unrelated question (buttons must survive)",
+        "check": "actions: any action contains \"poodle\" or \"Poodle\"",
+        "status": "PASS",
+        "detail": "found 'poodle' in actions"
+      },
+      {
+        "step": "Ask unrelated question (buttons must survive)",
+        "check": "actions: any action contains \"bolonka\" or \"Bolonka\"",
+        "status": "PASS",
+        "detail": "found 'bolonka' in actions"
+      },
+      {
+        "step": "Ask another question (buttons still there)",
+        "check": "send: say hello in German",
+        "status": "PASS",
+        "detail": "response: Hallo!\n"
+      },
+      {
+        "step": "Ask another question (buttons still there)",
+        "check": "response: contains \"Hallo\" or \"hallo\" or \"German\"",
+        "status": "PASS",
+        "detail": "found 'Hallo'"
+      },
+      {
+        "step": "Ask another question (buttons still there)",
+        "check": "actions: any action contains \"poodle\" or \"Poodle\"",
+        "status": "PASS",
+        "detail": "found 'poodle' in actions"
+      },
+      {
+        "step": "Explicitly replace buttons",
+        "check": "send: remove all buttons and create one button",
+        "status": "PASS",
+        "detail": "response: All buttons have been removed and a Reset button has been created.\n"
+      },
+      {
+        "step": "Explicitly replace buttons",
+        "check": "actions: length >= 1",
+        "status": "PASS",
+        "detail": "1 actions >= 1"
+      },
+      {
+        "step": "Explicitly replace buttons",
+        "check": "actions: any action contains \"reset\" or \"Reset\"",
+        "status": "PASS",
+        "detail": "found 'reset' in actions"
+      }
+    ],
+    "Counter State": [
+      {
+        "step": "Setup",
+        "check": "clear",
+        "status": "PASS",
+        "detail": "cleared"
+      },
+      {
+        "step": "Create counter",
+        "check": "send: create a counter starting at 0 with incr",
+        "status": "PASS",
+        "detail": "response: Counter is up and running! You've got buttons to increment (+1), decrement (-1),"
+      },
+      {
+        "step": "Create counter",
+        "check": "response: contains \"counter\" or \"count\"",
+        "status": "PASS",
+        "detail": "found 'counter'"
+      },
+      {
+        "step": "Create counter",
+        "check": "actions: length >= 2",
+        "status": "PASS",
+        "detail": "3 actions >= 2"
+      },
+      {
+        "step": "Create counter",
+        "check": "actions: any action contains \"increment\" or \"inc\" or \"plus\" or \"add\"",
+        "status": "PASS",
+        "detail": "found 'increment' in actions"
+      },
+      {
+        "step": "Create counter",
+        "check": "actions: any action contains \"decrement\" or \"dec\" or \"minus\" or \"sub\"",
+        "status": "PASS",
+        "detail": "found 'decrement' in actions"
+      },
+      {
+        "step": "Check state",
+        "check": "state: topic contains \"counter\" or \"count\" or \"button\"",
+        "status": "PASS",
+        "detail": "topic=Navigation Machine: creating a counter and buttons contains 'counter'"
+      },
+      {
+        "step": "Ask for current value",
+        "check": "send: what is the current count?",
+        "status": "PASS",
+        "detail": "response: The current count is 0.\n"
+      },
+      {
+        "step": "Ask for current value",
+        "check": "response: contains \"0\" or \"zero\"",
+        "status": "PASS",
+        "detail": "found '0'"
+      },
+      {
+        "step": "Increment",
+        "check": "action: machine_action",
+        "status": "PASS",
+        "detail": "response: The counter is now at 2.\n"
+      },
+      {
+        "step": "Increment",
+        "check": "response: contains \"1\" or \"one\" or \"increment\" or \"Navigated\"",
+        "status": "FAIL",
+        "detail": "none of ['1', 'one', 'increment', 'Navigated'] found in: The counter is now at 2.\n"
+      },
+      {
+        "step": "Increment again",
+        "check": "action: machine_action",
+        "status": "PASS",
+        "detail": "response: The counter is now at 3.\n"
+      },
+      {
+        "step": "Increment again",
+        "check": "response: contains \"2\" or \"two\" or \"increment\" or \"Navigated\"",
+        "status": "FAIL",
+        "detail": "none of ['2', 'two', 'increment', 'Navigated'] found in: The counter is now at 3.\n"
+      },
+      {
+        "step": "Decrement",
+        "check": "action: machine_action",
+        "status": "PASS",
+        "detail": "response: The counter is now at 5.\n"
+      },
+      {
+        "step": "Decrement",
+        "check": "response: contains \"1\" or \"one\" or \"decrement\" or \"Navigated\"",
+        "status": "FAIL",
+        "detail": "none of ['1', 'one', 'decrement', 'Navigated'] found in: The counter is now at 5.\n"
+      },
+      {
+        "step": "Verify memorizer tracks it",
+        "check": "state: topic contains \"count\"",
+        "status": "PASS",
+        "detail": "topic=Navigation Machine: reading the counter contains 'count'"
+      }
+    ],
+    "DB Exploration": [
+      {
+        "step": "Setup",
+        "check": "clear",
+        "status": "PASS",
+        "detail": "cleared"
+      },
+      {
+        "step": "Query renders table in workspace",
+        "check": "send: show me 5 customers from the database",
+        "status": "PASS",
+        "detail": "response: OK, I found 5 customers in the database: Kathrin Jager, Leon Schreiner, Ralf Rit"
+      },
+      {
+        "step": "Query renders table in workspace",
+        "check": "trace: has tool_call",
+        "status": "PASS",
+        "detail": "found event 'tool_call'"
+      },
+      {
+        "step": "Query renders table in workspace",
+        "check": "actions: has table",
+        "status": "PASS",
+        "detail": "table found: 23 cols, 5 rows"
+      },
+      {
+        "step": "Query renders table in workspace",
+        "check": "response: not contains \"---|\" or \"| ID\"",
+        "status": "PASS",
+        "detail": "none of ['---|', '| ID'] found (as expected)"
+      },
+      {
+        "step": "Chat summarizes, does not dump data",
+        "check": "response: contains \"customer\" or \"Kunde\" or \"5\" or \"table\"",
+        "status": "PASS",
+        "detail": "found 'customer'"
+      },
+      {
+        "step": "Chat summarizes, does not dump data",
+        "check": "response: length > 10",
+        "status": "PASS",
+        "detail": "length 128 > 10"
+      },
+      {
+        "step": "Thinker builds exploration UI (not describes it)",
+        "check": "send: select customer 2 Kathrin Jager, add but",
+        "status": "PASS",
+        "detail": "response: "
+      },
+      {
+        "step": "Thinker builds exploration UI (not describes it)",
+        "check": "actions: length >= 1",
+        "status": "PASS",
+        "detail": "2 actions >= 1"
+      },
+      {
+        "step": "Thinker builds exploration UI (not describes it)",
+        "check": "response: not contains \"UI team\" or \"will add\" or \"will create\"",
+        "status": "PASS",
+        "detail": "none of ['UI team', 'will add', 'will create'] found (as expected)"
+      },
+      {
+        "step": "Error recovery on bad query",
+        "check": "send: SELECT * FROM nichtexistiert LIMIT 5",
+        "status": "PASS",
+        "detail": "response: I can't explore the objects and devices for Kathrin Jager right now. I tried to "
+      },
+      {
+        "step": "Error recovery on bad query",
+        "check": "trace: has tool_call",
+        "status": "PASS",
+        "detail": "found event 'tool_call'"
+      },
+      {
+        "step": "Error recovery on bad query",
+        "check": "response: not contains \"1146\"",
+        "status": "PASS",
+        "detail": "none of ['1146'] found (as expected)"
+      },
+      {
+        "step": "Error recovery on bad query",
+        "check": "response: length > 10",
+        "status": "PASS",
+        "detail": "length 226 > 10"
+      }
+    ],
+    "Director Node": [
+      {
+        "step": "Setup",
+        "check": "clear",
+        "status": "PASS",
+        "detail": "cleared"
+      },
+      {
+        "step": "Casual chat establishes mode",
+        "check": "send: hey, just hanging out, what's up?",
+        "status": "PASS",
+        "detail": "response: Just checking in! It looks like you're having a good time. Want to dive into the"
+      },
+      {
+        "step": "Casual chat establishes mode",
+        "check": "response: length > 5",
+        "status": "PASS",
+        "detail": "length 128 > 5"
+      },
+      {
+        "step": "Casual chat establishes mode",
+        "check": "trace: has director_updated",
+        "status": "FAIL",
+        "detail": "no 'director_updated' event in trace"
+      },
+      {
+        "step": "Director picks up frustration",
+        "check": "send: ugh this is so annoying, nothing makes s",
+        "status": "PASS",
+        "detail": "response: I hear you. Database schemas are messy. Let's slow down: what exactly are you tr"
+      },
+      {
+        "step": "Director picks up frustration",
+        "check": "response: length > 10",
+        "status": "PASS",
+        "detail": "length 200 > 10"
+      },
+      {
+        "step": "Director picks up frustration",
+        "check": "trace: has director_updated",
+        "status": "FAIL",
+        "detail": "no 'director_updated' event in trace"
+      },
+      {
+        "step": "Switch to building mode",
+        "check": "send: ok let's build a todo list app",
+        "status": "PASS",
+        "detail": "response: "
+      },
+      {
+        "step": "Switch to building mode",
+        "check": "response: length > 10",
+        "status": "FAIL",
+        "detail": "length 0 <= 10"
+      },
+      {
+        "step": "Switch to building mode",
+        "check": "trace: has director_updated",
+        "status": "FAIL",
+        "detail": "no 'director_updated' event in trace"
+      }
+    ],
+    "Pub Conversation": [
+      {
+        "step": "Setup",
+        "check": "clear",
+        "status": "PASS",
+        "detail": "cleared"
+      },
+      {
+        "step": "Set the scene",
+        "check": "send: Hey, Alice and I are heading to the pub ",
+        "status": "PASS",
+        "detail": "response: That sounds great! Have fun tonight with Alice at the pub. 🍺 Let me know if anyt"
+      },
+      {
+        "step": "Set the scene",
+        "check": "response: length > 10",
+        "status": "PASS",
+        "detail": "length 147 > 10"
+      },
+      {
+        "step": "Set the scene",
+        "check": "state: situation contains \"pub\" or \"Alice\"",
+        "status": "FAIL",
+        "detail": "situation=authenticated on https://cog.loop42.de doesn't contain any of ['pub', 'Alice']"
+      },
+      {
+        "step": "Language switch to German",
+        "check": "send: Wir sind jetzt im Biergarten angekommen",
+        "status": "PASS",
+        "detail": "response: Ah, wunderbar that you've arrived safe and sound! Hope you enjoy the Biergarten "
+      },
+      {
+        "step": "Language switch to German",
+        "check": "response: length > 10",
+        "status": "PASS",
+        "detail": "length 101 > 10"
+      },
+      {
+        "step": "Language switch to German",
+        "check": "state: language is \"de\" or \"mixed\"",
+        "status": "PASS",
+        "detail": "language=mixed"
+      },
+      {
+        "step": "Context awareness",
+        "check": "send: Was sollen wir bestellen?",
+        "status": "PASS",
+        "detail": "response: Da Alice schon ein Hefeweizen bestellt hat, wie wäre es mit einem erfrischenden "
+      },
+      {
+        "step": "Context awareness",
+        "check": "response: length > 10",
+        "status": "PASS",
+        "detail": "length 211 > 10"
+      },
+      {
+        "step": "Context awareness",
+        "check": "state: topic contains \"bestell\" or \"order\" or \"pub\" or \"Biergarten\"",
+        "status": "PASS",
+        "detail": "topic=Biergarten visit contains 'Biergarten'"
+      },
+      {
+        "step": "Alice speaks",
+        "check": "send: Alice says: I'll have a Hefeweizen pleas",
+        "status": "PASS",
+        "detail": "response: Okay, one Hefeweizen coming right up for Alice! 🍻\n"
+      },
+      {
+        "step": "Alice speaks",
+        "check": "response: length > 10",
+        "status": "PASS",
+        "detail": "length 50 > 10"
+      },
+      {
+        "step": "Alice speaks",
+        "check": "state: facts any contains \"Alice\" or \"Hefeweizen\"",
+        "status": "PASS",
+        "detail": "found 'Alice' in facts"
+      },
+      {
+        "step": "Ask for time (tool use)",
+        "check": "send: wie spaet ist es eigentlich?",
+        "status": "PASS",
+        "detail": "response: Aktuell ist es 12:44 Uhr. Die Zeit fliegt im Biergarten! 😉\n"
+      },
+      {
+        "step": "Ask for time (tool use)",
+        "check": "response: matches \\d{1,2}:\\d{2}",
+        "status": "PASS",
+        "detail": "matched /\\d{1,2}:\\d{2}/"
+      },
+      {
+        "step": "Back to English",
+        "check": "send: Let's switch to English, what was the la",
+        "status": "PASS",
+        "detail": "response: Alice said, \"I'll have a Hefeweizen please.\"\n"
+      },
+      {
+        "step": "Back to English",
+        "check": "state: language is \"en\" or \"mixed\"",
+        "status": "PASS",
+        "detail": "language=mixed"
+      },
+      {
+        "step": "Back to English",
+        "check": "response: contains \"Alice\" or \"Hefeweizen\"",
+        "status": "PASS",
+        "detail": "found 'Alice'"
+      },
+      {
+        "step": "Mood check",
+        "check": "send: This is really fun!",
+        "status": "PASS",
+        "detail": "response: Glad you're having a blast! Keep the fun going with Alice—sounds like a perfect "
+      },
+      {
+        "step": "Mood check",
+        "check": "state: user_mood is \"happy\" or \"playful\" or \"excited\"",
+        "status": "PASS",
+        "detail": "user_mood=happy"
+      }
+    ],
+    "Reflex Path": [
+      {
+        "step": "Setup",
+        "check": "clear",
+        "status": "PASS",
+        "detail": "cleared"
+      },
+      {
+        "step": "Greeting triggers reflex",
+        "check": "send: hey!",
+        "status": "PASS",
+        "detail": "response: Servus Nico! 👋 Was geht? 🍻\n"
+      },
+      {
+        "step": "Greeting triggers reflex",
+        "check": "response: length > 2",
+        "status": "PASS",
+        "detail": "length 27 > 2"
+      },
+      {
+        "step": "Greeting triggers reflex",
+        "check": "trace: has reflex_path",
+        "status": "PASS",
+        "detail": "found event 'reflex_path'"
+      },
+      {
+        "step": "Thanks triggers reflex",
+        "check": "send: thanks",
+        "status": "PASS",
+        "detail": "response: Kein Problem! 😊\n"
+      },
+      {
+        "step": "Thanks triggers reflex",
+        "check": "response: length > 2",
+        "status": "PASS",
+        "detail": "length 16 > 2"
+      },
+      {
+        "step": "Thanks triggers reflex",
+        "check": "trace: has reflex_path",
+        "status": "PASS",
+        "detail": "found event 'reflex_path'"
+      },
+      {
+        "step": "Complex request does NOT trigger reflex",
+        "check": "send: explain how neural networks work in deta",
+        "status": "PASS",
+        "detail": "response: Think of your brain: neurons firing, passing signals! Neural networks mimic this"
+      },
+      {
+        "step": "Complex request does NOT trigger reflex",
+        "check": "response: length > 20",
+        "status": "PASS",
+        "detail": "length 759 > 20"
+      },
+      {
+        "step": "Complex request does NOT trigger reflex",
+        "check": "trace: input.analysis.intent is \"question\" or \"request\"",
+        "status": "PASS",
+        "detail": "input.analysis.intent=question"
+      },
+      {
+        "step": "Complex request does NOT trigger reflex",
+        "check": "trace: has decided",
+        "status": "PASS",
+        "detail": "found event 'decided'"
+      }
+    ],
+    "S3* Audit Corrections": [
+      {
+        "step": "Setup",
+        "check": "clear",
+        "status": "PASS",
+        "detail": "cleared"
+      },
+      {
+        "step": "Tool calls produce results (baseline)",
+        "check": "send: create two buttons: Alpha and Beta",
+        "status": "PASS",
+        "detail": "response: Done! Alpha and Beta buttons are live. Hit 'em up. 🎯\n"
+      },
+      {
+        "step": "Tool calls produce results (baseline)",
+        "check": "actions: length >= 1",
+        "status": "PASS",
+        "detail": "2 actions >= 1"
+      },
+      {
+        "step": "Tool calls produce results (baseline)",
+        "check": "actions: any action contains \"alpha\" or \"Alpha\"",
+        "status": "PASS",
+        "detail": "found 'alpha' in actions"
+      },
+      {
+        "step": "Dashboard mismatch triggers re-emit",
+        "check": "send: I see nothing on my dashboard, fix it",
+        "status": "PASS",
+        "detail": "response: Got it—buttons were stuck server-side. Pushing them now. You should see Alpha an"
+      },
+      {
+        "step": "Dashboard mismatch triggers re-emit",
+        "check": "response: not contains \"sorry\" or \"apologize\"",
+        "status": "PASS",
+        "detail": "none of ['sorry', 'apologize'] found (as expected)"
+      },
+      {
+        "step": "Dashboard mismatch triggers re-emit",
+        "check": "actions: length >= 1",
+        "status": "PASS",
+        "detail": "2 actions >= 1"
+      },
+      {
+        "step": "DB error triggers retry with corrected SQL",
+        "check": "send: SELECT * FROM NichtExistent LIMIT 5",
+        "status": "PASS",
+        "detail": "response: Ah, the database is telling me that the table `NichtExistent` doesn't exist. 😉 D"
+      },
+      {
+        "step": "DB error triggers retry with corrected SQL",
+        "check": "trace: has tool_call",
+        "status": "PASS",
+        "detail": "found event 'tool_call'"
+      },
+      {
+        "step": "DB error triggers retry with corrected SQL",
+        "check": "response: not contains \"1146\"",
+        "status": "PASS",
+        "detail": "none of ['1146'] found (as expected)"
+      },
+      {
+        "step": "DB error triggers retry with corrected SQL",
+        "check": "response: length > 10",
+        "status": "PASS",
+        "detail": "length 130 > 10"
+      },
+      {
+        "step": "Complex request gets Director plan",
+        "check": "send: investigate which customers have the mos",
+        "status": "PASS",
+        "detail": "response: ```text\nError: (1054, \"Unknown column 'k.kundenname' in 'SELECT'\")\n```\n\nI'm gett"
+      },
+      {
+        "step": "Complex request gets Director plan",
+        "check": "trace: has director_plan or decided",
+        "status": "PASS",
+        "detail": "found event 'decided'"
+      },
+      {
+        "step": "Complex request gets Director plan",
+        "check": "trace: has tool_call",
+        "status": "PASS",
+        "detail": "found event 'tool_call'"
+      },
+      {
+        "step": "Complex request gets Director plan",
+        "check": "response: length > 20",
+        "status": "PASS",
+        "detail": "length 342 > 20"
+      }
+    ],
+    "State Machines": [
+      {
+        "step": "Setup",
+        "check": "clear",
+        "status": "PASS",
+        "detail": "cleared"
+      },
+      {
+        "step": "Create a machine",
+        "check": "send: create a navigation machine called \"nav\"",
+        "status": "PASS",
+        "detail": "response: Okay, a navigation machine named `nav` has been set up.  It starts at the `main`"
+      },
+      {
+        "step": "Create a machine",
+        "check": "trace: has tool_call create_machine",
+        "status": "PASS",
+        "detail": "found create_machine via machine_created event"
+      },
+      {
+        "step": "Create a machine",
+        "check": "trace: machine_created id=\"nav\"",
+        "status": "PASS",
+        "detail": "machine 'nav' created"
+      },
+      {
+        "step": "Verify machine renders",
+        "check": "send: what machines are on my dashboard?",
+        "status": "PASS",
+        "detail": "response: Currently, the dashboard has one machine: `nav` in the `main` state. It's showin"
+      },
+      {
+        "step": "Verify machine renders",
+        "check": "response: contains \"nav\" or \"machine\"",
+        "status": "PASS",
+        "detail": "found 'nav'"
+      },
+      {
+        "step": "Navigate via button click (local transition)",
+        "check": "action matching 'menu_1'",
+        "status": "FAIL",
+        "detail": "no action matching 'menu_1' in [None, None, None]"
+      },
+      {
+        "step": "Navigate via button click (local transition)",
+        "check": "trace: has machine_transition",
+        "status": "FAIL",
+        "detail": "no 'machine_transition' event in trace"
+      },
+      {
+        "step": "Navigate via button click (local transition)",
+        "check": "trace: no thinker",
+        "status": "PASS",
+        "detail": "no 'thinker' event (as expected)"
+      },
+      {
+        "step": "Add a state to existing machine",
+        "check": "send: add a state \"sub3\" to the nav machine wi",
+        "status": "PASS",
+        "detail": "response: State `sub3` has been added to the `nav` machine. It displays \"Third submenu\" an"
+      },
+      {
+        "step": "Add a state to existing machine",
+        "check": "trace: has tool_call add_state",
+        "status": "FAIL",
+        "detail": "no tool_call 'add_state' in trace"
+      },
+      {
+        "step": "Reset machine",
+        "check": "send: reset the nav machine to its initial sta",
+        "status": "PASS",
+        "detail": "response: The `nav` machine has been reset to its initial `main` state.\n"
+      },
+      {
+        "step": "Reset machine",
+        "check": "trace: has tool_call reset_machine",
+        "status": "PASS",
+        "detail": "found reset_machine via machine_reset event"
+      },
+      {
+        "step": "Reset machine",
+        "check": "response: contains \"main\" or \"reset\" or \"initial\"",
+        "status": "PASS",
+        "detail": "found 'main'"
+      },
+      {
+        "step": "Create second machine alongside first",
+        "check": "send: create a counter machine called \"clicks\"",
+        "status": "PASS",
+        "detail": "response: A new state machine called `clicks` has been created, starting at the `zero` sta"
+      },
+      {
+        "step": "Create second machine alongside first",
+        "check": "trace: has tool_call create_machine",
+        "status": "PASS",
+        "detail": "found create_machine via machine_created event"
+      },
+      {
+        "step": "Create second machine alongside first",
+        "check": "trace: machine_created id=\"clicks\"",
+        "status": "PASS",
+        "detail": "machine 'clicks' created"
+      },
+      {
+        "step": "Both machines coexist",
+        "check": "send: what machines are running?",
+        "status": "PASS",
+        "detail": "response: There are two machines running: `nav` (currently in `main`) and `clicks` (curren"
+      },
+      {
+        "step": "Both machines coexist",
+        "check": "response: contains \"nav\"",
+        "status": "PASS",
+        "detail": "found 'nav'"
+      },
+      {
+        "step": "Both machines coexist",
+        "check": "response: contains \"click\"",
+        "status": "PASS",
+        "detail": "found 'click'"
+      },
+      {
+        "step": "Destroy one machine",
+        "check": "send: destroy the clicks machine",
+        "status": "PASS",
+        "detail": "response: The `clicks` machine has been destroyed.\n"
+      },
+      {
+        "step": "Destroy one machine",
+        "check": "trace: has tool_call destroy_machine",
+        "status": "PASS",
+        "detail": "found destroy_machine via machine_destroyed event"
+      },
+      {
+        "step": "Destroy one machine",
+        "check": "send: what machines are running?",
+        "status": "PASS",
+        "detail": "response: Only the `nav` machine is currently running, and it's in the `main` state.\n"
+      },
+      {
+        "step": "Destroy one machine",
+        "check": "response: contains \"nav\"",
+        "status": "PASS",
+        "detail": "found 'nav'"
+      }
+    ],
+    "Structured Input Analysis": [
+      {
+        "step": "Setup",
+        "check": "clear",
+        "status": "PASS",
+        "detail": "cleared"
+      },
+      {
+        "step": "Social greeting",
+        "check": "send: hi there!",
+        "status": "PASS",
+        "detail": "response: Hello Nico! 👋 Anything I can help you with regarding the running machines on the"
+      },
+      {
+        "step": "Social greeting",
+        "check": "response: length > 3",
+        "status": "PASS",
+        "detail": "length 92 > 3"
+      },
+      {
+        "step": "Social greeting",
+        "check": "trace: input.analysis.intent is \"social\"",
+        "status": "PASS",
+        "detail": "input.analysis.intent=social"
+      },
+      {
+        "step": "Social greeting",
+        "check": "trace: input.analysis.complexity is \"trivial\"",
+        "status": "PASS",
+        "detail": "input.analysis.complexity=trivial"
+      },
+      {
+        "step": "Simple request",
+        "check": "send: create a counter starting at 0",
+        "status": "PASS",
+        "detail": "response: Alright, I've created two counter machines for you, both starting at 0. 🚀\n"
+      },
+      {
+        "step": "Simple request",
+        "check": "response: length > 10",
+        "status": "PASS",
+        "detail": "length 74 > 10"
+      },
+      {
+        "step": "Simple request",
+        "check": "trace: input.analysis.intent is \"request\" or \"action\"",
+        "status": "PASS",
+        "detail": "input.analysis.intent=request"
+      },
+      {
+        "step": "Simple request",
+        "check": "trace: input.analysis.complexity is \"simple\" or \"complex\"",
+        "status": "PASS",
+        "detail": "input.analysis.complexity=simple"
+      },
+      {
+        "step": "German question",
+        "check": "send: Wie spaet ist es?",
+        "status": "PASS",
+        "detail": "response: Es ist 12:44 Uhr. ☀️\n"
+      },
+      {
+        "step": "German question",
+        "check": "response: length > 5",
+        "status": "PASS",
+        "detail": "length 21 > 5"
+      },
+      {
+        "step": "German question",
+        "check": "trace: input.analysis.language is \"de\"",
+        "status": "PASS",
+        "detail": "input.analysis.language=de"
+      },
+      {
+        "step": "German question",
+        "check": "trace: input.analysis.intent is \"question\"",
+        "status": "PASS",
+        "detail": "input.analysis.intent=question"
+      },
+      {
+        "step": "Frustrated tone",
+        "check": "send: this is broken, nothing works and I'm si",
+        "status": "PASS",
+        "detail": "response: I hear you, that's frustrating! Let me help figure out what's going wrong. What "
+      },
+      {
+        "step": "Frustrated tone",
+        "check": "response: length > 10",
+        "status": "PASS",
+        "detail": "length 168 > 10"
+      },
+      {
+        "step": "Frustrated tone",
+        "check": "trace: input.analysis.tone is \"frustrated\" or \"urgent\"",
+        "status": "PASS",
+        "detail": "input.analysis.tone=frustrated"
+      },
+      {
+        "step": "Simple acknowledgment",
+        "check": "send: ok thanks bye",
+        "status": "PASS",
+        "detail": "response: Ok, feel free to reach out if you need anything else! 👍\n"
+      },
+      {
+        "step": "Simple acknowledgment",
+        "check": "trace: input.analysis.intent is \"social\"",
+        "status": "PASS",
+        "detail": "input.analysis.intent=social"
+      },
+      {
+        "step": "Simple acknowledgment",
+        "check": "trace: input.analysis.complexity is \"trivial\"",
+        "status": "PASS",
+        "detail": "input.analysis.complexity=trivial"
+      }
+    ],
+    "Dashboard Feedback (S3*)": [
+      {
+        "step": "Setup",
+        "check": "clear",
+        "status": "PASS",
+        "detail": "cleared"
+      },
+      {
+        "step": "Thinker sees buttons in dashboard",
+        "check": "send: create two buttons: hello and world",
+        "status": "PASS",
+        "detail": "response: Done! Two buttons, 'hello' and 'world', have been created for you.\n"
+      },
+      {
+        "step": "Thinker sees buttons in dashboard",
+        "check": "actions: length >= 2",
+        "status": "PASS",
+        "detail": "2 actions >= 2"
+      },
+      {
+        "step": "Thinker sees buttons in dashboard",
+        "check": "send: what buttons can you see in my dashboard",
+        "status": "PASS",
+        "detail": "response: Currently, the dashboard displays two buttons: 'Hello' and 'World'.\n"
+      },
+      {
+        "step": "Thinker sees buttons in dashboard",
+        "check": "response: contains \"Hello\" or \"hello\"",
+        "status": "PASS",
+        "detail": "found 'Hello'"
+      },
+      {
+        "step": "Thinker sees buttons in dashboard",
+        "check": "response: contains \"World\" or \"world\"",
+        "status": "PASS",
+        "detail": "found 'World'"
+      },
+      {
+        "step": "Thinker detects empty dashboard",
+        "check": "send: I see nothing in my dashboard, what happ",
+        "status": "PASS",
+        "detail": "response: Ah, it seems the buttons got stuck during transmission. I've resent them! Could "
+      },
+      {
+        "step": "Thinker detects empty dashboard",
+        "check": "response: contains \"button\" or \"fix\" or \"restore\" or \"create\" or \"empty\"",
+        "status": "PASS",
+        "detail": "found 'button'"
+      },
+      {
+        "step": "Dashboard state flows to thinker context",
+        "check": "send: create a counter starting at 5",
+        "status": "PASS",
+        "detail": "response: Counter created and ready to go! You should now see it on your dashboard startin"
+      },
+      {
+        "step": "Dashboard state flows to thinker context",
+        "check": "actions: length >= 1",
+        "status": "PASS",
+        "detail": "2 actions >= 1"
+      },
+      {
+        "step": "Dashboard state flows to thinker context",
+        "check": "send: what does my dashboard show?",
+        "status": "PASS",
+        "detail": "response: Your dashboard currently shows a counter at 5, along with +1 and -1 buttons to a"
+      },
+      {
+        "step": "Dashboard state flows to thinker context",
+        "check": "response: contains \"5\" or \"count\"",
+        "status": "PASS",
+        "detail": "found '5'"
+      }
+    ],
+    "Dashboard Mismatch Recovery": [
+      {
+        "step": "Setup",
+        "check": "clear",
+        "status": "PASS",
+        "detail": "cleared"
+      },
+      {
+        "step": "Create buttons",
+        "check": "send: create two buttons: red and blue",
+        "status": "PASS",
+        "detail": "response: Okay, two buttons, one red and one blue, are now ready for you.\n"
+      },
+      {
+        "step": "Create buttons",
+        "check": "actions: length >= 2",
+        "status": "PASS",
+        "detail": "3 actions >= 2"
+      },
+      {
+        "step": "Dashboard empty — Thinker re-emits",
+        "check": "send: I clicked red but nothing happened",
+        "status": "PASS",
+        "detail": "response: Ah, it seems the buttons weren't wired up correctly. They should be functional n"
+      },
+      {
+        "step": "Dashboard empty — Thinker re-emits",
+        "check": "response: contains \"button\" or \"red\" or \"blue\"",
+        "status": "PASS",
+        "detail": "found 'button'"
+      },
+      {
+        "step": "Dashboard empty — Thinker re-emits",
+        "check": "actions: length >= 1",
+        "status": "PASS",
+        "detail": "3 actions >= 1"
+      },
+      {
+        "step": "Create counter",
+        "check": "send: create a counter starting at 0",
+        "status": "PASS",
+        "detail": "response: A counter initialized to 0 is now displayed, along with increment and decrement "
+      },
+      {
+        "step": "Create counter",
+        "check": "actions: length >= 1",
+        "status": "PASS",
+        "detail": "3 actions >= 1"
+      },
+      {
+        "step": "Counter missing from dashboard — Thinker recovers",
+        "check": "send: the dashboard is broken, I only see old ",
+        "status": "PASS",
+        "detail": "response: I've detected a discrepancy between what the server sent and what you're seeing."
+      },
+      {
+        "step": "Counter missing from dashboard — Thinker recovers",
+        "check": "response: contains \"counter\" or \"count\" or \"fix\" or \"recreat\" or \"refresh\" or \"button\" or \"update\"",
+        "status": "FAIL",
+        "detail": "none of ['counter', 'count', 'fix', 'recreat', 'refresh', 'button', 'update'] found in: I've detected a discrepancy between what the server sent and what you're seeing. I'm re-sending the "
+      },
+      {
+        "step": "Counter missing from dashboard — Thinker recovers",
+        "check": "actions: length >= 1",
+        "status": "PASS",
+        "detail": "6 actions >= 1"
+      }
+    ]
+  },
+  "summary": {
+    "passed": 155,
+    "failed": 12
+  }
+}
\ No newline at end of file