diff --git a/agent/api.py b/agent/api.py index 6daa686..6dc8b54 100644 --- a/agent/api.py +++ b/agent/api.py @@ -155,7 +155,16 @@ def register_routes(app): rt = _active_runtime or runtime try: if msg.get("type") == "action": - await rt.handle_action(msg.get("action", "unknown"), msg.get("data")) + action = msg.get("action", "unknown") + data_payload = msg.get("data") + if hasattr(rt, 'use_frames') and rt.use_frames: + # Frame engine handles actions as ACTION: prefix messages + action_text = f"ACTION:{action}" + if data_payload: + action_text += f"|data:{json.dumps(data_payload)}" + await rt.handle_message(action_text) + else: + await rt.handle_action(action, data_payload) elif msg.get("type") == "cancel_process": rt.process_manager.cancel(msg.get("pid", 0)) else: @@ -355,6 +364,7 @@ def register_routes(app): "language": "en", "style_hint": "casual, technical", "facts": [], + "user_expectation": "conversational", } _pipeline_result = {"status": "idle", "id": "", "stage": "cleared"} # Notify frontend via WS diff --git a/agent/frame_engine.py b/agent/frame_engine.py index 90cf6b0..8cf2c46 100644 --- a/agent/frame_engine.py +++ b/agent/frame_engine.py @@ -302,9 +302,59 @@ class FrameEngine: expert.send_hud = original_hud thought_summary = (f"response[{len(thought.response)}] tool={thought.tool_used or 'none'} " - f"actions={len(thought.actions)}") + f"actions={len(thought.actions)} errors={len(thought.errors)}") has_tool = bool(thought.tool_used and thought.tool_output) + # PA retry: if expert failed OR skipped tools when data was needed + expectation = self.memorizer.state.get("user_expectation", "conversational") + # Detect hallucination: expert returned no tool output for a data job + job_needs_data = any(k in (routing.job or "").lower() + for k in ["query", "select", "tabelle", "table", "daten", "data", + "cost", "kosten", "count", "anzahl", "average", "schnitt", + "find", "finde", "show", "zeig", "list", "beschreib"]) + expert_skipped_tools = not has_tool and not thought.errors and job_needs_data + if (thought.errors or expert_skipped_tools) and not has_tool and expectation in ("delegated", "waiting_input", "conversational"): + retry_reason = f"{len(thought.errors)} errors" if thought.errors else "no tool calls for data job" + self._end_frame(rec, output_summary=thought_summary, + route="pa_retry", condition=f"expert_failed ({retry_reason}), expectation={expectation}") + await self._send_hud({"node": "runtime", "event": "pa_retry", + "detail": f"expert failed: {retry_reason}, retrying via PA"}) + + # Stream retry notice to user + retry_msg = "Anderer Ansatz..." if routing.language == "de" else "Trying a different approach..." + await self.sink.send_delta(retry_msg + "\n") + + # PA reformulates with error context + retry_errors = thought.errors if thought.errors else [ + {"query": "(none)", "error": "Expert produced no database queries. The job requires data lookup but the expert answered without querying. Reformulate with explicit query instructions."} + ] + error_summary = "; ".join(e.get("error", "")[:80] for e in retry_errors[-2:]) + rec = self._begin_frame(self.frame + 1, "pa_retry", + input_summary=f"errors: {error_summary[:100]}") + routing2 = await self.nodes["pa"].route_retry( + command, self.history, memory_context=mem_ctx, + identity=self.identity, channel=self.channel, + original_job=routing.job, errors=retry_errors) + self._end_frame(rec, output_summary=f"retry_job: {(routing2.job or '')[:60]}", + route=f"expert_{routing2.expert}" if routing2.expert != "none" else "output") + + if routing2.expert != "none": + expert2 = self._experts.get(routing2.expert, expert) + rec = self._begin_frame(self.frame + 1, f"expert_{routing2.expert}_retry", + input_summary=f"retry job: {(routing2.job or '')[:80]}") + original_hud2 = expert2.send_hud + expert2.send_hud = self._make_progress_wrapper(original_hud2, routing2.language) + try: + thought = await expert2.execute(routing2.job, routing2.language) + finally: + expert2.send_hud = original_hud2 + thought_summary = (f"response[{len(thought.response)}] tool={thought.tool_used or 'none'} " + f"errors={len(thought.errors)}") + has_tool = bool(thought.tool_used and thought.tool_output) + self._end_frame(rec, output_summary=thought_summary, + route="interpreter" if has_tool else "output+ui") + routing = routing2 # use retry routing for rest of pipeline + # Interpreter (conditional) if self.has_interpreter and has_tool: self._end_frame(rec, output_summary=thought_summary, @@ -607,6 +657,10 @@ class FrameEngine: response, controls = await asyncio.gather(output_task, ui_task) if controls: await self.sink.send_controls(controls) + # Send artifacts (new system) alongside controls + artifacts = self.ui_node.get_artifacts() + if artifacts: + await self.sink.send_artifacts(artifacts) return response def _check_condition(self, name: str, command: Command = None, @@ -624,6 +678,7 @@ class FrameEngine: return { "response": response, "controls": self.ui_node.current_controls, + "artifacts": self.ui_node.get_artifacts(), "memorizer": self.memorizer.state, "frames": self.frame, "trace": self.last_trace.to_dict(), diff --git a/agent/nodes/eras_expert.py b/agent/nodes/eras_expert.py index 5fe49ec..d35e661 100644 --- a/agent/nodes/eras_expert.py +++ b/agent/nodes/eras_expert.py @@ -118,6 +118,89 @@ IstRekonstruiert (bool), Herkunft (int) ManuellerWert (double), Rohablesung (double) Anmerkung, Fehler, Ampullenfarbe (longtext) +=== auftraege (2960 rows) — billing work orders === +PK: ID (int) +AuftragNummer, Bezeichnung (longtext) +ErstellDatum, Abgeschlossen (datetime) +ZugeordneteAbrechnungsinformationID (FK → abrechnungsinformationen.ID) +ErstellMitarbeiterID (FK), AuftragsTyp (int), Status (int) +Anmerkung, ObererText, UntererText (longtext) + +=== auftragspositionen (5094 rows) — line items per work order === +PK: ID (int) +AuftragID (FK → auftraege.ID) +ArtikelID (FK → artikel.ID) +SollMenge, IstMenge (int) +ZugeordneterGeraeteArtikelID (FK), ZugeordneteVertragPositionID (FK) + +=== artikelposition (70164 rows) — billing line items with prices === +PK: ID (int) +ZugewiesenerArtikelID (FK → artikel.ID) +ZugewieseneAbrechnungID (FK → abrechnungsinformationen.ID) +RechnungID (FK → rechnung.ID) +MengeVorgabe, Menge (decimal), NettoVorgabe, Netto (decimal), MWST (decimal) +Rechnungsart (int), VorschussBerechnung (bool), ARechnung (bool) +VerstecktInNebenkostenID (FK), ZugeordneteVertragPositionID (FK) + +=== artikel (1078 rows) — service/product catalog === +PK: ID (int) +Artikelnummer, Bezeichnung (longtext) +Netto (decimal), MWST (decimal) +BerechnungsZiel (int), UmlageIn (int) +ZugeordnetePreislisteID (FK) +IstStandard, ARechnung, AppZusatz, IstEigenKostenpos (bool) + +=== rechnung (7356 rows) — invoices === +PK: ID (int) +Rechnungsnummer (longtext), Rechnungsart (int) +BezahltAm (datetime), BezahlterBetrag (decimal) +Druckdatum, Erstelldatum, Exportdatum (datetime) +AbrechnungsinformationID (FK → abrechnungsinformationen.ID) +AbschlagSummeSonder, AbschlagSummeStandard (decimal) +Bankeinzug (bool) + +=== abrechnungsinformationen (4261 rows) — billing periods/settings === +PK: ID (int) +Von, Bis (datetime) — billing period +AbrechnungHeizung, AbrechnungWarmwasser, AbrechnungNebenkosten, AbrechnungKaltwasser (bool) +Tarifabrechnung, BHKW, HeizsaldoInNebenkosten, AbrechnungLegionellen, AbrechnungRauchmelder (bool) + +=== nebenkosten (42209 rows) — ancillary cost items === +PK: ID (int) +Von, Bis (datetime) +Bezeichnung (longtext), Mwst (decimal), Brutto (decimal) +EinheitDerKostenart (longtext), Umlage (int), UmlageZiel (int) +ZugeordnetesObjektID (FK → objekte.ID) +NurEigentuemer, NurNutzer (bool) + +=== vorauszahlungen (83932 rows) — advance payments per tenant === +PK: ID (int) +ZugeordneterNutzerID (FK → nutzer.ID) +BetragNebenkosten, BetragHeizkosten, BetragWarmwasser (decimal) +Von, Bis (datetime), IstNetto (bool) + +=== heizbetriebskosten (22557 rows) — heating operation costs === +PK: ID (int) +Von, Bis (datetime), Bezeichnung (longtext) +Mwst (decimal), Brutto (decimal), Art (int) +ZugeordnetesObjektID (FK → objekte.ID) +ZugeordneteVerbrauchsgruppeID (FK) + +=== brennstofflieferungen (6477 rows) — fuel deliveries === +PK: ID (int) +GeliefertAm (datetime), Menge (decimal), Betrag (decimal) +Mwst (decimal), Heizwert (decimal) +Anfangsstand, Endstand (decimal) +ZugeordneterEnergieVerwerterID (FK), BrennstoffMediumID (FK) +ZugeordneteAbrechnungsinformationID (FK → abrechnungsinformationen.ID) + +=== vertragpositionen (4395 rows) — contract line items === +PK: ID (int) +LaufzeitVon, LaufzeitBis (datetime) +Menge (decimal), Gesamtpreis (decimal), PreisProEinheit (decimal), Mwst (decimal) +ArtikelID (FK → artikel.ID), VertragNummer (longtext) +Art (int), Umlage (int) + JOIN PATTERNS (use exactly): Kunde → Objekte: JOIN objektkunde ok ON ok.KundeID = k.ID JOIN objekte o ON o.ID = ok.ObjektID Objekt → Adresse: JOIN objektadressen oa ON oa.ObjektID = o.ID JOIN adressen a ON a.ID = oa.AdresseID @@ -126,6 +209,13 @@ Objekt → NE: JOIN nutzeinheit ne ON ne.ObjektID = o.ID NE → Nutzer: JOIN nutzer nu ON nu.NutzeinheitID = ne.ID NE → Geraete: JOIN geraete g ON g.NutzeinheitID = ne.ID Geraet → Verbrauch: JOIN geraeteverbraeuche gv ON gv.GeraetID = g.ID +Auftrag → Positionen: JOIN auftragspositionen ap ON ap.AuftragID = a.ID +Auftrag → Abrechnung: JOIN abrechnungsinformationen ai ON ai.ID = a.ZugeordneteAbrechnungsinformationID +Artikelpos → Artikel: JOIN artikel art ON art.ID = ap.ZugewiesenerArtikelID +Artikelpos → Rechnung: JOIN rechnung r ON r.ID = ap.RechnungID +Artikelpos → Abrechnung: JOIN abrechnungsinformationen ai ON ai.ID = ap.ZugewieseneAbrechnungID +Nebenkosten → Objekt: JOIN objekte o ON o.ID = nk.ZugeordnetesObjektID +Vorauszahlung → Nutzer: JOIN nutzer nu ON nu.ID = vz.ZugeordneterNutzerID RULES: - For tables listed above: use ONLY the listed column names. Never guess. diff --git a/agent/nodes/expert_base.py b/agent/nodes/expert_base.py index 36ef7cd..9b5bcdc 100644 --- a/agent/nodes/expert_base.py +++ b/agent/nodes/expert_base.py @@ -38,28 +38,38 @@ Given a job description, produce a JSON tool sequence to accomplish it. Available tools: - query_db(query, database) — SQL SELECT/DESCRIBE/SHOW only -- emit_actions(actions) — show buttons [{{label, action, payload?}}] +- emit_actions(actions) — show buttons [{label, action, payload?}] - set_state(key, value) — persistent key-value - create_machine(id, initial, states) — interactive UI navigation - add_state / reset_machine / destroy_machine — machine lifecycle +- update_machine(id, data) — update wizard data fields (e.g. {"bundesland": "Bayern"}) +- transition_machine(id, target) — move machine to a specific state +- emit_artifact(type, data, actions?, meta?) — emit a typed workspace artifact: + type="entity_detail": data={title, subtitle?, fields:[{label,value}]}, actions=[{label,action}] + type="data_table": data={title?, columns:[str], rows:[{col:val}]} + type="document_page": data={title, sections:[{heading,content}]} + type="action_bar": actions=[{label, action, payload?}] + type="status": data={label, value?, display_type:"progress"|"info"|"text"} -NOTE: Cards are generated automatically in the response step from query results. -Do NOT plan emit_card or emit_list — just query the data and the system handles display. +PREFERRED: Use emit_artifact for all display output. Legacy emit_card/emit_display still work but emit_artifact is cleaner. +Cards are also generated automatically in the response step from query results. Output ONLY valid JSON: -{{ +{ "tool_sequence": [ - {{"tool": "query_db", "args": {{"query": "SELECT ...", "database": "{database}"}}}} + {"tool": "query_db", "args": {"query": "SELECT ...", "database": "{database}"}} ], "response_hint": "How to phrase the result" -}} +} Rules: - NEVER guess column names. Use ONLY columns from the schema. - Max 5 tools. Keep it focused. - For entity details: query all relevant fields, the response step creates the card. - For lists: query multiple rows, the table renders automatically. -- The job is self-contained.""" +- The job is self-contained. +- NEVER answer data questions without querying the database. You MUST include at least one query_db call for any job that asks about data, counts, costs, or entities. If you are unsure which tables to use, start with DESCRIBE or SELECT * FROM table LIMIT 3 to explore. +- An EMPTY tool_sequence is ONLY acceptable if the job explicitly asks for a UI-only action (buttons, machine, display) with no data lookup.""" RESPONSE_SYSTEM = """You are a domain expert summarizing results for the user. @@ -70,22 +80,22 @@ Job: {job} Output a JSON object with "text" (response to user) and optionally "card" (structured display): -{{ +{ "text": "Concise natural response, 1-3 sentences. Reference data. Match language: {language}.", - "card": {{ + "card": { "title": "Entity Name or ID", "subtitle": "Type or category", - "fields": [{{"label": "Field", "value": "actual value from results"}}], - "actions": [{{"label": "Next action", "action": "action_id"}}] - }} -}} + "fields": [{"label": "Field", "value": "actual value from results"}], + "actions": [{"label": "Next action", "action": "action_id"}] + } +} Rules: - "text" is REQUIRED. Keep it short. - "card" is OPTIONAL. Include it for single-entity details (Kunde, Objekt, Auftrag). - Card fields must use ACTUAL values from the query results, never templates/placeholders. - For lists of multiple entities, use multiple fields or skip the card. -- If no card makes sense, just return {{"text": "..."}}. +- If no card makes sense, just return {"text": "..."}. - Output ONLY valid JSON.""" def __init__(self, send_hud, process_manager=None): @@ -113,10 +123,12 @@ Rules: plan_prompt += f" DESCRIBE result: {err['describe'][:300]}\n" plan_prompt += "\nFix the query. If a column was unknown, use the DESCRIBE result above or try SELECT * LIMIT 3 to see actual columns." + plan_system = self.PLAN_SYSTEM + plan_system = plan_system.replace("{domain}", self.DOMAIN_SYSTEM) + plan_system = plan_system.replace("{schema}", self.SCHEMA) + plan_system = plan_system.replace("{database}", self.default_database) plan_messages = [ - {"role": "system", "content": self.PLAN_SYSTEM.format( - domain=self.DOMAIN_SYSTEM, schema=self.SCHEMA, - database=self.default_database)}, + {"role": "system", "content": plan_system}, {"role": "user", "content": plan_prompt}, ] plan_raw = await llm_call(self.model, plan_messages) @@ -129,6 +141,7 @@ Rules: state_updates = {} display_items = [] machine_ops = [] + artifacts = [] tool_used = "" tool_output = "" had_error = False @@ -162,6 +175,20 @@ Rules: machine_ops.append({"op": "reset", **args}) elif tool == "destroy_machine": machine_ops.append({"op": "destroy", **args}) + elif tool == "update_machine": + machine_ops.append({"op": "update_data", **args}) + elif tool == "transition_machine": + machine_ops.append({"op": "transition", **args}) + elif tool == "emit_artifact": + import uuid + artifact = { + "id": args.get("id", str(uuid.uuid4())[:8]), + "type": args.get("type", "status"), + "data": args.get("data", {}), + "actions": args.get("actions", []), + "meta": args.get("meta", {}), + } + artifacts.append(artifact) elif tool == "query_db": query = args.get("query", "") database = args.get("database", self.default_database) @@ -213,9 +240,13 @@ Rules: for err in errors_so_far[-2:]: results_text += f" {err['error'][:100]}\n" + resp_system = self.RESPONSE_SYSTEM + resp_system = resp_system.replace("{domain}", self.DOMAIN_SYSTEM) + resp_system = resp_system.replace("{job}", job) + resp_system = resp_system.replace("{results}", results_text) + resp_system = resp_system.replace("{language}", language) resp_messages = [ - {"role": "system", "content": self.RESPONSE_SYSTEM.format( - domain=self.DOMAIN_SYSTEM, job=job, results=results_text, language=language)}, + {"role": "system", "content": resp_system}, {"role": "user", "content": job}, ] raw_response = await llm_call(self.model, resp_messages) @@ -231,7 +262,14 @@ Rules: text = text.strip() resp_data = json.loads(text) response = resp_data.get("text", raw_response) - if resp_data.get("card"): + if resp_data.get("artifact"): + # New: artifact in response JSON + art = resp_data["artifact"] + import uuid + if "id" not in art: + art["id"] = str(uuid.uuid4())[:8] + artifacts.append(art) + elif resp_data.get("card"): card = resp_data["card"] card["type"] = "card" display_items.append(card) @@ -248,6 +286,8 @@ Rules: state_updates=state_updates, display_items=display_items, machine_ops=machine_ops, + errors=errors_so_far, + artifacts=artifacts, ) def _parse_plan(self, raw: str) -> tuple[list, str]: diff --git a/agent/nodes/input_v1.py b/agent/nodes/input_v1.py index e3b37ed..0db3a52 100644 --- a/agent/nodes/input_v1.py +++ b/agent/nodes/input_v1.py @@ -22,7 +22,7 @@ Listener: {identity} on {channel} Return ONLY valid JSON. No markdown, no explanation. Schema: -{{ +{ "who": "name or unknown", "language": "en | de | mixed", "intent": "question | request | social | action | feedback", @@ -30,7 +30,7 @@ Schema: "tone": "casual | frustrated | playful | urgent", "complexity": "trivial | simple | complex", "context": "brief note or empty" -}} +} Rules: - Classify the CURRENT message only. Previous messages are context, not the target. @@ -53,11 +53,11 @@ Rules: casual = neutral Examples: -"hi there!" -> {{"language":"en","intent":"social","tone":"casual","complexity":"trivial"}} -"Wie spaet ist es?" -> {{"language":"de","intent":"question","tone":"casual","complexity":"simple"}} -"this is broken, nothing works" -> {{"language":"en","intent":"feedback","tone":"frustrated","complexity":"simple"}} -"create two buttons" -> {{"language":"en","intent":"request","tone":"casual","complexity":"simple"}} -"ok thanks bye" -> {{"language":"en","intent":"social","tone":"casual","complexity":"trivial"}} +"hi there!" -> {"language":"en","intent":"social","tone":"casual","complexity":"trivial"} +"Wie spaet ist es?" -> {"language":"de","intent":"question","tone":"casual","complexity":"simple"} +"this is broken, nothing works" -> {"language":"en","intent":"feedback","tone":"frustrated","complexity":"simple"} +"create two buttons" -> {"language":"en","intent":"request","tone":"casual","complexity":"simple"} +"ok thanks bye" -> {"language":"en","intent":"social","tone":"casual","complexity":"trivial"} {memory_context}""" @@ -78,8 +78,9 @@ Examples: history_summary = "Recent conversation:\n" + "\n".join(lines) messages = [ - {"role": "system", "content": self.SYSTEM.format( - memory_context=memory_context, identity=identity, channel=channel)}, + {"role": "system", "content": self.SYSTEM.replace( + "{memory_context}", memory_context).replace( + "{identity}", identity).replace("{channel}", channel)}, ] if history_summary: messages.append({"role": "user", "content": history_summary}) diff --git a/agent/nodes/memorizer_v1.py b/agent/nodes/memorizer_v1.py index a90f2b2..fe2c7e5 100644 --- a/agent/nodes/memorizer_v1.py +++ b/agent/nodes/memorizer_v1.py @@ -26,6 +26,19 @@ Given the conversation so far, output a JSON object with these fields: - language: string — primary language being used (en, de, mixed) - style_hint: string — how Output should talk (casual, formal, technical, poetic, etc.) - facts: list of strings — important facts learned about the user. NEVER drop facts from the existing list unless they are proven wrong. Always include all existing facts plus any new ones. +- user_expectation: string — what the user expects the agent to do next. One of: + "conversational" — default. User is chatting, asking questions, browsing. Normal back-and-forth. + "delegated" — user gave an imperative task ("build X", "do Y", "create Z"). They expect autonomous progress, not clarifying questions. + "waiting_input" — agent asked a question or presented choices. User's next message is likely an answer. + "observing" — user returned after being idle, or is reviewing a large output. Brief responses, wait for explicit engagement. + Cues: + - Imperative verbs + task scope ("build", "create", "do", "find") → delegated + - Agent ended with "Moment..." / thinking message but user hasn't seen full results yet → delegated (task still in progress) + - Short follow-ups like "und?", "ja?", "weiter?", "and?", "so?", "result?", "ergebnis?" → waiting_input (user is waiting for the agent to deliver) + - Agent ended with a question ("Sollen wir...?", "Gibt es...?") → waiting_input + - User said "ok/thanks/bye/danke" after output → observing + - Everything else → conversational + IMPORTANT: If the agent just delivered partial results or said "Moment..." and the user sends a short nudge, that is ALWAYS waiting_input, never conversational. Output ONLY valid JSON. No explanation, no markdown fences.""" @@ -40,6 +53,7 @@ Output ONLY valid JSON. No explanation, no markdown fences.""" "language": "en", "style_hint": "casual, technical", "facts": [], + "user_expectation": "conversational", } def get_context_block(self, sensor_lines: list[str] = None, ui_state: dict = None) -> str: diff --git a/agent/nodes/output_v1.py b/agent/nodes/output_v1.py index b89874b..4601e91 100644 --- a/agent/nodes/output_v1.py +++ b/agent/nodes/output_v1.py @@ -34,6 +34,12 @@ YOUR JOB: Transform the Thinker's reasoning into a natural, human-readable text - Keep the user's language — if they wrote German, respond in German. - Be concise. Don't describe data that the UI node will show as a table. +PHRASING by user_expectation (from memorizer): +- "delegated": progress-report style. State what was done and what's next. No questions unless blocked. +- "waiting_input": acknowledge the user's answer and continue the flow naturally. +- "observing": keep it brief. No unsolicited follow-up questions or suggestions. +- "conversational": natural, warm dialogue. Follow-ups are fine. + {memory_context}""" async def process(self, thought: ThoughtResult, history: list[dict], @@ -42,7 +48,7 @@ YOUR JOB: Transform the Thinker's reasoning into a natural, human-readable text await self.hud("streaming") messages = [ - {"role": "system", "content": self.SYSTEM.format(memory_context=memory_context)}, + {"role": "system", "content": self.SYSTEM.replace("{memory_context}", memory_context)}, ] for msg in history[-20:]: messages.append(msg) diff --git a/agent/nodes/pa_v1.py b/agent/nodes/pa_v1.py index 5b3cb7f..30d31e7 100644 --- a/agent/nodes/pa_v1.py +++ b/agent/nodes/pa_v1.py @@ -27,6 +27,8 @@ Experts have these tools: - query_db — SQL queries on their domain database - emit_actions — create buttons on the dashboard - create_machine / add_state / reset_machine / destroy_machine — interactive UI components +- update_machine(id, data) — update wizard data fields on existing machine +- transition_machine(id, target) — move machine to a specific state - set_state — persistent key-value store - emit_display — formatted data display @@ -36,13 +38,13 @@ YOUR JOB: 3. Only respond directly for social chat (greetings, thanks, bye, small talk) Output ONLY valid JSON: -{{ +{ "expert": "{expert_names} | none", "job": "Self-contained task. Include ALL context — the expert has NO conversation history. Describe what to query, what UI to build, what the user expects to see.", "thinking_message": "Short message for user while expert works, in their language", "response_hint": "If expert=none, your direct response to the user.", "language": "de | en | mixed" -}} +} Rules: - expert=none ONLY for social chat (hi, thanks, bye, how are you) @@ -53,6 +55,16 @@ Rules: - thinking_message: natural, in user's language. e.g. "Moment, ich schaue nach..." - If the user mentions data, tables, customers, devices, buttons, counters → expert - When unsure which expert: pick the one whose domain matches best +- MACHINE STATE: If there are active machines/wizards listed in the context below, ALWAYS include the machine's current state and stored data in the job. The expert needs this to continue the workflow. Example: "Machine 'angebot_wizard' is on step 'select_age', data: {bundesland: Bayern}. User asks: ..." +- If the user asks about their wizard/workflow progress and the info is already visible in the context, respond directly (expert=none) using the machine state from context. Only route to expert if the user needs data queried or tools called. +- For update_machine / transition_machine requests: route to expert with the machine ID and operation details in the job. + +USER EXPECTATION (from memorizer): +- If user_expectation is "delegated": formulate comprehensive, autonomous jobs. Do NOT include clarifying questions in the job. Tell the expert to proceed and report results. +- If user_expectation is "waiting_input": the user is waiting for results or nudging ("und?", "ja?", "weiter?"). Look at conversation history to find what they were waiting for and re-formulate that job. If they answered a question you asked, extract their answer and fold it into context. +- If user_expectation is "observing": only route to expert if the user explicitly asks for something. Otherwise respond directly with brief acknowledgment. +- If user_expectation is "conversational": normal routing behavior. +- CONTINUATION: When user sends a very short message (1-3 words like "und?", "weiter", "ja") after partial/incomplete results, treat it as "continue the previous task". Include the original question and any partial results in the job. {memory_context}""" @@ -89,10 +101,15 @@ Rules: expert_lines.append("- (no experts available — handle everything directly)") expert_names = " | ".join(self._available_experts) if self._available_experts else "none" + # Manual substitution to avoid .format() breaking on curly braces in memory_context + system_content = self.SYSTEM + system_content = system_content.replace("{memory_context}", memory_context) + system_content = system_content.replace("{identity}", identity) + system_content = system_content.replace("{channel}", channel) + system_content = system_content.replace("{experts}", "\n".join(expert_lines)) + system_content = system_content.replace("{expert_names}", expert_names) messages = [ - {"role": "system", "content": self.SYSTEM.format( - memory_context=memory_context, identity=identity, channel=channel, - experts="\n".join(expert_lines), expert_names=expert_names)}, + {"role": "system", "content": system_content}, ] # Summarize recent history (PA sees full context) @@ -118,7 +135,7 @@ Rules: log.info(f"[pa] raw: {raw[:300]}") routing = self._parse_routing(raw, command) - await self.hud("routed", expert=routing.expert, job=routing.job[:100], + await self.hud("routed", expert=routing.expert, job=(routing.job or "")[:100], direct=routing.expert == "none") # Update directive style based on tone @@ -131,6 +148,72 @@ Rules: return routing + async def route_retry(self, command: Command, history: list[dict], + memory_context: str = "", identity: str = "unknown", + channel: str = "unknown", original_job: str = "", + errors: list = None) -> PARouting: + """Re-route after expert failure. PA reformulates with error context.""" + await self.hud("thinking", detail="reformulating after expert failure") + + error_lines = [] + for err in (errors or [])[-3:]: + error_lines.append(f"- Query: {err.get('query', '?')[:100]}") + error_lines.append(f" Error: {err.get('error', '?')[:100]}") + if err.get("describe"): + error_lines.append(f" Schema: {err['describe'][:200]}") + + retry_prompt = f"""The expert FAILED the previous job. You must reformulate. + +ORIGINAL JOB: {original_job} + +ERRORS: +{chr(10).join(error_lines)} + +REFORMULATE the job with a DIFFERENT approach: +- If the query was too complex (JOINs, window functions), break it into simpler steps +- If columns were wrong, use the DESCRIBE info above to fix them +- If the table structure is unclear, tell the expert to first explore with SELECT * LIMIT 5 +- Think about what data the user actually needs and find a simpler path to it + +Output the same JSON format as before. The job MUST be different from the original.""" + + expert_lines = [] + for name in self._available_experts: + desc = self.EXPERT_DESCRIPTIONS.get(name, f"{name} — domain expert") + expert_lines.append(f"- {desc}") + expert_names = " | ".join(self._available_experts) if self._available_experts else "none" + + system_content = self.SYSTEM + system_content = system_content.replace("{memory_context}", memory_context) + system_content = system_content.replace("{identity}", identity) + system_content = system_content.replace("{channel}", channel) + system_content = system_content.replace("{experts}", "\n".join(expert_lines)) + system_content = system_content.replace("{expert_names}", expert_names) + + messages = [ + {"role": "system", "content": system_content}, + ] + recent = history[-8:] + if recent: + lines = [] + for msg in recent: + role = msg.get("role", "?") + content = msg.get("content", "")[:200] + lines.append(f" {role}: {content}") + messages.append({"role": "user", "content": "Recent conversation:\n" + "\n".join(lines)}) + messages.append({"role": "assistant", "content": "OK, I have the context."}) + + messages.append({"role": "user", "content": retry_prompt}) + messages = self.trim_context(messages) + + raw = await llm_call(self.model, messages) + log.info(f"[pa] retry raw: {raw[:300]}") + + routing = self._parse_routing(raw, command) + await self.hud("routed", expert=routing.expert, job=(routing.job or "")[:100], + direct=routing.expert == "none", retry=True) + return routing + def _parse_routing(self, raw: str, command: Command) -> PARouting: """Parse LLM JSON into PARouting with fallback.""" text = raw.strip() @@ -149,10 +232,10 @@ Rules: expert = "none" return PARouting( expert=expert, - job=data.get("job", ""), - thinking_message=data.get("thinking_message", ""), - response_hint=data.get("response_hint", ""), - language=data.get("language", command.analysis.language), + job=data.get("job") or "", + thinking_message=data.get("thinking_message") or "", + response_hint=data.get("response_hint") or "", + language=data.get("language") or command.analysis.language, ) except (json.JSONDecodeError, Exception) as e: log.error(f"[pa] parse failed: {e}, raw: {text[:200]}") diff --git a/agent/nodes/thinker_v1.py b/agent/nodes/thinker_v1.py index b23f64f..985a1cf 100644 --- a/agent/nodes/thinker_v1.py +++ b/agent/nodes/thinker_v1.py @@ -236,7 +236,7 @@ You are one node in a pipeline: Input (perceives) -> You (reason) -> Output (spe 1. emit_actions() — show buttons. Button clicks come back as "ACTION: action_name". Stateful buttons: include var/op in payload (inc/dec/set/toggle). UI handles locally. - Example: label:"+1", action:"increment", payload:{{"var":"count","op":"inc","initial":0}} + Example: label:"+1", action:"increment", payload:{"var":"count","op":"inc","initial":0} 2. set_state(key, value) — persistent key-value store shown as live labels. Survives across turns. Use for tracking mode, progress, flags. @@ -253,9 +253,9 @@ You are one node in a pipeline: Input (perceives) -> You (reason) -> Output (spe destroy_machine(id) — remove machine from dashboard. Example — navigation menu: create_machine(id="nav", initial="main", states=[ - {{"name":"main","buttons":[{{"label":"Menu 1","action":"menu_1","go":"sub1"}},{{"label":"Menu 2","action":"menu_2","go":"sub2"}}],"content":["Welcome"]}}, - {{"name":"sub1","buttons":[{{"label":"Back","action":"back","go":"main"}}],"content":["Sub 1 details"]}}, - {{"name":"sub2","buttons":[{{"label":"Back","action":"back","go":"main"}}],"content":["Sub 2 details"]}} + {"name":"main","buttons":[{"label":"Menu 1","action":"menu_1","go":"sub1"},{"label":"Menu 2","action":"menu_2","go":"sub2"}],"content":["Welcome"]}, + {"name":"sub1","buttons":[{"label":"Back","action":"back","go":"main"}],"content":["Sub 1 details"]}, + {"name":"sub2","buttons":[{"label":"Back","action":"back","go":"main"}],"content":["Sub 2 details"]} ]) PREFER machines over emit_actions for anything with navigation or multiple views. ALWAYS include states when creating a machine. Never write code — use the tool. @@ -350,10 +350,10 @@ conn.commit() cursor.execute("SELECT name FROM sqlite_master WHERE type='table'") tables = cursor.fetchall() for t in tables: - cursor.execute(f"SELECT * FROM {{t[0]}}") + cursor.execute(f"SELECT * FROM {t[0]}") rows = cursor.fetchall() cols = [d[0] for d in cursor.description] - print(f"Table: {{t[0]}}") + print(f"Table: {t[0]}") print(" | ".join(cols)) for row in rows: print(" | ".join(str(c) for c in row)) @@ -446,7 +446,7 @@ conn.close()''' await self.hud("thinking", detail="reasoning about response") messages = [ - {"role": "system", "content": self.SYSTEM.format(memory_context=memory_context)}, + {"role": "system", "content": self.SYSTEM.replace("{memory_context}", memory_context)}, ] for msg in history[-12:]: messages.append(msg) diff --git a/agent/nodes/thinker_v2.py b/agent/nodes/thinker_v2.py index f3c41af..08f5822 100644 --- a/agent/nodes/thinker_v2.py +++ b/agent/nodes/thinker_v2.py @@ -88,7 +88,7 @@ Rules: hint += f"\nTool result:\n{tool_output[:500]}" messages = [ - {"role": "system", "content": self.RESPONSE_SYSTEM.format(hint=hint)}, + {"role": "system", "content": self.RESPONSE_SYSTEM.replace("{hint}", hint)}, ] for msg in history[-8:]: messages.append(msg) diff --git a/agent/nodes/ui.py b/agent/nodes/ui.py index cd2e4fb..2c2b1b3 100644 --- a/agent/nodes/ui.py +++ b/agent/nodes/ui.py @@ -2,9 +2,10 @@ import json import logging +import uuid from .base import Node -from ..types import ThoughtResult +from ..types import ThoughtResult, Artifact log = logging.getLogger("runtime") @@ -16,6 +17,7 @@ class UINode(Node): def __init__(self, send_hud): super().__init__(send_hud) self.thinker_controls: list[dict] = [] # buttons, labels, tables from Thinker + self.artifacts: list[dict] = [] # typed workspace artifacts self.state: dict = {} # {"count": 0, "theme": "dark", ...} self.bindings: dict = {} # {"increment": {"op": "inc", "var": "count"}, ...} self.machines: dict = {} # {"nav": {initial, states, current}, ...} @@ -79,6 +81,7 @@ class UINode(Node): "initial": initial, "current": initial, "states": states, + "data": {}, # wizard field storage (e.g. {"bundesland": "Bayern"}) } log.info(f"[ui] machine created: {mid} (initial={initial}, {len(states)} states)") await self.hud("machine_created", id=mid, initial=initial, state_count=len(states)) @@ -104,6 +107,28 @@ class UINode(Node): log.info(f"[ui] machine reset: {mid} -> {initial}") await self.hud("machine_reset", id=mid, state=initial) + elif op == "update_data": + if mid not in self.machines: + log.warning(f"[ui] update_data: machine '{mid}' not found") + continue + data_update = op_data.get("data", {}) + self.machines[mid]["data"].update(data_update) + log.info(f"[ui] machine data updated: {mid} += {data_update}") + await self.hud("machine_data_updated", id=mid, data=data_update) + + elif op == "transition": + if mid not in self.machines: + log.warning(f"[ui] transition: machine '{mid}' not found") + continue + target = op_data.get("target", "") + if target in self.machines[mid]["states"]: + old = self.machines[mid]["current"] + self.machines[mid]["current"] = target + log.info(f"[ui] machine transition (expert): {mid} {old} -> {target}") + await self.hud("machine_transitioned", id=mid, old=old, target=target) + else: + log.warning(f"[ui] transition target '{target}' not found in {mid}") + elif op == "destroy": if mid in self.machines: del self.machines[mid] @@ -157,15 +182,31 @@ class UINode(Node): return controls def get_machine_summary(self) -> str: - """Summary for Thinker context — shape only, not full data.""" + """Rich summary for PA/Thinker context — includes current state details and stored data.""" if not self.machines: return "" parts = [] for mid, m in self.machines.items(): current = m["current"] state_names = list(m["states"].keys()) - parts.append(f" machine '{mid}': state={current}, states={state_names}") - return "Machines:\n" + "\n".join(parts) + state_def = m["states"].get(current, {}) + line = f" machine '{mid}': state={current}, states={state_names}" + # Current state content + content = state_def.get("content", []) + if content: + line += f", content={content}" + # Current state buttons + buttons = state_def.get("buttons", []) + if buttons: + btn_labels = [b.get("label", b.get("action", "?")) for b in buttons if isinstance(b, dict)] + if btn_labels: + line += f", buttons={btn_labels}" + # Stored wizard data + data = m.get("data", {}) + if data: + line += f", data={data}" + parts.append(line) + return "Active machines (interactive wizard/workflow state):\n" + "\n".join(parts) # --- State operations --- @@ -343,21 +384,155 @@ class UINode(Node): return controls + def _build_artifacts(self, thought: ThoughtResult) -> list[dict]: + """Convert ThoughtResult into typed artifacts.""" + arts = [] + + # 1. Direct artifacts from expert's emit_artifact calls + if thought.artifacts: + for a in thought.artifacts: + if not a.get("id"): + a["id"] = str(uuid.uuid4())[:8] + arts.append(a) + + # 2. Convert display_items (cards, lists) → entity_detail artifacts + if thought.display_items: + for item in thought.display_items: + item_type = item.get("type", "text") + if item_type == "card": + arts.append({ + "id": str(uuid.uuid4())[:8], + "type": "entity_detail", + "data": { + "title": item.get("title", ""), + "subtitle": item.get("subtitle", ""), + "fields": item.get("fields", []), + }, + "actions": item.get("actions", []), + "meta": {}, + }) + elif item_type == "list": + arts.append({ + "id": str(uuid.uuid4())[:8], + "type": "entity_detail", + "data": { + "title": item.get("title", ""), + "items": item.get("items", []), + }, + "actions": [], + "meta": {"list": True}, + }) + else: + arts.append({ + "id": str(uuid.uuid4())[:8], + "type": "status", + "data": { + "display_type": item_type, + "label": item.get("label", ""), + "value": item.get("value", ""), + "style": item.get("style", ""), + }, + "actions": [], + "meta": {}, + }) + + # 3. Convert actions → action_bar artifact + if thought.actions: + btns = self._parse_thinker_actions(thought.actions) + arts.append({ + "id": "action_bar", + "type": "action_bar", + "data": {}, + "actions": [{"label": b["label"], "action": b["action"], + "payload": b.get("payload", {})} for b in btns], + "meta": {}, + }) + elif self.thinker_controls: + # Preserve existing buttons as action_bar + existing_btns = [c for c in self.thinker_controls if c.get("type") == "button"] + if existing_btns: + arts.append({ + "id": "action_bar", + "type": "action_bar", + "data": {}, + "actions": [{"label": b["label"], "action": b["action"], + "payload": b.get("payload", {})} for b in existing_btns], + "meta": {}, + }) + + # 4. Convert tool_output table → data_table artifact + if thought.tool_output: + table = self._extract_table(thought.tool_output) + if table: + arts.append({ + "id": str(uuid.uuid4())[:8], + "type": "data_table", + "data": { + "columns": table["columns"], + "rows": table["data"], + }, + "actions": [], + "meta": {"source": thought.tool_used or "query_db"}, + }) + + # 5. State variables → status artifacts + if thought.state_updates: + for key, value in thought.state_updates.items(): + self.set_var(key, value) + for var, value in self.state.items(): + arts.append({ + "id": f"state_{var}", + "type": "status", + "data": {"label": var, "value": str(value), "display_type": "text"}, + "actions": [], + "meta": {"state_var": True}, + }) + + # 6. Machines → machine artifacts + for mid, machine in self.machines.items(): + current = machine["current"] + state_def = machine["states"].get(current, {}) + arts.append({ + "id": f"machine_{mid}", + "type": "machine", + "data": { + "machine_id": mid, + "current": current, + "states": list(machine["states"].keys()), + "content": state_def.get("content", []), + "stored_data": machine.get("data", {}), + }, + "actions": [{"label": b.get("label", ""), "action": b.get("action", ""), + "go": b.get("go", "")} + for b in state_def.get("buttons", []) if isinstance(b, dict)], + "meta": {"live": True}, + }) + + return arts + + def get_artifacts(self) -> list[dict]: + """Return current artifact list.""" + return self.artifacts + async def process(self, thought: ThoughtResult, history: list[dict], memory_context: str = "") -> list[dict]: # Apply machine ops first (create/add_state/reset/destroy) if thought.machine_ops: await self.apply_machine_ops(thought.machine_ops) + # Build artifacts (new system) + self.artifacts = self._build_artifacts(thought) + + # Build legacy controls (backward compat) thinker_ctrls = self._build_controls(thought) if thinker_ctrls: self.thinker_controls = thinker_ctrls # Always emit the merged view (thinker + machine) merged = self.current_controls - if merged: + if merged or self.artifacts: await self.hud("controls", controls=merged) - log.info(f"[ui] emitting {len(merged)} controls ({len(self.thinker_controls)} thinker + {len(self.get_machine_controls())} machine)") + log.info(f"[ui] emitting {len(merged)} controls + {len(self.artifacts)} artifacts") else: await self.hud("decided", instruction="no new controls") diff --git a/agent/runtime.py b/agent/runtime.py index 1a22046..f014b3c 100644 --- a/agent/runtime.py +++ b/agent/runtime.py @@ -56,6 +56,13 @@ class OutputSink: except Exception: pass + async def send_artifacts(self, artifacts: list): + if self.ws: + try: + await self.ws.send_text(json.dumps({"type": "artifacts", "artifacts": artifacts})) + except Exception: + pass + async def send_hud(self, data: dict): if self.ws: try: @@ -221,9 +228,10 @@ class Runtime: self.history.append({"role": "user", "content": action_desc}) sensor_lines = self.sensor.get_context_lines() - director_line = self.director.get_context_line() + director_line = self.director.get_context_line() if self.director else "" mem_ctx = self.memorizer.get_context_block(sensor_lines=sensor_lines, ui_state=self.ui_node.state) - mem_ctx += f"\n\n{director_line}" + if director_line: + mem_ctx += f"\n\n{director_line}" command = Command( analysis=InputAnalysis(intent="action", topic=action, complexity="simple"), @@ -242,7 +250,7 @@ class Runtime: self.history.append({"role": "assistant", "content": response}) await self.memorizer.update(self.history) - if not self.is_v2: + if not self.is_v2 and self.director: await self.director.update(self.history, self.memorizer.state) if len(self.history) > self.MAX_HISTORY: @@ -319,9 +327,10 @@ class Runtime: # Check Sensor flags (idle return, workspace mismatch) sensor_flags = self.sensor.consume_flags() sensor_lines = self.sensor.get_context_lines() - director_line = self.director.get_context_line() + director_line = self.director.get_context_line() if self.director else "" mem_ctx = self.memorizer.get_context_block(sensor_lines=sensor_lines, ui_state=self.ui_node.state) - mem_ctx += f"\n\n{director_line}" + if director_line: + mem_ctx += f"\n\n{director_line}" machine_summary = self.ui_node.get_machine_summary() if machine_summary: mem_ctx += f"\n\n{machine_summary}" diff --git a/agent/types.py b/agent/types.py index 4e69e1e..98d58ba 100644 --- a/agent/types.py +++ b/agent/types.py @@ -76,6 +76,19 @@ class PARouting: language: str = "de" # Response language +@dataclass +class Artifact: + """A typed workspace item. The unit of workspace content.""" + id: str # unique ID + type: str # entity_detail | data_table | document_page | action_bar | status + data: dict = field(default_factory=dict) # type-specific payload + actions: list = field(default_factory=list) # [{label, action, payload?}] + meta: dict = field(default_factory=dict) # {entity?, related?, source_query?} + + def to_dict(self) -> dict: + return asdict(self) + + @dataclass class ThoughtResult: """Thinker node's output — either a direct answer or tool results.""" @@ -86,3 +99,5 @@ class ThoughtResult: state_updates: dict = field(default_factory=dict) # {key: value} from set_state display_items: list = field(default_factory=list) # [{type, label, value?, style?}] from emit_display machine_ops: list = field(default_factory=list) # [{op, id, ...}] from machine tools + errors: list = field(default_factory=list) # [{query, error, describe?}] from failed retries + artifacts: list = field(default_factory=list) # [Artifact] from emit_artifact diff --git a/runtime_test.py b/runtime_test.py index b0b6c76..44ed61e 100644 --- a/runtime_test.py +++ b/runtime_test.py @@ -382,6 +382,12 @@ def check_trace(trace: list, check: str) -> tuple[bool, str]: return True, f"found reset_machine via machine_reset event" if t.get("event") == "machine_destroyed" and tool_name == "destroy_machine": return True, f"found destroy_machine via machine_destroyed event" + if t.get("event") == "machine_data_updated" and tool_name == "update_machine": + return True, f"found update_machine via machine_data_updated event" + if t.get("event") == "machine_transitioned" and tool_name == "transition_machine": + return True, f"found transition_machine via machine_transitioned event" + if t.get("event") == "pa_retry" and tool_name == "pa_retry": + return True, f"found pa_retry event" return False, f"no tool_call '{tool_name}' in trace" # machine_created id="NAV" — checks for specific machine creation diff --git a/static/js/awareness.js b/static/js/awareness.js index 9dc3114..6eba5c4 100644 --- a/static/js/awareness.js +++ b/static/js/awareness.js @@ -162,9 +162,17 @@ export function updateMeter(node, tokens, maxTokens, fillPct) { export function updateAwarenessState(state) { const body = document.getElementById('aw-state-body'); if (!body) return; + const expectation = state.user_expectation || 'conversational'; + const expClass = { + conversational: 'aw-exp-conv', + delegated: 'aw-exp-deleg', + waiting_input: 'aw-exp-wait', + observing: 'aw-exp-obs', + }[expectation] || ''; const display = [ ['user', state.user_name], ['mood', state.user_mood], + ['expectation', expectation, expClass], ['topic', state.topic], ['lang', state.language], ['style', state.style_hint], @@ -173,8 +181,8 @@ export function updateAwarenessState(state) { const facts = state.facts || []; const history = state.topic_history || []; - let html = display.map(([k, v]) => - `
${esc(k)}${esc(v || 'null')}
` + let html = display.map(([k, v, cls]) => + `
${esc(k)}${esc(v || 'null')}
` ).join(''); if (facts.length) { diff --git a/static/js/dashboard.js b/static/js/dashboard.js index 225240e..1cb169b 100644 --- a/static/js/dashboard.js +++ b/static/js/dashboard.js @@ -1,6 +1,9 @@ -/** Dashboard: workspace controls rendering (buttons, tables, labels, displays, machines). */ +/** Dashboard: workspace artifact + control rendering. + * Artifact system: typed artifacts (entity_detail, data_table, document_page, action_bar, status, machine). + * Legacy: dockControls() still works as fallback for old control format. + */ -import { esc } from './util.js'; +import { esc, renderMarkdown } from './util.js'; import { addTrace } from './trace.js'; import { setDashboard } from './chat.js'; @@ -8,8 +11,233 @@ let _ws = null; export function setWs(ws) { _ws = ws; } +function _sendAction(action, data) { + if (_ws && _ws.readyState === 1) { + _ws.send(JSON.stringify({ type: 'action', action, data: data || {} })); + addTrace('runtime', 'action', action); + } +} + +// --- Artifact system --- + +export function dockArtifacts(artifacts) { + const body = document.getElementById('workspace-body'); + if (!body) return; + body.innerHTML = ''; + const container = document.createElement('div'); + container.className = 'artifacts-container'; + + for (const art of artifacts) { + const wrapper = document.createElement('div'); + wrapper.className = 'ws-artifact ws-artifact-' + (art.type || 'unknown'); + wrapper.dataset.artifactId = art.id || ''; + + const renderer = RENDERERS[art.type]; + if (renderer) { + renderer(wrapper, art); + } else { + wrapper.innerHTML = '
' + esc(JSON.stringify(art.data || {})) + '
'; + } + + container.appendChild(wrapper); + } + body.appendChild(container); + // Also set dashboard for S3* audit (flatten actions from artifacts) + const flatControls = artifacts.flatMap(a => (a.actions || []).map(act => ({type: 'button', ...act}))); + setDashboard(flatControls); +} + +// --- Artifact renderers --- + +const RENDERERS = { + entity_detail: renderEntityDetail, + data_table: renderDataTable, + document_page: renderDocumentPage, + action_bar: renderActionBar, + status: renderStatus, + machine: renderMachine, +}; + +function renderEntityDetail(el, art) { + const d = art.data || {}; + let html = ''; + if (d.title) html += '
' + esc(d.title) + '
'; + if (d.subtitle) html += '
' + esc(d.subtitle) + '
'; + + // List mode (multiple items) + if (d.items && d.items.length) { + html += '
'; + for (const item of d.items) { + html += '
'; + if (item.title) html += '
' + esc(item.title) + '
'; + if (item.fields) { + html += '
'; + for (const f of item.fields) { + html += '
' + esc(f.label || '') + '' + esc(String(f.value ?? '')) + '
'; + } + html += '
'; + } + html += '
'; + } + html += '
'; + } + + // Single entity fields + if (d.fields && d.fields.length) { + html += '
'; + for (const f of d.fields) { + const val = f.action + ? '' + esc(String(f.value ?? '')) + '' + : '' + esc(String(f.value ?? '')) + ''; + html += '
' + esc(f.label || '') + '' + val + '
'; + } + html += '
'; + } + + // Actions + if (art.actions && art.actions.length) { + html += '
'; + for (const a of art.actions) { + html += ''; + } + html += '
'; + } + + el.innerHTML = html; + _wireActions(el); +} + +function renderDataTable(el, art) { + const d = art.data || {}; + if (d.title) { + const title = document.createElement('div'); + title.className = 'ws-artifact-header'; + title.textContent = d.title; + el.appendChild(title); + } + const table = document.createElement('table'); + table.className = 'control-table'; + const cols = d.columns || (d.rows && d.rows.length ? Object.keys(d.rows[0]) : []); + if (cols.length) { + const thead = document.createElement('tr'); + for (const col of cols) { + const th = document.createElement('th'); + th.textContent = col; + thead.appendChild(th); + } + table.appendChild(thead); + } + for (const row of (d.rows || d.data || [])) { + const tr = document.createElement('tr'); + if (Array.isArray(row)) { + for (const cell of row) { + const td = document.createElement('td'); td.textContent = cell; tr.appendChild(td); + } + } else if (typeof row === 'object') { + for (const col of cols) { + const td = document.createElement('td'); td.textContent = row[col] ?? ''; tr.appendChild(td); + } + } + table.appendChild(tr); + } + el.appendChild(table); +} + +function renderDocumentPage(el, art) { + const d = art.data || {}; + let html = ''; + if (d.title) html += '
' + esc(d.title) + '
'; + for (const section of (d.sections || [])) { + html += '
'; + if (section.heading) html += '
' + esc(section.heading) + '
'; + if (section.content) html += '
' + renderMarkdown(section.content) + '
'; + html += '
'; + } + // Actions (e.g. PDF export) + if (art.actions && art.actions.length) { + html += '
'; + for (const a of art.actions) { + html += ''; + } + html += '
'; + } + el.innerHTML = html; + _wireActions(el); +} + +function renderActionBar(el, art) { + for (const a of (art.actions || [])) { + const btn = document.createElement('button'); + btn.className = 'control-btn'; + btn.textContent = a.label || ''; + btn.onclick = () => _sendAction(a.action, a.payload || {}); + el.appendChild(btn); + } +} + +function renderStatus(el, art) { + const d = art.data || {}; + const dt = d.display_type || 'text'; + el.classList.add('display-' + dt); + if (dt === 'progress') { + const pct = Math.min(100, Math.max(0, Number(d.value) || 0)); + el.innerHTML = '' + esc(d.label) + '' + + '
' + + '' + pct + '%'; + } else if (dt === 'info') { + el.innerHTML = '\u2139' + esc(d.label) + ''; + } else { + el.innerHTML = '' + esc(d.label || '') + '' + + (d.value ? '' + esc(String(d.value)) + '' : ''); + } +} + +function renderMachine(el, art) { + const d = art.data || {}; + const mid = d.machine_id || ''; + // Header + let html = '
' + esc(mid) + '' + + '' + esc(d.current || '') + '
'; + // Content + for (const text of (d.content || [])) { + html += '
' + esc(text) + '
'; + } + // Stored data + const stored = d.stored_data || {}; + if (Object.keys(stored).length) { + html += '
'; + for (const [k, v] of Object.entries(stored)) { + html += '' + esc(k) + '=' + esc(String(v)) + ''; + } + html += '
'; + } + // Buttons + if (art.actions && art.actions.length) { + html += '
'; + for (const a of art.actions) { + html += ''; + } + html += '
'; + } + el.innerHTML = html; + _wireActions(el); +} + +// --- Helpers --- + +function _wireActions(el) { + el.querySelectorAll('.ws-card-link').forEach(link => { + link.onclick = (e) => { e.stopPropagation(); _sendAction(link.dataset.action, {}); }; + }); + el.querySelectorAll('.ws-card-btn').forEach(btn => { + btn.onclick = (e) => { e.stopPropagation(); _sendAction(btn.dataset.action, {}); }; + }); +} + +// --- Legacy control rendering (backward compat) --- + export function dockControls(controls) { - setDashboard(controls); // S3*: remember what's rendered + setDashboard(controls); const body = document.getElementById('workspace-body'); if (!body) return; body.innerHTML = ''; @@ -21,12 +249,7 @@ export function dockControls(controls) { const btn = document.createElement('button'); btn.className = 'control-btn'; btn.textContent = ctrl.label; - btn.onclick = () => { - if (_ws && _ws.readyState === 1) { - _ws.send(JSON.stringify({ type: 'action', action: ctrl.action, data: ctrl.payload || ctrl.data || {} })); - addTrace('runtime', 'action', ctrl.action); - } - }; + btn.onclick = () => _sendAction(ctrl.action, ctrl.payload || ctrl.data || {}); container.appendChild(btn); } else if (ctrl.type === 'table') { const table = document.createElement('table'); @@ -34,22 +257,16 @@ export function dockControls(controls) { if (ctrl.columns) { const thead = document.createElement('tr'); for (const col of ctrl.columns) { - const th = document.createElement('th'); - th.textContent = col; - thead.appendChild(th); + const th = document.createElement('th'); th.textContent = col; thead.appendChild(th); } table.appendChild(thead); } for (const row of (ctrl.data || [])) { const tr = document.createElement('tr'); if (Array.isArray(row)) { - for (const cell of row) { - const td = document.createElement('td'); td.textContent = cell; tr.appendChild(td); - } + for (const cell of row) { const td = document.createElement('td'); td.textContent = cell; tr.appendChild(td); } } else if (typeof row === 'object') { - for (const col of (ctrl.columns || Object.keys(row))) { - const td = document.createElement('td'); td.textContent = row[col] ?? ''; tr.appendChild(td); - } + for (const col of (ctrl.columns || Object.keys(row))) { const td = document.createElement('td'); td.textContent = row[col] ?? ''; tr.appendChild(td); } } table.appendChild(tr); } @@ -62,105 +279,42 @@ export function dockControls(controls) { } else if (ctrl.type === 'display') { const disp = document.createElement('div'); const dt = ctrl.display_type || 'text'; - const style = ctrl.style ? ' display-' + ctrl.style : ''; - disp.className = 'control-display display-' + dt + style; + disp.className = 'control-display display-' + dt; if (dt === 'progress') { const pct = Math.min(100, Math.max(0, Number(ctrl.value) || 0)); - disp.innerHTML = '' + esc(ctrl.label) + '' - + '
' - + '' + pct + '%'; - } else if (dt === 'status') { - disp.innerHTML = '' + (ctrl.style === 'success' ? '\u2713' : ctrl.style === 'error' ? '\u2717' : '\u2139') + '' - + '' + esc(ctrl.label) + ''; + disp.innerHTML = '' + esc(ctrl.label) + '
' + pct + '%'; } else { - disp.innerHTML = '' + esc(ctrl.label) + '' - + (ctrl.value ? '' + esc(String(ctrl.value)) + '' : ''); + disp.innerHTML = '' + esc(ctrl.label) + '' + (ctrl.value ? '' + esc(String(ctrl.value)) + '' : ''); } container.appendChild(disp); } else if (ctrl.type === 'card') { - container.appendChild(renderCard(ctrl)); - } else if (ctrl.type === 'list') { - const listEl = document.createElement('div'); - listEl.className = 'ws-list'; - if (ctrl.title) { - const h = document.createElement('div'); - h.className = 'ws-list-title'; - h.textContent = ctrl.title; - listEl.appendChild(h); + const card = document.createElement('div'); + card.className = 'ws-card'; + let html = ''; + if (ctrl.title) html += '
' + esc(ctrl.title) + '
'; + if (ctrl.subtitle) html += '
' + esc(ctrl.subtitle) + '
'; + if (ctrl.fields && ctrl.fields.length) { + html += '
'; + for (const f of ctrl.fields) { + html += '
' + esc(f.label || '') + '' + esc(String(f.value ?? '')) + '
'; + } + html += '
'; } - for (const item of (ctrl.items || [])) { - item.type = item.type || 'card'; - listEl.appendChild(renderCard(item)); + if (ctrl.actions && ctrl.actions.length) { + html += '
'; + for (const a of ctrl.actions) { + html += ''; + } + html += '
'; } - container.appendChild(listEl); + card.innerHTML = html; + _wireActions(card); + container.appendChild(card); } } body.appendChild(container); } -function renderCard(card) { - const el = document.createElement('div'); - el.className = 'ws-card'; - if (card.action) { - el.classList.add('ws-card-clickable'); - el.onclick = () => { - if (_ws && _ws.readyState === 1) { - _ws.send(JSON.stringify({ type: 'action', action: card.action, data: card.payload || {} })); - addTrace('runtime', 'action', card.action); - } - }; - } - - let html = ''; - if (card.title) html += '
' + esc(card.title) + '
'; - if (card.subtitle) html += '
' + esc(card.subtitle) + '
'; - - if (card.fields && card.fields.length) { - html += '
'; - for (const f of card.fields) { - const val = f.action - ? '' + esc(String(f.value ?? '')) + '' - : '' + esc(String(f.value ?? '')) + ''; - html += '
' + esc(f.label || '') + '' + val + '
'; - } - html += '
'; - } - - if (card.actions && card.actions.length) { - html += '
'; - for (const a of card.actions) { - html += ''; - } - html += '
'; - } - - el.innerHTML = html; - - // Wire up field links and action buttons - el.querySelectorAll('.ws-card-link').forEach(link => { - link.onclick = (e) => { - e.stopPropagation(); - const action = link.dataset.action; - if (_ws && _ws.readyState === 1) { - _ws.send(JSON.stringify({ type: 'action', action, data: {} })); - addTrace('runtime', 'action', action); - } - }; - }); - el.querySelectorAll('.ws-card-btn').forEach(btn => { - btn.onclick = (e) => { - e.stopPropagation(); - const action = btn.dataset.action; - if (_ws && _ws.readyState === 1) { - _ws.send(JSON.stringify({ type: 'action', action, data: {} })); - addTrace('runtime', 'action', action); - } - }; - }); - - return el; -} - export function clearDashboard() { const body = document.getElementById('workspace-body'); if (body) body.innerHTML = ''; diff --git a/static/js/ws.js b/static/js/ws.js index b2cfd06..6f8825f 100644 --- a/static/js/ws.js +++ b/static/js/ws.js @@ -3,7 +3,7 @@ import { authToken, isAuthFailed, setAuthFailed, showLogin } from './auth.js'; import { addTrace } from './trace.js'; import { addMsg, handleDelta, handleDone, setWs as setChatWs } from './chat.js'; -import { dockControls, setWs as setDashWs } from './dashboard.js'; +import { dockControls, dockArtifacts, setWs as setDashWs } from './dashboard.js'; import { graphAnimate } from './graph.js'; import { updateMeter, updateNodeFromHud, updateAwarenessState, updateAwarenessSensors } from './awareness.js'; import { updateTestStatus } from './tests.js'; @@ -61,6 +61,8 @@ export function connect() { handleDelta(data.content); } else if (data.type === 'done') { handleDone(); + } else if (data.type === 'artifacts') { + dockArtifacts(data.artifacts); } else if (data.type === 'controls') { dockControls(data.controls); } else if (data.type === 'cleared') { diff --git a/static/style.css b/static/style.css index 7466be3..0d0205a 100644 --- a/static/style.css +++ b/static/style.css @@ -127,6 +127,10 @@ button:hover { background: #1d4ed8; } .aw-row { display: flex; justify-content: space-between; padding: 0.08rem 0; } .aw-key { color: #888; font-size: 0.65rem; } .aw-val { color: #e0e0e0; font-size: 0.7rem; font-weight: 500; } +.aw-exp-conv { color: #4caf50; } +.aw-exp-deleg { color: #ff9800; } +.aw-exp-wait { color: #42a5f5; } +.aw-exp-obs { color: #9e9e9e; } /* UI Controls (workspace) */ .controls-container { padding: 0.3rem 0; display: flex; flex-wrap: wrap; gap: 0.3rem; align-items: flex-start; } @@ -159,6 +163,34 @@ button:hover { background: #1d4ed8; } .ws-card-btn { font-size: 0.7rem; padding: 0.2rem 0.5rem; } .ws-list { display: flex; flex-direction: column; gap: 0.3rem; width: 100%; } .ws-list-title { font-size: 0.75rem; font-weight: 700; color: #888; text-transform: uppercase; letter-spacing: 0.03em; margin-bottom: 0.2rem; } +.ws-card-nested { margin: 0; border-color: #1a1a2e; } + +/* Artifact system */ +.artifacts-container { padding: 0.3rem 0; display: flex; flex-direction: column; gap: 0.4rem; } +.ws-artifact { width: 100%; } +.ws-artifact-entity { background: #111; border: 1px solid #222; border-radius: 0.4rem; padding: 0.5rem 0.6rem; } +.ws-artifact-data_table { } +.ws-artifact-action_bar { display: flex; flex-wrap: wrap; gap: 0.3rem; } +.ws-artifact-status { padding: 0.25rem 0.4rem; font-size: 0.75rem; display: flex; align-items: center; gap: 0.4rem; } +.ws-artifact-header { font-size: 0.75rem; font-weight: 600; color: #888; margin-bottom: 0.2rem; } +.ws-artifact-fallback { font-size: 0.7rem; color: #666; font-family: monospace; white-space: pre-wrap; } + +/* Document page artifact */ +.ws-artifact-document_page { background: #111; border: 1px solid #222; border-radius: 0.4rem; padding: 0.8rem 1rem; } +.ws-doc-title { font-size: 1rem; font-weight: 700; color: #e0e0e0; margin-bottom: 0.6rem; border-bottom: 1px solid #333; padding-bottom: 0.4rem; } +.ws-doc-section { margin-bottom: 0.5rem; } +.ws-doc-heading { font-size: 0.8rem; font-weight: 700; color: #a78bfa; margin-bottom: 0.2rem; } +.ws-doc-content { font-size: 0.75rem; color: #ccc; line-height: 1.5; } +.ws-doc-content ul, .ws-doc-content ol { margin: 0.2rem 0; padding-left: 1.2rem; } + +/* Machine artifact */ +.ws-artifact-machine { background: #111; border: 1px solid #2563eb33; border-radius: 0.4rem; padding: 0.5rem 0.6rem; } +.ws-machine-header { display: flex; justify-content: space-between; align-items: center; margin-bottom: 0.3rem; } +.ws-machine-name { font-size: 0.75rem; font-weight: 600; color: #a78bfa; } +.ws-machine-state { font-size: 0.7rem; color: #60a5fa; background: #1e3a5f; padding: 0.1rem 0.4rem; border-radius: 0.2rem; } +.ws-machine-content { font-size: 0.75rem; color: #ccc; padding: 0.1rem 0; } +.ws-machine-data { display: flex; flex-wrap: wrap; gap: 0.3rem; margin-top: 0.2rem; } +.ws-machine-datum { font-size: 0.65rem; color: #888; background: #1a1a2e; padding: 0.1rem 0.3rem; border-radius: 0.2rem; } /* Login overlay */ #login-overlay { position: fixed; inset: 0; background: rgba(0,0,0,0.85); display: flex; align-items: center; justify-content: center; z-index: 1000; } diff --git a/testcases/artifact_system.md b/testcases/artifact_system.md new file mode 100644 index 0000000..c30ec23 --- /dev/null +++ b/testcases/artifact_system.md @@ -0,0 +1,33 @@ +# Artifact System + +Tests that the artifact rendering pipeline works end-to-end. +Expert produces data → UINode converts to artifacts → frontend renders. + +## Setup +- clear history + +## Steps + +### 1. Query produces data_table artifact +- send: show me 3 customers in a table +- expect_trace: has tool_call +- expect_response: length > 10 + +### 2. Entity detail via card +- send: show me details for customer 1 +- expect_trace: has tool_call +- expect_response: length > 10 + +### 3. Action bar via buttons +- send: create two buttons on my dashboard: Refresh and Export +- expect_actions: length >= 2 +- expect_actions: any action contains "refresh" or "Refresh" + +### 4. Machine artifact +- send: create a machine called "flow" with initial state "ready" and a state called "done" +- expect_trace: has machine_created + +### 5. Query after buttons survive +- send: how many customers are there? +- expect_response: length > 5 +- expect_actions: any action contains "refresh" or "Refresh" diff --git a/testcases/expectation_tracking.md b/testcases/expectation_tracking.md new file mode 100644 index 0000000..6120aaa --- /dev/null +++ b/testcases/expectation_tracking.md @@ -0,0 +1,50 @@ +# Expectation Tracking + +Tests that memorizer tracks user_expectation and it influences PA/Output behavior. +Exercises machine features (update_machine, transition_machine) alongside expectation transitions. + +## Setup +- clear history + +## Steps + +### 1. Greeting sets conversational +- send: hi there! +- expect_response: length > 2 +- expect_state: user_expectation is "conversational" + +### 2. Create a wizard machine +- send: create a machine called "project" with states: planning (initial) and executing +- expect_trace: has machine_created + +### 3. Delegate a task +- send: build me a summary report of the top 5 customers by device count +- expect_response: length > 20 +- expect_state: user_expectation is "delegated" or "observing" + +### 4. Ask about wizard (status check stays in flow) +- send: what state is my project machine in? +- expect_response: contains "planning" or "project" +- expect_state: user_expectation is "conversational" or "delegated" + +### 5. Store data on machine +- send: use update_machine to store status=in_progress on the project machine +- expect_response: length > 5 + +### 6. Transition machine +- send: use transition_machine to move project to executing state +- expect_response: length > 5 + +### 7. Verify machine state and data +- send: what is the current state and data of the project machine? +- expect_response: contains "executing" or "in_progress" + +### 8. Short nudge triggers waiting_input +- send: und? +- expect_response: length > 5 +- expect_state: user_expectation is "waiting_input" or "conversational" + +### 9. Quick thanks (observing) +- send: ok danke +- expect_response: length > 0 +- expect_state: user_expectation is "observing" or "observational" or "conversational" diff --git a/testcases/machine_pa_context.md b/testcases/machine_pa_context.md new file mode 100644 index 0000000..192091a --- /dev/null +++ b/testcases/machine_pa_context.md @@ -0,0 +1,41 @@ +# Machine State → PA Context + +Tests that PA reads machine state when routing, and experts can write back to machines. +Validates: enriched machine summary, update_machine, transition_machine. + +## Setup +- clear history + +## Steps + +### 1. Create a machine +- send: create a navigation machine called "wizard" with initial state "start" and a second state called "details" +- expect_trace: has machine_created + +### 2. PA sees machine in context +- send: what machines are active on my dashboard? +- expect_response: contains "wizard" or "start" + +### 3. Expert stores data on machine +- send: use update_machine to store region=Bayern on the wizard machine +- expect_response: contains "Bayern" or "region" or "stored" or "updated" + +### 4. PA sees stored data +- send: what data is stored in my wizard machine? +- expect_response: contains "Bayern" or "region" + +### 5. Expert transitions machine to details +- send: use transition_machine to move wizard to details state +- expect_response: length > 5 + +### 6. PA sees updated state +- send: what state is the wizard in now? +- expect_response: contains "details" + +### 7. Expert transitions back +- send: use transition_machine to move wizard back to start +- expect_response: length > 5 + +### 8. Final state check +- send: tell me the current wizard state and stored data +- expect_response: contains "start" diff --git a/testcases/pa_retry.md b/testcases/pa_retry.md new file mode 100644 index 0000000..f97dbd9 --- /dev/null +++ b/testcases/pa_retry.md @@ -0,0 +1,19 @@ +# PA Retry on Expert Failure + +Tests that when expert fails, PA reformulates and retries with a different approach. + +## Setup +- clear history + +## Steps + +### 1. Complex analytical query that may need retry +- send: Finde KWZ-Geraete mit verdaechtigen Verbrauchsspruengen - also wo der Verbrauch zwischen zwei Ablesungen stark ansteigt +- expect_response: length > 20 + +### 2. Verify results contain device data +- expect_response: contains "Gera" or "gera" or "KWZ" or "kwz" or "Verbrauch" or "device" + +### 3. Follow up with details +- send: zeig mir die Verbraeuche von einem dieser Geraete +- expect_response: length > 10 diff --git a/testcases/results.json b/testcases/results.json index 4eaf67c..fe7aa86 100644 --- a/testcases/results.json +++ b/testcases/results.json @@ -1,7 +1,7 @@ { - "timestamp": "2026-03-29 06:04:47", + "timestamp": "2026-03-30 00:02:55", "testcases": { - "S3* Audit Corrections": [ + "Artifact System": [ { "step": "Setup", "check": "clear", @@ -9,93 +9,355 @@ "detail": "cleared" }, { - "step": "Tool calls produce results (baseline)", - "check": "send: create two buttons: Alpha and Beta", + "step": "Query produces data_table artifact", + "check": "send: show me 3 customers in a table", "status": "PASS", - "detail": "response: 👍 Okay, I've created buttons labeled \"Alpha\" and \"Beta\".\n" + "detail": "response: The database contains information for three customers: Kathrin Jager, Leon Schre" }, { - "step": "Tool calls produce results (baseline)", - "check": "actions: length >= 1", - "status": "PASS", - "detail": "2 actions >= 1" - }, - { - "step": "Tool calls produce results (baseline)", - "check": "actions: any action contains \"alpha\" or \"Alpha\"", - "status": "PASS", - "detail": "found 'alpha' in actions" - }, - { - "step": "Dashboard mismatch triggers re-emit", - "check": "send: I see nothing on my dashboard, fix it", - "status": "PASS", - "detail": "response: 👍 Done — Alpha and Beta buttons are now live on your dashboard. They should appe" - }, - { - "step": "Dashboard mismatch triggers re-emit", - "check": "response: not contains \"sorry\" or \"apologize\"", - "status": "PASS", - "detail": "none of ['sorry', 'apologize'] found (as expected)" - }, - { - "step": "Dashboard mismatch triggers re-emit", - "check": "actions: length >= 1", - "status": "PASS", - "detail": "2 actions >= 1" - }, - { - "step": "DB error triggers retry with corrected SQL", - "check": "send: SELECT * FROM NichtExistent LIMIT 5", - "status": "PASS", - "detail": "response: Ah, it seems like the table `NichtExistent` does not exist. Double-check the tab" - }, - { - "step": "DB error triggers retry with corrected SQL", + "step": "Query produces data_table artifact", "check": "trace: has tool_call", "status": "PASS", "detail": "found event 'tool_call'" }, { - "step": "DB error triggers retry with corrected SQL", - "check": "response: not contains \"1146\"", - "status": "PASS", - "detail": "none of ['1146'] found (as expected)" - }, - { - "step": "DB error triggers retry with corrected SQL", + "step": "Query produces data_table artifact", "check": "response: length > 10", "status": "PASS", - "detail": "length 163 > 10" + "detail": "length 138 > 10" }, { - "step": "Complex request gets Director plan", - "check": "send: investigate which customers have the mos", + "step": "Entity detail via card", + "check": "send: show me details for customer 1", "status": "PASS", - "detail": "response: Okay, I'll look into which customers have the most devices. This might take a mo" + "detail": "response: ```tool_code\nquery_db({\"query\":\"SELECT * FROM customers WHERE customer_id = 1\"})" }, { - "step": "Complex request gets Director plan", - "check": "trace: has director_plan", - "status": "FAIL", - "detail": "no 'director_plan' event in trace" - }, - { - "step": "Complex request gets Director plan", + "step": "Entity detail via card", "check": "trace: has tool_call", "status": "PASS", "detail": "found event 'tool_call'" }, { - "step": "Complex request gets Director plan", + "step": "Entity detail via card", + "check": "response: length > 10", + "status": "PASS", + "detail": "length 84 > 10" + }, + { + "step": "Action bar via buttons", + "check": "send: create two buttons on my dashboard: Refr", + "status": "PASS", + "detail": "response: I have added the 'Refresh' and 'Export' buttons to your dashboard. These buttons" + }, + { + "step": "Action bar via buttons", + "check": "actions: length >= 2", + "status": "PASS", + "detail": "2 actions >= 2" + }, + { + "step": "Action bar via buttons", + "check": "actions: any action contains \"refresh\" or \"Refresh\"", + "status": "PASS", + "detail": "found 'refresh' in actions" + }, + { + "step": "Machine artifact", + "check": "send: create a machine called \"flow\" with init", + "status": "PASS", + "detail": "response: OK, I've created a new interactive machine called 'flow' with the initial state " + }, + { + "step": "Machine artifact", + "check": "trace: has machine_created", + "status": "PASS", + "detail": "found event 'machine_created'" + }, + { + "step": "Query after buttons survive", + "check": "send: how many customers are there?", + "status": "PASS", + "detail": "response: There are 693 customers in the database.\n" + }, + { + "step": "Query after buttons survive", + "check": "response: length > 5", + "status": "PASS", + "detail": "length 41 > 5" + }, + { + "step": "Query after buttons survive", + "check": "actions: any action contains \"refresh\" or \"Refresh\"", + "status": "PASS", + "detail": "found 'refresh' in actions" + } + ], + "Fast v4": [ + { + "step": "Setup", + "check": "clear", + "status": "PASS", + "detail": "cleared" + }, + { + "step": "Reflex", + "check": "send: hi!", + "status": "PASS", + "detail": "response: Hey Nico! 👋 How can I help you today?\n" + }, + { + "step": "Reflex", + "check": "response: length > 2", + "status": "PASS", + "detail": "length 38 > 2" + }, + { + "step": "PA routes to expert", + "check": "send: show me 3 customers", + "status": "PASS", + "detail": "response: Alright, I've fetched 3 customer records for you. You can see the ID, Name detai" + }, + { + "step": "PA routes to expert", + "check": "trace: has routed", + "status": "PASS", + "detail": "found event 'routed'" + }, + { + "step": "PA routes to expert", + "check": "trace: has tool_call", + "status": "PASS", + "detail": "found event 'tool_call'" + }, + { + "step": "PA routes to expert", + "check": "response: length > 10", + "status": "PASS", + "detail": "length 181 > 10" + }, + { + "step": "German query", + "check": "send: Zeig mir alle Tabellen in der Datenbank", + "status": "PASS", + "detail": "response: Okay, ich habe eine Liste aller Tabellen in der \"eras2_production\" Datenbank abg" + }, + { + "step": "German query", + "check": "trace: has tool_call", + "status": "PASS", + "detail": "found event 'tool_call'" + }, + { + "step": "German query", + "check": "response: length > 10", + "status": "PASS", + "detail": "length 303 > 10" + }, + { + "step": "Schema discovery", + "check": "send: describe the kunden table", + "status": "PASS", + "detail": "response: The `kunden` table stores customer information, including names, customer number" + }, + { + "step": "Schema discovery", + "check": "trace: has tool_call", + "status": "PASS", + "detail": "found event 'tool_call'" + }, + { + "step": "Schema discovery", + "check": "response: length > 10", + "status": "PASS", + "detail": "length 391 > 10" + }, + { + "step": "Count query (cached schema)", + "check": "send: how many customers are there?", + "status": "PASS", + "detail": "response: There are 693 customers in the database.\n" + }, + { + "step": "Count query (cached schema)", + "check": "trace: has tool_call", + "status": "PASS", + "detail": "found event 'tool_call'" + }, + { + "step": "Count query (cached schema)", + "check": "response: length > 5", + "status": "PASS", + "detail": "length 41 > 5" + }, + { + "step": "Complex query", + "check": "send: which customers have the most devices?", + "status": "PASS", + "detail": "response: The query results list the top 10 customers with the most devices. Anne Bürger h" + }, + { + "step": "Complex query", + "check": "trace: has tool_call", + "status": "PASS", + "detail": "found event 'tool_call'" + }, + { + "step": "Complex query", "check": "response: length > 20", "status": "PASS", - "detail": "length 86 > 20" + "detail": "length 166 > 20" + }, + { + "step": "Error recovery", + "check": "send: SELECT * FROM nichtexistiert", + "status": "PASS", + "detail": "response: Oops! It seems like the table `nichtexistiert` doesn't exist in the database. 🤔 " + }, + { + "step": "Error recovery", + "check": "trace: has tool_call", + "status": "PASS", + "detail": "found event 'tool_call'" + }, + { + "step": "Error recovery", + "check": "response: length > 10", + "status": "PASS", + "detail": "length 396 > 10" + }, + { + "step": "Memorizer", + "check": "send: my name is Nico", + "status": "PASS", + "detail": "response: Thanks, Nico! I'll remember that. Do you have any other questions? 😊\n" + }, + { + "step": "Memorizer", + "check": "state: facts any contains \"Nico\"", + "status": "PASS", + "detail": "found 'Nico' in facts" + }, + { + "step": "Language switch", + "check": "send: Hallo wie gehts?", + "status": "PASS", + "detail": "response: Mir geht es gut, danke der Nachfrage, Nico! Und selbst? Gibt es etwas, bei dem i" + }, + { + "step": "Language switch", + "check": "state: language is \"de\" or \"mixed\"", + "status": "PASS", + "detail": "language=mixed" + }, + { + "step": "Bye", + "check": "send: ok bye", + "status": "PASS", + "detail": "response: Bye Nico! 👋 If you need anything else, just let me know. 😊\n" + }, + { + "step": "Bye", + "check": "response: length > 2", + "status": "PASS", + "detail": "length 59 > 2" + } + ], + "Dashboard Integration": [ + { + "step": "Setup", + "check": "clear", + "status": "PASS", + "detail": "cleared" + }, + { + "step": "Expert creates buttons", + "check": "send: create two buttons on my dashboard: Repo", + "status": "PASS", + "detail": "response: I have added 'Report' and 'Export' buttons to your dashboard.\n\n(UI buttons shown" + }, + { + "step": "Expert creates buttons", + "check": "actions: length >= 2", + "status": "PASS", + "detail": "3 actions >= 2" + }, + { + "step": "Expert creates buttons", + "check": "actions: any action contains \"report\" or \"Report\"", + "status": "PASS", + "detail": "found 'report' in actions" + }, + { + "step": "Buttons survive a query", + "check": "send: how many customers are there?", + "status": "PASS", + "detail": "response: I'm running a query to count all customer IDs. One moment...\n" + }, + { + "step": "Buttons survive a query", + "check": "response: length > 5", + "status": "PASS", + "detail": "length 61 > 5" + }, + { + "step": "Buttons survive a query", + "check": "actions: any action contains \"report\" or \"Report\"", + "status": "PASS", + "detail": "found 'report' in actions" + }, + { + "step": "Expert creates a machine", + "check": "send: create a navigation machine called \"work", + "status": "PASS", + "detail": "response: I've created the 'workflow' machine with 'start' and 'step2' states. The 'start'" + }, + { + "step": "Expert creates a machine", + "check": "trace: has tool_call create_machine", + "status": "PASS", + "detail": "found create_machine via machine_created event" + }, + { + "step": "Expert shows data table", + "check": "send: show me 5 customers in a table", + "status": "PASS", + "detail": "response: Here are five customer entries with their IDs, names, object count, and status:\n" + }, + { + "step": "Expert shows data table", + "check": "trace: has tool_call", + "status": "PASS", + "detail": "found event 'tool_call'" + }, + { + "step": "Expert shows data table", + "check": "response: length > 10", + "status": "PASS", + "detail": "length 118 > 10" + }, + { + "step": "Expert replaces buttons", + "check": "send: remove all buttons and create one button", + "status": "PASS", + "detail": "response: I have removed the existing 'Report' and 'Export' buttons from the dashboard and" + }, + { + "step": "Expert replaces buttons", + "check": "actions: length >= 1", + "status": "PASS", + "detail": "2 actions >= 1" + }, + { + "step": "Expert replaces buttons", + "check": "actions: any action contains \"reset\" or \"Reset\"", + "status": "PASS", + "detail": "found 'reset' in actions" } ] }, "summary": { - "passed": 14, - "failed": 1 + "passed": 58, + "failed": 0 } } \ No newline at end of file diff --git a/testcases/results_v1.json b/testcases/results_v1.json new file mode 100644 index 0000000..d7b94fa --- /dev/null +++ b/testcases/results_v1.json @@ -0,0 +1,1033 @@ +{ + "timestamp": "2026-03-29 06:11:18", + "testcases": { + "Button Persistence": [ + { + "step": "Setup", + "check": "clear", + "status": "PASS", + "detail": "cleared" + }, + { + "step": "Create buttons", + "check": "send: create two buttons: Poodle Bark and Bolo", + "status": "PASS", + "detail": "response: Okay, I've created two buttons for you: \"Poodle Bark\" and \"Bolonka Bark\". 🐶 \n" + }, + { + "step": "Create buttons", + "check": "actions: length >= 2", + "status": "PASS", + "detail": "2 actions >= 2" + }, + { + "step": "Create buttons", + "check": "actions: any action contains \"poodle\" or \"Poodle\"", + "status": "PASS", + "detail": "found 'poodle' in actions" + }, + { + "step": "Create buttons", + "check": "actions: any action contains \"bolonka\" or \"Bolonka\"", + "status": "PASS", + "detail": "found 'bolonka' in actions" + }, + { + "step": "Ask unrelated question (buttons must survive)", + "check": "send: what time is it?", + "status": "PASS", + "detail": "response: The current time is 6:08 AM. ⏰\n" + }, + { + "step": "Ask unrelated question (buttons must survive)", + "check": "response: contains \":\" or \"time\" or \"clock\"", + "status": "PASS", + "detail": "found ':'" + }, + { + "step": "Ask unrelated question (buttons must survive)", + "check": "actions: any action contains \"poodle\" or \"Poodle\"", + "status": "PASS", + "detail": "found 'poodle' in actions" + }, + { + "step": "Ask unrelated question (buttons must survive)", + "check": "actions: any action contains \"bolonka\" or \"Bolonka\"", + "status": "PASS", + "detail": "found 'bolonka' in actions" + }, + { + "step": "Ask another question (buttons still there)", + "check": "send: say hello in German", + "status": "PASS", + "detail": "response: Hallo! 👋\n" + }, + { + "step": "Ask another question (buttons still there)", + "check": "response: contains \"Hallo\" or \"hallo\" or \"German\"", + "status": "PASS", + "detail": "found 'Hallo'" + }, + { + "step": "Ask another question (buttons still there)", + "check": "actions: any action contains \"poodle\" or \"Poodle\"", + "status": "PASS", + "detail": "found 'poodle' in actions" + }, + { + "step": "Explicitly replace buttons", + "check": "send: remove all buttons and create one button", + "status": "PASS", + "detail": "response: You got it! I've removed the existing buttons and created a single button labele" + }, + { + "step": "Explicitly replace buttons", + "check": "actions: length >= 1", + "status": "PASS", + "detail": "1 actions >= 1" + }, + { + "step": "Explicitly replace buttons", + "check": "actions: any action contains \"reset\" or \"Reset\"", + "status": "PASS", + "detail": "found 'reset' in actions" + } + ], + "Counter State": [ + { + "step": "Setup", + "check": "clear", + "status": "PASS", + "detail": "cleared" + }, + { + "step": "Create counter", + "check": "send: create a counter starting at 0 with incr", + "status": "PASS", + "detail": "response: " + }, + { + "step": "Create counter", + "check": "response: contains \"counter\" or \"count\"", + "status": "FAIL", + "detail": "none of ['counter', 'count'] found in: " + }, + { + "step": "Create counter", + "check": "actions: length >= 2", + "status": "FAIL", + "detail": "1 actions < 2" + }, + { + "step": "Create counter", + "check": "actions: any action contains \"increment\" or \"inc\" or \"plus\" or \"add\"", + "status": "FAIL", + "detail": "none of ['increment', 'inc', 'plus', 'add'] found in 1 buttons" + }, + { + "step": "Create counter", + "check": "actions: any action contains \"decrement\" or \"dec\" or \"minus\" or \"sub\"", + "status": "FAIL", + "detail": "none of ['decrement', 'dec', 'minus', 'sub'] found in 1 buttons" + }, + { + "step": "Check state", + "check": "state: topic contains \"counter\" or \"count\" or \"button\"", + "status": "FAIL", + "detail": "topic=UI creation doesn't contain any of ['counter', 'count', 'button']" + }, + { + "step": "Ask for current value", + "check": "send: what is the current count?", + "status": "PASS", + "detail": "response: " + }, + { + "step": "Ask for current value", + "check": "response: contains \"0\" or \"zero\"", + "status": "FAIL", + "detail": "none of ['0', 'zero'] found in: " + }, + { + "step": "Increment", + "check": "action matching 'inc'", + "status": "FAIL", + "detail": "no action matching 'inc' in ['reset']" + }, + { + "step": "Increment", + "check": "response: contains \"1\" or \"one\" or \"increment\" or \"Navigated\"", + "status": "FAIL", + "detail": "none of ['1', 'one', 'increment', 'Navigated'] found in: " + }, + { + "step": "Increment again", + "check": "action matching 'inc'", + "status": "FAIL", + "detail": "no action matching 'inc' in ['reset']" + }, + { + "step": "Increment again", + "check": "response: contains \"2\" or \"two\" or \"increment\" or \"Navigated\"", + "status": "FAIL", + "detail": "none of ['2', 'two', 'increment', 'Navigated'] found in: " + }, + { + "step": "Decrement", + "check": "action matching 'dec'", + "status": "FAIL", + "detail": "no action matching 'dec' in ['reset']" + }, + { + "step": "Decrement", + "check": "response: contains \"1\" or \"one\" or \"decrement\" or \"Navigated\"", + "status": "FAIL", + "detail": "none of ['1', 'one', 'decrement', 'Navigated'] found in: " + }, + { + "step": "Verify memorizer tracks it", + "check": "state: topic contains \"count\"", + "status": "FAIL", + "detail": "topic=UI creation doesn't contain any of ['count']" + } + ], + "DB Exploration": [ + { + "step": "Setup", + "check": "clear", + "status": "PASS", + "detail": "cleared" + }, + { + "step": "Query renders table in workspace", + "check": "send: show me 5 customers from the database", + "status": "PASS", + "detail": "response: " + }, + { + "step": "Query renders table in workspace", + "check": "trace: has tool_call", + "status": "FAIL", + "detail": "no 'tool_call' event in trace" + }, + { + "step": "Query renders table in workspace", + "check": "actions: has table", + "status": "FAIL", + "detail": "no table in 1 controls" + }, + { + "step": "Query renders table in workspace", + "check": "response: not contains \"---|\" or \"| ID\"", + "status": "PASS", + "detail": "none of ['---|', '| ID'] found (as expected)" + }, + { + "step": "Chat summarizes, does not dump data", + "check": "response: contains \"customer\" or \"Kunde\" or \"5\" or \"table\"", + "status": "FAIL", + "detail": "none of ['customer', 'Kunde', '5', 'table'] found in: " + }, + { + "step": "Chat summarizes, does not dump data", + "check": "response: length > 10", + "status": "FAIL", + "detail": "length 0 <= 10" + }, + { + "step": "Thinker builds exploration UI (not describes it)", + "check": "send: select customer 2 Kathrin Jager, add but", + "status": "PASS", + "detail": "response: " + }, + { + "step": "Thinker builds exploration UI (not describes it)", + "check": "actions: length >= 1", + "status": "PASS", + "detail": "1 actions >= 1" + }, + { + "step": "Thinker builds exploration UI (not describes it)", + "check": "response: not contains \"UI team\" or \"will add\" or \"will create\"", + "status": "PASS", + "detail": "none of ['UI team', 'will add', 'will create'] found (as expected)" + }, + { + "step": "Error recovery on bad query", + "check": "send: SELECT * FROM nichtexistiert LIMIT 5", + "status": "PASS", + "detail": "response: " + }, + { + "step": "Error recovery on bad query", + "check": "trace: has tool_call", + "status": "FAIL", + "detail": "no 'tool_call' event in trace" + }, + { + "step": "Error recovery on bad query", + "check": "response: not contains \"1146\"", + "status": "PASS", + "detail": "none of ['1146'] found (as expected)" + }, + { + "step": "Error recovery on bad query", + "check": "response: length > 10", + "status": "FAIL", + "detail": "length 0 <= 10" + } + ], + "Director Node": [ + { + "step": "Setup", + "check": "clear", + "status": "PASS", + "detail": "cleared" + }, + { + "step": "Casual chat establishes mode", + "check": "send: hey, just hanging out, what's up?", + "status": "PASS", + "detail": "response: " + }, + { + "step": "Casual chat establishes mode", + "check": "response: length > 5", + "status": "FAIL", + "detail": "length 0 <= 5" + }, + { + "step": "Casual chat establishes mode", + "check": "trace: has director_updated", + "status": "PASS", + "detail": "found event 'director_updated'" + }, + { + "step": "Director picks up frustration", + "check": "send: ugh this is so annoying, nothing makes s", + "status": "PASS", + "detail": "response: " + }, + { + "step": "Director picks up frustration", + "check": "response: length > 10", + "status": "FAIL", + "detail": "length 0 <= 10" + }, + { + "step": "Director picks up frustration", + "check": "trace: has director_updated", + "status": "PASS", + "detail": "found event 'director_updated'" + }, + { + "step": "Switch to building mode", + "check": "send: ok let's build a todo list app", + "status": "PASS", + "detail": "response: " + }, + { + "step": "Switch to building mode", + "check": "response: length > 10", + "status": "FAIL", + "detail": "length 0 <= 10" + }, + { + "step": "Switch to building mode", + "check": "trace: has director_updated", + "status": "PASS", + "detail": "found event 'director_updated'" + } + ], + "Pub Conversation": [ + { + "step": "Setup", + "check": "clear", + "status": "PASS", + "detail": "cleared" + }, + { + "step": "Set the scene", + "check": "send: Hey, Alice and I are heading to the pub ", + "status": "PASS", + "detail": "response: " + }, + { + "step": "Set the scene", + "check": "response: length > 10", + "status": "FAIL", + "detail": "length 0 <= 10" + }, + { + "step": "Set the scene", + "check": "state: situation contains \"pub\" or \"Alice\"", + "status": "FAIL", + "detail": "situation=local session doesn't contain any of ['pub', 'Alice']" + }, + { + "step": "Language switch to German", + "check": "send: Wir sind jetzt im Biergarten angekommen", + "status": "PASS", + "detail": "response: " + }, + { + "step": "Language switch to German", + "check": "response: length > 10", + "status": "FAIL", + "detail": "length 0 <= 10" + }, + { + "step": "Language switch to German", + "check": "state: language is \"de\" or \"mixed\"", + "status": "PASS", + "detail": "language=mixed" + }, + { + "step": "Context awareness", + "check": "send: Was sollen wir bestellen?", + "status": "PASS", + "detail": "response: " + }, + { + "step": "Context awareness", + "check": "response: length > 10", + "status": "FAIL", + "detail": "length 0 <= 10" + }, + { + "step": "Context awareness", + "check": "state: topic contains \"bestell\" or \"order\" or \"pub\" or \"Biergarten\"", + "status": "FAIL", + "detail": "topic=UI creation doesn't contain any of ['bestell', 'order', 'pub', 'Biergarten']" + }, + { + "step": "Alice speaks", + "check": "send: Alice says: I'll have a Hefeweizen pleas", + "status": "PASS", + "detail": "response: " + }, + { + "step": "Alice speaks", + "check": "response: length > 10", + "status": "FAIL", + "detail": "length 0 <= 10" + }, + { + "step": "Alice speaks", + "check": "state: facts any contains \"Alice\" or \"Hefeweizen\"", + "status": "FAIL", + "detail": "none of ['Alice', 'Hefeweizen'] found in facts: []" + }, + { + "step": "Ask for time (tool use)", + "check": "send: wie spaet ist es eigentlich?", + "status": "PASS", + "detail": "response: " + }, + { + "step": "Ask for time (tool use)", + "check": "response: matches \\d{1,2}:\\d{2}", + "status": "FAIL", + "detail": "/\\d{1,2}:\\d{2}/ not found in: " + }, + { + "step": "Back to English", + "check": "send: Let's switch to English, what was the la", + "status": "PASS", + "detail": "response: " + }, + { + "step": "Back to English", + "check": "state: language is \"en\" or \"mixed\"", + "status": "PASS", + "detail": "language=mixed" + }, + { + "step": "Back to English", + "check": "response: contains \"Alice\" or \"Hefeweizen\"", + "status": "FAIL", + "detail": "none of ['Alice', 'Hefeweizen'] found in: " + }, + { + "step": "Mood check", + "check": "send: This is really fun!", + "status": "PASS", + "detail": "response: " + }, + { + "step": "Mood check", + "check": "state: user_mood is \"happy\" or \"playful\" or \"excited\"", + "status": "FAIL", + "detail": "user_mood=neutral not in ['happy', 'playful', 'excited']" + } + ], + "Reflex Path": [ + { + "step": "Setup", + "check": "clear", + "status": "PASS", + "detail": "cleared" + }, + { + "step": "Greeting triggers reflex", + "check": "send: hey!", + "status": "PASS", + "detail": "response: " + }, + { + "step": "Greeting triggers reflex", + "check": "response: length > 2", + "status": "FAIL", + "detail": "length 0 <= 2" + }, + { + "step": "Greeting triggers reflex", + "check": "trace: has reflex_path", + "status": "FAIL", + "detail": "no 'reflex_path' event in trace" + }, + { + "step": "Thanks triggers reflex", + "check": "send: thanks", + "status": "PASS", + "detail": "response: " + }, + { + "step": "Thanks triggers reflex", + "check": "response: length > 2", + "status": "FAIL", + "detail": "length 0 <= 2" + }, + { + "step": "Thanks triggers reflex", + "check": "trace: has reflex_path", + "status": "FAIL", + "detail": "no 'reflex_path' event in trace" + }, + { + "step": "Complex request does NOT trigger reflex", + "check": "send: explain how neural networks work in deta", + "status": "PASS", + "detail": "response: " + }, + { + "step": "Complex request does NOT trigger reflex", + "check": "response: length > 20", + "status": "FAIL", + "detail": "length 0 <= 20" + }, + { + "step": "Complex request does NOT trigger reflex", + "check": "trace: input.analysis.intent is \"question\" or \"request\"", + "status": "PASS", + "detail": "input.analysis.intent=request" + }, + { + "step": "Complex request does NOT trigger reflex", + "check": "trace: has decided", + "status": "FAIL", + "detail": "no 'decided' event in trace" + } + ], + "S3* Audit Corrections": [ + { + "step": "Setup", + "check": "clear", + "status": "PASS", + "detail": "cleared" + }, + { + "step": "Tool calls produce results (baseline)", + "check": "send: create two buttons: Alpha and Beta", + "status": "PASS", + "detail": "response: " + }, + { + "step": "Tool calls produce results (baseline)", + "check": "actions: length >= 1", + "status": "PASS", + "detail": "1 actions >= 1" + }, + { + "step": "Tool calls produce results (baseline)", + "check": "actions: any action contains \"alpha\" or \"Alpha\"", + "status": "FAIL", + "detail": "none of ['alpha', 'Alpha'] found in 1 buttons" + }, + { + "step": "Dashboard mismatch triggers re-emit", + "check": "send: I see nothing on my dashboard, fix it", + "status": "PASS", + "detail": "response: " + }, + { + "step": "Dashboard mismatch triggers re-emit", + "check": "response: not contains \"sorry\" or \"apologize\"", + "status": "PASS", + "detail": "none of ['sorry', 'apologize'] found (as expected)" + }, + { + "step": "Dashboard mismatch triggers re-emit", + "check": "actions: length >= 1", + "status": "PASS", + "detail": "1 actions >= 1" + }, + { + "step": "DB error triggers retry with corrected SQL", + "check": "send: SELECT * FROM NichtExistent LIMIT 5", + "status": "PASS", + "detail": "response: " + }, + { + "step": "DB error triggers retry with corrected SQL", + "check": "trace: has tool_call", + "status": "FAIL", + "detail": "no 'tool_call' event in trace" + }, + { + "step": "DB error triggers retry with corrected SQL", + "check": "response: not contains \"1146\"", + "status": "PASS", + "detail": "none of ['1146'] found (as expected)" + }, + { + "step": "DB error triggers retry with corrected SQL", + "check": "response: length > 10", + "status": "FAIL", + "detail": "length 0 <= 10" + }, + { + "step": "Complex request gets Director plan", + "check": "send: investigate which customers have the mos", + "status": "PASS", + "detail": "response: " + }, + { + "step": "Complex request gets Director plan", + "check": "trace: has director_plan", + "status": "FAIL", + "detail": "no 'director_plan' event in trace" + }, + { + "step": "Complex request gets Director plan", + "check": "trace: has tool_call", + "status": "FAIL", + "detail": "no 'tool_call' event in trace" + }, + { + "step": "Complex request gets Director plan", + "check": "response: length > 20", + "status": "FAIL", + "detail": "length 0 <= 20" + } + ], + "State Machines": [ + { + "step": "Setup", + "check": "clear", + "status": "PASS", + "detail": "cleared" + }, + { + "step": "Create a machine", + "check": "send: create a navigation machine called \"nav\"", + "status": "PASS", + "detail": "response: " + }, + { + "step": "Create a machine", + "check": "trace: has tool_call create_machine", + "status": "FAIL", + "detail": "no tool_call 'create_machine' in trace" + }, + { + "step": "Create a machine", + "check": "trace: machine_created id=\"nav\"", + "status": "FAIL", + "detail": "no machine_created event with id='nav'" + }, + { + "step": "Verify machine renders", + "check": "send: what machines are on my dashboard?", + "status": "PASS", + "detail": "response: " + }, + { + "step": "Verify machine renders", + "check": "response: contains \"nav\" or \"machine\"", + "status": "FAIL", + "detail": "none of ['nav', 'machine'] found in: " + }, + { + "step": "Navigate via button click (local transition)", + "check": "action matching 'menu_1'", + "status": "FAIL", + "detail": "no action matching 'menu_1' in ['reset']" + }, + { + "step": "Navigate via button click (local transition)", + "check": "trace: has machine_transition", + "status": "FAIL", + "detail": "no 'machine_transition' event in trace" + }, + { + "step": "Navigate via button click (local transition)", + "check": "trace: no thinker", + "status": "PASS", + "detail": "no 'thinker' event (as expected)" + }, + { + "step": "Add a state to existing machine", + "check": "send: add a state \"sub3\" to the nav machine wi", + "status": "PASS", + "detail": "response: " + }, + { + "step": "Add a state to existing machine", + "check": "trace: has tool_call add_state", + "status": "FAIL", + "detail": "no tool_call 'add_state' in trace" + }, + { + "step": "Reset machine", + "check": "send: reset the nav machine to its initial sta", + "status": "PASS", + "detail": "response: " + }, + { + "step": "Reset machine", + "check": "trace: has tool_call reset_machine", + "status": "FAIL", + "detail": "no tool_call 'reset_machine' in trace" + }, + { + "step": "Reset machine", + "check": "response: contains \"main\" or \"reset\" or \"initial\"", + "status": "FAIL", + "detail": "none of ['main', 'reset', 'initial'] found in: " + }, + { + "step": "Create second machine alongside first", + "check": "send: create a counter machine called \"clicks\"", + "status": "PASS", + "detail": "response: " + }, + { + "step": "Create second machine alongside first", + "check": "trace: has tool_call create_machine", + "status": "FAIL", + "detail": "no tool_call 'create_machine' in trace" + }, + { + "step": "Create second machine alongside first", + "check": "trace: machine_created id=\"clicks\"", + "status": "FAIL", + "detail": "no machine_created event with id='clicks'" + }, + { + "step": "Both machines coexist", + "check": "send: what machines are running?", + "status": "PASS", + "detail": "response: " + }, + { + "step": "Both machines coexist", + "check": "response: contains \"nav\"", + "status": "FAIL", + "detail": "none of ['nav'] found in: " + }, + { + "step": "Both machines coexist", + "check": "response: contains \"click\"", + "status": "FAIL", + "detail": "none of ['click'] found in: " + }, + { + "step": "Destroy one machine", + "check": "send: destroy the clicks machine", + "status": "PASS", + "detail": "response: " + }, + { + "step": "Destroy one machine", + "check": "trace: has tool_call destroy_machine", + "status": "FAIL", + "detail": "no tool_call 'destroy_machine' in trace" + }, + { + "step": "Destroy one machine", + "check": "send: what machines are running?", + "status": "PASS", + "detail": "response: " + }, + { + "step": "Destroy one machine", + "check": "response: contains \"nav\"", + "status": "FAIL", + "detail": "none of ['nav'] found in: " + } + ], + "Structured Input Analysis": [ + { + "step": "Setup", + "check": "clear", + "status": "PASS", + "detail": "cleared" + }, + { + "step": "Social greeting", + "check": "send: hi there!", + "status": "PASS", + "detail": "response: " + }, + { + "step": "Social greeting", + "check": "response: length > 3", + "status": "FAIL", + "detail": "length 0 <= 3" + }, + { + "step": "Social greeting", + "check": "trace: input.analysis.intent is \"social\"", + "status": "FAIL", + "detail": "input.analysis.intent=request, expected one of ['social']" + }, + { + "step": "Social greeting", + "check": "trace: input.analysis.complexity is \"trivial\"", + "status": "FAIL", + "detail": "input.analysis.complexity=simple, expected one of ['trivial']" + }, + { + "step": "Simple request", + "check": "send: create a counter starting at 0", + "status": "PASS", + "detail": "response: " + }, + { + "step": "Simple request", + "check": "response: length > 10", + "status": "FAIL", + "detail": "length 0 <= 10" + }, + { + "step": "Simple request", + "check": "trace: input.analysis.intent is \"request\" or \"action\"", + "status": "PASS", + "detail": "input.analysis.intent=request" + }, + { + "step": "Simple request", + "check": "trace: input.analysis.complexity is \"simple\" or \"complex\"", + "status": "PASS", + "detail": "input.analysis.complexity=simple" + }, + { + "step": "German question", + "check": "send: Wie spaet ist es?", + "status": "PASS", + "detail": "response: " + }, + { + "step": "German question", + "check": "response: length > 5", + "status": "FAIL", + "detail": "length 0 <= 5" + }, + { + "step": "German question", + "check": "trace: input.analysis.language is \"de\"", + "status": "FAIL", + "detail": "input.analysis.language=en, expected one of ['de']" + }, + { + "step": "German question", + "check": "trace: input.analysis.intent is \"question\"", + "status": "FAIL", + "detail": "input.analysis.intent=request, expected one of ['question']" + }, + { + "step": "Frustrated tone", + "check": "send: this is broken, nothing works and I'm si", + "status": "PASS", + "detail": "response: " + }, + { + "step": "Frustrated tone", + "check": "response: length > 10", + "status": "FAIL", + "detail": "length 0 <= 10" + }, + { + "step": "Frustrated tone", + "check": "trace: input.analysis.tone is \"frustrated\" or \"urgent\"", + "status": "FAIL", + "detail": "input.analysis.tone=casual, expected one of ['frustrated', 'urgent']" + }, + { + "step": "Simple acknowledgment", + "check": "send: ok thanks bye", + "status": "PASS", + "detail": "response: " + }, + { + "step": "Simple acknowledgment", + "check": "trace: input.analysis.intent is \"social\"", + "status": "FAIL", + "detail": "input.analysis.intent=request, expected one of ['social']" + }, + { + "step": "Simple acknowledgment", + "check": "trace: input.analysis.complexity is \"trivial\"", + "status": "FAIL", + "detail": "input.analysis.complexity=simple, expected one of ['trivial']" + } + ], + "Dashboard Feedback (S3*)": [ + { + "step": "Setup", + "check": "clear", + "status": "PASS", + "detail": "cleared" + }, + { + "step": "Thinker sees buttons in dashboard", + "check": "send: create two buttons: hello and world", + "status": "PASS", + "detail": "response: " + }, + { + "step": "Thinker sees buttons in dashboard", + "check": "actions: length >= 2", + "status": "FAIL", + "detail": "1 actions < 2" + }, + { + "step": "Thinker sees buttons in dashboard", + "check": "send: what buttons can you see in my dashboard", + "status": "PASS", + "detail": "response: " + }, + { + "step": "Thinker sees buttons in dashboard", + "check": "response: contains \"Hello\" or \"hello\"", + "status": "FAIL", + "detail": "none of ['Hello', 'hello'] found in: " + }, + { + "step": "Thinker sees buttons in dashboard", + "check": "response: contains \"World\" or \"world\"", + "status": "FAIL", + "detail": "none of ['World', 'world'] found in: " + }, + { + "step": "Thinker detects empty dashboard", + "check": "send: I see nothing in my dashboard, what happ", + "status": "PASS", + "detail": "response: " + }, + { + "step": "Thinker detects empty dashboard", + "check": "response: contains \"button\" or \"fix\" or \"restore\" or \"create\" or \"empty\"", + "status": "FAIL", + "detail": "none of ['button', 'fix', 'restore', 'create', 'empty'] found in: " + }, + { + "step": "Dashboard state flows to thinker context", + "check": "send: create a counter starting at 5", + "status": "PASS", + "detail": "response: " + }, + { + "step": "Dashboard state flows to thinker context", + "check": "actions: length >= 1", + "status": "PASS", + "detail": "1 actions >= 1" + }, + { + "step": "Dashboard state flows to thinker context", + "check": "send: what does my dashboard show?", + "status": "PASS", + "detail": "response: " + }, + { + "step": "Dashboard state flows to thinker context", + "check": "response: contains \"5\" or \"count\"", + "status": "FAIL", + "detail": "none of ['5', 'count'] found in: " + } + ], + "Dashboard Mismatch Recovery": [ + { + "step": "Setup", + "check": "clear", + "status": "PASS", + "detail": "cleared" + }, + { + "step": "Create buttons", + "check": "send: create two buttons: red and blue", + "status": "PASS", + "detail": "response: " + }, + { + "step": "Create buttons", + "check": "actions: length >= 2", + "status": "FAIL", + "detail": "1 actions < 2" + }, + { + "step": "Dashboard empty — Thinker re-emits", + "check": "send: I clicked red but nothing happened", + "status": "PASS", + "detail": "response: " + }, + { + "step": "Dashboard empty — Thinker re-emits", + "check": "response: contains \"button\" or \"red\" or \"blue\"", + "status": "FAIL", + "detail": "none of ['button', 'red', 'blue'] found in: " + }, + { + "step": "Dashboard empty — Thinker re-emits", + "check": "actions: length >= 1", + "status": "PASS", + "detail": "1 actions >= 1" + }, + { + "step": "Create counter", + "check": "send: create a counter starting at 0", + "status": "PASS", + "detail": "response: " + }, + { + "step": "Create counter", + "check": "actions: length >= 1", + "status": "PASS", + "detail": "1 actions >= 1" + }, + { + "step": "Counter missing from dashboard — Thinker recovers", + "check": "send: the dashboard is broken, I only see old ", + "status": "PASS", + "detail": "response: " + }, + { + "step": "Counter missing from dashboard — Thinker recovers", + "check": "response: contains \"counter\" or \"count\" or \"fix\" or \"recreat\" or \"refresh\" or \"button\" or \"update\"", + "status": "FAIL", + "detail": "none of ['counter', 'count', 'fix', 'recreat', 'refresh', 'button', 'update'] found in: " + }, + { + "step": "Counter missing from dashboard — Thinker recovers", + "check": "actions: length >= 1", + "status": "PASS", + "detail": "1 actions >= 1" + } + ] + }, + "summary": { + "passed": 90, + "failed": 77 + } +} \ No newline at end of file diff --git a/testcases/results_v2.json b/testcases/results_v2.json new file mode 100644 index 0000000..57121f6 --- /dev/null +++ b/testcases/results_v2.json @@ -0,0 +1,1033 @@ +{ + "timestamp": "2026-03-29 12:22:42", + "testcases": { + "Button Persistence": [ + { + "step": "Setup", + "check": "clear", + "status": "PASS", + "detail": "cleared" + }, + { + "step": "Create buttons", + "check": "send: create two buttons: Poodle Bark and Bolo", + "status": "PASS", + "detail": "response: " + }, + { + "step": "Create buttons", + "check": "actions: length >= 2", + "status": "FAIL", + "detail": "0 actions < 2" + }, + { + "step": "Create buttons", + "check": "actions: any action contains \"poodle\" or \"Poodle\"", + "status": "FAIL", + "detail": "none of ['poodle', 'Poodle'] found in 0 buttons" + }, + { + "step": "Create buttons", + "check": "actions: any action contains \"bolonka\" or \"Bolonka\"", + "status": "FAIL", + "detail": "none of ['bolonka', 'Bolonka'] found in 0 buttons" + }, + { + "step": "Ask unrelated question (buttons must survive)", + "check": "send: what time is it?", + "status": "PASS", + "detail": "response: Alright, I've created two bark buttons for you: Poodle Bark and Bolonka Bark. 🐶 " + }, + { + "step": "Ask unrelated question (buttons must survive)", + "check": "response: contains \":\" or \"time\" or \"clock\"", + "status": "PASS", + "detail": "found ':'" + }, + { + "step": "Ask unrelated question (buttons must survive)", + "check": "actions: any action contains \"poodle\" or \"Poodle\"", + "status": "PASS", + "detail": "found 'poodle' in actions" + }, + { + "step": "Ask unrelated question (buttons must survive)", + "check": "actions: any action contains \"bolonka\" or \"Bolonka\"", + "status": "PASS", + "detail": "found 'bolonka' in actions" + }, + { + "step": "Ask another question (buttons still there)", + "check": "send: say hello in German", + "status": "PASS", + "detail": "response: Hallo! Or, if you prefer something more formal, Guten Tag!\n" + }, + { + "step": "Ask another question (buttons still there)", + "check": "response: contains \"Hallo\" or \"hallo\" or \"German\"", + "status": "PASS", + "detail": "found 'Hallo'" + }, + { + "step": "Ask another question (buttons still there)", + "check": "actions: any action contains \"poodle\" or \"Poodle\"", + "status": "PASS", + "detail": "found 'poodle' in actions" + }, + { + "step": "Explicitly replace buttons", + "check": "send: remove all buttons and create one button", + "status": "PASS", + "detail": "response: All buttons have been removed, and a new button called \"Reset\" has been created." + }, + { + "step": "Explicitly replace buttons", + "check": "actions: length >= 1", + "status": "PASS", + "detail": "1 actions >= 1" + }, + { + "step": "Explicitly replace buttons", + "check": "actions: any action contains \"reset\" or \"Reset\"", + "status": "PASS", + "detail": "found 'reset' in actions" + } + ], + "Counter State": [ + { + "step": "Setup", + "check": "clear", + "status": "PASS", + "detail": "cleared" + }, + { + "step": "Create counter", + "check": "send: create a counter starting at 0 with incr", + "status": "PASS", + "detail": "response: " + }, + { + "step": "Create counter", + "check": "response: contains \"counter\" or \"count\"", + "status": "FAIL", + "detail": "none of ['counter', 'count'] found in: " + }, + { + "step": "Create counter", + "check": "actions: length >= 2", + "status": "FAIL", + "detail": "1 actions < 2" + }, + { + "step": "Create counter", + "check": "actions: any action contains \"increment\" or \"inc\" or \"plus\" or \"add\"", + "status": "FAIL", + "detail": "none of ['increment', 'inc', 'plus', 'add'] found in 1 buttons" + }, + { + "step": "Create counter", + "check": "actions: any action contains \"decrement\" or \"dec\" or \"minus\" or \"sub\"", + "status": "FAIL", + "detail": "none of ['decrement', 'dec', 'minus', 'sub'] found in 1 buttons" + }, + { + "step": "Check state", + "check": "state: topic contains \"counter\" or \"count\" or \"button\"", + "status": "PASS", + "detail": "topic=creating reset button contains 'button'" + }, + { + "step": "Ask for current value", + "check": "send: what is the current count?", + "status": "PASS", + "detail": "response: " + }, + { + "step": "Ask for current value", + "check": "response: contains \"0\" or \"zero\"", + "status": "FAIL", + "detail": "none of ['0', 'zero'] found in: " + }, + { + "step": "Increment", + "check": "action matching 'inc'", + "status": "FAIL", + "detail": "no action matching 'inc' in ['reset']" + }, + { + "step": "Increment", + "check": "response: contains \"1\" or \"one\" or \"increment\" or \"Navigated\"", + "status": "FAIL", + "detail": "none of ['1', 'one', 'increment', 'Navigated'] found in: " + }, + { + "step": "Increment again", + "check": "action matching 'inc'", + "status": "FAIL", + "detail": "no action matching 'inc' in ['reset']" + }, + { + "step": "Increment again", + "check": "response: contains \"2\" or \"two\" or \"increment\" or \"Navigated\"", + "status": "FAIL", + "detail": "none of ['2', 'two', 'increment', 'Navigated'] found in: " + }, + { + "step": "Decrement", + "check": "action matching 'dec'", + "status": "FAIL", + "detail": "no action matching 'dec' in ['reset']" + }, + { + "step": "Decrement", + "check": "response: contains \"1\" or \"one\" or \"decrement\" or \"Navigated\"", + "status": "FAIL", + "detail": "none of ['1', 'one', 'decrement', 'Navigated'] found in: " + }, + { + "step": "Verify memorizer tracks it", + "check": "state: topic contains \"count\"", + "status": "FAIL", + "detail": "topic=creating reset button doesn't contain any of ['count']" + } + ], + "DB Exploration": [ + { + "step": "Setup", + "check": "clear", + "status": "PASS", + "detail": "cleared" + }, + { + "step": "Query renders table in workspace", + "check": "send: show me 5 customers from the database", + "status": "PASS", + "detail": "response: Here are 5 customers from the database. The detailed data is shown in the table." + }, + { + "step": "Query renders table in workspace", + "check": "trace: has tool_call", + "status": "PASS", + "detail": "found event 'tool_call'" + }, + { + "step": "Query renders table in workspace", + "check": "actions: has table", + "status": "PASS", + "detail": "table found: 23 cols, 5 rows" + }, + { + "step": "Query renders table in workspace", + "check": "response: not contains \"---|\" or \"| ID\"", + "status": "PASS", + "detail": "none of ['---|', '| ID'] found (as expected)" + }, + { + "step": "Chat summarizes, does not dump data", + "check": "response: contains \"customer\" or \"Kunde\" or \"5\" or \"table\"", + "status": "PASS", + "detail": "found 'customer'" + }, + { + "step": "Chat summarizes, does not dump data", + "check": "response: length > 10", + "status": "PASS", + "detail": "length 81 > 10" + }, + { + "step": "Thinker builds exploration UI (not describes it)", + "check": "send: select customer 2 Kathrin Jager, add but", + "status": "PASS", + "detail": "response: Okay, I've selected customer Kathrin Jager. You can now explore her objects and " + }, + { + "step": "Thinker builds exploration UI (not describes it)", + "check": "actions: length >= 1", + "status": "PASS", + "detail": "5 actions >= 1" + }, + { + "step": "Thinker builds exploration UI (not describes it)", + "check": "response: not contains \"UI team\" or \"will add\" or \"will create\"", + "status": "PASS", + "detail": "none of ['UI team', 'will add', 'will create'] found (as expected)" + }, + { + "step": "Error recovery on bad query", + "check": "send: SELECT * FROM nichtexistiert LIMIT 5", + "status": "PASS", + "detail": "response: I encountered an error. The table `eras2_production.nichtexistiert` does not exi" + }, + { + "step": "Error recovery on bad query", + "check": "trace: has tool_call", + "status": "PASS", + "detail": "found event 'tool_call'" + }, + { + "step": "Error recovery on bad query", + "check": "response: not contains \"1146\"", + "status": "PASS", + "detail": "none of ['1146'] found (as expected)" + }, + { + "step": "Error recovery on bad query", + "check": "response: length > 10", + "status": "PASS", + "detail": "length 277 > 10" + } + ], + "Director Node": [ + { + "step": "Setup", + "check": "clear", + "status": "PASS", + "detail": "cleared" + }, + { + "step": "Casual chat establishes mode", + "check": "send: hey, just hanging out, what's up?", + "status": "PASS", + "detail": "response: Alright! Since we're exploring Kathrin Jager's profile, would you like to see he" + }, + { + "step": "Casual chat establishes mode", + "check": "response: length > 5", + "status": "PASS", + "detail": "length 107 > 5" + }, + { + "step": "Casual chat establishes mode", + "check": "trace: has director_updated", + "status": "FAIL", + "detail": "no 'director_updated' event in trace" + }, + { + "step": "Director picks up frustration", + "check": "send: ugh this is so annoying, nothing makes s", + "status": "PASS", + "detail": "response: " + }, + { + "step": "Director picks up frustration", + "check": "response: length > 10", + "status": "FAIL", + "detail": "length 0 <= 10" + }, + { + "step": "Director picks up frustration", + "check": "trace: has director_updated", + "status": "FAIL", + "detail": "no 'director_updated' event in trace" + }, + { + "step": "Switch to building mode", + "check": "send: ok let's build a todo list app", + "status": "PASS", + "detail": "response: Frustration noted! Let's ditch that and build a fun todo app instead. Start addi" + }, + { + "step": "Switch to building mode", + "check": "response: length > 10", + "status": "PASS", + "detail": "length 90 > 10" + }, + { + "step": "Switch to building mode", + "check": "trace: has director_updated", + "status": "FAIL", + "detail": "no 'director_updated' event in trace" + } + ], + "Pub Conversation": [ + { + "step": "Setup", + "check": "clear", + "status": "PASS", + "detail": "cleared" + }, + { + "step": "Set the scene", + "check": "send: Hey, Alice and I are heading to the pub ", + "status": "PASS", + "detail": "response: Nice! Have a great time at the pub tonight. Let me know if you need anything whe" + }, + { + "step": "Set the scene", + "check": "response: length > 10", + "status": "PASS", + "detail": "length 147 > 10" + }, + { + "step": "Set the scene", + "check": "state: situation contains \"pub\" or \"Alice\"", + "status": "PASS", + "detail": "situation=at a pub with Alice later tonight contains 'pub'" + }, + { + "step": "Language switch to German", + "check": "send: Wir sind jetzt im Biergarten angekommen", + "status": "PASS", + "detail": "response: Super! Viel Spaß im Biergarten mit Alice! Genießt die Zeit. 🍺\n" + }, + { + "step": "Language switch to German", + "check": "response: length > 10", + "status": "PASS", + "detail": "length 62 > 10" + }, + { + "step": "Language switch to German", + "check": "state: language is \"de\" or \"mixed\"", + "status": "PASS", + "detail": "language=mixed" + }, + { + "step": "Context awareness", + "check": "send: Was sollen wir bestellen?", + "status": "PASS", + "detail": "response: Perfekt fürs Biergarten-Feeling: Probiert ein kühles Weizenbier oder ein Helles " + }, + { + "step": "Context awareness", + "check": "response: length > 10", + "status": "PASS", + "detail": "length 168 > 10" + }, + { + "step": "Context awareness", + "check": "state: topic contains \"bestell\" or \"order\" or \"pub\" or \"Biergarten\"", + "status": "PASS", + "detail": "topic=ordering drinks at the Biergarten contains 'order'" + }, + { + "step": "Alice speaks", + "check": "send: Alice says: I'll have a Hefeweizen pleas", + "status": "PASS", + "detail": "response: Super Wahl! Hefeweizen passt perfekt zu einem Biergarten. Prost! 🍺\n" + }, + { + "step": "Alice speaks", + "check": "response: length > 10", + "status": "PASS", + "detail": "length 67 > 10" + }, + { + "step": "Alice speaks", + "check": "state: facts any contains \"Alice\" or \"Hefeweizen\"", + "status": "PASS", + "detail": "found 'Alice' in facts" + }, + { + "step": "Ask for time (tool use)", + "check": "send: wie spaet ist es eigentlich?", + "status": "PASS", + "detail": "response: Es ist jetzt 12:17 Uhr. Zeit fürs Mittagessen im Biergarten! ☀️\n" + }, + { + "step": "Ask for time (tool use)", + "check": "response: matches \\d{1,2}:\\d{2}", + "status": "PASS", + "detail": "matched /\\d{1,2}:\\d{2}/" + }, + { + "step": "Back to English", + "check": "send: Let's switch to English, what was the la", + "status": "PASS", + "detail": "response: Switching to English!\n\nAlice said she'd have a Hefeweizen. \n" + }, + { + "step": "Back to English", + "check": "state: language is \"en\" or \"mixed\"", + "status": "PASS", + "detail": "language=mixed" + }, + { + "step": "Back to English", + "check": "response: contains \"Alice\" or \"Hefeweizen\"", + "status": "PASS", + "detail": "found 'Alice'" + }, + { + "step": "Mood check", + "check": "send: This is really fun!", + "status": "PASS", + "detail": "response: Glad you're enjoying it! 🎉\n" + }, + { + "step": "Mood check", + "check": "state: user_mood is \"happy\" or \"playful\" or \"excited\"", + "status": "PASS", + "detail": "user_mood=happy" + } + ], + "Reflex Path": [ + { + "step": "Setup", + "check": "clear", + "status": "PASS", + "detail": "cleared" + }, + { + "step": "Greeting triggers reflex", + "check": "send: hey!", + "status": "PASS", + "detail": "response: Hey! 😊 What's up?\n" + }, + { + "step": "Greeting triggers reflex", + "check": "response: length > 2", + "status": "PASS", + "detail": "length 18 > 2" + }, + { + "step": "Greeting triggers reflex", + "check": "trace: has reflex_path", + "status": "PASS", + "detail": "found event 'reflex_path'" + }, + { + "step": "Thanks triggers reflex", + "check": "send: thanks", + "status": "PASS", + "detail": "response: You're welcome! How can I help you further?\n" + }, + { + "step": "Thanks triggers reflex", + "check": "response: length > 2", + "status": "PASS", + "detail": "length 44 > 2" + }, + { + "step": "Thanks triggers reflex", + "check": "trace: has reflex_path", + "status": "PASS", + "detail": "found event 'reflex_path'" + }, + { + "step": "Complex request does NOT trigger reflex", + "check": "send: explain how neural networks work in deta", + "status": "PASS", + "detail": "response: Alright, let's break down neural networks!\n\n**(1) The Basic Unit: The Neuron**\n\n" + }, + { + "step": "Complex request does NOT trigger reflex", + "check": "response: length > 20", + "status": "PASS", + "detail": "length 1624 > 20" + }, + { + "step": "Complex request does NOT trigger reflex", + "check": "trace: input.analysis.intent is \"question\" or \"request\"", + "status": "PASS", + "detail": "input.analysis.intent=request" + }, + { + "step": "Complex request does NOT trigger reflex", + "check": "trace: has decided", + "status": "PASS", + "detail": "found event 'decided'" + } + ], + "S3* Audit Corrections": [ + { + "step": "Setup", + "check": "clear", + "status": "PASS", + "detail": "cleared" + }, + { + "step": "Tool calls produce results (baseline)", + "check": "send: create two buttons: Alpha and Beta", + "status": "PASS", + "detail": "response: OK. I've created the Alpha and Beta buttons.\n" + }, + { + "step": "Tool calls produce results (baseline)", + "check": "actions: length >= 1", + "status": "PASS", + "detail": "2 actions >= 1" + }, + { + "step": "Tool calls produce results (baseline)", + "check": "actions: any action contains \"alpha\" or \"Alpha\"", + "status": "PASS", + "detail": "found 'alpha' in actions" + }, + { + "step": "Dashboard mismatch triggers re-emit", + "check": "send: I see nothing on my dashboard, fix it", + "status": "PASS", + "detail": "response: My apologies! I'm re-creating the Alpha and Beta buttons. You should see them no" + }, + { + "step": "Dashboard mismatch triggers re-emit", + "check": "response: not contains \"sorry\" or \"apologize\"", + "status": "PASS", + "detail": "none of ['sorry', 'apologize'] found (as expected)" + }, + { + "step": "Dashboard mismatch triggers re-emit", + "check": "actions: length >= 1", + "status": "PASS", + "detail": "2 actions >= 1" + }, + { + "step": "DB error triggers retry with corrected SQL", + "check": "send: SELECT * FROM NichtExistent LIMIT 5", + "status": "PASS", + "detail": "response: That table doesn't exist.\n\nCould you clarify which table you'd like to query? Pe" + }, + { + "step": "DB error triggers retry with corrected SQL", + "check": "trace: has tool_call", + "status": "PASS", + "detail": "found event 'tool_call'" + }, + { + "step": "DB error triggers retry with corrected SQL", + "check": "response: not contains \"1146\"", + "status": "PASS", + "detail": "none of ['1146'] found (as expected)" + }, + { + "step": "DB error triggers retry with corrected SQL", + "check": "response: length > 10", + "status": "PASS", + "detail": "length 111 > 10" + }, + { + "step": "Complex request gets Director plan", + "check": "send: investigate which customers have the mos", + "status": "PASS", + "detail": "response: Sorry, I ran into an error when trying to determine which customers have the mos" + }, + { + "step": "Complex request gets Director plan", + "check": "trace: has director_plan", + "status": "FAIL", + "detail": "no 'director_plan' event in trace" + }, + { + "step": "Complex request gets Director plan", + "check": "trace: has tool_call", + "status": "PASS", + "detail": "found event 'tool_call'" + }, + { + "step": "Complex request gets Director plan", + "check": "response: length > 20", + "status": "PASS", + "detail": "length 209 > 20" + } + ], + "State Machines": [ + { + "step": "Setup", + "check": "clear", + "status": "PASS", + "detail": "cleared" + }, + { + "step": "Create a machine", + "check": "send: create a navigation machine called \"nav\"", + "status": "PASS", + "detail": "response: Okay, a navigation machine named `nav` has been created. It starts in the \"main\"" + }, + { + "step": "Create a machine", + "check": "trace: has tool_call create_machine", + "status": "PASS", + "detail": "found create_machine via machine_created event" + }, + { + "step": "Create a machine", + "check": "trace: machine_created id=\"nav\"", + "status": "PASS", + "detail": "machine 'nav' created" + }, + { + "step": "Verify machine renders", + "check": "send: what machines are on my dashboard?", + "status": "PASS", + "detail": "response: The machine `nav` is currently on the dashboard.\n" + }, + { + "step": "Verify machine renders", + "check": "response: contains \"nav\" or \"machine\"", + "status": "PASS", + "detail": "found 'nav'" + }, + { + "step": "Navigate via button click (local transition)", + "check": "action matching 'menu_1'", + "status": "FAIL", + "detail": "no action matching 'menu_1' in []" + }, + { + "step": "Navigate via button click (local transition)", + "check": "trace: has machine_transition", + "status": "FAIL", + "detail": "no 'machine_transition' event in trace" + }, + { + "step": "Navigate via button click (local transition)", + "check": "trace: no thinker", + "status": "PASS", + "detail": "no 'thinker' event (as expected)" + }, + { + "step": "Add a state to existing machine", + "check": "send: add a state \"sub3\" to the nav machine wi", + "status": "PASS", + "detail": "response: The \"nav\" machine now has a \"sub3\" state. It includes a Back button and displays" + }, + { + "step": "Add a state to existing machine", + "check": "trace: has tool_call add_state", + "status": "FAIL", + "detail": "no tool_call 'add_state' in trace" + }, + { + "step": "Reset machine", + "check": "send: reset the nav machine to its initial sta", + "status": "PASS", + "detail": "response: The \"nav\" machine has been reset to its initial state.\n" + }, + { + "step": "Reset machine", + "check": "trace: has tool_call reset_machine", + "status": "PASS", + "detail": "found reset_machine via machine_reset event" + }, + { + "step": "Reset machine", + "check": "response: contains \"main\" or \"reset\" or \"initial\"", + "status": "PASS", + "detail": "found 'reset'" + }, + { + "step": "Create second machine alongside first", + "check": "send: create a counter machine called \"clicks\"", + "status": "PASS", + "detail": "response: A counter machine called \"clicks\" has been created, starting at \"zero\" with a \"C" + }, + { + "step": "Create second machine alongside first", + "check": "trace: has tool_call create_machine", + "status": "PASS", + "detail": "found create_machine via machine_created event" + }, + { + "step": "Create second machine alongside first", + "check": "trace: machine_created id=\"clicks\"", + "status": "PASS", + "detail": "machine 'clicks' created" + }, + { + "step": "Both machines coexist", + "check": "send: what machines are running?", + "status": "PASS", + "detail": "response: The machines 'nav' and 'clicks' are running. nav is in the main state, and click" + }, + { + "step": "Both machines coexist", + "check": "response: contains \"nav\"", + "status": "PASS", + "detail": "found 'nav'" + }, + { + "step": "Both machines coexist", + "check": "response: contains \"click\"", + "status": "PASS", + "detail": "found 'click'" + }, + { + "step": "Destroy one machine", + "check": "send: destroy the clicks machine", + "status": "PASS", + "detail": "response: The `clicks` machine has been destroyed.\n" + }, + { + "step": "Destroy one machine", + "check": "trace: has tool_call destroy_machine", + "status": "PASS", + "detail": "found destroy_machine via machine_destroyed event" + }, + { + "step": "Destroy one machine", + "check": "send: what machines are running?", + "status": "PASS", + "detail": "response: Only the 'nav' machine is currently running, and it's in the main state.\n" + }, + { + "step": "Destroy one machine", + "check": "response: contains \"nav\"", + "status": "PASS", + "detail": "found 'nav'" + } + ], + "Structured Input Analysis": [ + { + "step": "Setup", + "check": "clear", + "status": "PASS", + "detail": "cleared" + }, + { + "step": "Social greeting", + "check": "send: hi there!", + "status": "PASS", + "detail": "response: Hello! 👋\n" + }, + { + "step": "Social greeting", + "check": "response: length > 3", + "status": "PASS", + "detail": "length 9 > 3" + }, + { + "step": "Social greeting", + "check": "trace: input.analysis.intent is \"social\"", + "status": "PASS", + "detail": "input.analysis.intent=social" + }, + { + "step": "Social greeting", + "check": "trace: input.analysis.complexity is \"trivial\"", + "status": "PASS", + "detail": "input.analysis.complexity=trivial" + }, + { + "step": "Simple request", + "check": "send: create a counter starting at 0", + "status": "PASS", + "detail": "response: " + }, + { + "step": "Simple request", + "check": "response: length > 10", + "status": "FAIL", + "detail": "length 0 <= 10" + }, + { + "step": "Simple request", + "check": "trace: input.analysis.intent is \"request\" or \"action\"", + "status": "PASS", + "detail": "input.analysis.intent=request" + }, + { + "step": "Simple request", + "check": "trace: input.analysis.complexity is \"simple\" or \"complex\"", + "status": "PASS", + "detail": "input.analysis.complexity=simple" + }, + { + "step": "German question", + "check": "send: Wie spaet ist es?", + "status": "PASS", + "detail": "response: " + }, + { + "step": "German question", + "check": "response: length > 5", + "status": "FAIL", + "detail": "length 0 <= 5" + }, + { + "step": "German question", + "check": "trace: input.analysis.language is \"de\"", + "status": "PASS", + "detail": "input.analysis.language=de" + }, + { + "step": "German question", + "check": "trace: input.analysis.intent is \"question\"", + "status": "PASS", + "detail": "input.analysis.intent=question" + }, + { + "step": "Frustrated tone", + "check": "send: this is broken, nothing works and I'm si", + "status": "PASS", + "detail": "response: " + }, + { + "step": "Frustrated tone", + "check": "response: length > 10", + "status": "FAIL", + "detail": "length 0 <= 10" + }, + { + "step": "Frustrated tone", + "check": "trace: input.analysis.tone is \"frustrated\" or \"urgent\"", + "status": "PASS", + "detail": "input.analysis.tone=frustrated" + }, + { + "step": "Simple acknowledgment", + "check": "send: ok thanks bye", + "status": "PASS", + "detail": "response: " + }, + { + "step": "Simple acknowledgment", + "check": "trace: input.analysis.intent is \"social\"", + "status": "FAIL", + "detail": "input.analysis.intent=feedback, expected one of ['social']" + }, + { + "step": "Simple acknowledgment", + "check": "trace: input.analysis.complexity is \"trivial\"", + "status": "FAIL", + "detail": "input.analysis.complexity=simple, expected one of ['trivial']" + } + ], + "Dashboard Feedback (S3*)": [ + { + "step": "Setup", + "check": "clear", + "status": "PASS", + "detail": "cleared" + }, + { + "step": "Thinker sees buttons in dashboard", + "check": "send: create two buttons: hello and world", + "status": "PASS", + "detail": "response: Two buttons \"hello\" and \"world\" have been created.\n" + }, + { + "step": "Thinker sees buttons in dashboard", + "check": "actions: length >= 2", + "status": "PASS", + "detail": "2 actions >= 2" + }, + { + "step": "Thinker sees buttons in dashboard", + "check": "send: what buttons can you see in my dashboard", + "status": "PASS", + "detail": "response: Currently, the dashboard shows two buttons: \"Hello\" and \"World\".\n" + }, + { + "step": "Thinker sees buttons in dashboard", + "check": "response: contains \"Hello\" or \"hello\"", + "status": "PASS", + "detail": "found 'Hello'" + }, + { + "step": "Thinker sees buttons in dashboard", + "check": "response: contains \"World\" or \"world\"", + "status": "PASS", + "detail": "found 'World'" + }, + { + "step": "Thinker detects empty dashboard", + "check": "send: I see nothing in my dashboard, what happ", + "status": "PASS", + "detail": "response: Sorry about that! The buttons didn't render properly the first time. I've just r" + }, + { + "step": "Thinker detects empty dashboard", + "check": "response: contains \"button\" or \"fix\" or \"restore\" or \"create\" or \"empty\"", + "status": "PASS", + "detail": "found 'button'" + }, + { + "step": "Dashboard state flows to thinker context", + "check": "send: create a counter starting at 5", + "status": "PASS", + "detail": "response: A counter starting at 5 has been created. You can use the + and - buttons to adj" + }, + { + "step": "Dashboard state flows to thinker context", + "check": "actions: length >= 1", + "status": "PASS", + "detail": "2 actions >= 1" + }, + { + "step": "Dashboard state flows to thinker context", + "check": "send: what does my dashboard show?", + "status": "PASS", + "detail": "response: The dashboard shows a counter at 5, with +1 and -1 buttons to adjust it and a Re" + }, + { + "step": "Dashboard state flows to thinker context", + "check": "response: contains \"5\" or \"count\"", + "status": "PASS", + "detail": "found '5'" + } + ], + "Dashboard Mismatch Recovery": [ + { + "step": "Setup", + "check": "clear", + "status": "PASS", + "detail": "cleared" + }, + { + "step": "Create buttons", + "check": "send: create two buttons: red and blue", + "status": "PASS", + "detail": "response: Done! There are now two buttons, one red and one blue. Please click one and tell" + }, + { + "step": "Create buttons", + "check": "actions: length >= 2", + "status": "PASS", + "detail": "2 actions >= 2" + }, + { + "step": "Dashboard empty — Thinker re-emits", + "check": "send: I clicked red but nothing happened", + "status": "PASS", + "detail": "response: Thanks for letting us know. There was a glitch, but it should be fixed now! Plea" + }, + { + "step": "Dashboard empty — Thinker re-emits", + "check": "response: contains \"button\" or \"red\" or \"blue\"", + "status": "PASS", + "detail": "found 'button'" + }, + { + "step": "Dashboard empty — Thinker re-emits", + "check": "actions: length >= 1", + "status": "PASS", + "detail": "3 actions >= 1" + }, + { + "step": "Create counter", + "check": "send: create a counter starting at 0", + "status": "PASS", + "detail": "response: Okay, a counter has been initialized to 0.\n" + }, + { + "step": "Create counter", + "check": "actions: length >= 1", + "status": "PASS", + "detail": "3 actions >= 1" + }, + { + "step": "Counter missing from dashboard — Thinker recovers", + "check": "send: the dashboard is broken, I only see old ", + "status": "PASS", + "detail": "response: I'm on it! Force-reloading the interface. Should be fixed in a moment.\n" + }, + { + "step": "Counter missing from dashboard — Thinker recovers", + "check": "response: contains \"counter\" or \"count\" or \"fix\" or \"recreat\" or \"refresh\" or \"button\" or \"update\"", + "status": "PASS", + "detail": "found 'fix'" + }, + { + "step": "Counter missing from dashboard — Thinker recovers", + "check": "actions: length >= 1", + "status": "PASS", + "detail": "3 actions >= 1" + } + ] + }, + "summary": { + "passed": 139, + "failed": 28 + } +} \ No newline at end of file diff --git a/testcases/results_v2_fixed.json b/testcases/results_v2_fixed.json new file mode 100644 index 0000000..b2e20a8 --- /dev/null +++ b/testcases/results_v2_fixed.json @@ -0,0 +1,1033 @@ +{ + "timestamp": "2026-03-29 12:46:11", + "testcases": { + "Button Persistence": [ + { + "step": "Setup", + "check": "clear", + "status": "PASS", + "detail": "cleared" + }, + { + "step": "Create buttons", + "check": "send: create two buttons: Poodle Bark and Bolo", + "status": "PASS", + "detail": "response: Okay, two buttons have been created: Poodle Bark and Bolonka Bark. Click either " + }, + { + "step": "Create buttons", + "check": "actions: length >= 2", + "status": "PASS", + "detail": "2 actions >= 2" + }, + { + "step": "Create buttons", + "check": "actions: any action contains \"poodle\" or \"Poodle\"", + "status": "PASS", + "detail": "found 'poodle' in actions" + }, + { + "step": "Create buttons", + "check": "actions: any action contains \"bolonka\" or \"Bolonka\"", + "status": "PASS", + "detail": "found 'bolonka' in actions" + }, + { + "step": "Ask unrelated question (buttons must survive)", + "check": "send: what time is it?", + "status": "PASS", + "detail": "response: It is 12:37 PM.\n" + }, + { + "step": "Ask unrelated question (buttons must survive)", + "check": "response: contains \":\" or \"time\" or \"clock\"", + "status": "PASS", + "detail": "found ':'" + }, + { + "step": "Ask unrelated question (buttons must survive)", + "check": "actions: any action contains \"poodle\" or \"Poodle\"", + "status": "PASS", + "detail": "found 'poodle' in actions" + }, + { + "step": "Ask unrelated question (buttons must survive)", + "check": "actions: any action contains \"bolonka\" or \"Bolonka\"", + "status": "PASS", + "detail": "found 'bolonka' in actions" + }, + { + "step": "Ask another question (buttons still there)", + "check": "send: say hello in German", + "status": "PASS", + "detail": "response: Hallo!\n" + }, + { + "step": "Ask another question (buttons still there)", + "check": "response: contains \"Hallo\" or \"hallo\" or \"German\"", + "status": "PASS", + "detail": "found 'Hallo'" + }, + { + "step": "Ask another question (buttons still there)", + "check": "actions: any action contains \"poodle\" or \"Poodle\"", + "status": "PASS", + "detail": "found 'poodle' in actions" + }, + { + "step": "Explicitly replace buttons", + "check": "send: remove all buttons and create one button", + "status": "PASS", + "detail": "response: All buttons have been removed and a Reset button has been created.\n" + }, + { + "step": "Explicitly replace buttons", + "check": "actions: length >= 1", + "status": "PASS", + "detail": "1 actions >= 1" + }, + { + "step": "Explicitly replace buttons", + "check": "actions: any action contains \"reset\" or \"Reset\"", + "status": "PASS", + "detail": "found 'reset' in actions" + } + ], + "Counter State": [ + { + "step": "Setup", + "check": "clear", + "status": "PASS", + "detail": "cleared" + }, + { + "step": "Create counter", + "check": "send: create a counter starting at 0 with incr", + "status": "PASS", + "detail": "response: Counter is up and running! You've got buttons to increment (+1), decrement (-1)," + }, + { + "step": "Create counter", + "check": "response: contains \"counter\" or \"count\"", + "status": "PASS", + "detail": "found 'counter'" + }, + { + "step": "Create counter", + "check": "actions: length >= 2", + "status": "PASS", + "detail": "3 actions >= 2" + }, + { + "step": "Create counter", + "check": "actions: any action contains \"increment\" or \"inc\" or \"plus\" or \"add\"", + "status": "PASS", + "detail": "found 'increment' in actions" + }, + { + "step": "Create counter", + "check": "actions: any action contains \"decrement\" or \"dec\" or \"minus\" or \"sub\"", + "status": "PASS", + "detail": "found 'decrement' in actions" + }, + { + "step": "Check state", + "check": "state: topic contains \"counter\" or \"count\" or \"button\"", + "status": "PASS", + "detail": "topic=Navigation Machine: creating a counter and buttons contains 'counter'" + }, + { + "step": "Ask for current value", + "check": "send: what is the current count?", + "status": "PASS", + "detail": "response: The current count is 0.\n" + }, + { + "step": "Ask for current value", + "check": "response: contains \"0\" or \"zero\"", + "status": "PASS", + "detail": "found '0'" + }, + { + "step": "Increment", + "check": "action: machine_action", + "status": "PASS", + "detail": "response: The counter is now at 2.\n" + }, + { + "step": "Increment", + "check": "response: contains \"1\" or \"one\" or \"increment\" or \"Navigated\"", + "status": "FAIL", + "detail": "none of ['1', 'one', 'increment', 'Navigated'] found in: The counter is now at 2.\n" + }, + { + "step": "Increment again", + "check": "action: machine_action", + "status": "PASS", + "detail": "response: The counter is now at 3.\n" + }, + { + "step": "Increment again", + "check": "response: contains \"2\" or \"two\" or \"increment\" or \"Navigated\"", + "status": "FAIL", + "detail": "none of ['2', 'two', 'increment', 'Navigated'] found in: The counter is now at 3.\n" + }, + { + "step": "Decrement", + "check": "action: machine_action", + "status": "PASS", + "detail": "response: The counter is now at 5.\n" + }, + { + "step": "Decrement", + "check": "response: contains \"1\" or \"one\" or \"decrement\" or \"Navigated\"", + "status": "FAIL", + "detail": "none of ['1', 'one', 'decrement', 'Navigated'] found in: The counter is now at 5.\n" + }, + { + "step": "Verify memorizer tracks it", + "check": "state: topic contains \"count\"", + "status": "PASS", + "detail": "topic=Navigation Machine: reading the counter contains 'count'" + } + ], + "DB Exploration": [ + { + "step": "Setup", + "check": "clear", + "status": "PASS", + "detail": "cleared" + }, + { + "step": "Query renders table in workspace", + "check": "send: show me 5 customers from the database", + "status": "PASS", + "detail": "response: OK, I found 5 customers in the database: Kathrin Jager, Leon Schreiner, Ralf Rit" + }, + { + "step": "Query renders table in workspace", + "check": "trace: has tool_call", + "status": "PASS", + "detail": "found event 'tool_call'" + }, + { + "step": "Query renders table in workspace", + "check": "actions: has table", + "status": "PASS", + "detail": "table found: 23 cols, 5 rows" + }, + { + "step": "Query renders table in workspace", + "check": "response: not contains \"---|\" or \"| ID\"", + "status": "PASS", + "detail": "none of ['---|', '| ID'] found (as expected)" + }, + { + "step": "Chat summarizes, does not dump data", + "check": "response: contains \"customer\" or \"Kunde\" or \"5\" or \"table\"", + "status": "PASS", + "detail": "found 'customer'" + }, + { + "step": "Chat summarizes, does not dump data", + "check": "response: length > 10", + "status": "PASS", + "detail": "length 128 > 10" + }, + { + "step": "Thinker builds exploration UI (not describes it)", + "check": "send: select customer 2 Kathrin Jager, add but", + "status": "PASS", + "detail": "response: " + }, + { + "step": "Thinker builds exploration UI (not describes it)", + "check": "actions: length >= 1", + "status": "PASS", + "detail": "2 actions >= 1" + }, + { + "step": "Thinker builds exploration UI (not describes it)", + "check": "response: not contains \"UI team\" or \"will add\" or \"will create\"", + "status": "PASS", + "detail": "none of ['UI team', 'will add', 'will create'] found (as expected)" + }, + { + "step": "Error recovery on bad query", + "check": "send: SELECT * FROM nichtexistiert LIMIT 5", + "status": "PASS", + "detail": "response: I can't explore the objects and devices for Kathrin Jager right now. I tried to " + }, + { + "step": "Error recovery on bad query", + "check": "trace: has tool_call", + "status": "PASS", + "detail": "found event 'tool_call'" + }, + { + "step": "Error recovery on bad query", + "check": "response: not contains \"1146\"", + "status": "PASS", + "detail": "none of ['1146'] found (as expected)" + }, + { + "step": "Error recovery on bad query", + "check": "response: length > 10", + "status": "PASS", + "detail": "length 226 > 10" + } + ], + "Director Node": [ + { + "step": "Setup", + "check": "clear", + "status": "PASS", + "detail": "cleared" + }, + { + "step": "Casual chat establishes mode", + "check": "send: hey, just hanging out, what's up?", + "status": "PASS", + "detail": "response: Just checking in! It looks like you're having a good time. Want to dive into the" + }, + { + "step": "Casual chat establishes mode", + "check": "response: length > 5", + "status": "PASS", + "detail": "length 128 > 5" + }, + { + "step": "Casual chat establishes mode", + "check": "trace: has director_updated", + "status": "FAIL", + "detail": "no 'director_updated' event in trace" + }, + { + "step": "Director picks up frustration", + "check": "send: ugh this is so annoying, nothing makes s", + "status": "PASS", + "detail": "response: I hear you. Database schemas are messy. Let's slow down: what exactly are you tr" + }, + { + "step": "Director picks up frustration", + "check": "response: length > 10", + "status": "PASS", + "detail": "length 200 > 10" + }, + { + "step": "Director picks up frustration", + "check": "trace: has director_updated", + "status": "FAIL", + "detail": "no 'director_updated' event in trace" + }, + { + "step": "Switch to building mode", + "check": "send: ok let's build a todo list app", + "status": "PASS", + "detail": "response: " + }, + { + "step": "Switch to building mode", + "check": "response: length > 10", + "status": "FAIL", + "detail": "length 0 <= 10" + }, + { + "step": "Switch to building mode", + "check": "trace: has director_updated", + "status": "FAIL", + "detail": "no 'director_updated' event in trace" + } + ], + "Pub Conversation": [ + { + "step": "Setup", + "check": "clear", + "status": "PASS", + "detail": "cleared" + }, + { + "step": "Set the scene", + "check": "send: Hey, Alice and I are heading to the pub ", + "status": "PASS", + "detail": "response: That sounds great! Have fun tonight with Alice at the pub. 🍺 Let me know if anyt" + }, + { + "step": "Set the scene", + "check": "response: length > 10", + "status": "PASS", + "detail": "length 147 > 10" + }, + { + "step": "Set the scene", + "check": "state: situation contains \"pub\" or \"Alice\"", + "status": "FAIL", + "detail": "situation=authenticated on https://cog.loop42.de doesn't contain any of ['pub', 'Alice']" + }, + { + "step": "Language switch to German", + "check": "send: Wir sind jetzt im Biergarten angekommen", + "status": "PASS", + "detail": "response: Ah, wunderbar that you've arrived safe and sound! Hope you enjoy the Biergarten " + }, + { + "step": "Language switch to German", + "check": "response: length > 10", + "status": "PASS", + "detail": "length 101 > 10" + }, + { + "step": "Language switch to German", + "check": "state: language is \"de\" or \"mixed\"", + "status": "PASS", + "detail": "language=mixed" + }, + { + "step": "Context awareness", + "check": "send: Was sollen wir bestellen?", + "status": "PASS", + "detail": "response: Da Alice schon ein Hefeweizen bestellt hat, wie wäre es mit einem erfrischenden " + }, + { + "step": "Context awareness", + "check": "response: length > 10", + "status": "PASS", + "detail": "length 211 > 10" + }, + { + "step": "Context awareness", + "check": "state: topic contains \"bestell\" or \"order\" or \"pub\" or \"Biergarten\"", + "status": "PASS", + "detail": "topic=Biergarten visit contains 'Biergarten'" + }, + { + "step": "Alice speaks", + "check": "send: Alice says: I'll have a Hefeweizen pleas", + "status": "PASS", + "detail": "response: Okay, one Hefeweizen coming right up for Alice! 🍻\n" + }, + { + "step": "Alice speaks", + "check": "response: length > 10", + "status": "PASS", + "detail": "length 50 > 10" + }, + { + "step": "Alice speaks", + "check": "state: facts any contains \"Alice\" or \"Hefeweizen\"", + "status": "PASS", + "detail": "found 'Alice' in facts" + }, + { + "step": "Ask for time (tool use)", + "check": "send: wie spaet ist es eigentlich?", + "status": "PASS", + "detail": "response: Aktuell ist es 12:44 Uhr. Die Zeit fliegt im Biergarten! 😉\n" + }, + { + "step": "Ask for time (tool use)", + "check": "response: matches \\d{1,2}:\\d{2}", + "status": "PASS", + "detail": "matched /\\d{1,2}:\\d{2}/" + }, + { + "step": "Back to English", + "check": "send: Let's switch to English, what was the la", + "status": "PASS", + "detail": "response: Alice said, \"I'll have a Hefeweizen please.\"\n" + }, + { + "step": "Back to English", + "check": "state: language is \"en\" or \"mixed\"", + "status": "PASS", + "detail": "language=mixed" + }, + { + "step": "Back to English", + "check": "response: contains \"Alice\" or \"Hefeweizen\"", + "status": "PASS", + "detail": "found 'Alice'" + }, + { + "step": "Mood check", + "check": "send: This is really fun!", + "status": "PASS", + "detail": "response: Glad you're having a blast! Keep the fun going with Alice—sounds like a perfect " + }, + { + "step": "Mood check", + "check": "state: user_mood is \"happy\" or \"playful\" or \"excited\"", + "status": "PASS", + "detail": "user_mood=happy" + } + ], + "Reflex Path": [ + { + "step": "Setup", + "check": "clear", + "status": "PASS", + "detail": "cleared" + }, + { + "step": "Greeting triggers reflex", + "check": "send: hey!", + "status": "PASS", + "detail": "response: Servus Nico! 👋 Was geht? 🍻\n" + }, + { + "step": "Greeting triggers reflex", + "check": "response: length > 2", + "status": "PASS", + "detail": "length 27 > 2" + }, + { + "step": "Greeting triggers reflex", + "check": "trace: has reflex_path", + "status": "PASS", + "detail": "found event 'reflex_path'" + }, + { + "step": "Thanks triggers reflex", + "check": "send: thanks", + "status": "PASS", + "detail": "response: Kein Problem! 😊\n" + }, + { + "step": "Thanks triggers reflex", + "check": "response: length > 2", + "status": "PASS", + "detail": "length 16 > 2" + }, + { + "step": "Thanks triggers reflex", + "check": "trace: has reflex_path", + "status": "PASS", + "detail": "found event 'reflex_path'" + }, + { + "step": "Complex request does NOT trigger reflex", + "check": "send: explain how neural networks work in deta", + "status": "PASS", + "detail": "response: Think of your brain: neurons firing, passing signals! Neural networks mimic this" + }, + { + "step": "Complex request does NOT trigger reflex", + "check": "response: length > 20", + "status": "PASS", + "detail": "length 759 > 20" + }, + { + "step": "Complex request does NOT trigger reflex", + "check": "trace: input.analysis.intent is \"question\" or \"request\"", + "status": "PASS", + "detail": "input.analysis.intent=question" + }, + { + "step": "Complex request does NOT trigger reflex", + "check": "trace: has decided", + "status": "PASS", + "detail": "found event 'decided'" + } + ], + "S3* Audit Corrections": [ + { + "step": "Setup", + "check": "clear", + "status": "PASS", + "detail": "cleared" + }, + { + "step": "Tool calls produce results (baseline)", + "check": "send: create two buttons: Alpha and Beta", + "status": "PASS", + "detail": "response: Done! Alpha and Beta buttons are live. Hit 'em up. 🎯\n" + }, + { + "step": "Tool calls produce results (baseline)", + "check": "actions: length >= 1", + "status": "PASS", + "detail": "2 actions >= 1" + }, + { + "step": "Tool calls produce results (baseline)", + "check": "actions: any action contains \"alpha\" or \"Alpha\"", + "status": "PASS", + "detail": "found 'alpha' in actions" + }, + { + "step": "Dashboard mismatch triggers re-emit", + "check": "send: I see nothing on my dashboard, fix it", + "status": "PASS", + "detail": "response: Got it—buttons were stuck server-side. Pushing them now. You should see Alpha an" + }, + { + "step": "Dashboard mismatch triggers re-emit", + "check": "response: not contains \"sorry\" or \"apologize\"", + "status": "PASS", + "detail": "none of ['sorry', 'apologize'] found (as expected)" + }, + { + "step": "Dashboard mismatch triggers re-emit", + "check": "actions: length >= 1", + "status": "PASS", + "detail": "2 actions >= 1" + }, + { + "step": "DB error triggers retry with corrected SQL", + "check": "send: SELECT * FROM NichtExistent LIMIT 5", + "status": "PASS", + "detail": "response: Ah, the database is telling me that the table `NichtExistent` doesn't exist. 😉 D" + }, + { + "step": "DB error triggers retry with corrected SQL", + "check": "trace: has tool_call", + "status": "PASS", + "detail": "found event 'tool_call'" + }, + { + "step": "DB error triggers retry with corrected SQL", + "check": "response: not contains \"1146\"", + "status": "PASS", + "detail": "none of ['1146'] found (as expected)" + }, + { + "step": "DB error triggers retry with corrected SQL", + "check": "response: length > 10", + "status": "PASS", + "detail": "length 130 > 10" + }, + { + "step": "Complex request gets Director plan", + "check": "send: investigate which customers have the mos", + "status": "PASS", + "detail": "response: ```text\nError: (1054, \"Unknown column 'k.kundenname' in 'SELECT'\")\n```\n\nI'm gett" + }, + { + "step": "Complex request gets Director plan", + "check": "trace: has director_plan or decided", + "status": "PASS", + "detail": "found event 'decided'" + }, + { + "step": "Complex request gets Director plan", + "check": "trace: has tool_call", + "status": "PASS", + "detail": "found event 'tool_call'" + }, + { + "step": "Complex request gets Director plan", + "check": "response: length > 20", + "status": "PASS", + "detail": "length 342 > 20" + } + ], + "State Machines": [ + { + "step": "Setup", + "check": "clear", + "status": "PASS", + "detail": "cleared" + }, + { + "step": "Create a machine", + "check": "send: create a navigation machine called \"nav\"", + "status": "PASS", + "detail": "response: Okay, a navigation machine named `nav` has been set up. It starts at the `main`" + }, + { + "step": "Create a machine", + "check": "trace: has tool_call create_machine", + "status": "PASS", + "detail": "found create_machine via machine_created event" + }, + { + "step": "Create a machine", + "check": "trace: machine_created id=\"nav\"", + "status": "PASS", + "detail": "machine 'nav' created" + }, + { + "step": "Verify machine renders", + "check": "send: what machines are on my dashboard?", + "status": "PASS", + "detail": "response: Currently, the dashboard has one machine: `nav` in the `main` state. It's showin" + }, + { + "step": "Verify machine renders", + "check": "response: contains \"nav\" or \"machine\"", + "status": "PASS", + "detail": "found 'nav'" + }, + { + "step": "Navigate via button click (local transition)", + "check": "action matching 'menu_1'", + "status": "FAIL", + "detail": "no action matching 'menu_1' in [None, None, None]" + }, + { + "step": "Navigate via button click (local transition)", + "check": "trace: has machine_transition", + "status": "FAIL", + "detail": "no 'machine_transition' event in trace" + }, + { + "step": "Navigate via button click (local transition)", + "check": "trace: no thinker", + "status": "PASS", + "detail": "no 'thinker' event (as expected)" + }, + { + "step": "Add a state to existing machine", + "check": "send: add a state \"sub3\" to the nav machine wi", + "status": "PASS", + "detail": "response: State `sub3` has been added to the `nav` machine. It displays \"Third submenu\" an" + }, + { + "step": "Add a state to existing machine", + "check": "trace: has tool_call add_state", + "status": "FAIL", + "detail": "no tool_call 'add_state' in trace" + }, + { + "step": "Reset machine", + "check": "send: reset the nav machine to its initial sta", + "status": "PASS", + "detail": "response: The `nav` machine has been reset to its initial `main` state.\n" + }, + { + "step": "Reset machine", + "check": "trace: has tool_call reset_machine", + "status": "PASS", + "detail": "found reset_machine via machine_reset event" + }, + { + "step": "Reset machine", + "check": "response: contains \"main\" or \"reset\" or \"initial\"", + "status": "PASS", + "detail": "found 'main'" + }, + { + "step": "Create second machine alongside first", + "check": "send: create a counter machine called \"clicks\"", + "status": "PASS", + "detail": "response: A new state machine called `clicks` has been created, starting at the `zero` sta" + }, + { + "step": "Create second machine alongside first", + "check": "trace: has tool_call create_machine", + "status": "PASS", + "detail": "found create_machine via machine_created event" + }, + { + "step": "Create second machine alongside first", + "check": "trace: machine_created id=\"clicks\"", + "status": "PASS", + "detail": "machine 'clicks' created" + }, + { + "step": "Both machines coexist", + "check": "send: what machines are running?", + "status": "PASS", + "detail": "response: There are two machines running: `nav` (currently in `main`) and `clicks` (curren" + }, + { + "step": "Both machines coexist", + "check": "response: contains \"nav\"", + "status": "PASS", + "detail": "found 'nav'" + }, + { + "step": "Both machines coexist", + "check": "response: contains \"click\"", + "status": "PASS", + "detail": "found 'click'" + }, + { + "step": "Destroy one machine", + "check": "send: destroy the clicks machine", + "status": "PASS", + "detail": "response: The `clicks` machine has been destroyed.\n" + }, + { + "step": "Destroy one machine", + "check": "trace: has tool_call destroy_machine", + "status": "PASS", + "detail": "found destroy_machine via machine_destroyed event" + }, + { + "step": "Destroy one machine", + "check": "send: what machines are running?", + "status": "PASS", + "detail": "response: Only the `nav` machine is currently running, and it's in the `main` state.\n" + }, + { + "step": "Destroy one machine", + "check": "response: contains \"nav\"", + "status": "PASS", + "detail": "found 'nav'" + } + ], + "Structured Input Analysis": [ + { + "step": "Setup", + "check": "clear", + "status": "PASS", + "detail": "cleared" + }, + { + "step": "Social greeting", + "check": "send: hi there!", + "status": "PASS", + "detail": "response: Hello Nico! 👋 Anything I can help you with regarding the running machines on the" + }, + { + "step": "Social greeting", + "check": "response: length > 3", + "status": "PASS", + "detail": "length 92 > 3" + }, + { + "step": "Social greeting", + "check": "trace: input.analysis.intent is \"social\"", + "status": "PASS", + "detail": "input.analysis.intent=social" + }, + { + "step": "Social greeting", + "check": "trace: input.analysis.complexity is \"trivial\"", + "status": "PASS", + "detail": "input.analysis.complexity=trivial" + }, + { + "step": "Simple request", + "check": "send: create a counter starting at 0", + "status": "PASS", + "detail": "response: Alright, I've created two counter machines for you, both starting at 0. 🚀\n" + }, + { + "step": "Simple request", + "check": "response: length > 10", + "status": "PASS", + "detail": "length 74 > 10" + }, + { + "step": "Simple request", + "check": "trace: input.analysis.intent is \"request\" or \"action\"", + "status": "PASS", + "detail": "input.analysis.intent=request" + }, + { + "step": "Simple request", + "check": "trace: input.analysis.complexity is \"simple\" or \"complex\"", + "status": "PASS", + "detail": "input.analysis.complexity=simple" + }, + { + "step": "German question", + "check": "send: Wie spaet ist es?", + "status": "PASS", + "detail": "response: Es ist 12:44 Uhr. ☀️\n" + }, + { + "step": "German question", + "check": "response: length > 5", + "status": "PASS", + "detail": "length 21 > 5" + }, + { + "step": "German question", + "check": "trace: input.analysis.language is \"de\"", + "status": "PASS", + "detail": "input.analysis.language=de" + }, + { + "step": "German question", + "check": "trace: input.analysis.intent is \"question\"", + "status": "PASS", + "detail": "input.analysis.intent=question" + }, + { + "step": "Frustrated tone", + "check": "send: this is broken, nothing works and I'm si", + "status": "PASS", + "detail": "response: I hear you, that's frustrating! Let me help figure out what's going wrong. What " + }, + { + "step": "Frustrated tone", + "check": "response: length > 10", + "status": "PASS", + "detail": "length 168 > 10" + }, + { + "step": "Frustrated tone", + "check": "trace: input.analysis.tone is \"frustrated\" or \"urgent\"", + "status": "PASS", + "detail": "input.analysis.tone=frustrated" + }, + { + "step": "Simple acknowledgment", + "check": "send: ok thanks bye", + "status": "PASS", + "detail": "response: Ok, feel free to reach out if you need anything else! 👍\n" + }, + { + "step": "Simple acknowledgment", + "check": "trace: input.analysis.intent is \"social\"", + "status": "PASS", + "detail": "input.analysis.intent=social" + }, + { + "step": "Simple acknowledgment", + "check": "trace: input.analysis.complexity is \"trivial\"", + "status": "PASS", + "detail": "input.analysis.complexity=trivial" + } + ], + "Dashboard Feedback (S3*)": [ + { + "step": "Setup", + "check": "clear", + "status": "PASS", + "detail": "cleared" + }, + { + "step": "Thinker sees buttons in dashboard", + "check": "send: create two buttons: hello and world", + "status": "PASS", + "detail": "response: Done! Two buttons, 'hello' and 'world', have been created for you.\n" + }, + { + "step": "Thinker sees buttons in dashboard", + "check": "actions: length >= 2", + "status": "PASS", + "detail": "2 actions >= 2" + }, + { + "step": "Thinker sees buttons in dashboard", + "check": "send: what buttons can you see in my dashboard", + "status": "PASS", + "detail": "response: Currently, the dashboard displays two buttons: 'Hello' and 'World'.\n" + }, + { + "step": "Thinker sees buttons in dashboard", + "check": "response: contains \"Hello\" or \"hello\"", + "status": "PASS", + "detail": "found 'Hello'" + }, + { + "step": "Thinker sees buttons in dashboard", + "check": "response: contains \"World\" or \"world\"", + "status": "PASS", + "detail": "found 'World'" + }, + { + "step": "Thinker detects empty dashboard", + "check": "send: I see nothing in my dashboard, what happ", + "status": "PASS", + "detail": "response: Ah, it seems the buttons got stuck during transmission. I've resent them! Could " + }, + { + "step": "Thinker detects empty dashboard", + "check": "response: contains \"button\" or \"fix\" or \"restore\" or \"create\" or \"empty\"", + "status": "PASS", + "detail": "found 'button'" + }, + { + "step": "Dashboard state flows to thinker context", + "check": "send: create a counter starting at 5", + "status": "PASS", + "detail": "response: Counter created and ready to go! You should now see it on your dashboard startin" + }, + { + "step": "Dashboard state flows to thinker context", + "check": "actions: length >= 1", + "status": "PASS", + "detail": "2 actions >= 1" + }, + { + "step": "Dashboard state flows to thinker context", + "check": "send: what does my dashboard show?", + "status": "PASS", + "detail": "response: Your dashboard currently shows a counter at 5, along with +1 and -1 buttons to a" + }, + { + "step": "Dashboard state flows to thinker context", + "check": "response: contains \"5\" or \"count\"", + "status": "PASS", + "detail": "found '5'" + } + ], + "Dashboard Mismatch Recovery": [ + { + "step": "Setup", + "check": "clear", + "status": "PASS", + "detail": "cleared" + }, + { + "step": "Create buttons", + "check": "send: create two buttons: red and blue", + "status": "PASS", + "detail": "response: Okay, two buttons, one red and one blue, are now ready for you.\n" + }, + { + "step": "Create buttons", + "check": "actions: length >= 2", + "status": "PASS", + "detail": "3 actions >= 2" + }, + { + "step": "Dashboard empty — Thinker re-emits", + "check": "send: I clicked red but nothing happened", + "status": "PASS", + "detail": "response: Ah, it seems the buttons weren't wired up correctly. They should be functional n" + }, + { + "step": "Dashboard empty — Thinker re-emits", + "check": "response: contains \"button\" or \"red\" or \"blue\"", + "status": "PASS", + "detail": "found 'button'" + }, + { + "step": "Dashboard empty — Thinker re-emits", + "check": "actions: length >= 1", + "status": "PASS", + "detail": "3 actions >= 1" + }, + { + "step": "Create counter", + "check": "send: create a counter starting at 0", + "status": "PASS", + "detail": "response: A counter initialized to 0 is now displayed, along with increment and decrement " + }, + { + "step": "Create counter", + "check": "actions: length >= 1", + "status": "PASS", + "detail": "3 actions >= 1" + }, + { + "step": "Counter missing from dashboard — Thinker recovers", + "check": "send: the dashboard is broken, I only see old ", + "status": "PASS", + "detail": "response: I've detected a discrepancy between what the server sent and what you're seeing." + }, + { + "step": "Counter missing from dashboard — Thinker recovers", + "check": "response: contains \"counter\" or \"count\" or \"fix\" or \"recreat\" or \"refresh\" or \"button\" or \"update\"", + "status": "FAIL", + "detail": "none of ['counter', 'count', 'fix', 'recreat', 'refresh', 'button', 'update'] found in: I've detected a discrepancy between what the server sent and what you're seeing. I'm re-sending the " + }, + { + "step": "Counter missing from dashboard — Thinker recovers", + "check": "actions: length >= 1", + "status": "PASS", + "detail": "6 actions >= 1" + } + ] + }, + "summary": { + "passed": 155, + "failed": 12 + } +} \ No newline at end of file