v0.17.0: User expectation tracking, PA retry loop, machine state in PA context

- Memorizer tracks user_expectation (conversational/delegated/waiting_input/observing)
- Output node adjusts phrasing per expectation
- PA retry loop: reformulates job on expert failure (all retries exhausted or tool skip)
- Machine state in PA context: get_machine_summary includes current state, buttons, stored data
- Expert writes to machine state via update_machine + transition_machine
- Expanded baked schema coverage
- Awareness panel shows color-coded expectation state
- Dashboard and workspace component updates

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
Nico 2026-03-30 19:03:07 +02:00
parent 09374674e3
commit 925fff731f
26 changed files with 4436 additions and 232 deletions

View File

@ -155,7 +155,16 @@ def register_routes(app):
rt = _active_runtime or runtime
try:
if msg.get("type") == "action":
await rt.handle_action(msg.get("action", "unknown"), msg.get("data"))
action = msg.get("action", "unknown")
data_payload = msg.get("data")
if hasattr(rt, 'use_frames') and rt.use_frames:
# Frame engine handles actions as ACTION: prefix messages
action_text = f"ACTION:{action}"
if data_payload:
action_text += f"|data:{json.dumps(data_payload)}"
await rt.handle_message(action_text)
else:
await rt.handle_action(action, data_payload)
elif msg.get("type") == "cancel_process":
rt.process_manager.cancel(msg.get("pid", 0))
else:
@ -355,6 +364,7 @@ def register_routes(app):
"language": "en",
"style_hint": "casual, technical",
"facts": [],
"user_expectation": "conversational",
}
_pipeline_result = {"status": "idle", "id": "", "stage": "cleared"}
# Notify frontend via WS

View File

@ -302,9 +302,59 @@ class FrameEngine:
expert.send_hud = original_hud
thought_summary = (f"response[{len(thought.response)}] tool={thought.tool_used or 'none'} "
f"actions={len(thought.actions)}")
f"actions={len(thought.actions)} errors={len(thought.errors)}")
has_tool = bool(thought.tool_used and thought.tool_output)
# PA retry: if expert failed OR skipped tools when data was needed
expectation = self.memorizer.state.get("user_expectation", "conversational")
# Detect hallucination: expert returned no tool output for a data job
job_needs_data = any(k in (routing.job or "").lower()
for k in ["query", "select", "tabelle", "table", "daten", "data",
"cost", "kosten", "count", "anzahl", "average", "schnitt",
"find", "finde", "show", "zeig", "list", "beschreib"])
expert_skipped_tools = not has_tool and not thought.errors and job_needs_data
if (thought.errors or expert_skipped_tools) and not has_tool and expectation in ("delegated", "waiting_input", "conversational"):
retry_reason = f"{len(thought.errors)} errors" if thought.errors else "no tool calls for data job"
self._end_frame(rec, output_summary=thought_summary,
route="pa_retry", condition=f"expert_failed ({retry_reason}), expectation={expectation}")
await self._send_hud({"node": "runtime", "event": "pa_retry",
"detail": f"expert failed: {retry_reason}, retrying via PA"})
# Stream retry notice to user
retry_msg = "Anderer Ansatz..." if routing.language == "de" else "Trying a different approach..."
await self.sink.send_delta(retry_msg + "\n")
# PA reformulates with error context
retry_errors = thought.errors if thought.errors else [
{"query": "(none)", "error": "Expert produced no database queries. The job requires data lookup but the expert answered without querying. Reformulate with explicit query instructions."}
]
error_summary = "; ".join(e.get("error", "")[:80] for e in retry_errors[-2:])
rec = self._begin_frame(self.frame + 1, "pa_retry",
input_summary=f"errors: {error_summary[:100]}")
routing2 = await self.nodes["pa"].route_retry(
command, self.history, memory_context=mem_ctx,
identity=self.identity, channel=self.channel,
original_job=routing.job, errors=retry_errors)
self._end_frame(rec, output_summary=f"retry_job: {(routing2.job or '')[:60]}",
route=f"expert_{routing2.expert}" if routing2.expert != "none" else "output")
if routing2.expert != "none":
expert2 = self._experts.get(routing2.expert, expert)
rec = self._begin_frame(self.frame + 1, f"expert_{routing2.expert}_retry",
input_summary=f"retry job: {(routing2.job or '')[:80]}")
original_hud2 = expert2.send_hud
expert2.send_hud = self._make_progress_wrapper(original_hud2, routing2.language)
try:
thought = await expert2.execute(routing2.job, routing2.language)
finally:
expert2.send_hud = original_hud2
thought_summary = (f"response[{len(thought.response)}] tool={thought.tool_used or 'none'} "
f"errors={len(thought.errors)}")
has_tool = bool(thought.tool_used and thought.tool_output)
self._end_frame(rec, output_summary=thought_summary,
route="interpreter" if has_tool else "output+ui")
routing = routing2 # use retry routing for rest of pipeline
# Interpreter (conditional)
if self.has_interpreter and has_tool:
self._end_frame(rec, output_summary=thought_summary,
@ -607,6 +657,10 @@ class FrameEngine:
response, controls = await asyncio.gather(output_task, ui_task)
if controls:
await self.sink.send_controls(controls)
# Send artifacts (new system) alongside controls
artifacts = self.ui_node.get_artifacts()
if artifacts:
await self.sink.send_artifacts(artifacts)
return response
def _check_condition(self, name: str, command: Command = None,
@ -624,6 +678,7 @@ class FrameEngine:
return {
"response": response,
"controls": self.ui_node.current_controls,
"artifacts": self.ui_node.get_artifacts(),
"memorizer": self.memorizer.state,
"frames": self.frame,
"trace": self.last_trace.to_dict(),

View File

@ -118,6 +118,89 @@ IstRekonstruiert (bool), Herkunft (int)
ManuellerWert (double), Rohablesung (double)
Anmerkung, Fehler, Ampullenfarbe (longtext)
=== auftraege (2960 rows) billing work orders ===
PK: ID (int)
AuftragNummer, Bezeichnung (longtext)
ErstellDatum, Abgeschlossen (datetime)
ZugeordneteAbrechnungsinformationID (FK abrechnungsinformationen.ID)
ErstellMitarbeiterID (FK), AuftragsTyp (int), Status (int)
Anmerkung, ObererText, UntererText (longtext)
=== auftragspositionen (5094 rows) line items per work order ===
PK: ID (int)
AuftragID (FK auftraege.ID)
ArtikelID (FK artikel.ID)
SollMenge, IstMenge (int)
ZugeordneterGeraeteArtikelID (FK), ZugeordneteVertragPositionID (FK)
=== artikelposition (70164 rows) billing line items with prices ===
PK: ID (int)
ZugewiesenerArtikelID (FK artikel.ID)
ZugewieseneAbrechnungID (FK abrechnungsinformationen.ID)
RechnungID (FK rechnung.ID)
MengeVorgabe, Menge (decimal), NettoVorgabe, Netto (decimal), MWST (decimal)
Rechnungsart (int), VorschussBerechnung (bool), ARechnung (bool)
VerstecktInNebenkostenID (FK), ZugeordneteVertragPositionID (FK)
=== artikel (1078 rows) service/product catalog ===
PK: ID (int)
Artikelnummer, Bezeichnung (longtext)
Netto (decimal), MWST (decimal)
BerechnungsZiel (int), UmlageIn (int)
ZugeordnetePreislisteID (FK)
IstStandard, ARechnung, AppZusatz, IstEigenKostenpos (bool)
=== rechnung (7356 rows) invoices ===
PK: ID (int)
Rechnungsnummer (longtext), Rechnungsart (int)
BezahltAm (datetime), BezahlterBetrag (decimal)
Druckdatum, Erstelldatum, Exportdatum (datetime)
AbrechnungsinformationID (FK abrechnungsinformationen.ID)
AbschlagSummeSonder, AbschlagSummeStandard (decimal)
Bankeinzug (bool)
=== abrechnungsinformationen (4261 rows) billing periods/settings ===
PK: ID (int)
Von, Bis (datetime) billing period
AbrechnungHeizung, AbrechnungWarmwasser, AbrechnungNebenkosten, AbrechnungKaltwasser (bool)
Tarifabrechnung, BHKW, HeizsaldoInNebenkosten, AbrechnungLegionellen, AbrechnungRauchmelder (bool)
=== nebenkosten (42209 rows) ancillary cost items ===
PK: ID (int)
Von, Bis (datetime)
Bezeichnung (longtext), Mwst (decimal), Brutto (decimal)
EinheitDerKostenart (longtext), Umlage (int), UmlageZiel (int)
ZugeordnetesObjektID (FK objekte.ID)
NurEigentuemer, NurNutzer (bool)
=== vorauszahlungen (83932 rows) advance payments per tenant ===
PK: ID (int)
ZugeordneterNutzerID (FK nutzer.ID)
BetragNebenkosten, BetragHeizkosten, BetragWarmwasser (decimal)
Von, Bis (datetime), IstNetto (bool)
=== heizbetriebskosten (22557 rows) heating operation costs ===
PK: ID (int)
Von, Bis (datetime), Bezeichnung (longtext)
Mwst (decimal), Brutto (decimal), Art (int)
ZugeordnetesObjektID (FK objekte.ID)
ZugeordneteVerbrauchsgruppeID (FK)
=== brennstofflieferungen (6477 rows) fuel deliveries ===
PK: ID (int)
GeliefertAm (datetime), Menge (decimal), Betrag (decimal)
Mwst (decimal), Heizwert (decimal)
Anfangsstand, Endstand (decimal)
ZugeordneterEnergieVerwerterID (FK), BrennstoffMediumID (FK)
ZugeordneteAbrechnungsinformationID (FK abrechnungsinformationen.ID)
=== vertragpositionen (4395 rows) contract line items ===
PK: ID (int)
LaufzeitVon, LaufzeitBis (datetime)
Menge (decimal), Gesamtpreis (decimal), PreisProEinheit (decimal), Mwst (decimal)
ArtikelID (FK artikel.ID), VertragNummer (longtext)
Art (int), Umlage (int)
JOIN PATTERNS (use exactly):
Kunde Objekte: JOIN objektkunde ok ON ok.KundeID = k.ID JOIN objekte o ON o.ID = ok.ObjektID
Objekt Adresse: JOIN objektadressen oa ON oa.ObjektID = o.ID JOIN adressen a ON a.ID = oa.AdresseID
@ -126,6 +209,13 @@ Objekt → NE: JOIN nutzeinheit ne ON ne.ObjektID = o.ID
NE Nutzer: JOIN nutzer nu ON nu.NutzeinheitID = ne.ID
NE Geraete: JOIN geraete g ON g.NutzeinheitID = ne.ID
Geraet Verbrauch: JOIN geraeteverbraeuche gv ON gv.GeraetID = g.ID
Auftrag Positionen: JOIN auftragspositionen ap ON ap.AuftragID = a.ID
Auftrag Abrechnung: JOIN abrechnungsinformationen ai ON ai.ID = a.ZugeordneteAbrechnungsinformationID
Artikelpos Artikel: JOIN artikel art ON art.ID = ap.ZugewiesenerArtikelID
Artikelpos Rechnung: JOIN rechnung r ON r.ID = ap.RechnungID
Artikelpos Abrechnung: JOIN abrechnungsinformationen ai ON ai.ID = ap.ZugewieseneAbrechnungID
Nebenkosten Objekt: JOIN objekte o ON o.ID = nk.ZugeordnetesObjektID
Vorauszahlung Nutzer: JOIN nutzer nu ON nu.ID = vz.ZugeordneterNutzerID
RULES:
- For tables listed above: use ONLY the listed column names. Never guess.

View File

@ -38,28 +38,38 @@ Given a job description, produce a JSON tool sequence to accomplish it.
Available tools:
- query_db(query, database) SQL SELECT/DESCRIBE/SHOW only
- emit_actions(actions) show buttons [{{label, action, payload?}}]
- emit_actions(actions) show buttons [{label, action, payload?}]
- set_state(key, value) persistent key-value
- create_machine(id, initial, states) interactive UI navigation
- add_state / reset_machine / destroy_machine machine lifecycle
- update_machine(id, data) update wizard data fields (e.g. {"bundesland": "Bayern"})
- transition_machine(id, target) move machine to a specific state
- emit_artifact(type, data, actions?, meta?) emit a typed workspace artifact:
type="entity_detail": data={title, subtitle?, fields:[{label,value}]}, actions=[{label,action}]
type="data_table": data={title?, columns:[str], rows:[{col:val}]}
type="document_page": data={title, sections:[{heading,content}]}
type="action_bar": actions=[{label, action, payload?}]
type="status": data={label, value?, display_type:"progress"|"info"|"text"}
NOTE: Cards are generated automatically in the response step from query results.
Do NOT plan emit_card or emit_list just query the data and the system handles display.
PREFERRED: Use emit_artifact for all display output. Legacy emit_card/emit_display still work but emit_artifact is cleaner.
Cards are also generated automatically in the response step from query results.
Output ONLY valid JSON:
{{
{
"tool_sequence": [
{{"tool": "query_db", "args": {{"query": "SELECT ...", "database": "{database}"}}}}
{"tool": "query_db", "args": {"query": "SELECT ...", "database": "{database}"}}
],
"response_hint": "How to phrase the result"
}}
}
Rules:
- NEVER guess column names. Use ONLY columns from the schema.
- Max 5 tools. Keep it focused.
- For entity details: query all relevant fields, the response step creates the card.
- For lists: query multiple rows, the table renders automatically.
- The job is self-contained."""
- The job is self-contained.
- NEVER answer data questions without querying the database. You MUST include at least one query_db call for any job that asks about data, counts, costs, or entities. If you are unsure which tables to use, start with DESCRIBE or SELECT * FROM table LIMIT 3 to explore.
- An EMPTY tool_sequence is ONLY acceptable if the job explicitly asks for a UI-only action (buttons, machine, display) with no data lookup."""
RESPONSE_SYSTEM = """You are a domain expert summarizing results for the user.
@ -70,22 +80,22 @@ Job: {job}
Output a JSON object with "text" (response to user) and optionally "card" (structured display):
{{
{
"text": "Concise natural response, 1-3 sentences. Reference data. Match language: {language}.",
"card": {{
"card": {
"title": "Entity Name or ID",
"subtitle": "Type or category",
"fields": [{{"label": "Field", "value": "actual value from results"}}],
"actions": [{{"label": "Next action", "action": "action_id"}}]
}}
}}
"fields": [{"label": "Field", "value": "actual value from results"}],
"actions": [{"label": "Next action", "action": "action_id"}]
}
}
Rules:
- "text" is REQUIRED. Keep it short.
- "card" is OPTIONAL. Include it for single-entity details (Kunde, Objekt, Auftrag).
- Card fields must use ACTUAL values from the query results, never templates/placeholders.
- For lists of multiple entities, use multiple fields or skip the card.
- If no card makes sense, just return {{"text": "..."}}.
- If no card makes sense, just return {"text": "..."}.
- Output ONLY valid JSON."""
def __init__(self, send_hud, process_manager=None):
@ -113,10 +123,12 @@ Rules:
plan_prompt += f" DESCRIBE result: {err['describe'][:300]}\n"
plan_prompt += "\nFix the query. If a column was unknown, use the DESCRIBE result above or try SELECT * LIMIT 3 to see actual columns."
plan_system = self.PLAN_SYSTEM
plan_system = plan_system.replace("{domain}", self.DOMAIN_SYSTEM)
plan_system = plan_system.replace("{schema}", self.SCHEMA)
plan_system = plan_system.replace("{database}", self.default_database)
plan_messages = [
{"role": "system", "content": self.PLAN_SYSTEM.format(
domain=self.DOMAIN_SYSTEM, schema=self.SCHEMA,
database=self.default_database)},
{"role": "system", "content": plan_system},
{"role": "user", "content": plan_prompt},
]
plan_raw = await llm_call(self.model, plan_messages)
@ -129,6 +141,7 @@ Rules:
state_updates = {}
display_items = []
machine_ops = []
artifacts = []
tool_used = ""
tool_output = ""
had_error = False
@ -162,6 +175,20 @@ Rules:
machine_ops.append({"op": "reset", **args})
elif tool == "destroy_machine":
machine_ops.append({"op": "destroy", **args})
elif tool == "update_machine":
machine_ops.append({"op": "update_data", **args})
elif tool == "transition_machine":
machine_ops.append({"op": "transition", **args})
elif tool == "emit_artifact":
import uuid
artifact = {
"id": args.get("id", str(uuid.uuid4())[:8]),
"type": args.get("type", "status"),
"data": args.get("data", {}),
"actions": args.get("actions", []),
"meta": args.get("meta", {}),
}
artifacts.append(artifact)
elif tool == "query_db":
query = args.get("query", "")
database = args.get("database", self.default_database)
@ -213,9 +240,13 @@ Rules:
for err in errors_so_far[-2:]:
results_text += f" {err['error'][:100]}\n"
resp_system = self.RESPONSE_SYSTEM
resp_system = resp_system.replace("{domain}", self.DOMAIN_SYSTEM)
resp_system = resp_system.replace("{job}", job)
resp_system = resp_system.replace("{results}", results_text)
resp_system = resp_system.replace("{language}", language)
resp_messages = [
{"role": "system", "content": self.RESPONSE_SYSTEM.format(
domain=self.DOMAIN_SYSTEM, job=job, results=results_text, language=language)},
{"role": "system", "content": resp_system},
{"role": "user", "content": job},
]
raw_response = await llm_call(self.model, resp_messages)
@ -231,7 +262,14 @@ Rules:
text = text.strip()
resp_data = json.loads(text)
response = resp_data.get("text", raw_response)
if resp_data.get("card"):
if resp_data.get("artifact"):
# New: artifact in response JSON
art = resp_data["artifact"]
import uuid
if "id" not in art:
art["id"] = str(uuid.uuid4())[:8]
artifacts.append(art)
elif resp_data.get("card"):
card = resp_data["card"]
card["type"] = "card"
display_items.append(card)
@ -248,6 +286,8 @@ Rules:
state_updates=state_updates,
display_items=display_items,
machine_ops=machine_ops,
errors=errors_so_far,
artifacts=artifacts,
)
def _parse_plan(self, raw: str) -> tuple[list, str]:

View File

@ -22,7 +22,7 @@ Listener: {identity} on {channel}
Return ONLY valid JSON. No markdown, no explanation.
Schema:
{{
{
"who": "name or unknown",
"language": "en | de | mixed",
"intent": "question | request | social | action | feedback",
@ -30,7 +30,7 @@ Schema:
"tone": "casual | frustrated | playful | urgent",
"complexity": "trivial | simple | complex",
"context": "brief note or empty"
}}
}
Rules:
- Classify the CURRENT message only. Previous messages are context, not the target.
@ -53,11 +53,11 @@ Rules:
casual = neutral
Examples:
"hi there!" -> {{"language":"en","intent":"social","tone":"casual","complexity":"trivial"}}
"Wie spaet ist es?" -> {{"language":"de","intent":"question","tone":"casual","complexity":"simple"}}
"this is broken, nothing works" -> {{"language":"en","intent":"feedback","tone":"frustrated","complexity":"simple"}}
"create two buttons" -> {{"language":"en","intent":"request","tone":"casual","complexity":"simple"}}
"ok thanks bye" -> {{"language":"en","intent":"social","tone":"casual","complexity":"trivial"}}
"hi there!" -> {"language":"en","intent":"social","tone":"casual","complexity":"trivial"}
"Wie spaet ist es?" -> {"language":"de","intent":"question","tone":"casual","complexity":"simple"}
"this is broken, nothing works" -> {"language":"en","intent":"feedback","tone":"frustrated","complexity":"simple"}
"create two buttons" -> {"language":"en","intent":"request","tone":"casual","complexity":"simple"}
"ok thanks bye" -> {"language":"en","intent":"social","tone":"casual","complexity":"trivial"}
{memory_context}"""
@ -78,8 +78,9 @@ Examples:
history_summary = "Recent conversation:\n" + "\n".join(lines)
messages = [
{"role": "system", "content": self.SYSTEM.format(
memory_context=memory_context, identity=identity, channel=channel)},
{"role": "system", "content": self.SYSTEM.replace(
"{memory_context}", memory_context).replace(
"{identity}", identity).replace("{channel}", channel)},
]
if history_summary:
messages.append({"role": "user", "content": history_summary})

View File

@ -26,6 +26,19 @@ Given the conversation so far, output a JSON object with these fields:
- language: string primary language being used (en, de, mixed)
- style_hint: string how Output should talk (casual, formal, technical, poetic, etc.)
- facts: list of strings important facts learned about the user. NEVER drop facts from the existing list unless they are proven wrong. Always include all existing facts plus any new ones.
- user_expectation: string what the user expects the agent to do next. One of:
"conversational" default. User is chatting, asking questions, browsing. Normal back-and-forth.
"delegated" user gave an imperative task ("build X", "do Y", "create Z"). They expect autonomous progress, not clarifying questions.
"waiting_input" agent asked a question or presented choices. User's next message is likely an answer.
"observing" user returned after being idle, or is reviewing a large output. Brief responses, wait for explicit engagement.
Cues:
- Imperative verbs + task scope ("build", "create", "do", "find") delegated
- Agent ended with "Moment..." / thinking message but user hasn't seen full results yet → delegated (task still in progress)
- Short follow-ups like "und?", "ja?", "weiter?", "and?", "so?", "result?", "ergebnis?" waiting_input (user is waiting for the agent to deliver)
- Agent ended with a question ("Sollen wir...?", "Gibt es...?") waiting_input
- User said "ok/thanks/bye/danke" after output observing
- Everything else conversational
IMPORTANT: If the agent just delivered partial results or said "Moment..." and the user sends a short nudge, that is ALWAYS waiting_input, never conversational.
Output ONLY valid JSON. No explanation, no markdown fences."""
@ -40,6 +53,7 @@ Output ONLY valid JSON. No explanation, no markdown fences."""
"language": "en",
"style_hint": "casual, technical",
"facts": [],
"user_expectation": "conversational",
}
def get_context_block(self, sensor_lines: list[str] = None, ui_state: dict = None) -> str:

View File

@ -34,6 +34,12 @@ YOUR JOB: Transform the Thinker's reasoning into a natural, human-readable text
- Keep the user's language — if they wrote German, respond in German.
- Be concise. Don't describe data that the UI node will show as a table.
PHRASING by user_expectation (from memorizer):
- "delegated": progress-report style. State what was done and what's next. No questions unless blocked.
- "waiting_input": acknowledge the user's answer and continue the flow naturally.
- "observing": keep it brief. No unsolicited follow-up questions or suggestions.
- "conversational": natural, warm dialogue. Follow-ups are fine.
{memory_context}"""
async def process(self, thought: ThoughtResult, history: list[dict],
@ -42,7 +48,7 @@ YOUR JOB: Transform the Thinker's reasoning into a natural, human-readable text
await self.hud("streaming")
messages = [
{"role": "system", "content": self.SYSTEM.format(memory_context=memory_context)},
{"role": "system", "content": self.SYSTEM.replace("{memory_context}", memory_context)},
]
for msg in history[-20:]:
messages.append(msg)

View File

@ -27,6 +27,8 @@ Experts have these tools:
- query_db SQL queries on their domain database
- emit_actions create buttons on the dashboard
- create_machine / add_state / reset_machine / destroy_machine interactive UI components
- update_machine(id, data) update wizard data fields on existing machine
- transition_machine(id, target) move machine to a specific state
- set_state persistent key-value store
- emit_display formatted data display
@ -36,13 +38,13 @@ YOUR JOB:
3. Only respond directly for social chat (greetings, thanks, bye, small talk)
Output ONLY valid JSON:
{{
{
"expert": "{expert_names} | none",
"job": "Self-contained task. Include ALL context — the expert has NO conversation history. Describe what to query, what UI to build, what the user expects to see.",
"thinking_message": "Short message for user while expert works, in their language",
"response_hint": "If expert=none, your direct response to the user.",
"language": "de | en | mixed"
}}
}
Rules:
- expert=none ONLY for social chat (hi, thanks, bye, how are you)
@ -53,6 +55,16 @@ Rules:
- thinking_message: natural, in user's language. e.g. "Moment, ich schaue nach..."
- If the user mentions data, tables, customers, devices, buttons, counters expert
- When unsure which expert: pick the one whose domain matches best
- MACHINE STATE: If there are active machines/wizards listed in the context below, ALWAYS include the machine's current state and stored data in the job. The expert needs this to continue the workflow. Example: "Machine 'angebot_wizard' is on step 'select_age', data: {bundesland: Bayern}. User asks: ..."
- If the user asks about their wizard/workflow progress and the info is already visible in the context, respond directly (expert=none) using the machine state from context. Only route to expert if the user needs data queried or tools called.
- For update_machine / transition_machine requests: route to expert with the machine ID and operation details in the job.
USER EXPECTATION (from memorizer):
- If user_expectation is "delegated": formulate comprehensive, autonomous jobs. Do NOT include clarifying questions in the job. Tell the expert to proceed and report results.
- If user_expectation is "waiting_input": the user is waiting for results or nudging ("und?", "ja?", "weiter?"). Look at conversation history to find what they were waiting for and re-formulate that job. If they answered a question you asked, extract their answer and fold it into context.
- If user_expectation is "observing": only route to expert if the user explicitly asks for something. Otherwise respond directly with brief acknowledgment.
- If user_expectation is "conversational": normal routing behavior.
- CONTINUATION: When user sends a very short message (1-3 words like "und?", "weiter", "ja") after partial/incomplete results, treat it as "continue the previous task". Include the original question and any partial results in the job.
{memory_context}"""
@ -89,10 +101,15 @@ Rules:
expert_lines.append("- (no experts available — handle everything directly)")
expert_names = " | ".join(self._available_experts) if self._available_experts else "none"
# Manual substitution to avoid .format() breaking on curly braces in memory_context
system_content = self.SYSTEM
system_content = system_content.replace("{memory_context}", memory_context)
system_content = system_content.replace("{identity}", identity)
system_content = system_content.replace("{channel}", channel)
system_content = system_content.replace("{experts}", "\n".join(expert_lines))
system_content = system_content.replace("{expert_names}", expert_names)
messages = [
{"role": "system", "content": self.SYSTEM.format(
memory_context=memory_context, identity=identity, channel=channel,
experts="\n".join(expert_lines), expert_names=expert_names)},
{"role": "system", "content": system_content},
]
# Summarize recent history (PA sees full context)
@ -118,7 +135,7 @@ Rules:
log.info(f"[pa] raw: {raw[:300]}")
routing = self._parse_routing(raw, command)
await self.hud("routed", expert=routing.expert, job=routing.job[:100],
await self.hud("routed", expert=routing.expert, job=(routing.job or "")[:100],
direct=routing.expert == "none")
# Update directive style based on tone
@ -131,6 +148,72 @@ Rules:
return routing
async def route_retry(self, command: Command, history: list[dict],
memory_context: str = "", identity: str = "unknown",
channel: str = "unknown", original_job: str = "",
errors: list = None) -> PARouting:
"""Re-route after expert failure. PA reformulates with error context."""
await self.hud("thinking", detail="reformulating after expert failure")
error_lines = []
for err in (errors or [])[-3:]:
error_lines.append(f"- Query: {err.get('query', '?')[:100]}")
error_lines.append(f" Error: {err.get('error', '?')[:100]}")
if err.get("describe"):
error_lines.append(f" Schema: {err['describe'][:200]}")
retry_prompt = f"""The expert FAILED the previous job. You must reformulate.
ORIGINAL JOB: {original_job}
ERRORS:
{chr(10).join(error_lines)}
REFORMULATE the job with a DIFFERENT approach:
- If the query was too complex (JOINs, window functions), break it into simpler steps
- If columns were wrong, use the DESCRIBE info above to fix them
- If the table structure is unclear, tell the expert to first explore with SELECT * LIMIT 5
- Think about what data the user actually needs and find a simpler path to it
Output the same JSON format as before. The job MUST be different from the original."""
expert_lines = []
for name in self._available_experts:
desc = self.EXPERT_DESCRIPTIONS.get(name, f"{name} — domain expert")
expert_lines.append(f"- {desc}")
expert_names = " | ".join(self._available_experts) if self._available_experts else "none"
system_content = self.SYSTEM
system_content = system_content.replace("{memory_context}", memory_context)
system_content = system_content.replace("{identity}", identity)
system_content = system_content.replace("{channel}", channel)
system_content = system_content.replace("{experts}", "\n".join(expert_lines))
system_content = system_content.replace("{expert_names}", expert_names)
messages = [
{"role": "system", "content": system_content},
]
recent = history[-8:]
if recent:
lines = []
for msg in recent:
role = msg.get("role", "?")
content = msg.get("content", "")[:200]
lines.append(f" {role}: {content}")
messages.append({"role": "user", "content": "Recent conversation:\n" + "\n".join(lines)})
messages.append({"role": "assistant", "content": "OK, I have the context."})
messages.append({"role": "user", "content": retry_prompt})
messages = self.trim_context(messages)
raw = await llm_call(self.model, messages)
log.info(f"[pa] retry raw: {raw[:300]}")
routing = self._parse_routing(raw, command)
await self.hud("routed", expert=routing.expert, job=(routing.job or "")[:100],
direct=routing.expert == "none", retry=True)
return routing
def _parse_routing(self, raw: str, command: Command) -> PARouting:
"""Parse LLM JSON into PARouting with fallback."""
text = raw.strip()
@ -149,10 +232,10 @@ Rules:
expert = "none"
return PARouting(
expert=expert,
job=data.get("job", ""),
thinking_message=data.get("thinking_message", ""),
response_hint=data.get("response_hint", ""),
language=data.get("language", command.analysis.language),
job=data.get("job") or "",
thinking_message=data.get("thinking_message") or "",
response_hint=data.get("response_hint") or "",
language=data.get("language") or command.analysis.language,
)
except (json.JSONDecodeError, Exception) as e:
log.error(f"[pa] parse failed: {e}, raw: {text[:200]}")

View File

@ -236,7 +236,7 @@ You are one node in a pipeline: Input (perceives) -> You (reason) -> Output (spe
1. emit_actions() show buttons. Button clicks come back as "ACTION: action_name".
Stateful buttons: include var/op in payload (inc/dec/set/toggle). UI handles locally.
Example: label:"+1", action:"increment", payload:{{"var":"count","op":"inc","initial":0}}
Example: label:"+1", action:"increment", payload:{"var":"count","op":"inc","initial":0}
2. set_state(key, value) persistent key-value store shown as live labels.
Survives across turns. Use for tracking mode, progress, flags.
@ -253,9 +253,9 @@ You are one node in a pipeline: Input (perceives) -> You (reason) -> Output (spe
destroy_machine(id) remove machine from dashboard.
Example navigation menu:
create_machine(id="nav", initial="main", states=[
{{"name":"main","buttons":[{{"label":"Menu 1","action":"menu_1","go":"sub1"}},{{"label":"Menu 2","action":"menu_2","go":"sub2"}}],"content":["Welcome"]}},
{{"name":"sub1","buttons":[{{"label":"Back","action":"back","go":"main"}}],"content":["Sub 1 details"]}},
{{"name":"sub2","buttons":[{{"label":"Back","action":"back","go":"main"}}],"content":["Sub 2 details"]}}
{"name":"main","buttons":[{"label":"Menu 1","action":"menu_1","go":"sub1"},{"label":"Menu 2","action":"menu_2","go":"sub2"}],"content":["Welcome"]},
{"name":"sub1","buttons":[{"label":"Back","action":"back","go":"main"}],"content":["Sub 1 details"]},
{"name":"sub2","buttons":[{"label":"Back","action":"back","go":"main"}],"content":["Sub 2 details"]}
])
PREFER machines over emit_actions for anything with navigation or multiple views.
ALWAYS include states when creating a machine. Never write code use the tool.
@ -350,10 +350,10 @@ conn.commit()
cursor.execute("SELECT name FROM sqlite_master WHERE type='table'")
tables = cursor.fetchall()
for t in tables:
cursor.execute(f"SELECT * FROM {{t[0]}}")
cursor.execute(f"SELECT * FROM {t[0]}")
rows = cursor.fetchall()
cols = [d[0] for d in cursor.description]
print(f"Table: {{t[0]}}")
print(f"Table: {t[0]}")
print(" | ".join(cols))
for row in rows:
print(" | ".join(str(c) for c in row))
@ -446,7 +446,7 @@ conn.close()'''
await self.hud("thinking", detail="reasoning about response")
messages = [
{"role": "system", "content": self.SYSTEM.format(memory_context=memory_context)},
{"role": "system", "content": self.SYSTEM.replace("{memory_context}", memory_context)},
]
for msg in history[-12:]:
messages.append(msg)

View File

@ -88,7 +88,7 @@ Rules:
hint += f"\nTool result:\n{tool_output[:500]}"
messages = [
{"role": "system", "content": self.RESPONSE_SYSTEM.format(hint=hint)},
{"role": "system", "content": self.RESPONSE_SYSTEM.replace("{hint}", hint)},
]
for msg in history[-8:]:
messages.append(msg)

View File

@ -2,9 +2,10 @@
import json
import logging
import uuid
from .base import Node
from ..types import ThoughtResult
from ..types import ThoughtResult, Artifact
log = logging.getLogger("runtime")
@ -16,6 +17,7 @@ class UINode(Node):
def __init__(self, send_hud):
super().__init__(send_hud)
self.thinker_controls: list[dict] = [] # buttons, labels, tables from Thinker
self.artifacts: list[dict] = [] # typed workspace artifacts
self.state: dict = {} # {"count": 0, "theme": "dark", ...}
self.bindings: dict = {} # {"increment": {"op": "inc", "var": "count"}, ...}
self.machines: dict = {} # {"nav": {initial, states, current}, ...}
@ -79,6 +81,7 @@ class UINode(Node):
"initial": initial,
"current": initial,
"states": states,
"data": {}, # wizard field storage (e.g. {"bundesland": "Bayern"})
}
log.info(f"[ui] machine created: {mid} (initial={initial}, {len(states)} states)")
await self.hud("machine_created", id=mid, initial=initial, state_count=len(states))
@ -104,6 +107,28 @@ class UINode(Node):
log.info(f"[ui] machine reset: {mid} -> {initial}")
await self.hud("machine_reset", id=mid, state=initial)
elif op == "update_data":
if mid not in self.machines:
log.warning(f"[ui] update_data: machine '{mid}' not found")
continue
data_update = op_data.get("data", {})
self.machines[mid]["data"].update(data_update)
log.info(f"[ui] machine data updated: {mid} += {data_update}")
await self.hud("machine_data_updated", id=mid, data=data_update)
elif op == "transition":
if mid not in self.machines:
log.warning(f"[ui] transition: machine '{mid}' not found")
continue
target = op_data.get("target", "")
if target in self.machines[mid]["states"]:
old = self.machines[mid]["current"]
self.machines[mid]["current"] = target
log.info(f"[ui] machine transition (expert): {mid} {old} -> {target}")
await self.hud("machine_transitioned", id=mid, old=old, target=target)
else:
log.warning(f"[ui] transition target '{target}' not found in {mid}")
elif op == "destroy":
if mid in self.machines:
del self.machines[mid]
@ -157,15 +182,31 @@ class UINode(Node):
return controls
def get_machine_summary(self) -> str:
"""Summary for Thinker context — shape only, not full data."""
"""Rich summary for PA/Thinker context — includes current state details and stored data."""
if not self.machines:
return ""
parts = []
for mid, m in self.machines.items():
current = m["current"]
state_names = list(m["states"].keys())
parts.append(f" machine '{mid}': state={current}, states={state_names}")
return "Machines:\n" + "\n".join(parts)
state_def = m["states"].get(current, {})
line = f" machine '{mid}': state={current}, states={state_names}"
# Current state content
content = state_def.get("content", [])
if content:
line += f", content={content}"
# Current state buttons
buttons = state_def.get("buttons", [])
if buttons:
btn_labels = [b.get("label", b.get("action", "?")) for b in buttons if isinstance(b, dict)]
if btn_labels:
line += f", buttons={btn_labels}"
# Stored wizard data
data = m.get("data", {})
if data:
line += f", data={data}"
parts.append(line)
return "Active machines (interactive wizard/workflow state):\n" + "\n".join(parts)
# --- State operations ---
@ -343,21 +384,155 @@ class UINode(Node):
return controls
def _build_artifacts(self, thought: ThoughtResult) -> list[dict]:
"""Convert ThoughtResult into typed artifacts."""
arts = []
# 1. Direct artifacts from expert's emit_artifact calls
if thought.artifacts:
for a in thought.artifacts:
if not a.get("id"):
a["id"] = str(uuid.uuid4())[:8]
arts.append(a)
# 2. Convert display_items (cards, lists) → entity_detail artifacts
if thought.display_items:
for item in thought.display_items:
item_type = item.get("type", "text")
if item_type == "card":
arts.append({
"id": str(uuid.uuid4())[:8],
"type": "entity_detail",
"data": {
"title": item.get("title", ""),
"subtitle": item.get("subtitle", ""),
"fields": item.get("fields", []),
},
"actions": item.get("actions", []),
"meta": {},
})
elif item_type == "list":
arts.append({
"id": str(uuid.uuid4())[:8],
"type": "entity_detail",
"data": {
"title": item.get("title", ""),
"items": item.get("items", []),
},
"actions": [],
"meta": {"list": True},
})
else:
arts.append({
"id": str(uuid.uuid4())[:8],
"type": "status",
"data": {
"display_type": item_type,
"label": item.get("label", ""),
"value": item.get("value", ""),
"style": item.get("style", ""),
},
"actions": [],
"meta": {},
})
# 3. Convert actions → action_bar artifact
if thought.actions:
btns = self._parse_thinker_actions(thought.actions)
arts.append({
"id": "action_bar",
"type": "action_bar",
"data": {},
"actions": [{"label": b["label"], "action": b["action"],
"payload": b.get("payload", {})} for b in btns],
"meta": {},
})
elif self.thinker_controls:
# Preserve existing buttons as action_bar
existing_btns = [c for c in self.thinker_controls if c.get("type") == "button"]
if existing_btns:
arts.append({
"id": "action_bar",
"type": "action_bar",
"data": {},
"actions": [{"label": b["label"], "action": b["action"],
"payload": b.get("payload", {})} for b in existing_btns],
"meta": {},
})
# 4. Convert tool_output table → data_table artifact
if thought.tool_output:
table = self._extract_table(thought.tool_output)
if table:
arts.append({
"id": str(uuid.uuid4())[:8],
"type": "data_table",
"data": {
"columns": table["columns"],
"rows": table["data"],
},
"actions": [],
"meta": {"source": thought.tool_used or "query_db"},
})
# 5. State variables → status artifacts
if thought.state_updates:
for key, value in thought.state_updates.items():
self.set_var(key, value)
for var, value in self.state.items():
arts.append({
"id": f"state_{var}",
"type": "status",
"data": {"label": var, "value": str(value), "display_type": "text"},
"actions": [],
"meta": {"state_var": True},
})
# 6. Machines → machine artifacts
for mid, machine in self.machines.items():
current = machine["current"]
state_def = machine["states"].get(current, {})
arts.append({
"id": f"machine_{mid}",
"type": "machine",
"data": {
"machine_id": mid,
"current": current,
"states": list(machine["states"].keys()),
"content": state_def.get("content", []),
"stored_data": machine.get("data", {}),
},
"actions": [{"label": b.get("label", ""), "action": b.get("action", ""),
"go": b.get("go", "")}
for b in state_def.get("buttons", []) if isinstance(b, dict)],
"meta": {"live": True},
})
return arts
def get_artifacts(self) -> list[dict]:
"""Return current artifact list."""
return self.artifacts
async def process(self, thought: ThoughtResult, history: list[dict],
memory_context: str = "") -> list[dict]:
# Apply machine ops first (create/add_state/reset/destroy)
if thought.machine_ops:
await self.apply_machine_ops(thought.machine_ops)
# Build artifacts (new system)
self.artifacts = self._build_artifacts(thought)
# Build legacy controls (backward compat)
thinker_ctrls = self._build_controls(thought)
if thinker_ctrls:
self.thinker_controls = thinker_ctrls
# Always emit the merged view (thinker + machine)
merged = self.current_controls
if merged:
if merged or self.artifacts:
await self.hud("controls", controls=merged)
log.info(f"[ui] emitting {len(merged)} controls ({len(self.thinker_controls)} thinker + {len(self.get_machine_controls())} machine)")
log.info(f"[ui] emitting {len(merged)} controls + {len(self.artifacts)} artifacts")
else:
await self.hud("decided", instruction="no new controls")

View File

@ -56,6 +56,13 @@ class OutputSink:
except Exception:
pass
async def send_artifacts(self, artifacts: list):
if self.ws:
try:
await self.ws.send_text(json.dumps({"type": "artifacts", "artifacts": artifacts}))
except Exception:
pass
async def send_hud(self, data: dict):
if self.ws:
try:
@ -221,9 +228,10 @@ class Runtime:
self.history.append({"role": "user", "content": action_desc})
sensor_lines = self.sensor.get_context_lines()
director_line = self.director.get_context_line()
director_line = self.director.get_context_line() if self.director else ""
mem_ctx = self.memorizer.get_context_block(sensor_lines=sensor_lines, ui_state=self.ui_node.state)
mem_ctx += f"\n\n{director_line}"
if director_line:
mem_ctx += f"\n\n{director_line}"
command = Command(
analysis=InputAnalysis(intent="action", topic=action, complexity="simple"),
@ -242,7 +250,7 @@ class Runtime:
self.history.append({"role": "assistant", "content": response})
await self.memorizer.update(self.history)
if not self.is_v2:
if not self.is_v2 and self.director:
await self.director.update(self.history, self.memorizer.state)
if len(self.history) > self.MAX_HISTORY:
@ -319,9 +327,10 @@ class Runtime:
# Check Sensor flags (idle return, workspace mismatch)
sensor_flags = self.sensor.consume_flags()
sensor_lines = self.sensor.get_context_lines()
director_line = self.director.get_context_line()
director_line = self.director.get_context_line() if self.director else ""
mem_ctx = self.memorizer.get_context_block(sensor_lines=sensor_lines, ui_state=self.ui_node.state)
mem_ctx += f"\n\n{director_line}"
if director_line:
mem_ctx += f"\n\n{director_line}"
machine_summary = self.ui_node.get_machine_summary()
if machine_summary:
mem_ctx += f"\n\n{machine_summary}"

View File

@ -76,6 +76,19 @@ class PARouting:
language: str = "de" # Response language
@dataclass
class Artifact:
"""A typed workspace item. The unit of workspace content."""
id: str # unique ID
type: str # entity_detail | data_table | document_page | action_bar | status
data: dict = field(default_factory=dict) # type-specific payload
actions: list = field(default_factory=list) # [{label, action, payload?}]
meta: dict = field(default_factory=dict) # {entity?, related?, source_query?}
def to_dict(self) -> dict:
return asdict(self)
@dataclass
class ThoughtResult:
"""Thinker node's output — either a direct answer or tool results."""
@ -86,3 +99,5 @@ class ThoughtResult:
state_updates: dict = field(default_factory=dict) # {key: value} from set_state
display_items: list = field(default_factory=list) # [{type, label, value?, style?}] from emit_display
machine_ops: list = field(default_factory=list) # [{op, id, ...}] from machine tools
errors: list = field(default_factory=list) # [{query, error, describe?}] from failed retries
artifacts: list = field(default_factory=list) # [Artifact] from emit_artifact

View File

@ -382,6 +382,12 @@ def check_trace(trace: list, check: str) -> tuple[bool, str]:
return True, f"found reset_machine via machine_reset event"
if t.get("event") == "machine_destroyed" and tool_name == "destroy_machine":
return True, f"found destroy_machine via machine_destroyed event"
if t.get("event") == "machine_data_updated" and tool_name == "update_machine":
return True, f"found update_machine via machine_data_updated event"
if t.get("event") == "machine_transitioned" and tool_name == "transition_machine":
return True, f"found transition_machine via machine_transitioned event"
if t.get("event") == "pa_retry" and tool_name == "pa_retry":
return True, f"found pa_retry event"
return False, f"no tool_call '{tool_name}' in trace"
# machine_created id="NAV" — checks for specific machine creation

View File

@ -162,9 +162,17 @@ export function updateMeter(node, tokens, maxTokens, fillPct) {
export function updateAwarenessState(state) {
const body = document.getElementById('aw-state-body');
if (!body) return;
const expectation = state.user_expectation || 'conversational';
const expClass = {
conversational: 'aw-exp-conv',
delegated: 'aw-exp-deleg',
waiting_input: 'aw-exp-wait',
observing: 'aw-exp-obs',
}[expectation] || '';
const display = [
['user', state.user_name],
['mood', state.user_mood],
['expectation', expectation, expClass],
['topic', state.topic],
['lang', state.language],
['style', state.style_hint],
@ -173,8 +181,8 @@ export function updateAwarenessState(state) {
const facts = state.facts || [];
const history = state.topic_history || [];
let html = display.map(([k, v]) =>
`<div class="aw-row"><span class="aw-key">${esc(k)}</span><span class="aw-val">${esc(v || 'null')}</span></div>`
let html = display.map(([k, v, cls]) =>
`<div class="aw-row"><span class="aw-key">${esc(k)}</span><span class="aw-val ${cls || ''}">${esc(v || 'null')}</span></div>`
).join('');
if (facts.length) {

View File

@ -1,6 +1,9 @@
/** Dashboard: workspace controls rendering (buttons, tables, labels, displays, machines). */
/** Dashboard: workspace artifact + control rendering.
* Artifact system: typed artifacts (entity_detail, data_table, document_page, action_bar, status, machine).
* Legacy: dockControls() still works as fallback for old control format.
*/
import { esc } from './util.js';
import { esc, renderMarkdown } from './util.js';
import { addTrace } from './trace.js';
import { setDashboard } from './chat.js';
@ -8,8 +11,233 @@ let _ws = null;
export function setWs(ws) { _ws = ws; }
function _sendAction(action, data) {
if (_ws && _ws.readyState === 1) {
_ws.send(JSON.stringify({ type: 'action', action, data: data || {} }));
addTrace('runtime', 'action', action);
}
}
// --- Artifact system ---
export function dockArtifacts(artifacts) {
const body = document.getElementById('workspace-body');
if (!body) return;
body.innerHTML = '';
const container = document.createElement('div');
container.className = 'artifacts-container';
for (const art of artifacts) {
const wrapper = document.createElement('div');
wrapper.className = 'ws-artifact ws-artifact-' + (art.type || 'unknown');
wrapper.dataset.artifactId = art.id || '';
const renderer = RENDERERS[art.type];
if (renderer) {
renderer(wrapper, art);
} else {
wrapper.innerHTML = '<div class="ws-artifact-fallback">' + esc(JSON.stringify(art.data || {})) + '</div>';
}
container.appendChild(wrapper);
}
body.appendChild(container);
// Also set dashboard for S3* audit (flatten actions from artifacts)
const flatControls = artifacts.flatMap(a => (a.actions || []).map(act => ({type: 'button', ...act})));
setDashboard(flatControls);
}
// --- Artifact renderers ---
const RENDERERS = {
entity_detail: renderEntityDetail,
data_table: renderDataTable,
document_page: renderDocumentPage,
action_bar: renderActionBar,
status: renderStatus,
machine: renderMachine,
};
function renderEntityDetail(el, art) {
const d = art.data || {};
let html = '';
if (d.title) html += '<div class="ws-card-title">' + esc(d.title) + '</div>';
if (d.subtitle) html += '<div class="ws-card-subtitle">' + esc(d.subtitle) + '</div>';
// List mode (multiple items)
if (d.items && d.items.length) {
html += '<div class="ws-list">';
for (const item of d.items) {
html += '<div class="ws-card ws-card-nested">';
if (item.title) html += '<div class="ws-card-title">' + esc(item.title) + '</div>';
if (item.fields) {
html += '<div class="ws-card-fields">';
for (const f of item.fields) {
html += '<div class="ws-card-field"><span class="ws-card-key">' + esc(f.label || '') + '</span><span class="ws-card-val">' + esc(String(f.value ?? '')) + '</span></div>';
}
html += '</div>';
}
html += '</div>';
}
html += '</div>';
}
// Single entity fields
if (d.fields && d.fields.length) {
html += '<div class="ws-card-fields">';
for (const f of d.fields) {
const val = f.action
? '<span class="ws-card-link" data-action="' + esc(f.action) + '">' + esc(String(f.value ?? '')) + '</span>'
: '<span class="ws-card-val">' + esc(String(f.value ?? '')) + '</span>';
html += '<div class="ws-card-field"><span class="ws-card-key">' + esc(f.label || '') + '</span>' + val + '</div>';
}
html += '</div>';
}
// Actions
if (art.actions && art.actions.length) {
html += '<div class="ws-card-actions">';
for (const a of art.actions) {
html += '<button class="control-btn ws-card-btn" data-action="' + esc(a.action || '') + '">' + esc(a.label || '') + '</button>';
}
html += '</div>';
}
el.innerHTML = html;
_wireActions(el);
}
function renderDataTable(el, art) {
const d = art.data || {};
if (d.title) {
const title = document.createElement('div');
title.className = 'ws-artifact-header';
title.textContent = d.title;
el.appendChild(title);
}
const table = document.createElement('table');
table.className = 'control-table';
const cols = d.columns || (d.rows && d.rows.length ? Object.keys(d.rows[0]) : []);
if (cols.length) {
const thead = document.createElement('tr');
for (const col of cols) {
const th = document.createElement('th');
th.textContent = col;
thead.appendChild(th);
}
table.appendChild(thead);
}
for (const row of (d.rows || d.data || [])) {
const tr = document.createElement('tr');
if (Array.isArray(row)) {
for (const cell of row) {
const td = document.createElement('td'); td.textContent = cell; tr.appendChild(td);
}
} else if (typeof row === 'object') {
for (const col of cols) {
const td = document.createElement('td'); td.textContent = row[col] ?? ''; tr.appendChild(td);
}
}
table.appendChild(tr);
}
el.appendChild(table);
}
function renderDocumentPage(el, art) {
const d = art.data || {};
let html = '';
if (d.title) html += '<div class="ws-doc-title">' + esc(d.title) + '</div>';
for (const section of (d.sections || [])) {
html += '<div class="ws-doc-section">';
if (section.heading) html += '<div class="ws-doc-heading">' + esc(section.heading) + '</div>';
if (section.content) html += '<div class="ws-doc-content">' + renderMarkdown(section.content) + '</div>';
html += '</div>';
}
// Actions (e.g. PDF export)
if (art.actions && art.actions.length) {
html += '<div class="ws-card-actions">';
for (const a of art.actions) {
html += '<button class="control-btn ws-card-btn" data-action="' + esc(a.action || '') + '">' + esc(a.label || '') + '</button>';
}
html += '</div>';
}
el.innerHTML = html;
_wireActions(el);
}
function renderActionBar(el, art) {
for (const a of (art.actions || [])) {
const btn = document.createElement('button');
btn.className = 'control-btn';
btn.textContent = a.label || '';
btn.onclick = () => _sendAction(a.action, a.payload || {});
el.appendChild(btn);
}
}
function renderStatus(el, art) {
const d = art.data || {};
const dt = d.display_type || 'text';
el.classList.add('display-' + dt);
if (dt === 'progress') {
const pct = Math.min(100, Math.max(0, Number(d.value) || 0));
el.innerHTML = '<span class="cd-label">' + esc(d.label) + '</span>'
+ '<div class="cd-bar"><div class="cd-fill" style="width:' + pct + '%"></div></div>'
+ '<span class="cd-pct">' + pct + '%</span>';
} else if (dt === 'info') {
el.innerHTML = '<span class="cd-icon">\u2139</span><span class="cd-label">' + esc(d.label) + '</span>';
} else {
el.innerHTML = '<span class="cd-label">' + esc(d.label || '') + '</span>'
+ (d.value ? '<span class="cd-value">' + esc(String(d.value)) + '</span>' : '');
}
}
function renderMachine(el, art) {
const d = art.data || {};
const mid = d.machine_id || '';
// Header
let html = '<div class="ws-machine-header"><span class="ws-machine-name">' + esc(mid) + '</span>'
+ '<span class="ws-machine-state">' + esc(d.current || '') + '</span></div>';
// Content
for (const text of (d.content || [])) {
html += '<div class="ws-machine-content">' + esc(text) + '</div>';
}
// Stored data
const stored = d.stored_data || {};
if (Object.keys(stored).length) {
html += '<div class="ws-machine-data">';
for (const [k, v] of Object.entries(stored)) {
html += '<span class="ws-machine-datum">' + esc(k) + '=' + esc(String(v)) + '</span>';
}
html += '</div>';
}
// Buttons
if (art.actions && art.actions.length) {
html += '<div class="ws-card-actions">';
for (const a of art.actions) {
html += '<button class="control-btn ws-card-btn" data-action="' + esc(a.action || '') + '">' + esc(a.label || '') + '</button>';
}
html += '</div>';
}
el.innerHTML = html;
_wireActions(el);
}
// --- Helpers ---
function _wireActions(el) {
el.querySelectorAll('.ws-card-link').forEach(link => {
link.onclick = (e) => { e.stopPropagation(); _sendAction(link.dataset.action, {}); };
});
el.querySelectorAll('.ws-card-btn').forEach(btn => {
btn.onclick = (e) => { e.stopPropagation(); _sendAction(btn.dataset.action, {}); };
});
}
// --- Legacy control rendering (backward compat) ---
export function dockControls(controls) {
setDashboard(controls); // S3*: remember what's rendered
setDashboard(controls);
const body = document.getElementById('workspace-body');
if (!body) return;
body.innerHTML = '';
@ -21,12 +249,7 @@ export function dockControls(controls) {
const btn = document.createElement('button');
btn.className = 'control-btn';
btn.textContent = ctrl.label;
btn.onclick = () => {
if (_ws && _ws.readyState === 1) {
_ws.send(JSON.stringify({ type: 'action', action: ctrl.action, data: ctrl.payload || ctrl.data || {} }));
addTrace('runtime', 'action', ctrl.action);
}
};
btn.onclick = () => _sendAction(ctrl.action, ctrl.payload || ctrl.data || {});
container.appendChild(btn);
} else if (ctrl.type === 'table') {
const table = document.createElement('table');
@ -34,22 +257,16 @@ export function dockControls(controls) {
if (ctrl.columns) {
const thead = document.createElement('tr');
for (const col of ctrl.columns) {
const th = document.createElement('th');
th.textContent = col;
thead.appendChild(th);
const th = document.createElement('th'); th.textContent = col; thead.appendChild(th);
}
table.appendChild(thead);
}
for (const row of (ctrl.data || [])) {
const tr = document.createElement('tr');
if (Array.isArray(row)) {
for (const cell of row) {
const td = document.createElement('td'); td.textContent = cell; tr.appendChild(td);
}
for (const cell of row) { const td = document.createElement('td'); td.textContent = cell; tr.appendChild(td); }
} else if (typeof row === 'object') {
for (const col of (ctrl.columns || Object.keys(row))) {
const td = document.createElement('td'); td.textContent = row[col] ?? ''; tr.appendChild(td);
}
for (const col of (ctrl.columns || Object.keys(row))) { const td = document.createElement('td'); td.textContent = row[col] ?? ''; tr.appendChild(td); }
}
table.appendChild(tr);
}
@ -62,105 +279,42 @@ export function dockControls(controls) {
} else if (ctrl.type === 'display') {
const disp = document.createElement('div');
const dt = ctrl.display_type || 'text';
const style = ctrl.style ? ' display-' + ctrl.style : '';
disp.className = 'control-display display-' + dt + style;
disp.className = 'control-display display-' + dt;
if (dt === 'progress') {
const pct = Math.min(100, Math.max(0, Number(ctrl.value) || 0));
disp.innerHTML = '<span class="cd-label">' + esc(ctrl.label) + '</span>'
+ '<div class="cd-bar"><div class="cd-fill" style="width:' + pct + '%"></div></div>'
+ '<span class="cd-pct">' + pct + '%</span>';
} else if (dt === 'status') {
disp.innerHTML = '<span class="cd-icon">' + (ctrl.style === 'success' ? '\u2713' : ctrl.style === 'error' ? '\u2717' : '\u2139') + '</span>'
+ '<span class="cd-label">' + esc(ctrl.label) + '</span>';
disp.innerHTML = '<span class="cd-label">' + esc(ctrl.label) + '</span><div class="cd-bar"><div class="cd-fill" style="width:' + pct + '%"></div></div><span class="cd-pct">' + pct + '%</span>';
} else {
disp.innerHTML = '<span class="cd-label">' + esc(ctrl.label) + '</span>'
+ (ctrl.value ? '<span class="cd-value">' + esc(String(ctrl.value)) + '</span>' : '');
disp.innerHTML = '<span class="cd-label">' + esc(ctrl.label) + '</span>' + (ctrl.value ? '<span class="cd-value">' + esc(String(ctrl.value)) + '</span>' : '');
}
container.appendChild(disp);
} else if (ctrl.type === 'card') {
container.appendChild(renderCard(ctrl));
} else if (ctrl.type === 'list') {
const listEl = document.createElement('div');
listEl.className = 'ws-list';
if (ctrl.title) {
const h = document.createElement('div');
h.className = 'ws-list-title';
h.textContent = ctrl.title;
listEl.appendChild(h);
const card = document.createElement('div');
card.className = 'ws-card';
let html = '';
if (ctrl.title) html += '<div class="ws-card-title">' + esc(ctrl.title) + '</div>';
if (ctrl.subtitle) html += '<div class="ws-card-subtitle">' + esc(ctrl.subtitle) + '</div>';
if (ctrl.fields && ctrl.fields.length) {
html += '<div class="ws-card-fields">';
for (const f of ctrl.fields) {
html += '<div class="ws-card-field"><span class="ws-card-key">' + esc(f.label || '') + '</span><span class="ws-card-val">' + esc(String(f.value ?? '')) + '</span></div>';
}
html += '</div>';
}
for (const item of (ctrl.items || [])) {
item.type = item.type || 'card';
listEl.appendChild(renderCard(item));
if (ctrl.actions && ctrl.actions.length) {
html += '<div class="ws-card-actions">';
for (const a of ctrl.actions) {
html += '<button class="control-btn ws-card-btn" data-action="' + esc(a.action || '') + '">' + esc(a.label || '') + '</button>';
}
html += '</div>';
}
container.appendChild(listEl);
card.innerHTML = html;
_wireActions(card);
container.appendChild(card);
}
}
body.appendChild(container);
}
function renderCard(card) {
const el = document.createElement('div');
el.className = 'ws-card';
if (card.action) {
el.classList.add('ws-card-clickable');
el.onclick = () => {
if (_ws && _ws.readyState === 1) {
_ws.send(JSON.stringify({ type: 'action', action: card.action, data: card.payload || {} }));
addTrace('runtime', 'action', card.action);
}
};
}
let html = '';
if (card.title) html += '<div class="ws-card-title">' + esc(card.title) + '</div>';
if (card.subtitle) html += '<div class="ws-card-subtitle">' + esc(card.subtitle) + '</div>';
if (card.fields && card.fields.length) {
html += '<div class="ws-card-fields">';
for (const f of card.fields) {
const val = f.action
? '<span class="ws-card-link" data-action="' + esc(f.action) + '">' + esc(String(f.value ?? '')) + '</span>'
: '<span class="ws-card-val">' + esc(String(f.value ?? '')) + '</span>';
html += '<div class="ws-card-field"><span class="ws-card-key">' + esc(f.label || '') + '</span>' + val + '</div>';
}
html += '</div>';
}
if (card.actions && card.actions.length) {
html += '<div class="ws-card-actions">';
for (const a of card.actions) {
html += '<button class="control-btn ws-card-btn" data-action="' + esc(a.action || '') + '">' + esc(a.label || '') + '</button>';
}
html += '</div>';
}
el.innerHTML = html;
// Wire up field links and action buttons
el.querySelectorAll('.ws-card-link').forEach(link => {
link.onclick = (e) => {
e.stopPropagation();
const action = link.dataset.action;
if (_ws && _ws.readyState === 1) {
_ws.send(JSON.stringify({ type: 'action', action, data: {} }));
addTrace('runtime', 'action', action);
}
};
});
el.querySelectorAll('.ws-card-btn').forEach(btn => {
btn.onclick = (e) => {
e.stopPropagation();
const action = btn.dataset.action;
if (_ws && _ws.readyState === 1) {
_ws.send(JSON.stringify({ type: 'action', action, data: {} }));
addTrace('runtime', 'action', action);
}
};
});
return el;
}
export function clearDashboard() {
const body = document.getElementById('workspace-body');
if (body) body.innerHTML = '';

View File

@ -3,7 +3,7 @@
import { authToken, isAuthFailed, setAuthFailed, showLogin } from './auth.js';
import { addTrace } from './trace.js';
import { addMsg, handleDelta, handleDone, setWs as setChatWs } from './chat.js';
import { dockControls, setWs as setDashWs } from './dashboard.js';
import { dockControls, dockArtifacts, setWs as setDashWs } from './dashboard.js';
import { graphAnimate } from './graph.js';
import { updateMeter, updateNodeFromHud, updateAwarenessState, updateAwarenessSensors } from './awareness.js';
import { updateTestStatus } from './tests.js';
@ -61,6 +61,8 @@ export function connect() {
handleDelta(data.content);
} else if (data.type === 'done') {
handleDone();
} else if (data.type === 'artifacts') {
dockArtifacts(data.artifacts);
} else if (data.type === 'controls') {
dockControls(data.controls);
} else if (data.type === 'cleared') {

View File

@ -127,6 +127,10 @@ button:hover { background: #1d4ed8; }
.aw-row { display: flex; justify-content: space-between; padding: 0.08rem 0; }
.aw-key { color: #888; font-size: 0.65rem; }
.aw-val { color: #e0e0e0; font-size: 0.7rem; font-weight: 500; }
.aw-exp-conv { color: #4caf50; }
.aw-exp-deleg { color: #ff9800; }
.aw-exp-wait { color: #42a5f5; }
.aw-exp-obs { color: #9e9e9e; }
/* UI Controls (workspace) */
.controls-container { padding: 0.3rem 0; display: flex; flex-wrap: wrap; gap: 0.3rem; align-items: flex-start; }
@ -159,6 +163,34 @@ button:hover { background: #1d4ed8; }
.ws-card-btn { font-size: 0.7rem; padding: 0.2rem 0.5rem; }
.ws-list { display: flex; flex-direction: column; gap: 0.3rem; width: 100%; }
.ws-list-title { font-size: 0.75rem; font-weight: 700; color: #888; text-transform: uppercase; letter-spacing: 0.03em; margin-bottom: 0.2rem; }
.ws-card-nested { margin: 0; border-color: #1a1a2e; }
/* Artifact system */
.artifacts-container { padding: 0.3rem 0; display: flex; flex-direction: column; gap: 0.4rem; }
.ws-artifact { width: 100%; }
.ws-artifact-entity { background: #111; border: 1px solid #222; border-radius: 0.4rem; padding: 0.5rem 0.6rem; }
.ws-artifact-data_table { }
.ws-artifact-action_bar { display: flex; flex-wrap: wrap; gap: 0.3rem; }
.ws-artifact-status { padding: 0.25rem 0.4rem; font-size: 0.75rem; display: flex; align-items: center; gap: 0.4rem; }
.ws-artifact-header { font-size: 0.75rem; font-weight: 600; color: #888; margin-bottom: 0.2rem; }
.ws-artifact-fallback { font-size: 0.7rem; color: #666; font-family: monospace; white-space: pre-wrap; }
/* Document page artifact */
.ws-artifact-document_page { background: #111; border: 1px solid #222; border-radius: 0.4rem; padding: 0.8rem 1rem; }
.ws-doc-title { font-size: 1rem; font-weight: 700; color: #e0e0e0; margin-bottom: 0.6rem; border-bottom: 1px solid #333; padding-bottom: 0.4rem; }
.ws-doc-section { margin-bottom: 0.5rem; }
.ws-doc-heading { font-size: 0.8rem; font-weight: 700; color: #a78bfa; margin-bottom: 0.2rem; }
.ws-doc-content { font-size: 0.75rem; color: #ccc; line-height: 1.5; }
.ws-doc-content ul, .ws-doc-content ol { margin: 0.2rem 0; padding-left: 1.2rem; }
/* Machine artifact */
.ws-artifact-machine { background: #111; border: 1px solid #2563eb33; border-radius: 0.4rem; padding: 0.5rem 0.6rem; }
.ws-machine-header { display: flex; justify-content: space-between; align-items: center; margin-bottom: 0.3rem; }
.ws-machine-name { font-size: 0.75rem; font-weight: 600; color: #a78bfa; }
.ws-machine-state { font-size: 0.7rem; color: #60a5fa; background: #1e3a5f; padding: 0.1rem 0.4rem; border-radius: 0.2rem; }
.ws-machine-content { font-size: 0.75rem; color: #ccc; padding: 0.1rem 0; }
.ws-machine-data { display: flex; flex-wrap: wrap; gap: 0.3rem; margin-top: 0.2rem; }
.ws-machine-datum { font-size: 0.65rem; color: #888; background: #1a1a2e; padding: 0.1rem 0.3rem; border-radius: 0.2rem; }
/* Login overlay */
#login-overlay { position: fixed; inset: 0; background: rgba(0,0,0,0.85); display: flex; align-items: center; justify-content: center; z-index: 1000; }

View File

@ -0,0 +1,33 @@
# Artifact System
Tests that the artifact rendering pipeline works end-to-end.
Expert produces data → UINode converts to artifacts → frontend renders.
## Setup
- clear history
## Steps
### 1. Query produces data_table artifact
- send: show me 3 customers in a table
- expect_trace: has tool_call
- expect_response: length > 10
### 2. Entity detail via card
- send: show me details for customer 1
- expect_trace: has tool_call
- expect_response: length > 10
### 3. Action bar via buttons
- send: create two buttons on my dashboard: Refresh and Export
- expect_actions: length >= 2
- expect_actions: any action contains "refresh" or "Refresh"
### 4. Machine artifact
- send: create a machine called "flow" with initial state "ready" and a state called "done"
- expect_trace: has machine_created
### 5. Query after buttons survive
- send: how many customers are there?
- expect_response: length > 5
- expect_actions: any action contains "refresh" or "Refresh"

View File

@ -0,0 +1,50 @@
# Expectation Tracking
Tests that memorizer tracks user_expectation and it influences PA/Output behavior.
Exercises machine features (update_machine, transition_machine) alongside expectation transitions.
## Setup
- clear history
## Steps
### 1. Greeting sets conversational
- send: hi there!
- expect_response: length > 2
- expect_state: user_expectation is "conversational"
### 2. Create a wizard machine
- send: create a machine called "project" with states: planning (initial) and executing
- expect_trace: has machine_created
### 3. Delegate a task
- send: build me a summary report of the top 5 customers by device count
- expect_response: length > 20
- expect_state: user_expectation is "delegated" or "observing"
### 4. Ask about wizard (status check stays in flow)
- send: what state is my project machine in?
- expect_response: contains "planning" or "project"
- expect_state: user_expectation is "conversational" or "delegated"
### 5. Store data on machine
- send: use update_machine to store status=in_progress on the project machine
- expect_response: length > 5
### 6. Transition machine
- send: use transition_machine to move project to executing state
- expect_response: length > 5
### 7. Verify machine state and data
- send: what is the current state and data of the project machine?
- expect_response: contains "executing" or "in_progress"
### 8. Short nudge triggers waiting_input
- send: und?
- expect_response: length > 5
- expect_state: user_expectation is "waiting_input" or "conversational"
### 9. Quick thanks (observing)
- send: ok danke
- expect_response: length > 0
- expect_state: user_expectation is "observing" or "observational" or "conversational"

View File

@ -0,0 +1,41 @@
# Machine State → PA Context
Tests that PA reads machine state when routing, and experts can write back to machines.
Validates: enriched machine summary, update_machine, transition_machine.
## Setup
- clear history
## Steps
### 1. Create a machine
- send: create a navigation machine called "wizard" with initial state "start" and a second state called "details"
- expect_trace: has machine_created
### 2. PA sees machine in context
- send: what machines are active on my dashboard?
- expect_response: contains "wizard" or "start"
### 3. Expert stores data on machine
- send: use update_machine to store region=Bayern on the wizard machine
- expect_response: contains "Bayern" or "region" or "stored" or "updated"
### 4. PA sees stored data
- send: what data is stored in my wizard machine?
- expect_response: contains "Bayern" or "region"
### 5. Expert transitions machine to details
- send: use transition_machine to move wizard to details state
- expect_response: length > 5
### 6. PA sees updated state
- send: what state is the wizard in now?
- expect_response: contains "details"
### 7. Expert transitions back
- send: use transition_machine to move wizard back to start
- expect_response: length > 5
### 8. Final state check
- send: tell me the current wizard state and stored data
- expect_response: contains "start"

19
testcases/pa_retry.md Normal file
View File

@ -0,0 +1,19 @@
# PA Retry on Expert Failure
Tests that when expert fails, PA reformulates and retries with a different approach.
## Setup
- clear history
## Steps
### 1. Complex analytical query that may need retry
- send: Finde KWZ-Geraete mit verdaechtigen Verbrauchsspruengen - also wo der Verbrauch zwischen zwei Ablesungen stark ansteigt
- expect_response: length > 20
### 2. Verify results contain device data
- expect_response: contains "Gera" or "gera" or "KWZ" or "kwz" or "Verbrauch" or "device"
### 3. Follow up with details
- send: zeig mir die Verbraeuche von einem dieser Geraete
- expect_response: length > 10

View File

@ -1,7 +1,7 @@
{
"timestamp": "2026-03-29 06:04:47",
"timestamp": "2026-03-30 00:02:55",
"testcases": {
"S3* Audit Corrections": [
"Artifact System": [
{
"step": "Setup",
"check": "clear",
@ -9,93 +9,355 @@
"detail": "cleared"
},
{
"step": "Tool calls produce results (baseline)",
"check": "send: create two buttons: Alpha and Beta",
"step": "Query produces data_table artifact",
"check": "send: show me 3 customers in a table",
"status": "PASS",
"detail": "response: 👍 Okay, I've created buttons labeled \"Alpha\" and \"Beta\".\n"
"detail": "response: The database contains information for three customers: Kathrin Jager, Leon Schre"
},
{
"step": "Tool calls produce results (baseline)",
"check": "actions: length >= 1",
"status": "PASS",
"detail": "2 actions >= 1"
},
{
"step": "Tool calls produce results (baseline)",
"check": "actions: any action contains \"alpha\" or \"Alpha\"",
"status": "PASS",
"detail": "found 'alpha' in actions"
},
{
"step": "Dashboard mismatch triggers re-emit",
"check": "send: I see nothing on my dashboard, fix it",
"status": "PASS",
"detail": "response: 👍 Done — Alpha and Beta buttons are now live on your dashboard. They should appe"
},
{
"step": "Dashboard mismatch triggers re-emit",
"check": "response: not contains \"sorry\" or \"apologize\"",
"status": "PASS",
"detail": "none of ['sorry', 'apologize'] found (as expected)"
},
{
"step": "Dashboard mismatch triggers re-emit",
"check": "actions: length >= 1",
"status": "PASS",
"detail": "2 actions >= 1"
},
{
"step": "DB error triggers retry with corrected SQL",
"check": "send: SELECT * FROM NichtExistent LIMIT 5",
"status": "PASS",
"detail": "response: Ah, it seems like the table `NichtExistent` does not exist. Double-check the tab"
},
{
"step": "DB error triggers retry with corrected SQL",
"step": "Query produces data_table artifact",
"check": "trace: has tool_call",
"status": "PASS",
"detail": "found event 'tool_call'"
},
{
"step": "DB error triggers retry with corrected SQL",
"check": "response: not contains \"1146\"",
"status": "PASS",
"detail": "none of ['1146'] found (as expected)"
},
{
"step": "DB error triggers retry with corrected SQL",
"step": "Query produces data_table artifact",
"check": "response: length > 10",
"status": "PASS",
"detail": "length 163 > 10"
"detail": "length 138 > 10"
},
{
"step": "Complex request gets Director plan",
"check": "send: investigate which customers have the mos",
"step": "Entity detail via card",
"check": "send: show me details for customer 1",
"status": "PASS",
"detail": "response: Okay, I'll look into which customers have the most devices. This might take a mo"
"detail": "response: ```tool_code\nquery_db({\"query\":\"SELECT * FROM customers WHERE customer_id = 1\"})"
},
{
"step": "Complex request gets Director plan",
"check": "trace: has director_plan",
"status": "FAIL",
"detail": "no 'director_plan' event in trace"
},
{
"step": "Complex request gets Director plan",
"step": "Entity detail via card",
"check": "trace: has tool_call",
"status": "PASS",
"detail": "found event 'tool_call'"
},
{
"step": "Complex request gets Director plan",
"step": "Entity detail via card",
"check": "response: length > 10",
"status": "PASS",
"detail": "length 84 > 10"
},
{
"step": "Action bar via buttons",
"check": "send: create two buttons on my dashboard: Refr",
"status": "PASS",
"detail": "response: I have added the 'Refresh' and 'Export' buttons to your dashboard. These buttons"
},
{
"step": "Action bar via buttons",
"check": "actions: length >= 2",
"status": "PASS",
"detail": "2 actions >= 2"
},
{
"step": "Action bar via buttons",
"check": "actions: any action contains \"refresh\" or \"Refresh\"",
"status": "PASS",
"detail": "found 'refresh' in actions"
},
{
"step": "Machine artifact",
"check": "send: create a machine called \"flow\" with init",
"status": "PASS",
"detail": "response: OK, I've created a new interactive machine called 'flow' with the initial state "
},
{
"step": "Machine artifact",
"check": "trace: has machine_created",
"status": "PASS",
"detail": "found event 'machine_created'"
},
{
"step": "Query after buttons survive",
"check": "send: how many customers are there?",
"status": "PASS",
"detail": "response: There are 693 customers in the database.\n"
},
{
"step": "Query after buttons survive",
"check": "response: length > 5",
"status": "PASS",
"detail": "length 41 > 5"
},
{
"step": "Query after buttons survive",
"check": "actions: any action contains \"refresh\" or \"Refresh\"",
"status": "PASS",
"detail": "found 'refresh' in actions"
}
],
"Fast v4": [
{
"step": "Setup",
"check": "clear",
"status": "PASS",
"detail": "cleared"
},
{
"step": "Reflex",
"check": "send: hi!",
"status": "PASS",
"detail": "response: Hey Nico! 👋 How can I help you today?\n"
},
{
"step": "Reflex",
"check": "response: length > 2",
"status": "PASS",
"detail": "length 38 > 2"
},
{
"step": "PA routes to expert",
"check": "send: show me 3 customers",
"status": "PASS",
"detail": "response: Alright, I've fetched 3 customer records for you. You can see the ID, Name detai"
},
{
"step": "PA routes to expert",
"check": "trace: has routed",
"status": "PASS",
"detail": "found event 'routed'"
},
{
"step": "PA routes to expert",
"check": "trace: has tool_call",
"status": "PASS",
"detail": "found event 'tool_call'"
},
{
"step": "PA routes to expert",
"check": "response: length > 10",
"status": "PASS",
"detail": "length 181 > 10"
},
{
"step": "German query",
"check": "send: Zeig mir alle Tabellen in der Datenbank",
"status": "PASS",
"detail": "response: Okay, ich habe eine Liste aller Tabellen in der \"eras2_production\" Datenbank abg"
},
{
"step": "German query",
"check": "trace: has tool_call",
"status": "PASS",
"detail": "found event 'tool_call'"
},
{
"step": "German query",
"check": "response: length > 10",
"status": "PASS",
"detail": "length 303 > 10"
},
{
"step": "Schema discovery",
"check": "send: describe the kunden table",
"status": "PASS",
"detail": "response: The `kunden` table stores customer information, including names, customer number"
},
{
"step": "Schema discovery",
"check": "trace: has tool_call",
"status": "PASS",
"detail": "found event 'tool_call'"
},
{
"step": "Schema discovery",
"check": "response: length > 10",
"status": "PASS",
"detail": "length 391 > 10"
},
{
"step": "Count query (cached schema)",
"check": "send: how many customers are there?",
"status": "PASS",
"detail": "response: There are 693 customers in the database.\n"
},
{
"step": "Count query (cached schema)",
"check": "trace: has tool_call",
"status": "PASS",
"detail": "found event 'tool_call'"
},
{
"step": "Count query (cached schema)",
"check": "response: length > 5",
"status": "PASS",
"detail": "length 41 > 5"
},
{
"step": "Complex query",
"check": "send: which customers have the most devices?",
"status": "PASS",
"detail": "response: The query results list the top 10 customers with the most devices. Anne Bürger h"
},
{
"step": "Complex query",
"check": "trace: has tool_call",
"status": "PASS",
"detail": "found event 'tool_call'"
},
{
"step": "Complex query",
"check": "response: length > 20",
"status": "PASS",
"detail": "length 86 > 20"
"detail": "length 166 > 20"
},
{
"step": "Error recovery",
"check": "send: SELECT * FROM nichtexistiert",
"status": "PASS",
"detail": "response: Oops! It seems like the table `nichtexistiert` doesn't exist in the database. 🤔 "
},
{
"step": "Error recovery",
"check": "trace: has tool_call",
"status": "PASS",
"detail": "found event 'tool_call'"
},
{
"step": "Error recovery",
"check": "response: length > 10",
"status": "PASS",
"detail": "length 396 > 10"
},
{
"step": "Memorizer",
"check": "send: my name is Nico",
"status": "PASS",
"detail": "response: Thanks, Nico! I'll remember that. Do you have any other questions? 😊\n"
},
{
"step": "Memorizer",
"check": "state: facts any contains \"Nico\"",
"status": "PASS",
"detail": "found 'Nico' in facts"
},
{
"step": "Language switch",
"check": "send: Hallo wie gehts?",
"status": "PASS",
"detail": "response: Mir geht es gut, danke der Nachfrage, Nico! Und selbst? Gibt es etwas, bei dem i"
},
{
"step": "Language switch",
"check": "state: language is \"de\" or \"mixed\"",
"status": "PASS",
"detail": "language=mixed"
},
{
"step": "Bye",
"check": "send: ok bye",
"status": "PASS",
"detail": "response: Bye Nico! 👋 If you need anything else, just let me know. 😊\n"
},
{
"step": "Bye",
"check": "response: length > 2",
"status": "PASS",
"detail": "length 59 > 2"
}
],
"Dashboard Integration": [
{
"step": "Setup",
"check": "clear",
"status": "PASS",
"detail": "cleared"
},
{
"step": "Expert creates buttons",
"check": "send: create two buttons on my dashboard: Repo",
"status": "PASS",
"detail": "response: I have added 'Report' and 'Export' buttons to your dashboard.\n\n(UI buttons shown"
},
{
"step": "Expert creates buttons",
"check": "actions: length >= 2",
"status": "PASS",
"detail": "3 actions >= 2"
},
{
"step": "Expert creates buttons",
"check": "actions: any action contains \"report\" or \"Report\"",
"status": "PASS",
"detail": "found 'report' in actions"
},
{
"step": "Buttons survive a query",
"check": "send: how many customers are there?",
"status": "PASS",
"detail": "response: I'm running a query to count all customer IDs. One moment...\n"
},
{
"step": "Buttons survive a query",
"check": "response: length > 5",
"status": "PASS",
"detail": "length 61 > 5"
},
{
"step": "Buttons survive a query",
"check": "actions: any action contains \"report\" or \"Report\"",
"status": "PASS",
"detail": "found 'report' in actions"
},
{
"step": "Expert creates a machine",
"check": "send: create a navigation machine called \"work",
"status": "PASS",
"detail": "response: I've created the 'workflow' machine with 'start' and 'step2' states. The 'start'"
},
{
"step": "Expert creates a machine",
"check": "trace: has tool_call create_machine",
"status": "PASS",
"detail": "found create_machine via machine_created event"
},
{
"step": "Expert shows data table",
"check": "send: show me 5 customers in a table",
"status": "PASS",
"detail": "response: Here are five customer entries with their IDs, names, object count, and status:\n"
},
{
"step": "Expert shows data table",
"check": "trace: has tool_call",
"status": "PASS",
"detail": "found event 'tool_call'"
},
{
"step": "Expert shows data table",
"check": "response: length > 10",
"status": "PASS",
"detail": "length 118 > 10"
},
{
"step": "Expert replaces buttons",
"check": "send: remove all buttons and create one button",
"status": "PASS",
"detail": "response: I have removed the existing 'Report' and 'Export' buttons from the dashboard and"
},
{
"step": "Expert replaces buttons",
"check": "actions: length >= 1",
"status": "PASS",
"detail": "2 actions >= 1"
},
{
"step": "Expert replaces buttons",
"check": "actions: any action contains \"reset\" or \"Reset\"",
"status": "PASS",
"detail": "found 'reset' in actions"
}
]
},
"summary": {
"passed": 14,
"failed": 1
"passed": 58,
"failed": 0
}
}

1033
testcases/results_v1.json Normal file

File diff suppressed because it is too large Load Diff

1033
testcases/results_v2.json Normal file

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff