diff --git a/agent/nodes/__init__.py b/agent/nodes/__init__.py index b5890e7..99777ab 100644 --- a/agent/nodes/__init__.py +++ b/agent/nodes/__init__.py @@ -5,5 +5,6 @@ from .input import InputNode from .output import OutputNode from .thinker import ThinkerNode from .memorizer import MemorizerNode +from .ui import UINode -__all__ = ["SensorNode", "InputNode", "OutputNode", "ThinkerNode", "MemorizerNode"] +__all__ = ["SensorNode", "InputNode", "OutputNode", "ThinkerNode", "MemorizerNode", "UINode"] diff --git a/agent/nodes/input.py b/agent/nodes/input.py index b3b978b..183af9c 100644 --- a/agent/nodes/input.py +++ b/agent/nodes/input.py @@ -16,14 +16,19 @@ class InputNode(Node): SYSTEM = """You are the Input node — the ear of this cognitive runtime. -Listener context: -- Authenticated user: {identity} -- Channel: {channel} (Chrome browser on Nico's Windows PC, in his room at home) -- Physical: private space, Nico lives with Tina — she may use this session too -- Security: single-user account, shared physical space — other voices are trusted household +Listener: {identity} on {channel} -Your job: describe what you heard. Who spoke, what they want, what tone, what context matters. -ONE sentence. No content, no response — just your perception of what came through. +YOUR ONLY JOB: Describe what you heard in ONE short sentence. +- Who spoke, what they want, what tone. +- Example: "Nico asks what time it is, casual tone." +- Example: "Nico wants to create a database with customer data, direct request." +- Example: "Nico reports a UI bug — he can't see a value updating, frustrated tone." + +STRICT RULES: +- ONLY output a single perception sentence. Nothing else. +- NEVER generate a response, code, HTML, or suggestions. +- NEVER answer the user's question — that's not your job. +- NEVER write more than one sentence. {memory_context}""" diff --git a/agent/nodes/output.py b/agent/nodes/output.py index de84e89..517ae53 100644 --- a/agent/nodes/output.py +++ b/agent/nodes/output.py @@ -17,20 +17,23 @@ class OutputNode(Node): model = "google/gemini-2.0-flash-001" max_context_tokens = 4000 - SYSTEM = """You are the Output node — the renderer of this cognitive runtime. + SYSTEM = """You are the Output node — the voice of this cognitive runtime. -DEVICE: The user is on a web browser (Chrome, desktop). Your output renders in an HTML chat panel. -You can use markdown: **bold**, *italic*, `code`, ```code blocks```, lists, headers. -The chat panel renders markdown to HTML — use it for structure when helpful. +YOU ARE TEXT ONLY. Your output goes to a chat bubble. You can use: +- Markdown: **bold**, *italic*, `code`, ```code blocks```, lists, headers +- Emojis when they add warmth or clarity +- Short, structured text (bullet points, numbered lists) -YOUR JOB: Transform the Thinker's reasoning into a polished, user-facing response. -- The Thinker reasons and may use tools. You receive its output and render it for the human. +NEVER output HTML, buttons, tables, labels, or any UI elements. +A separate UI node handles all interactive elements — you just speak. + +YOUR JOB: Transform the Thinker's reasoning into a natural, human-readable text response. - NEVER echo internal node names, perceptions, or system details. - NEVER say "the Thinker decided..." or "I'll process..." — just deliver the answer. -- If the Thinker ran a tool and got output, weave the results into a natural response. -- If the Thinker gave a direct answer, refine and format it — don't just repeat it. +- If the Thinker ran a tool and got output, summarize the results in text. +- If the Thinker gave a direct answer, refine the wording — don't just repeat verbatim. - Keep the user's language — if they wrote German, respond in German. -- Be concise but complete. Use formatting to make data scannable. +- Be concise. Don't describe data that the UI node will show as a table. {memory_context}""" diff --git a/agent/nodes/thinker.py b/agent/nodes/thinker.py index 2f48581..1c7fd04 100644 --- a/agent/nodes/thinker.py +++ b/agent/nodes/thinker.py @@ -18,23 +18,13 @@ class ThinkerNode(Node): max_context_tokens = 4000 SYSTEM = """You are the Thinker node — the brain of this cognitive runtime. -You receive a perception of what the user said. Decide: answer directly, use a tool, or show UI controls. +You receive a perception of what the user said. Decide: answer directly or use a tool. TOOLS — write a ```python code block and it WILL be executed. Use print() for output. - For math, databases, file ops, any computation: write python. NEVER describe code — write it. - For simple conversation: respond directly as text. -UI CONTROLS — to show interactive elements, include a JSON block: -```controls -[ - {{"type": "table", "data": [...], "columns": ["id", "name", "email"]}}, - {{"type": "button", "label": "Add Customer", "action": "add_customer"}}, - {{"type": "button", "label": "Refresh", "action": "refresh_customers"}} -] -``` -Controls render in the chat. User clicks flow back as actions you can handle. - -You can combine text + code + controls in one response. +A separate UI node handles all visual controls (buttons, tables). Just focus on reasoning and content. {memory_context}""" @@ -94,24 +84,9 @@ conn.close()''' return None - def _parse_controls(self, response: str) -> list[dict]: - """Extract ```controls JSON blocks from response.""" - controls = [] - if "```controls" not in response: - return controls - parts = response.split("```controls") - for part in parts[1:]: - end = part.find("```") - if end != -1: - try: - controls.extend(json.loads(part[:end].strip())) - except json.JSONDecodeError: - pass - return controls - - def _strip_blocks(self, response: str) -> str: - """Remove code and control blocks, return plain text.""" - text = re.sub(r'```(?:python|py|controls).*?```', '', response, flags=re.DOTALL) + def _strip_code_blocks(self, response: str) -> str: + """Remove code blocks, return plain text.""" + text = re.sub(r'```(?:python|py|sql|sqlite|sh|bash|tool_code).*?```', '', response, flags=re.DOTALL) return text.strip() async def process(self, command: Command, history: list[dict], memory_context: str = "") -> ThoughtResult: @@ -129,12 +104,10 @@ conn.close()''' max_tokens=self.max_context_tokens, fill_pct=self.context_fill_pct) response = await llm_call(self.model, messages) + if not response: + response = "[no response from LLM]" log.info(f"[thinker] response: {response[:200]}") - controls = self._parse_controls(response) - if controls: - await self.hud("controls", controls=controls) - tool_call = self._parse_tool_call(response) if tool_call: tool_name, code = tool_call @@ -147,22 +120,18 @@ conn.close()''' log.info(f"[thinker] tool output: {tool_output[:200]}") + # Second call: interpret tool output messages.append({"role": "assistant", "content": response}) messages.append({"role": "system", "content": f"Tool output:\n{tool_output}"}) - messages.append({"role": "user", "content": "Respond to the user based on the tool output. If showing data, include a ```controls block with a table. Be natural and concise."}) + messages.append({"role": "user", "content": "Respond to the user based on the tool output. Be natural and concise."}) messages = self.trim_context(messages) final = await llm_call(self.model, messages) - more_controls = self._parse_controls(final) - if more_controls: - controls.extend(more_controls) - await self.hud("controls", controls=more_controls) - - clean_text = self._strip_blocks(final) + clean_text = self._strip_code_blocks(final) await self.hud("decided", instruction=clean_text[:200]) return ThoughtResult(response=clean_text, tool_used=tool_name, - tool_output=tool_output, controls=controls) + tool_output=tool_output) - clean_text = self._strip_blocks(response) or response + clean_text = self._strip_code_blocks(response) or response await self.hud("decided", instruction="direct response (no tools)") - return ThoughtResult(response=clean_text, controls=controls) + return ThoughtResult(response=clean_text) diff --git a/agent/nodes/ui.py b/agent/nodes/ui.py new file mode 100644 index 0000000..5b089ac --- /dev/null +++ b/agent/nodes/ui.py @@ -0,0 +1,102 @@ +"""UI Node: renders interactive elements to the awareness panel workspace.""" + +import json +import logging + +from .base import Node +from ..llm import llm_call +from ..types import ThoughtResult + +log = logging.getLogger("runtime") + + +class UINode(Node): + name = "ui" + model = "google/gemini-2.0-flash-001" + max_context_tokens = 3000 + + SYSTEM = """You are the UI node of a cognitive agent runtime. + +You render interactive elements to a workspace panel in the browser. A separate Output node handles all text — you NEVER write prose, explanations, or messages. + +YOUR OUTPUT: A JSON array of UI elements, or [] if nothing to show. + +ELEMENT TYPES: + +label — display a value: + {{"type": "label", "id": "unique_id", "text": "Label Text", "value": "current value"}} + +button — clickable action: + {{"type": "button", "label": "Short Label", "action": "action_name", "payload": {{"key": "value"}}}} + +table — structured data: + {{"type": "table", "columns": ["col1", "col2"], "data": [{{"col1": "val", "col2": "val"}}]}} + +RULES: +- Output ONLY a valid JSON array. No text, no markdown, no explanation. +- Labels: show key values the user asked about or that resulted from tool execution. +- Buttons: offer clear follow-up actions. Keep labels 2-4 words. Action is snake_case. +- Tables: when tool output contains structured/tabular data. +- Return [] when the response is purely conversational with no actionable data. +- Every element you emit REPLACES the entire workspace. Include all elements that should be visible. + +CURRENT WORKSPACE: +{current_controls}""" + + def __init__(self, send_hud): + super().__init__(send_hud) + self.current_controls: list[dict] = [] + + async def process(self, thought: ThoughtResult, history: list[dict], + memory_context: str = "") -> list[dict]: + await self.hud("thinking", detail="deciding UI controls") + + # Show UI what's currently rendered + if self.current_controls: + ctrl_desc = json.dumps(self.current_controls, indent=2) + else: + ctrl_desc = "(empty)" + + messages = [ + {"role": "system", "content": self.SYSTEM.format(current_controls=ctrl_desc)}, + ] + + for msg in history[-6:]: + messages.append(msg) + + ctx = f"Thinker response: {thought.response}" + if thought.tool_used: + ctx += f"\n\nTool: {thought.tool_used}\nTool output:\n{thought.tool_output}" + messages.append({"role": "system", "content": ctx}) + messages.append({"role": "user", "content": "What UI elements should the workspace show now? Return JSON array."}) + + messages = self.trim_context(messages) + await self.hud("context", messages=messages, tokens=self.last_context_tokens, + max_tokens=self.max_context_tokens, fill_pct=self.context_fill_pct) + + raw = await llm_call(self.model, messages) + log.info(f"[ui] raw: {raw[:200]}") + + text = raw.strip() + if text.startswith("```"): + text = text.split("\n", 1)[1] if "\n" in text else text[3:] + if text.endswith("```"): + text = text[:-3] + text = text.strip() + + try: + controls = json.loads(text) + if not isinstance(controls, list): + controls = [] + except (json.JSONDecodeError, Exception) as e: + log.error(f"[ui] parse error: {e}, raw: {text[:200]}") + controls = [] + + if controls: + self.current_controls = controls + await self.hud("controls", controls=controls) + log.info(f"[ui] emitting {len(controls)} controls") + else: + await self.hud("decided", instruction="no controls needed") + + return controls diff --git a/agent/runtime.py b/agent/runtime.py index ef16f62..9d51701 100644 --- a/agent/runtime.py +++ b/agent/runtime.py @@ -1,5 +1,6 @@ """Runtime: wires all nodes together into a processing pipeline.""" +import asyncio import json import logging import time @@ -10,7 +11,7 @@ from fastapi import WebSocket from .types import Envelope, Command from .process import ProcessManager -from .nodes import SensorNode, InputNode, OutputNode, ThinkerNode, MemorizerNode +from .nodes import SensorNode, InputNode, OutputNode, ThinkerNode, MemorizerNode, UINode log = logging.getLogger("runtime") @@ -29,6 +30,7 @@ class Runtime: self.process_manager = ProcessManager(send_hud=self._send_hud) self.thinker = ThinkerNode(send_hud=self._send_hud, process_manager=self.process_manager) self.output_node = OutputNode(send_hud=self._send_hud) + self.ui_node = UINode(send_hud=self._send_hud) self.memorizer = MemorizerNode(send_hud=self._send_hud) self.sensor = SensorNode(send_hud=self._send_hud) self.sensor.start(get_memo_state=lambda: self.memorizer.state) @@ -55,36 +57,38 @@ class Runtime: log.error(f"trace write error: {e}") self._broadcast(trace_entry) - async def _stream_text(self, text: str): - """Stream pre-formed text to the client as deltas, simulating LLM output.""" - # Send in chunks to feel natural - chunk_size = 12 - for i in range(0, len(text), chunk_size): - chunk = text[i:i + chunk_size] - await self.ws.send_text(json.dumps({"type": "delta", "content": chunk})) - await self.ws.send_text(json.dumps({"type": "done"})) + async def _run_output_and_ui(self, thought, mem_ctx): + """Run Output and UI nodes in parallel. Returns (response_text, controls).""" + output_task = asyncio.create_task( + self.output_node.process(thought, self.history, self.ws, memory_context=mem_ctx)) + ui_task = asyncio.create_task( + self.ui_node.process(thought, self.history, memory_context=mem_ctx)) + + # Output streams to WS, UI returns controls — both run concurrently + response, controls = await asyncio.gather(output_task, ui_task) + + # Send controls after Output starts streaming (UI may finish first or after) + if controls: + await self.ws.send_text(json.dumps({"type": "controls", "controls": controls})) + + return response async def handle_action(self, action: str, data: dict = None): """Handle a structured UI action (button click etc.).""" - # Format as a structured message that Thinker can parse action_desc = f"ACTION: {action}" if data: action_desc += f" | data: {json.dumps(data)}" - # Add to history as a system-level event, not user speech self.history.append({"role": "user", "content": action_desc}) self.sensor.note_user_activity() sensor_lines = self.sensor.get_context_lines() mem_ctx = self.memorizer.get_context_block(sensor_lines=sensor_lines) - # Skip Input node — this isn't speech to perceive, go straight to Thinker + # Skip Input — this isn't speech, go straight to Thinker command = Command(instruction=f"User clicked UI button: {action}", source_text=action_desc) thought = await self.thinker.process(command, self.history, memory_context=mem_ctx) - if thought.controls: - await self.ws.send_text(json.dumps({"type": "controls", "controls": thought.controls})) - - response = await self.output_node.process(thought, self.history, self.ws, memory_context=mem_ctx) + response = await self._run_output_and_ui(thought, mem_ctx) self.history.append({"role": "assistant", "content": response}) await self.memorizer.update(self.history) @@ -112,12 +116,8 @@ class Runtime: thought = await self.thinker.process(command, self.history, memory_context=mem_ctx) - # Send controls inline (before response text) - if thought.controls: - await self.ws.send_text(json.dumps({"type": "controls", "controls": thought.controls})) - - # Output renders Thinker's reasoning into device-appropriate response - response = await self.output_node.process(thought, self.history, self.ws, memory_context=mem_ctx) + # Output (voice) and UI (screen) run in parallel + response = await self._run_output_and_ui(thought, mem_ctx) self.history.append({"role": "assistant", "content": response}) diff --git a/static/app.js b/static/app.js index f9b0879..dbe2c25 100644 --- a/static/app.js +++ b/static/app.js @@ -127,7 +127,11 @@ function connect() { scroll(msgs); } else if (data.type === 'done') { - if (currentEl) currentEl.classList.remove('streaming'); + if (currentEl) { + currentEl.classList.remove('streaming'); + // Render markdown now that streaming is complete + currentEl.innerHTML = renderMarkdown(currentEl.textContent); + } currentEl = null; } else if (data.type === 'controls') { @@ -256,7 +260,7 @@ function renderControls(controls) { btn.textContent = ctrl.label; btn.onclick = () => { if (ws && ws.readyState === 1) { - ws.send(JSON.stringify({ type: 'action', action: ctrl.action, data: ctrl.data || {} })); + ws.send(JSON.stringify({ type: 'action', action: ctrl.action, data: ctrl.payload || ctrl.data || {} })); addTrace('runtime', 'action', ctrl.action); } }; @@ -295,6 +299,12 @@ function renderControls(controls) { } container.appendChild(table); + } else if (ctrl.type === 'label') { + const lbl = document.createElement('div'); + lbl.className = 'control-label'; + lbl.innerHTML = '' + esc(ctrl.text || '') + '' + esc(String(ctrl.value ?? '')) + ''; + container.appendChild(lbl); + } else if (ctrl.type === 'process') { const card = document.createElement('div'); card.className = 'process-card ' + (ctrl.status || 'running'); @@ -355,6 +365,30 @@ function updateMeter(node, tokens, maxTokens, fillPct) { function scroll(el) { el.scrollTop = el.scrollHeight; } function esc(s) { const d = document.createElement('span'); d.textContent = s; return d.innerHTML; } + +function renderMarkdown(text) { + // Escape HTML first + let html = esc(text); + // Code blocks (``` ... ```) + html = html.replace(/```(\w*)\n([\s\S]*?)```/g, (_, lang, code) => '
' + code.trim() + '');
+ // Inline code
+ html = html.replace(/`([^`]+)`/g, '$1');
+ // Bold
+ html = html.replace(/\*\*(.+?)\*\*/g, '$1');
+ // Italic
+ html = html.replace(/\*(.+?)\*/g, '$1');
+ // Headers
+ html = html.replace(/^### (.+)$/gm, '