agent-runtime/test_nodes/test_thinker_v1.py

"""Unit tests for ThinkerNode v1 — reasoning, tool calls, audit."""

from harness import HudCapture, make_command, make_history, NodeTestRunner

from agent.nodes.thinker_v1 import ThinkerNode
from agent.process import ProcessManager


def make_thinker():
    hud = HudCapture()
    pm = ProcessManager(send_hud=hud)
    node = ThinkerNode(send_hud=hud, process_manager=pm)
    return node, hud


async def test_simple_response():
    """Thinker produces a text response or tool call for a simple question."""
    node, hud = make_thinker()
    cmd = make_command(intent="question", topic="greeting", text="say hello to me")
    thought = await node.process(cmd, [], memory_context="")
    has_output = bool(thought.response) or bool(thought.actions) or bool(thought.tool_used)
    assert has_output, "no response, no actions, no tool used"


async def test_no_code_in_response():
    """Response should not contain code blocks (stripped by _strip_code_blocks)."""
    node, hud = make_thinker()
    cmd = make_command(intent="request", topic="create buttons", text="create two buttons: red and blue")
    thought = await node.process(cmd, [], memory_context="")
    assert "```" not in thought.response, f"code block leaked: {thought.response[:100]}"


async def test_emits_tool_calls_for_buttons():
    """When asked to create buttons, Thinker should call emit_actions."""
    node, hud = make_thinker()
    cmd = make_command(intent="request", topic="create buttons",
                       text="create two buttons: Alpha and Beta")
    thought = await node.process(cmd, [], memory_context="")
    assert thought.actions, "no actions emitted"
    labels = [a.get("label", "").lower() for a in thought.actions]
    assert any("alpha" in l for l in labels), f"no Alpha button: {labels}"


async def test_query_db_called():
    """When asked about database, Thinker should call query_db."""
    node, hud = make_thinker()
    cmd = make_command(intent="request", topic="database customers",
                       text="how many customers are in the database?")
    thought = await node.process(cmd, [], memory_context="")
    assert thought.tool_used == "query_db" or hud.has("tool_call"), \
        f"tool_used={thought.tool_used}, hud events: {[e.get('event') for e in hud.events]}"


async def test_s3_audit_code_without_tools():
    """S3* audit should fire when code is written without tool calls."""
    node, hud = make_thinker()
    # This is hard to trigger deterministically — we check the audit mechanism exists
    # by verifying the HUD capture works
    cmd = make_command(intent="request", topic="create machine",
                       text="create a state machine called test with states a and b")
    thought = await node.process(cmd, [], memory_context="")
    # If S3* fired, there will be an s3_audit event
    audit_events = hud.find("s3_audit")
    # Either S3* fired (model wrote code) or model called tools correctly — both OK
    if audit_events:
        print(f"    S3* fired: {audit_events[0].get('detail', '')[:80]}")
    elif thought.machine_ops:
        print(f"    Tools called directly: {len(thought.machine_ops)} machine ops")


async def test_decided_hud_emitted():
    """Thinker should emit a 'decided' HUD event."""
    node, hud = make_thinker()
    cmd = make_command(intent="question", text="hello")
    await node.process(cmd, [], memory_context="")
    assert hud.has("decided"), f"no decided event: {[e.get('event') for e in hud.events]}"


if __name__ == "__main__":
    runner = NodeTestRunner()
    print("\n=== ThinkerNode v1 ===")
    runner.test("simple response", test_simple_response())
    runner.test("no code in response", test_no_code_in_response())
    runner.test("emits tool calls for buttons", test_emits_tool_calls_for_buttons())
    runner.test("query_db called for DB question", test_query_db_called())
    runner.test("S3* audit mechanism", test_s3_audit_code_without_tools())
    runner.test("decided HUD emitted", test_decided_hud_emitted())
    p, f = runner.summary()
    print(f"\n  {p} passed, {f} failed")