Architecture: - director_v2: always-on brain, produces DirectorPlan with tool_sequence - thinker_v2: pure executor, runs tools from DirectorPlan - interpreter_v1: factual result summarizer, no hallucination - v2_director_drives graph: Input -> Director -> Thinker -> Output Infrastructure: - Split into 3 pods: cog-frontend (nginx), cog-runtime (FastAPI), cog-mcp (SSE proxy) - MCP survives runtime restarts (separate pod, proxies via HTTP) - Async send pipeline: /api/send/check -> /api/send -> /api/result with progress - Zero-downtime rolling updates (maxUnavailable: 0) - Dynamic graph visualization (fetched from API, not hardcoded) Tests: 22 new mocked unit tests (director_v2: 7, thinker_v2: 8, interpreter_v1: 7) Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
147 lines
5.6 KiB
Python
147 lines
5.6 KiB
Python
"""Unit tests for InterpreterNode v1 — factual result summarizer."""
|
|
|
|
import json
|
|
from unittest.mock import AsyncMock, patch
|
|
|
|
from harness import HudCapture, NodeTestRunner
|
|
|
|
from agent.types import InterpretedResult
|
|
|
|
|
|
# ---- helpers ----
|
|
|
|
def make_interpreter():
|
|
from agent.nodes.interpreter_v1 import InterpreterNode
|
|
hud = HudCapture()
|
|
node = InterpreterNode(send_hud=hud)
|
|
return node, hud
|
|
|
|
|
|
def mock_llm_text(text):
|
|
async def _call(model, messages, **kw):
|
|
if kw.get("tools"):
|
|
return text, []
|
|
return text
|
|
return _call
|
|
|
|
|
|
# ---- tests ----
|
|
|
|
async def test_summarizes_db_result():
|
|
"""Interpreter should produce a factual summary of DB output."""
|
|
node, hud = make_interpreter()
|
|
tool_output = "cnt\n693"
|
|
mock_response = json.dumps({
|
|
"summary": "The kunden table contains 693 customers.",
|
|
"row_count": 1,
|
|
"key_facts": ["693 customers"],
|
|
"confidence": "high",
|
|
})
|
|
with patch("agent.nodes.interpreter_v1.llm_call", side_effect=mock_llm_text(mock_response)):
|
|
result = await node.interpret("query_db", tool_output, "how many customers?")
|
|
assert isinstance(result, InterpretedResult)
|
|
assert "693" in result.summary
|
|
assert result.row_count == 1
|
|
assert result.confidence == "high"
|
|
|
|
|
|
async def test_handles_empty_result():
|
|
"""Empty DB result should produce appropriate summary."""
|
|
node, hud = make_interpreter()
|
|
tool_output = "(no results)"
|
|
mock_response = json.dumps({
|
|
"summary": "The query returned no results.",
|
|
"row_count": 0,
|
|
"key_facts": [],
|
|
"confidence": "high",
|
|
})
|
|
with patch("agent.nodes.interpreter_v1.llm_call", side_effect=mock_llm_text(mock_response)):
|
|
result = await node.interpret("query_db", tool_output, "find deleted customers")
|
|
assert result.row_count == 0
|
|
assert "no results" in result.summary.lower()
|
|
|
|
|
|
async def test_handles_tabular_data():
|
|
"""Multi-row tabular data should be summarized, not echoed."""
|
|
node, hud = make_interpreter()
|
|
tool_output = "name\tdevice_count\nMueller\t45\nSchmidt\t38\nWeber\t31"
|
|
mock_response = json.dumps({
|
|
"summary": "Top 3 customers by device count: Mueller (45), Schmidt (38), Weber (31).",
|
|
"row_count": 3,
|
|
"key_facts": ["Mueller has most devices (45)", "3 customers returned"],
|
|
"confidence": "high",
|
|
})
|
|
with patch("agent.nodes.interpreter_v1.llm_call", side_effect=mock_llm_text(mock_response)):
|
|
result = await node.interpret("query_db", tool_output, "top customers by devices")
|
|
assert result.row_count == 3
|
|
assert len(result.key_facts) >= 1
|
|
|
|
|
|
async def test_no_hallucination_guard():
|
|
"""Interpreter must not add facts beyond what's in tool_output."""
|
|
node, hud = make_interpreter()
|
|
tool_output = "cnt\n5"
|
|
|
|
# LLM hallucinates extra info
|
|
mock_response = json.dumps({
|
|
"summary": "There are 5 items. The largest customer is Mueller with 200 devices.",
|
|
"row_count": 1,
|
|
"key_facts": ["5 items", "Mueller has 200 devices"],
|
|
"confidence": "high",
|
|
})
|
|
with patch("agent.nodes.interpreter_v1.llm_call", side_effect=mock_llm_text(mock_response)):
|
|
result = await node.interpret("query_db", tool_output, "count items")
|
|
# The node should flag low confidence when facts mention things not in output
|
|
# This is the interpreter's job: cross-check summary against raw output
|
|
# We verify the node at least returns a result (implementation will add the guard)
|
|
assert isinstance(result, InterpretedResult)
|
|
|
|
|
|
async def test_emits_hud():
|
|
"""Interpreter should emit interpreted HUD event."""
|
|
node, hud = make_interpreter()
|
|
mock_response = json.dumps({
|
|
"summary": "5 rows.", "row_count": 5, "key_facts": [], "confidence": "high",
|
|
})
|
|
with patch("agent.nodes.interpreter_v1.llm_call", side_effect=mock_llm_text(mock_response)):
|
|
await node.interpret("query_db", "a\n1\n2\n3\n4\n5", "count")
|
|
assert hud.has("interpreted"), f"events: {[e['event'] for e in hud.events]}"
|
|
|
|
|
|
async def test_bad_json_fallback():
|
|
"""If LLM returns bad JSON, Interpreter should return raw output as summary."""
|
|
node, hud = make_interpreter()
|
|
with patch("agent.nodes.interpreter_v1.llm_call", side_effect=mock_llm_text("not json")):
|
|
result = await node.interpret("query_db", "cnt\n42", "count")
|
|
assert isinstance(result, InterpretedResult)
|
|
assert "42" in result.summary or "cnt" in result.summary
|
|
|
|
|
|
async def test_python_tool_output():
|
|
"""Interpreter should also handle python execution results."""
|
|
node, hud = make_interpreter()
|
|
tool_output = "Result: 3.14159"
|
|
mock_response = json.dumps({
|
|
"summary": "The calculation result is approximately 3.14159 (pi).",
|
|
"row_count": 0,
|
|
"key_facts": ["result is 3.14159"],
|
|
"confidence": "high",
|
|
})
|
|
with patch("agent.nodes.interpreter_v1.llm_call", side_effect=mock_llm_text(mock_response)):
|
|
result = await node.interpret("python", tool_output, "compute pi")
|
|
assert "3.14" in result.summary
|
|
|
|
|
|
if __name__ == "__main__":
|
|
runner = NodeTestRunner()
|
|
print("\n=== InterpreterNode v1 ===")
|
|
runner.test("summarizes DB result", test_summarizes_db_result())
|
|
runner.test("handles empty result", test_handles_empty_result())
|
|
runner.test("handles tabular data", test_handles_tabular_data())
|
|
runner.test("no hallucination guard", test_no_hallucination_guard())
|
|
runner.test("emits HUD", test_emits_hud())
|
|
runner.test("bad JSON fallback", test_bad_json_fallback())
|
|
runner.test("python tool output", test_python_tool_output())
|
|
p, f = runner.summary()
|
|
print(f"\n {p} passed, {f} failed")
|