"""Unit tests for InterpreterNode v1 — factual result summarizer.""" import json from unittest.mock import AsyncMock, patch from harness import HudCapture, NodeTestRunner from agent.types import InterpretedResult # ---- helpers ---- def make_interpreter(): from agent.nodes.interpreter_v1 import InterpreterNode hud = HudCapture() node = InterpreterNode(send_hud=hud) return node, hud def mock_llm_text(text): async def _call(model, messages, **kw): if kw.get("tools"): return text, [] return text return _call # ---- tests ---- async def test_summarizes_db_result(): """Interpreter should produce a factual summary of DB output.""" node, hud = make_interpreter() tool_output = "cnt\n693" mock_response = json.dumps({ "summary": "The kunden table contains 693 customers.", "row_count": 1, "key_facts": ["693 customers"], "confidence": "high", }) with patch("agent.nodes.interpreter_v1.llm_call", side_effect=mock_llm_text(mock_response)): result = await node.interpret("query_db", tool_output, "how many customers?") assert isinstance(result, InterpretedResult) assert "693" in result.summary assert result.row_count == 1 assert result.confidence == "high" async def test_handles_empty_result(): """Empty DB result should produce appropriate summary.""" node, hud = make_interpreter() tool_output = "(no results)" mock_response = json.dumps({ "summary": "The query returned no results.", "row_count": 0, "key_facts": [], "confidence": "high", }) with patch("agent.nodes.interpreter_v1.llm_call", side_effect=mock_llm_text(mock_response)): result = await node.interpret("query_db", tool_output, "find deleted customers") assert result.row_count == 0 assert "no results" in result.summary.lower() async def test_handles_tabular_data(): """Multi-row tabular data should be summarized, not echoed.""" node, hud = make_interpreter() tool_output = "name\tdevice_count\nMueller\t45\nSchmidt\t38\nWeber\t31" mock_response = json.dumps({ "summary": "Top 3 customers by device count: Mueller (45), Schmidt (38), Weber (31).", "row_count": 3, "key_facts": ["Mueller has most devices (45)", "3 customers returned"], "confidence": "high", }) with patch("agent.nodes.interpreter_v1.llm_call", side_effect=mock_llm_text(mock_response)): result = await node.interpret("query_db", tool_output, "top customers by devices") assert result.row_count == 3 assert len(result.key_facts) >= 1 async def test_no_hallucination_guard(): """Interpreter must not add facts beyond what's in tool_output.""" node, hud = make_interpreter() tool_output = "cnt\n5" # LLM hallucinates extra info mock_response = json.dumps({ "summary": "There are 5 items. The largest customer is Mueller with 200 devices.", "row_count": 1, "key_facts": ["5 items", "Mueller has 200 devices"], "confidence": "high", }) with patch("agent.nodes.interpreter_v1.llm_call", side_effect=mock_llm_text(mock_response)): result = await node.interpret("query_db", tool_output, "count items") # The node should flag low confidence when facts mention things not in output # This is the interpreter's job: cross-check summary against raw output # We verify the node at least returns a result (implementation will add the guard) assert isinstance(result, InterpretedResult) async def test_emits_hud(): """Interpreter should emit interpreted HUD event.""" node, hud = make_interpreter() mock_response = json.dumps({ "summary": "5 rows.", "row_count": 5, "key_facts": [], "confidence": "high", }) with patch("agent.nodes.interpreter_v1.llm_call", side_effect=mock_llm_text(mock_response)): await node.interpret("query_db", "a\n1\n2\n3\n4\n5", "count") assert hud.has("interpreted"), f"events: {[e['event'] for e in hud.events]}" async def test_bad_json_fallback(): """If LLM returns bad JSON, Interpreter should return raw output as summary.""" node, hud = make_interpreter() with patch("agent.nodes.interpreter_v1.llm_call", side_effect=mock_llm_text("not json")): result = await node.interpret("query_db", "cnt\n42", "count") assert isinstance(result, InterpretedResult) assert "42" in result.summary or "cnt" in result.summary async def test_python_tool_output(): """Interpreter should also handle python execution results.""" node, hud = make_interpreter() tool_output = "Result: 3.14159" mock_response = json.dumps({ "summary": "The calculation result is approximately 3.14159 (pi).", "row_count": 0, "key_facts": ["result is 3.14159"], "confidence": "high", }) with patch("agent.nodes.interpreter_v1.llm_call", side_effect=mock_llm_text(mock_response)): result = await node.interpret("python", tool_output, "compute pi") assert "3.14" in result.summary if __name__ == "__main__": runner = NodeTestRunner() print("\n=== InterpreterNode v1 ===") runner.test("summarizes DB result", test_summarizes_db_result()) runner.test("handles empty result", test_handles_empty_result()) runner.test("handles tabular data", test_handles_tabular_data()) runner.test("no hallucination guard", test_no_hallucination_guard()) runner.test("emits HUD", test_emits_hud()) runner.test("bad JSON fallback", test_bad_json_fallback()) runner.test("python tool output", test_python_tool_output()) p, f = runner.summary() print(f"\n {p} passed, {f} failed")