agent-runtime/agent/llm.py

"""LLM helper: OpenRouter calls, token estimation, context fitting."""

import json
import logging
import os
from typing import Any

import httpx

log = logging.getLogger("runtime")

API_KEY = os.environ.get("OPENROUTER_API_KEY", "")
OPENROUTER_URL = "https://openrouter.ai/api/v1/chat/completions"


async def llm_call(model: str, messages: list[dict], stream: bool = False) -> Any:
    """Single LLM call via OpenRouter. Returns full text or (client, response) for streaming."""
    headers = {"Authorization": f"Bearer {API_KEY}", "Content-Type": "application/json"}
    body = {"model": model, "messages": messages, "stream": stream}

    client = httpx.AsyncClient(timeout=60)
    if stream:
        resp = await client.send(client.build_request("POST", OPENROUTER_URL, headers=headers, json=body), stream=True)
        return client, resp

    resp = await client.post(OPENROUTER_URL, headers=headers, json=body)
    await client.aclose()
    data = resp.json()
    if "choices" not in data:
        log.error(f"LLM error: {data}")
        return f"[LLM error: {data.get('error', {}).get('message', 'unknown')}]"
    return data["choices"][0]["message"]["content"]


def estimate_tokens(text: str) -> int:
    """Rough token estimate: 1 token ~ 4 chars."""
    return len(text) // 4


def fit_context(messages: list[dict], max_tokens: int, protect_last: int = 4) -> list[dict]:
    """Trim oldest messages (after system prompt) to fit token budget.
    Always keeps: system prompt(s) at start + last `protect_last` messages."""
    if not messages:
        return messages

    system_msgs = []
    rest = []
    for m in messages:
        if not rest and m["role"] == "system":
            system_msgs.append(m)
        else:
            rest.append(m)

    protected = rest[-protect_last:] if len(rest) > protect_last else rest
    middle = rest[:-protect_last] if len(rest) > protect_last else []

    fixed_tokens = sum(estimate_tokens(m["content"]) for m in system_msgs + protected)

    if fixed_tokens >= max_tokens:
        result = system_msgs + protected
        total = sum(estimate_tokens(m["content"]) for m in result)
        while total > max_tokens and len(result) > 2:
            removed = result.pop(1)
            total -= estimate_tokens(removed["content"])
        return result

    remaining = max_tokens - fixed_tokens
    kept_middle = []
    for m in reversed(middle):
        t = estimate_tokens(m["content"])
        if remaining - t < 0:
            break
        kept_middle.insert(0, m)
        remaining -= t

    return system_msgs + kept_middle + protected