briefcase-python 2.4.1

"""
PageIndex MCP response observer for Briefcase.

Post-processes decision records from A1/A2 handlers that were generated when
PageIndex was accessed via MCP (Model Context Protocol).

Background
----------
When a LangChain agent or OpenAI Agent calls a PageIndex MCP tool, the call
appears as a regular tool invocation in the existing handlers:
  - A1 (BriefcaseLangChainHandler): recorded as decision_type="tool" with
    function_name from serialized["name"], outputs["output"] as raw string
  - A2 (OpenAIAgentsTracer): recorded as FunctionSpanData with tool_name, input, output

The MCP server returns a JSON-encoded string. This observer:
  1. Detects PageIndex MCP tool results (by tool name keywords or response structure)
  2. Parses the JSON output to extract tree metadata and doc_id
  3. Enriches the decision record in-place with pageindex.* attributes

MCP tool detection (applied in order):
  - Tool/function name contains any of: "pageindex", "page_index", "pi_search",
    "pi_chat", "pi_retrieve" (case-insensitive)
  - OR the output JSON contains "doc_id" or "retrieval_id" keys
  - OR the output JSON contains a "nodes" array (tree structure marker)

This component does NOT require pageindex to be installed — it only parses JSON.

Usage
-----
    from briefcase.integrations.frameworks import PageIndexMCPObserver

    observer = PageIndexMCPObserver()

    # Enrich a single record in-place
    enriched = observer.observe(record)  # True if enriched, False otherwise

    # Post-process all records from a handler
    for record in handler.get_decisions_as_dicts():
        observer.observe(record)
"""

import json
import logging
from typing import Any, Dict, Optional

logger = logging.getLogger(__name__)

# Keywords that identify a tool as a PageIndex MCP tool (case-insensitive match)
_PAGEINDEX_TOOL_KEYWORDS = (
    "pageindex",
    "page_index",
    "pi_search",
    "pi_chat",
    "pi_retrieve",
)

# Keys whose presence in a JSON response indicates a PageIndex server response
_PAGEINDEX_RESPONSE_KEYS = ("doc_id", "retrieval_id")


# ── Main observer class ───────────────────────────────────────────────────────

class PageIndexMCPObserver:
    """
    Post-processor that detects PageIndex MCP tool call results in decision records
    and enriches them with pageindex.* tree metadata attributes.

    This class does NOT require the pageindex package to be installed.
    It operates purely on the JSON string content of MCP tool responses.

    Attributes added to matching records (in-place):
        pageindex.doc_id              (str)
        pageindex.retrieval_method    (str) — always "tree_search"
        pageindex.tree.depth          (int)
        pageindex.tree.nodes_visited  (int)
        pageindex.tree.path           (str)
        pageindex.tree.backtrack_count (int) — always 0
    """

    def __init__(self) -> None:
        self._observed_count: int = 0
        self._enriched_count: int = 0

    # ── Public API ─────────────────────────────────────────────────────────────

    def observe(self, record: Dict[str, Any]) -> bool:
        """
        Inspect a decision record and enrich with pageindex.* attributes if it
        appears to be a PageIndex MCP tool call.

        Args:
            record: Decision record dict (mutated in-place if PageIndex detected).

        Returns:
            True if the record was identified as a PageIndex MCP call and enriched.
        """
        try:
            self._observed_count += 1
            if not self._is_pageindex_record(record):
                return False
            self._enrich_record(record)
            self._enriched_count += 1
            return True
        except Exception:
            return False

    def is_pageindex_mcp_response(self, record: Dict[str, Any]) -> bool:
        """
        Check whether a decision record looks like a PageIndex MCP tool call,
        without modifying the record.
        """
        try:
            return self._is_pageindex_record(record)
        except Exception:
            return False

    @property
    def observed_count(self) -> int:
        """Total number of records passed to observe()."""
        return self._observed_count

    @property
    def enriched_count(self) -> int:
        """Total number of records that were identified and enriched."""
        return self._enriched_count

    # ── Internal detection ──────────────────────────────────────────────────────

    def _is_pageindex_record(self, record: Dict[str, Any]) -> bool:
        """
        Return True if the record appears to originate from a PageIndex MCP tool call.
        Checks tool name first, then falls back to output content inspection.
        """
        # 1. Name-based detection (cheapest check first)
        name = (
            record.get("function_name", "")
            or record.get("tool_name", "")
            or record.get("name", "")
        ).lower()

        if any(kw in name for kw in _PAGEINDEX_TOOL_KEYWORDS):
            return True

        # 2. Content-based detection: parse output as JSON
        output_str = _extract_output_str(record)
        if not output_str:
            return False

        parsed = _try_parse_json(output_str)
        if not isinstance(parsed, dict):
            return False

        # doc_id or retrieval_id → PageIndex response
        if any(key in parsed for key in _PAGEINDEX_RESPONSE_KEYS):
            return True

        # Flat tree at root level (nodes list present)
        if "nodes" in parsed and isinstance(parsed.get("nodes"), list):
            return True

        # Nested tree under "tree" key
        if "tree" in parsed and isinstance(parsed.get("tree"), dict):
            return True

        return False

    def _enrich_record(self, record: Dict[str, Any]) -> None:
        """Add pageindex.* attributes to the record in-place."""
        output_str = _extract_output_str(record)
        parsed: Optional[Dict[str, Any]] = None
        if output_str:
            parsed = _try_parse_json(output_str)
            if not isinstance(parsed, dict):
                parsed = None

        # Extract doc_id from output, then fall back to inputs
        doc_id = _extract_doc_id(parsed, record)

        # Extract tree structure and compute metadata
        tree_meta = _extract_tree_metadata(parsed)

        record["pageindex.doc_id"] = doc_id
        record["pageindex.retrieval_method"] = "tree_search"
        record["pageindex.tree.depth"] = tree_meta.get("depth", 0)
        record["pageindex.tree.nodes_visited"] = tree_meta.get("nodes_visited", 0)
        record["pageindex.tree.path"] = tree_meta.get("path", "")
        record["pageindex.tree.backtrack_count"] = 0


# ── Private helpers ──────────────────────────────────────────────────────────

def _extract_output_str(record: Dict[str, Any]) -> str:
    """Extract the raw output string from a decision record."""
    outputs = record.get("outputs") or {}
    if not isinstance(outputs, dict):
        return ""
    return outputs.get("output", "") or outputs.get("content", "") or ""


def _try_parse_json(text: str) -> Optional[Any]:
    """Attempt to parse text as JSON. Returns None on failure."""
    try:
        return json.loads(text)
    except (json.JSONDecodeError, TypeError, ValueError):
        return None


def _extract_doc_id(
    parsed: Optional[Dict[str, Any]], record: Dict[str, Any]
) -> str:
    """Extract doc_id from parsed output, then from inputs, then default to ''."""
    if isinstance(parsed, dict):
        doc_id = parsed.get("doc_id", "")
        if doc_id:
            return str(doc_id)

    # Fall back: check inputs
    inputs = record.get("inputs") or {}
    if isinstance(inputs, dict):
        input_str = inputs.get("input", "") or ""
        input_parsed = _try_parse_json(input_str)
        if isinstance(input_parsed, dict):
            doc_id = input_parsed.get("doc_id", "")
            if doc_id:
                return str(doc_id)

    return ""


def _compute_tree_depth(node: Dict[str, Any], current_depth: int = 0) -> int:
    """Recursively compute maximum depth. Children stored under 'nodes' key."""
    children = node.get("nodes", [])
    if not children:
        return current_depth
    return max(_compute_tree_depth(child, current_depth + 1) for child in children)


def _count_tree_nodes(node: Dict[str, Any]) -> int:
    """Recursively count all nodes (inclusive of root)."""
    children = node.get("nodes", [])
    return 1 + sum(_count_tree_nodes(child) for child in children)


def _build_tree_path(node: Dict[str, Any], max_sections: int = 3) -> str:
    """Build a human-readable traversal path from root."""
    parts = []
    root_title = node.get("title") or node.get("node_id") or "root"
    parts.append(str(root_title))

    children = node.get("nodes", [])
    for child in children[:max_sections]:
        child_title = child.get("title") or child.get("node_id") or "node"
        parts.append(str(child_title))

    if len(children) > max_sections:
        parts.append(f"... ({len(children) - max_sections} more)")

    return " > ".join(parts)


def _extract_tree_metadata(parsed: Optional[Dict[str, Any]]) -> Dict[str, Any]:
    """
    Extract tree depth, node count, and path from a parsed MCP response.
    Returns default zeros when tree structure is absent or malformed.
    """
    if not isinstance(parsed, dict):
        return {}

    try:
        # Tree can be nested under "tree" key, or at root level via "nodes"
        tree: Optional[Dict[str, Any]] = None
        if isinstance(parsed.get("tree"), dict):
            tree = parsed["tree"]
        elif "nodes" in parsed and isinstance(parsed.get("nodes"), list):
            tree = parsed  # root-level flat tree

        if tree is None:
            return {}

        return {
            "depth": _compute_tree_depth(tree),
            "nodes_visited": _count_tree_nodes(tree),
            "path": _build_tree_path(tree),
        }
    except Exception:
        return {}