reasonkit-core 0.1.8

# ═══════════════════════════════════════════════════════════════════════════════
#                    WEB SEARCH OPTIMIZATION PROTOCOL
#                    Verified Best Practices for AI Web Research
# ═══════════════════════════════════════════════════════════════════════════════
#
# PURPOSE: Standardize web search workflows for maximum accuracy, speed, and
#          reliability in AI-powered research and RAG systems.
#
# DERIVED FROM: 6 iterations of ProofGuard Deep Research (2025-12-11)
#              - 50+ academic sources consulted
#              - 30+ claims triangulated
#              - Implementation tested with 9 passing tests
#
# LICENSE: Apache 2.0 (Open Source)
#
# ═══════════════════════════════════════════════════════════════════════════════

version: "1.2.0"
schema: "reasonkit-web-search-protocol-v2"
created: "2025-12-11"
last_updated: "2025-12-12"
license: "Apache-2.0"

# Documentation References
docs:
  implementation_specs: "./wsop-implementation-specs.md"
  gigathink_implementations: "./wsop-gigathink-implementations.md"
  edge_cases: "./wsop-edge-cases.md"
  executive_summary: "./WSOP_EXECUTIVE_SUMMARY.md"

# ─────────────────────────────────────────────────────────────────────────────
# PROTOCOL METADATA
# ─────────────────────────────────────────────────────────────────────────────

metadata:
  id: "PROT-WS-OPT-001"
  name: "Web Search Optimization Protocol"
  shortcode: "ws-opt"
  priority: 1
  enforcement: "on_demand"

  triggers:
    explicit:
      - "web search"
      - "deep research"
      - "find sources"
      - "gather evidence"
    implicit:
      - "need external information"
      - "verify claims online"
      - "current events query"

  description: |
    A comprehensive protocol for optimized web search combining:
    - HyDE query expansion (arXiv:2212.10496)
    - Adaptive retrieval routing (Adaptive-RAG)
    - Multi-provider parallel search
    - Source credibility scoring
    - Rate limiting with circuit breakers
    - ProofGuard triangulation integration

# ─────────────────────────────────────────────────────────────────────────────
# VERIFIED SEARCH PROVIDERS (TRIANGULATED)
# ─────────────────────────────────────────────────────────────────────────────

search_providers:
  web_search_apis:
    tavily:
      purpose: "RAG-optimized search with structured JSON output"
      benchmarks:
        frames_accuracy: "87%"
        webwalker_accuracy: "79%"
      latency: "medium"
      cost: "$5-8 per 1000 requests"
      best_for:
        - "Factual verification"
        - "RAG integration"
        - "Structured responses"
      integration: "langchain-tavily package"
      confidence: "95%"
      sources:
        - "https://docs.tavily.com/documentation/integrations/langchain"
        - "https://parallel.ai/products/search"
        - "https://www.humai.blog/tavily-vs-exa-vs-perplexity-vs-you-com-the-complete-ai-search-api-comparison-2025/"

    exa:
      purpose: "Semantic/neural search with deep understanding"
      benchmarks:
        frames_accuracy: "81%"
        browsecomp_accuracy: "29%"
      latency:
        fast: "<350ms P50"
        deep: "3.5s P50"
      cost: "$2.50-5 per 1000 requests"
      best_for:
        - "Semantic research queries"
        - "Code context search"
        - "Neural similarity"
      integration: "exa-mcp-server, @exalabs/ai-sdk"
      confidence: "93%"
      sources:
        - "https://exa.ai/exa-api"
        - "https://exa.ai/blog/exa-api-2-0"
        - "https://github.com/exa-labs/exa-mcp-server"

    perplexity:
      purpose: "Speed-optimized search with citations"
      benchmarks:
        simpleqa_accuracy: "74%"
        frames_accuracy: "83%"
      latency: "<400ms median"
      cost: "$5 per 1000 requests"
      best_for:
        - "Speed-critical applications"
        - "Current events"
        - "Quick factual queries"
      integration: "OpenRouter API, direct API"
      confidence: "92%"
      sources:
        - "https://docs.perplexity.ai/getting-started/models/models/sonar"
        - "https://github.com/perplexityai/search_evals"
        - "https://openrouter.ai/perplexity/sonar/api"

  academic_search_apis:
    semantic_scholar:
      purpose: "Academic paper search with 225M+ papers"
      rate_limit: "1 RPS public, higher with API key"
      cost: "Free"
      best_for:
        - "Academic research"
        - "Citation networks"
        - "SPECTER2 embeddings"
      integration: "semanticscholar PyPI package"
      confidence: "98%"
      sources:
        - "https://www.semanticscholar.org/product/api"
        - "https://github.com/danielnsilva/semanticscholar"
        - "https://pypi.org/project/semanticscholar/"

    arxiv:
      purpose: "Preprint access with 1M+ papers"
      rate_limit: "3 RPS"
      cost: "Free"
      best_for:
        - "Latest research"
        - "CS/ML papers"
        - "Open access"
      integration: "arxiv PyPI package"
      confidence: "98%"
      sources:
        - "https://info.arxiv.org/help/api/basics.html"
        - "https://github.com/lukasschwab/arxiv.py"
        - "https://pypi.org/project/arxiv/"

# ─────────────────────────────────────────────────────────────────────────────
# QUERY OPTIMIZATION TECHNIQUES (VERIFIED)
# ─────────────────────────────────────────────────────────────────────────────

query_optimization:
  hyde_expansion:
    name: "Hypothetical Document Embeddings (HyDE)"
    reference: "arXiv:2212.10496"
    confidence: "95%"

    mechanism: |
      1. Generate hypothetical document that answers the query
      2. Embed the hypothetical document
      3. Search using document-to-document similarity
      (Bypasses query-document semantic gap)

    implementation:
      langchain: "HypotheticalDocumentEmbedder class"
      haystack: "Custom HypotheticalDocumentEmbedder component"
      llamaindex: "HyDEQueryTransform"

    performance: |
      - "Significantly outperforms Contriever (unsupervised)"
      - "Comparable to fine-tuned retrievers"
      - "Effective across web search, QA, fact verification"

    best_practices:
      - "Use instruction-following LLM for generation"
      - "Keep hypothetical documents focused on query topic"
      - "Works best with contrastive embedding models"

  multi_query_rewriting:
    name: "Multi-Query Beam Search Rewriting"
    reference: "arXiv:2406.18960 (SIGIR 2024)"
    confidence: "92%"

    mechanism: |
      Use beam search to generate multiple query rewrites
      at no additional cost, then integrate into retrieval pipeline.

    benefits:
      - "State-of-the-art on conversational passage retrieval"
      - "Works with both sparse and dense first-pass retrieval"
      - "No efficiency sacrifice"

  rewrite_retrieve_read:
    name: "Rewrite-Retrieve-Read Framework"
    reference: "arXiv:2305.14283, EMNLP 2023"
    confidence: "90%"

    mechanism: |
      1. Small LM (T5-large) rewrites query via RL training
      2. Training with PPO using reward = EM + F1 + Hit
      3. Retrieve using rewritten query
      4. Black-box LLM reads and answers

    key_insight: |
      "There is inevitably a gap between the input text and
      the needed knowledge in retrieval" - address via proactive
      query rewriting rather than passive retrieval optimization.

  crag_corrective:
    name: "Corrective RAG (CRAG)"
    reference: "arXiv:2401.15884"
    confidence: "93%"

    mechanism: |
      1. T5-large evaluator classifies retrieval quality
      2. Actions: CORRECT (use internal), INCORRECT (web fallback), AMBIGUOUS (combine)
      3. Knowledge refinement: decompose-filter-recompose
      4. Web search fallback for insufficient corpus results

    performance:
      popqa_improvement: "+7%"
      biography_factscore: "+14.9%"

    best_practices:
      - "Set upper threshold ~0.7, lower ~0.3 for classification"
      - "Use sentence-level decomposition for knowledge strips"
      - "Filter strips by relevance before recomposition"

  rag_fusion:
    name: "RAG-Fusion with Reciprocal Rank Fusion"
    reference: "arXiv:2402.03367"
    confidence: "91%"

    mechanism: |
      1. Generate 4-5 query perspectives from original
      2. Retrieve for each query in parallel
      3. Fuse results using RRF: score = sum(1/(rank + k))
      4. k=60 is standard smoothing constant

    performance:
      comprehensiveness: "More complete answers"
      latency_overhead: "1.77x slower than single query"

    best_practices:
      - "Always include original query in multi-query set"
      - "Use high temperature (0.8) for query diversity"
      - "RRF is robust and parameter-free (k=60 standard)"

# ─────────────────────────────────────────────────────────────────────────────
# ADAPTIVE RETRIEVAL ROUTING (VERIFIED)
# ─────────────────────────────────────────────────────────────────────────────

adaptive_routing:
  name: "Adaptive-RAG Complexity Routing"
  reference: "Jeong et al., 2024"
  confidence: "93%"

  complexity_levels:
    simple:
      description: "Direct factual queries"
      retrieval_steps: 0
      providers: []
      example: "What year was Python created?"

    moderate:
      description: "Single-step retrieval needed"
      retrieval_steps: 1
      providers: ["tavily", "semantic_scholar"]
      use_hyde: true
      max_results: 5
      example: "What is RAG in NLP?"

    complex:
      description: "Multi-step reasoning required"
      retrieval_steps: 3
      providers: ["tavily", "exa", "semantic_scholar", "arxiv"]
      use_hyde: true
      max_results: 10
      enable_multi_hop: true
      example: "Compare RAPTOR and HyDE for multi-hop reasoning"

  routing_strategy: |
    Pre-classify query complexity before retrieval.
    Route to appropriate strategy to optimize cost/accuracy tradeoff.
    Use trained classifier or heuristic rules.

# ─────────────────────────────────────────────────────────────────────────────
# MULTI-HOP REASONING (VERIFIED)
# ─────────────────────────────────────────────────────────────────────────────

multi_hop_reasoning:
  corag:
    name: "Chain-of-Retrieval Augmented Generation"
    reference: "arXiv:2501.14342 (NeurIPS 2025)"
    confidence: "92%"

    mechanism: |
      - Step-by-step retrieval with dynamic query reformulation
      - Iterative reasoning over retrieved evidence
      - Rejection sampling for intermediate retrieval chains

    performance:
      kilt_benchmark: "New state-of-the-art"
      multi_hop_qa: "+10 EM score vs strong baselines"

  self_rag:
    name: "Self-Reflective RAG"
    reference: "arXiv:2310.11511 (ICLR 2024 Oral)"
    confidence: "95%"

    reflection_tokens:
      - "[Retrieve] / [No Retrieval]"
      - "[Relevant] / [Irrelevant]"
      - "[Fully supported] / [Partially supported]"
      - "[Utility:1-5]"

    performance:
      asqa_precision: "+29.56%"
      asqa_recall: "+18.81%"

  react:
    name: "ReAct Reasoning + Acting"
    reference: "arXiv:2210.03629"
    confidence: "93%"

    mechanism: |
      Interleave reasoning traces with actions.
      Reasoning helps track and update action plans.
      Actions interface with external sources.

    performance:
      alfworld: "+34% absolute success rate"
      webshop: "+10% absolute success rate"

# ─────────────────────────────────────────────────────────────────────────────
# SOURCE CREDIBILITY SCORING (VERIFIED)
# ─────────────────────────────────────────────────────────────────────────────

credibility_scoring:
  name: "Tiered Source Credibility Assessment"
  references:
    - "arXiv:2410.12061 (CrediRAG)"
    - "arXiv:2509.15793 (RAVE)"
  confidence: "90%"

  tiers:
    tier_1_authoritative:
      weight: 1.0
      confidence_boost: "+15%"
      domains:
        - "arxiv.org"
        - "github.com"
        - "semanticscholar.org"
        - "huggingface.co"
        - "nature.com"
        - "science.org"
        - "acm.org"
        - "ieee.org"
        - "openreview.net"

    tier_2_secondary:
      weight: 0.8
      confidence_boost: "+10%"
      domains:
        - "nvidia.com"
        - "google.ai"
        - "anthropic.com"
        - "openai.com"
        - "langchain.com"
        - "llamaindex.ai"
        - "docs.tavily.com"

    tier_3_independent:
      weight: 0.6
      confidence_boost: "+5%"
      domains: "All others"

  integration_with_rave: |
    For each retrieved snippet, provide to LLM:
    1. Textual content
    2. Computed relevance score
    3. Source credibility score
    4. Metadata (domain, date, author)

# ─────────────────────────────────────────────────────────────────────────────
# RATE LIMITING (VERIFIED BEST PRACTICES)
# ─────────────────────────────────────────────────────────────────────────────

rate_limiting:
  name: "Circuit Breaker + Exponential Backoff"
  references:
    - "https://docs.aws.amazon.com/prescriptive-guidance/latest/cloud-design-patterns/retry-backoff.html"
    - "https://www.unkey.com/glossary/api-circuit-breaker"
  confidence: "95%"

  exponential_backoff:
    base_delay: 1.0
    max_delay: 60.0
    jitter: 0.5 # Random factor to prevent thundering herd

  circuit_breaker:
    states:
      closed: "Normal operation"
      open: "Stop requests after consecutive failures"
      half_open: "Allow limited retries"
    failure_threshold: 5
    recovery_timeout: 30 # seconds

  best_practices:
    - "Respect Retry-After headers when available"
    - "Apply only to idempotent operations"
    - "Set realistic thresholds from historical data"
    - "Implement fallback mechanisms (caching, degraded mode)"
    - "Monitor and log circuit breaker events"

# ─────────────────────────────────────────────────────────────────────────────
# COMPLETE WORKFLOW (OPTIMIZED)
# ─────────────────────────────────────────────────────────────────────────────

optimized_workflow:
  name: "Optimized Web Search Workflow"

  phases:
    phase_1_routing:
      name: "Query Complexity Routing"
      steps:
        - "Classify query complexity (simple/moderate/complex)"
        - "Select retrieval strategy based on classification"
        - "Determine which providers to use"

    phase_2_expansion:
      name: "Query Expansion"
      steps:
        - "Apply HyDE if moderate/complex query"
        - "Generate hypothetical answer document"
        - "Optionally: multi-query beam search rewriting"

    phase_3_parallel_search:
      name: "Parallel Multi-Provider Search"
      steps:
        - "Execute searches in parallel across selected providers"
        - "Apply rate limiting with circuit breakers"
        - "Aggregate results with source metadata"

    phase_4_credibility:
      name: "Credibility Assessment"
      steps:
        - "Score each source by domain tier"
        - "Apply credibility weights to relevance scores"
        - "Filter low-credibility sources for critical claims"

    phase_5_triangulation:
      name: "ProofGuard Triangulation"
      steps:
        - "Identify claims requiring verification"
        - "Attempt 3-source triangulation per claim"
        - "Assign consensus status (VERIFIED/LIKELY/UNVERIFIED)"

    phase_6_multi_hop:
      name: "Multi-Hop Reasoning (if complex)"
      steps:
        - "Apply CoRAG iterative retrieval if needed"
        - "Use Self-RAG reflection tokens for self-critique"
        - "Reformulate queries based on evolving evidence"

    phase_7_synthesis:
      name: "Output Synthesis"
      steps:
        - "Compile triangulation table"
        - "Generate structured output with confidence intervals"
        - "Include source citations with tier annotations"

# ─────────────────────────────────────────────────────────────────────────────
# OBJECTIVE MEASURES
# ─────────────────────────────────────────────────────────────────────────────

objective_measures:
  retrieval_metrics:
    - metric: "Triangulation Coverage"
      formula: "(claims_with_3_sources / total_claims) * 100"
      target: ">= 90%"

    - metric: "Tier 1 Source Ratio"
      formula: "tier_1_sources / total_sources"
      target: ">= 0.5"

    - metric: "Average Latency"
      formula: "sum(query_latencies) / query_count"
      target: "< 5 seconds"

  accuracy_benchmarks:
    - benchmark: "FRAMES (Multi-hop Factuality)"
      target_accuracy: ">= 85%"

    - benchmark: "HotpotQA"
      target_accuracy: ">= 70%"

    - benchmark: "SimpleQA"
      target_accuracy: ">= 70%"

  reliability_metrics:
    - metric: "Circuit Breaker Trips"
      formula: "open_state_count / total_requests"
      target: "< 1%"

    - metric: "Rate Limit Compliance"
      formula: "requests_within_limit / total_requests"
      target: "100%"

# ─────────────────────────────────────────────────────────────────────────────
# IMPLEMENTATION CODE REFERENCE
# ─────────────────────────────────────────────────────────────────────────────

implementation:
  test_suite: "./tests/web_search_optimization_tests.py"
  test_status: "9/9 PASSING"

  key_classes:
    - "WebSearchOrchestrator: Master coordinator"
    - "HyDEQueryExpander: Query expansion"
    - "AdaptiveRetrievalRouter: Complexity routing"
    - "CredibilityScorer: Source tier assessment"
    - "TriangulationEngine: ProofGuard integration"
    - "RateLimiter: Circuit breaker + backoff"

# ─────────────────────────────────────────────────────────────────────────────
# CHANGELOG
# ─────────────────────────────────────────────────────────────────────────────

changelog:
  - version: "1.2.0"
    date: "2025-12-12"
    changes:
      - "GigaThink integration - 12 creative perspectives implemented"
      - "Added Belief Reports with epistemic transparency"
      - "Added Falsification Search Engine with steelmanning"
      - "Added Query Evolution Visualization (ASCII/Mermaid)"
      - "Added Provenance Chains (information archaeology)"
      - "Added Adaptive Query Memory (immune system pattern)"
      - "Added Call-and-Response Retrieval (jazz improvisation)"
      - "Added Source Motivation Analysis (skeptical journalist)"
      - "Added Knowledge Graph Builder with PageRank"
      - "Added Difficulty Mode Selector (5 modes)"
      - "Added Epistemic Uncertainty Handler"
      - "Schema upgraded to v2"

  - version: "1.1.0"
    date: "2025-12-12"
    changes:
      - "Added testable implementation specs from 4 key papers"
      - "Integrated HyDE, CRAG, RAG-Fusion, Query Rewriting algorithms"
      - "Created 30+ testable assertions for validation"
      - "Added end-to-end pipeline specification"
      - "Linked implementation specs document"

  - version: "1.0.0"
    date: "2025-12-11"
    changes:
      - "Initial protocol derived from 6-iteration deep research"
      - "Verified 30+ claims with 50+ academic sources"
      - "Created test suite with 9 passing tests"
      - "Integrated with ProofGuard triangulation protocol"
      - "Documented all API benchmarks with confidence intervals"