version: "1.2.0"
schema: "reasonkit-web-search-protocol-v2"
created: "2025-12-11"
last_updated: "2025-12-12"
license: "Apache-2.0"
docs:
implementation_specs: "./wsop-implementation-specs.md"
gigathink_implementations: "./wsop-gigathink-implementations.md"
edge_cases: "./wsop-edge-cases.md"
executive_summary: "./WSOP_EXECUTIVE_SUMMARY.md"
metadata:
id: "PROT-WS-OPT-001"
name: "Web Search Optimization Protocol"
shortcode: "ws-opt"
priority: 1
enforcement: "on_demand"
triggers:
explicit:
- "web search"
- "deep research"
- "find sources"
- "gather evidence"
implicit:
- "need external information"
- "verify claims online"
- "current events query"
description: |
A comprehensive protocol for optimized web search combining:
- HyDE query expansion (arXiv:2212.10496)
- Adaptive retrieval routing (Adaptive-RAG)
- Multi-provider parallel search
- Source credibility scoring
- Rate limiting with circuit breakers
- ProofGuard triangulation integration
search_providers:
web_search_apis:
tavily:
purpose: "RAG-optimized search with structured JSON output"
benchmarks:
frames_accuracy: "87%"
webwalker_accuracy: "79%"
latency: "medium"
cost: "$5-8 per 1000 requests"
best_for:
- "Factual verification"
- "RAG integration"
- "Structured responses"
integration: "langchain-tavily package"
confidence: "95%"
sources:
- "https://docs.tavily.com/documentation/integrations/langchain"
- "https://parallel.ai/products/search"
- "https://www.humai.blog/tavily-vs-exa-vs-perplexity-vs-you-com-the-complete-ai-search-api-comparison-2025/"
exa:
purpose: "Semantic/neural search with deep understanding"
benchmarks:
frames_accuracy: "81%"
browsecomp_accuracy: "29%"
latency:
fast: "<350ms P50"
deep: "3.5s P50"
cost: "$2.50-5 per 1000 requests"
best_for:
- "Semantic research queries"
- "Code context search"
- "Neural similarity"
integration: "exa-mcp-server, @exalabs/ai-sdk"
confidence: "93%"
sources:
- "https://exa.ai/exa-api"
- "https://exa.ai/blog/exa-api-2-0"
- "https://github.com/exa-labs/exa-mcp-server"
perplexity:
purpose: "Speed-optimized search with citations"
benchmarks:
simpleqa_accuracy: "74%"
frames_accuracy: "83%"
latency: "<400ms median"
cost: "$5 per 1000 requests"
best_for:
- "Speed-critical applications"
- "Current events"
- "Quick factual queries"
integration: "OpenRouter API, direct API"
confidence: "92%"
sources:
- "https://docs.perplexity.ai/getting-started/models/models/sonar"
- "https://github.com/perplexityai/search_evals"
- "https://openrouter.ai/perplexity/sonar/api"
academic_search_apis:
semantic_scholar:
purpose: "Academic paper search with 225M+ papers"
rate_limit: "1 RPS public, higher with API key"
cost: "Free"
best_for:
- "Academic research"
- "Citation networks"
- "SPECTER2 embeddings"
integration: "semanticscholar PyPI package"
confidence: "98%"
sources:
- "https://www.semanticscholar.org/product/api"
- "https://github.com/danielnsilva/semanticscholar"
- "https://pypi.org/project/semanticscholar/"
arxiv:
purpose: "Preprint access with 1M+ papers"
rate_limit: "3 RPS"
cost: "Free"
best_for:
- "Latest research"
- "CS/ML papers"
- "Open access"
integration: "arxiv PyPI package"
confidence: "98%"
sources:
- "https://info.arxiv.org/help/api/basics.html"
- "https://github.com/lukasschwab/arxiv.py"
- "https://pypi.org/project/arxiv/"
query_optimization:
hyde_expansion:
name: "Hypothetical Document Embeddings (HyDE)"
reference: "arXiv:2212.10496"
confidence: "95%"
mechanism: |
1. Generate hypothetical document that answers the query
2. Embed the hypothetical document
3. Search using document-to-document similarity
(Bypasses query-document semantic gap)
implementation:
langchain: "HypotheticalDocumentEmbedder class"
haystack: "Custom HypotheticalDocumentEmbedder component"
llamaindex: "HyDEQueryTransform"
performance: |
- "Significantly outperforms Contriever (unsupervised)"
- "Comparable to fine-tuned retrievers"
- "Effective across web search, QA, fact verification"
best_practices:
- "Use instruction-following LLM for generation"
- "Keep hypothetical documents focused on query topic"
- "Works best with contrastive embedding models"
multi_query_rewriting:
name: "Multi-Query Beam Search Rewriting"
reference: "arXiv:2406.18960 (SIGIR 2024)"
confidence: "92%"
mechanism: |
Use beam search to generate multiple query rewrites
at no additional cost, then integrate into retrieval pipeline.
benefits:
- "State-of-the-art on conversational passage retrieval"
- "Works with both sparse and dense first-pass retrieval"
- "No efficiency sacrifice"
rewrite_retrieve_read:
name: "Rewrite-Retrieve-Read Framework"
reference: "arXiv:2305.14283, EMNLP 2023"
confidence: "90%"
mechanism: |
1. Small LM (T5-large) rewrites query via RL training
2. Training with PPO using reward = EM + F1 + Hit
3. Retrieve using rewritten query
4. Black-box LLM reads and answers
key_insight: |
"There is inevitably a gap between the input text and
the needed knowledge in retrieval" - address via proactive
query rewriting rather than passive retrieval optimization.
crag_corrective:
name: "Corrective RAG (CRAG)"
reference: "arXiv:2401.15884"
confidence: "93%"
mechanism: |
1. T5-large evaluator classifies retrieval quality
2. Actions: CORRECT (use internal), INCORRECT (web fallback), AMBIGUOUS (combine)
3. Knowledge refinement: decompose-filter-recompose
4. Web search fallback for insufficient corpus results
performance:
popqa_improvement: "+7%"
biography_factscore: "+14.9%"
best_practices:
- "Set upper threshold ~0.7, lower ~0.3 for classification"
- "Use sentence-level decomposition for knowledge strips"
- "Filter strips by relevance before recomposition"
rag_fusion:
name: "RAG-Fusion with Reciprocal Rank Fusion"
reference: "arXiv:2402.03367"
confidence: "91%"
mechanism: |
1. Generate 4-5 query perspectives from original
2. Retrieve for each query in parallel
3. Fuse results using RRF: score = sum(1/(rank + k))
4. k=60 is standard smoothing constant
performance:
comprehensiveness: "More complete answers"
latency_overhead: "1.77x slower than single query"
best_practices:
- "Always include original query in multi-query set"
- "Use high temperature (0.8) for query diversity"
- "RRF is robust and parameter-free (k=60 standard)"
adaptive_routing:
name: "Adaptive-RAG Complexity Routing"
reference: "Jeong et al., 2024"
confidence: "93%"
complexity_levels:
simple:
description: "Direct factual queries"
retrieval_steps: 0
providers: []
example: "What year was Python created?"
moderate:
description: "Single-step retrieval needed"
retrieval_steps: 1
providers: ["tavily", "semantic_scholar"]
use_hyde: true
max_results: 5
example: "What is RAG in NLP?"
complex:
description: "Multi-step reasoning required"
retrieval_steps: 3
providers: ["tavily", "exa", "semantic_scholar", "arxiv"]
use_hyde: true
max_results: 10
enable_multi_hop: true
example: "Compare RAPTOR and HyDE for multi-hop reasoning"
routing_strategy: |
Pre-classify query complexity before retrieval.
Route to appropriate strategy to optimize cost/accuracy tradeoff.
Use trained classifier or heuristic rules.
multi_hop_reasoning:
corag:
name: "Chain-of-Retrieval Augmented Generation"
reference: "arXiv:2501.14342 (NeurIPS 2025)"
confidence: "92%"
mechanism: |
- Step-by-step retrieval with dynamic query reformulation
- Iterative reasoning over retrieved evidence
- Rejection sampling for intermediate retrieval chains
performance:
kilt_benchmark: "New state-of-the-art"
multi_hop_qa: "+10 EM score vs strong baselines"
self_rag:
name: "Self-Reflective RAG"
reference: "arXiv:2310.11511 (ICLR 2024 Oral)"
confidence: "95%"
reflection_tokens:
- "[Retrieve] / [No Retrieval]"
- "[Relevant] / [Irrelevant]"
- "[Fully supported] / [Partially supported]"
- "[Utility:1-5]"
performance:
asqa_precision: "+29.56%"
asqa_recall: "+18.81%"
react:
name: "ReAct Reasoning + Acting"
reference: "arXiv:2210.03629"
confidence: "93%"
mechanism: |
Interleave reasoning traces with actions.
Reasoning helps track and update action plans.
Actions interface with external sources.
performance:
alfworld: "+34% absolute success rate"
webshop: "+10% absolute success rate"
credibility_scoring:
name: "Tiered Source Credibility Assessment"
references:
- "arXiv:2410.12061 (CrediRAG)"
- "arXiv:2509.15793 (RAVE)"
confidence: "90%"
tiers:
tier_1_authoritative:
weight: 1.0
confidence_boost: "+15%"
domains:
- "arxiv.org"
- "github.com"
- "semanticscholar.org"
- "huggingface.co"
- "nature.com"
- "science.org"
- "acm.org"
- "ieee.org"
- "openreview.net"
tier_2_secondary:
weight: 0.8
confidence_boost: "+10%"
domains:
- "nvidia.com"
- "google.ai"
- "anthropic.com"
- "openai.com"
- "langchain.com"
- "llamaindex.ai"
- "docs.tavily.com"
tier_3_independent:
weight: 0.6
confidence_boost: "+5%"
domains: "All others"
integration_with_rave: |
For each retrieved snippet, provide to LLM:
1. Textual content
2. Computed relevance score
3. Source credibility score
4. Metadata (domain, date, author)
rate_limiting:
name: "Circuit Breaker + Exponential Backoff"
references:
- "https://docs.aws.amazon.com/prescriptive-guidance/latest/cloud-design-patterns/retry-backoff.html"
- "https://www.unkey.com/glossary/api-circuit-breaker"
confidence: "95%"
exponential_backoff:
base_delay: 1.0
max_delay: 60.0
jitter: 0.5
circuit_breaker:
states:
closed: "Normal operation"
open: "Stop requests after consecutive failures"
half_open: "Allow limited retries"
failure_threshold: 5
recovery_timeout: 30
best_practices:
- "Respect Retry-After headers when available"
- "Apply only to idempotent operations"
- "Set realistic thresholds from historical data"
- "Implement fallback mechanisms (caching, degraded mode)"
- "Monitor and log circuit breaker events"
optimized_workflow:
name: "Optimized Web Search Workflow"
phases:
phase_1_routing:
name: "Query Complexity Routing"
steps:
- "Classify query complexity (simple/moderate/complex)"
- "Select retrieval strategy based on classification"
- "Determine which providers to use"
phase_2_expansion:
name: "Query Expansion"
steps:
- "Apply HyDE if moderate/complex query"
- "Generate hypothetical answer document"
- "Optionally: multi-query beam search rewriting"
phase_3_parallel_search:
name: "Parallel Multi-Provider Search"
steps:
- "Execute searches in parallel across selected providers"
- "Apply rate limiting with circuit breakers"
- "Aggregate results with source metadata"
phase_4_credibility:
name: "Credibility Assessment"
steps:
- "Score each source by domain tier"
- "Apply credibility weights to relevance scores"
- "Filter low-credibility sources for critical claims"
phase_5_triangulation:
name: "ProofGuard Triangulation"
steps:
- "Identify claims requiring verification"
- "Attempt 3-source triangulation per claim"
- "Assign consensus status (VERIFIED/LIKELY/UNVERIFIED)"
phase_6_multi_hop:
name: "Multi-Hop Reasoning (if complex)"
steps:
- "Apply CoRAG iterative retrieval if needed"
- "Use Self-RAG reflection tokens for self-critique"
- "Reformulate queries based on evolving evidence"
phase_7_synthesis:
name: "Output Synthesis"
steps:
- "Compile triangulation table"
- "Generate structured output with confidence intervals"
- "Include source citations with tier annotations"
objective_measures:
retrieval_metrics:
- metric: "Triangulation Coverage"
formula: "(claims_with_3_sources / total_claims) * 100"
target: ">= 90%"
- metric: "Tier 1 Source Ratio"
formula: "tier_1_sources / total_sources"
target: ">= 0.5"
- metric: "Average Latency"
formula: "sum(query_latencies) / query_count"
target: "< 5 seconds"
accuracy_benchmarks:
- benchmark: "FRAMES (Multi-hop Factuality)"
target_accuracy: ">= 85%"
- benchmark: "HotpotQA"
target_accuracy: ">= 70%"
- benchmark: "SimpleQA"
target_accuracy: ">= 70%"
reliability_metrics:
- metric: "Circuit Breaker Trips"
formula: "open_state_count / total_requests"
target: "< 1%"
- metric: "Rate Limit Compliance"
formula: "requests_within_limit / total_requests"
target: "100%"
implementation:
test_suite: "./tests/web_search_optimization_tests.py"
test_status: "9/9 PASSING"
key_classes:
- "WebSearchOrchestrator: Master coordinator"
- "HyDEQueryExpander: Query expansion"
- "AdaptiveRetrievalRouter: Complexity routing"
- "CredibilityScorer: Source tier assessment"
- "TriangulationEngine: ProofGuard integration"
- "RateLimiter: Circuit breaker + backoff"
changelog:
- version: "1.2.0"
date: "2025-12-12"
changes:
- "GigaThink integration - 12 creative perspectives implemented"
- "Added Belief Reports with epistemic transparency"
- "Added Falsification Search Engine with steelmanning"
- "Added Query Evolution Visualization (ASCII/Mermaid)"
- "Added Provenance Chains (information archaeology)"
- "Added Adaptive Query Memory (immune system pattern)"
- "Added Call-and-Response Retrieval (jazz improvisation)"
- "Added Source Motivation Analysis (skeptical journalist)"
- "Added Knowledge Graph Builder with PageRank"
- "Added Difficulty Mode Selector (5 modes)"
- "Added Epistemic Uncertainty Handler"
- "Schema upgraded to v2"
- version: "1.1.0"
date: "2025-12-12"
changes:
- "Added testable implementation specs from 4 key papers"
- "Integrated HyDE, CRAG, RAG-Fusion, Query Rewriting algorithms"
- "Created 30+ testable assertions for validation"
- "Added end-to-end pipeline specification"
- "Linked implementation specs document"
- version: "1.0.0"
date: "2025-12-11"
changes:
- "Initial protocol derived from 6-iteration deep research"
- "Verified 30+ claims with 50+ academic sources"
- "Created test suite with 9 passing tests"
- "Integrated with ProofGuard triangulation protocol"
- "Documented all API benchmarks with confidence intervals"