Skip to main content

cp_query/
lib.rs

1//! CP Query - RAG query engine
2//!
3//! Provides semantic search and chat over the knowledge graph.
4//!
5//! Per CP-012/CP-020: Supports filtered retrieval and query caching.
6
7use cp_core::{Chunk, CPError, Result, ContextAssembler, ScoredChunk, AssembledContext};
8use cp_tor::types::{
9    MergedSearchResult, RemoteSearchResult, ResultSource, SearchResponse, SearchStatus,
10    MAX_RESULTS,
11};
12use glob::Pattern;
13use lru::LruCache;
14use std::collections::HashMap;
15use std::num::NonZeroUsize;
16use std::sync::{Arc, Mutex, RwLock};
17use tracing::{info, warn};
18use uuid::Uuid;
19
20/// Filter for search queries
21///
22/// Per CP-012: Supports filtering by document path, MIME type, and modification time.
23#[derive(Debug, Clone)]
24pub enum Filter {
25    /// Filter by document path glob pattern (e.g., "docs/*.md")
26    DocumentPath(String),
27    /// Filter by MIME type (e.g., "text/markdown")
28    MimeType(String),
29    /// Filter by modification time (Unix timestamp, documents modified after this time)
30    ModifiedAfter(i64),
31    /// Filter by modification time (Unix timestamp, documents modified before this time)
32    ModifiedBefore(i64),
33}
34
35impl Filter {
36    /// Check if a document matches this filter
37    pub fn matches(&self, doc: &cp_core::Document) -> bool {
38        match self {
39            Filter::DocumentPath(pattern) => {
40                if let Ok(glob) = Pattern::new(pattern) {
41                    glob.matches(doc.path.to_string_lossy().as_ref())
42                } else {
43                    false
44                }
45            }
46            Filter::MimeType(mime) => doc.mime_type == *mime,
47            Filter::ModifiedAfter(ts) => doc.mtime > *ts,
48            Filter::ModifiedBefore(ts) => doc.mtime < *ts,
49        }
50    }
51}
52
53/// Query cache for storing search results
54///
55/// Per CP-020: Caches query results keyed by query hash, invalidated on state change.
56pub struct QueryCache {
57    /// LRU cache: query_hash -> chunk IDs
58    cache: RwLock<LruCache<[u8; 32], Vec<Uuid>>>,
59    /// State root when cache was last valid
60    state_root: RwLock<[u8; 32]>,
61}
62
63impl QueryCache {
64    /// Create a new query cache with specified capacity
65    pub fn new(capacity: usize) -> Self {
66        Self {
67            cache: RwLock::new(LruCache::new(
68                NonZeroUsize::new(capacity).unwrap_or(NonZeroUsize::new(100).unwrap()),
69            )),
70            state_root: RwLock::new([0u8; 32]),
71        }
72    }
73
74    /// Get cached results for a query with a given k
75    pub fn get(&self, query: &str, k: usize) -> Option<Vec<Uuid>> {
76        let hash = Self::hash_key(query, k);
77        self.cache.write().ok()?.get(&hash).cloned()
78    }
79
80    /// Store results for a query with a given k
81    pub fn put(&self, query: &str, k: usize, results: Vec<Uuid>) {
82        let hash = Self::hash_key(query, k);
83        if let Ok(mut cache) = self.cache.write() {
84            cache.put(hash, results);
85        }
86    }
87
88    /// Check if cache is valid for current state root
89    pub fn is_valid(&self, current_root: &[u8; 32]) -> bool {
90        if let Ok(root) = self.state_root.read() {
91            *root == *current_root
92        } else {
93            false
94        }
95    }
96
97    /// Invalidate cache and update state root
98    pub fn invalidate(&self, new_root: [u8; 32]) {
99        if let Ok(mut cache) = self.cache.write() {
100            cache.clear();
101        }
102        if let Ok(mut root) = self.state_root.write() {
103            *root = new_root;
104        }
105    }
106
107    /// Hash a query string with k to form a unique cache key
108    fn hash_key(query: &str, k: usize) -> [u8; 32] {
109        let mut hasher = blake3::Hasher::new();
110        hasher.update(query.as_bytes());
111        hasher.update(&k.to_le_bytes());
112        *hasher.finalize().as_bytes()
113    }
114}
115
116impl Default for QueryCache {
117    fn default() -> Self {
118        Self::new(100)
119    }
120}
121
122/// Search result with relevance score
123#[derive(Debug, Clone, serde::Serialize)]
124pub struct SearchResult {
125    /// The matching chunk
126    pub chunk: Chunk,
127    /// Similarity score (0.0-1.0)
128    pub score: f32,
129    /// Path to the source document
130    pub doc_path: String,
131}
132
133/// Result of an LLM generation
134#[derive(Debug, Clone, serde::Serialize)]
135pub struct GenerationResult {
136    /// The generated answer
137    pub answer: String,
138    /// The context used to generate the answer
139    pub context: String,
140    /// Latency in milliseconds
141    pub latency_ms: u64,
142}
143
144/// A citation linking response text to source chunks
145///
146/// Per CP-020: Tracks which parts of a response are grounded in context.
147#[derive(Debug, Clone, serde::Serialize)]
148pub struct Citation {
149    /// ID of the source chunk
150    pub chunk_id: Uuid,
151    /// Byte span in the response (start, end)
152    pub span: (usize, usize),
153    /// Confidence score (0.0-1.0) based on overlap ratio
154    pub confidence: f32,
155}
156
157/// Result of response validation
158///
159/// Per CP-020: Detects potential hallucinations and measures citation coverage.
160#[derive(Debug, Clone, serde::Serialize)]
161pub struct ValidationResult {
162    /// Whether the response is considered valid (well-grounded)
163    pub is_valid: bool,
164    /// Warning messages about potential issues
165    pub warnings: Vec<String>,
166    /// Percentage of response covered by citations (0.0-1.0)
167    pub citation_coverage: f32,
168    /// Extracted citations
169    pub citations: Vec<Citation>,
170}
171
172/// Phrases that often indicate hallucination
173const HALLUCINATION_PHRASES: &[&str] = &[
174    "from my knowledge",
175    "i recall that",
176    "as far as i know",
177    "i believe that",
178    "in my experience",
179    "typically",
180    "generally speaking",
181    "it's commonly known",
182    "as everyone knows",
183    "i think that",
184    "probably",
185    "most likely",
186    "i assume",
187    "based on my understanding",
188    "from what i've learned",
189];
190
191/// Extract citations by finding n-gram overlaps between response and context
192///
193/// Per CP-020: Uses 5-gram overlap detection to identify grounded text.
194pub fn extract_citations(response: &str, context: &AssembledContext) -> Vec<Citation> {
195    let mut citations = Vec::new();
196    let response_lower = response.to_lowercase();
197    let response_words: Vec<&str> = response_lower.split_whitespace().collect();
198
199    if response_words.len() < 5 {
200        return citations;
201    }
202
203    for chunk in &context.chunks {
204        let chunk_lower = chunk.text.to_lowercase();
205        let chunk_words: Vec<&str> = chunk_lower.split_whitespace().collect();
206
207        if chunk_words.len() < 5 {
208            continue;
209        }
210
211        // Find 5-gram overlaps
212        let mut overlap_count = 0;
213        let mut matched_positions: Vec<usize> = Vec::new();
214
215        for i in 0..=response_words.len().saturating_sub(5) {
216            let response_ngram: Vec<&str> = response_words[i..i + 5].to_vec();
217
218            for j in 0..=chunk_words.len().saturating_sub(5) {
219                let chunk_ngram: Vec<&str> = chunk_words[j..j + 5].to_vec();
220
221                if response_ngram == chunk_ngram {
222                    overlap_count += 1;
223                    matched_positions.push(i);
224                    break;
225                }
226            }
227        }
228
229        if overlap_count > 0 {
230            // Calculate confidence as ratio of matched n-grams
231            let max_ngrams = (response_words.len().saturating_sub(4)).max(1);
232            let confidence = (overlap_count as f32) / (max_ngrams as f32);
233
234            // Find byte span from word positions
235            let start_pos = matched_positions.first().copied().unwrap_or(0);
236            let end_pos = matched_positions.last().copied().unwrap_or(0) + 5;
237
238            // Convert word positions to byte offsets (approximate)
239            let mut byte_start = 0;
240            let mut byte_end = response.len();
241
242            let mut word_idx = 0;
243            for (i, c) in response.char_indices() {
244                if c.is_whitespace() {
245                    word_idx += 1;
246                    if word_idx == start_pos {
247                        byte_start = i + 1;
248                    }
249                    if word_idx == end_pos.min(response_words.len()) {
250                        byte_end = i;
251                        break;
252                    }
253                }
254            }
255
256            citations.push(Citation {
257                chunk_id: chunk.chunk_id,
258                span: (byte_start, byte_end),
259                confidence,
260            });
261        }
262    }
263
264    // Sort by confidence descending
265    citations.sort_by(|a, b| b.confidence.partial_cmp(&a.confidence).unwrap_or(std::cmp::Ordering::Equal));
266
267    citations
268}
269
270/// Validate a response for potential hallucinations
271///
272/// Per CP-020: Checks for hallucination phrases and low citation coverage.
273pub fn validate_response(response: &str, context: &AssembledContext) -> ValidationResult {
274    let mut warnings = Vec::new();
275
276    // Extract citations
277    let citations = extract_citations(response, context);
278
279    // Calculate citation coverage (merge overlapping spans first)
280    let total_response_len = response.len() as f32;
281    let mut spans: Vec<(usize, usize)> = citations.iter().map(|c| c.span).collect();
282    spans.sort_by_key(|s| s.0);
283    let mut merged: Vec<(usize, usize)> = Vec::new();
284    for span in &spans {
285        if let Some(last) = merged.last_mut() {
286            if span.0 <= last.1 {
287                last.1 = last.1.max(span.1);
288                continue;
289            }
290        }
291        merged.push(*span);
292    }
293    let covered_bytes: usize = merged.iter().map(|(a, b)| b.saturating_sub(*a)).sum();
294
295    let citation_coverage = if total_response_len > 0.0 {
296        (covered_bytes as f32 / total_response_len).min(1.0)
297    } else {
298        0.0
299    };
300
301    // Check for hallucination phrases
302    let response_lower = response.to_lowercase();
303    for phrase in HALLUCINATION_PHRASES {
304        if response_lower.contains(phrase) {
305            warnings.push(format!("Response contains hallucination indicator: '{}'", phrase));
306        }
307    }
308
309    // Check for low citation coverage
310    if citation_coverage < 0.3 && !response.is_empty() {
311        warnings.push(format!(
312            "Low citation coverage: {:.1}% (threshold: 30%)",
313            citation_coverage * 100.0
314        ));
315    }
316
317    // Check if response claims missing information (this is good, not a warning)
318    let good_phrases = ["information is missing", "not found in the context", "cannot find"];
319    let claims_missing = good_phrases.iter().any(|p| response_lower.contains(p));
320
321    // Determine validity
322    let is_valid = warnings.is_empty() || claims_missing;
323
324    ValidationResult {
325        is_valid,
326        warnings,
327        citation_coverage,
328        citations,
329    }
330}
331
332/// Trait for the CP Intelligence Module (IM)
333/// 
334/// An IntelligenceEngine is a read-only consumer of the semantic substrate.
335/// It synthesizes information from retrieved context into human-readable answers.
336#[async_trait::async_trait]
337pub trait IntelligenceEngine: Send + Sync {
338    /// Generate a synthesized answer from the provided context and query.
339    /// This is a read-only operation and cannot mutate the underlying graph.
340    async fn generate(&self, context: &AssembledContext, query: &str) -> Result<String>;
341}
342
343/// Ollama-based LLM generator (for desktop/server use)
344pub struct OllamaGenerator {
345    base_url: String,
346    model: String,
347}
348
349impl OllamaGenerator {
350    /// Create a new Ollama generator
351    pub fn new(base_url: String, model: String) -> Self {
352        Self { base_url, model }
353    }
354}
355
356#[async_trait::async_trait]
357impl IntelligenceEngine for OllamaGenerator {
358    async fn generate(&self, context: &AssembledContext, query: &str) -> Result<String> {
359        let formatted_context = ContextAssembler::format(context);
360        
361        // Engineered for consistency with mobile prompt
362        let prompt = format!(
363            "<|im_start|>system\nYou are a context reading machine. You do not have knowledge of the outside world.\n- Read the Context below carefully.\n- If the answer to the Query is in the Context, output it.\n- If the answer is NOT in the Context, say 'Information is missing from the substrate.' and nothing else.\n- Do not make up facts.\n<|im_end|>\n<|im_start|>user\nContext:\n{}\n\nQuery: {}<|im_end|>\n<|im_start|>assistant\n",
364            formatted_context, query
365        );
366
367        let client = reqwest::Client::builder()
368            .timeout(std::time::Duration::from_secs(120))
369            .build()
370            .map_err(|e| CPError::Inference(format!("Failed to create HTTP client: {}", e)))?;
371        let payload = serde_json::json!({
372            "model": self.model,
373            "prompt": prompt,
374            "stream": false
375        });
376
377        let url = format!("{}/api/generate", self.base_url);
378        let res = client
379            .post(&url)
380            .json(&payload)
381            .send()
382            .await
383            .map_err(|e| CPError::Inference(format!("Ollama request failed: {}", e)))?;
384
385        let json: serde_json::Value = res
386            .json()
387            .await
388            .map_err(|e| CPError::Parse(e.to_string()))?;
389
390        let answer = json["response"]
391            .as_str()
392            .ok_or_else(|| CPError::Parse("Invalid Ollama response".into()))?
393            .to_string();
394
395        Ok(answer)
396    }
397}
398
399/// Query engine for semantic search
400pub struct QueryEngine {
401    graph: Arc<Mutex<cp_graph::GraphStore>>,
402    embedder: Arc<cp_embeddings::EmbeddingEngine>,
403    intelligence: Option<Box<dyn IntelligenceEngine>>,
404    /// Query result cache
405    cache: QueryCache,
406    /// Token budget for context assembly
407    context_budget: usize,
408}
409
410impl QueryEngine {
411    /// Create a new query engine
412    pub fn new(
413        graph: Arc<Mutex<cp_graph::GraphStore>>,
414        embedder: Arc<cp_embeddings::EmbeddingEngine>,
415    ) -> Self {
416        Self {
417            graph,
418            embedder,
419            intelligence: None,
420            cache: QueryCache::default(),
421            context_budget: 2000,
422        }
423    }
424
425    /// Create a new query engine with custom cache capacity
426    pub fn with_cache_capacity(
427        graph: Arc<Mutex<cp_graph::GraphStore>>,
428        embedder: Arc<cp_embeddings::EmbeddingEngine>,
429        cache_capacity: usize,
430    ) -> Self {
431        Self {
432            graph,
433            embedder,
434            intelligence: None,
435            cache: QueryCache::new(cache_capacity),
436            context_budget: 2000,
437        }
438    }
439
440    /// Set the context token budget (builder pattern)
441    pub fn with_context_budget(mut self, budget: usize) -> Self {
442        self.context_budget = budget;
443        self
444    }
445
446    /// Set the intelligence engine for RAG (builder pattern)
447    pub fn with_intelligence(mut self, intelligence: Box<dyn IntelligenceEngine>) -> Self {
448        self.intelligence = Some(intelligence);
449        self
450    }
451
452    /// Set or replace the intelligence engine after creation
453    pub fn set_intelligence(&mut self, intelligence: Box<dyn IntelligenceEngine>) {
454        self.intelligence = Some(intelligence);
455    }
456
457    /// Search for relevant chunks using a hybrid (semantic + lexical) approach
458    ///
459    /// Uses integer Reciprocal Rank Fusion (RRF) per CP-003 ยง7 for deterministic results:
460    /// Score(d) = 1,000,000 / (k + rank(d))
461    pub fn search(&self, query: &str, k: usize) -> Result<Vec<SearchResult>> {
462        info!("Hybrid search for: '{}'", query);
463
464        // 1. Semantic Search (Vector)
465        let query_vec = self
466            .embedder
467            .embed_query(query)
468            .map_err(|e| CPError::Embedding(format!("Failed to embed query: {}", e)))?;
469
470        let semantic_results = {
471            let graph = self.graph.lock().expect("graph lock poisoned");
472            graph.search(&query_vec, k)?
473        };
474
475        // 2. Lexical Search (FTS5)
476        let lexical_results = {
477            let graph = self.graph.lock().expect("graph lock poisoned");
478            // Sanitize FTS5 special characters to prevent syntax injection
479            let fts_query = sanitize_fts5_query(query);
480            graph.search_lexical(&fts_query, k).unwrap_or_else(|e| {
481                warn!("Lexical search failed: {}. Falling back to semantic only.", e);
482                Vec::new()
483            })
484        };
485
486        // 3. Merge Results using Integer Reciprocal Rank Fusion (RRF)
487        // Per CP-003 ยง7: Score(d) = 1,000,000 / (k + rank(d))
488        // This uses integer math for deterministic results across platforms
489        const RRF_K: u64 = 60;
490        const RRF_SCALE: u64 = 1_000_000;
491
492        // Use u64 for scores to ensure integer determinism
493        let mut scores: std::collections::HashMap<Uuid, u64> = std::collections::HashMap::new();
494
495        {
496            let graph = self.graph.lock().expect("graph lock poisoned");
497
498            for (i, (emb_id, _)) in semantic_results.iter().enumerate() {
499                if let Ok(Some(chunk_id)) = graph.get_chunk_id_for_embedding(*emb_id) {
500                    // Integer RRF: 1,000,000 / (60 + rank)
501                    let score = RRF_SCALE / (RRF_K + i as u64);
502                    *scores.entry(chunk_id).or_insert(0) += score;
503                }
504            }
505
506            for (i, (chunk_id, _)) in lexical_results.iter().enumerate() {
507                let score = RRF_SCALE / (RRF_K + i as u64);
508                *scores.entry(*chunk_id).or_insert(0) += score;
509            }
510        }
511
512        // Sort by fused score (descending), then by chunk ID (ascending) for deterministic tiebreaking
513        let mut fused: Vec<(Uuid, u64)> = scores.into_iter().collect();
514        fused.sort_by(|a, b| {
515            b.1.cmp(&a.1) // Score descending
516                .then_with(|| a.0.cmp(&b.0)) // Chunk ID ascending for tiebreak
517        });
518        fused.truncate(k);
519
520        // 4. Retrieve chunks and docs
521        let mut search_results = Vec::with_capacity(fused.len());
522        let graph = self.graph.lock().expect("graph lock poisoned");
523
524        for (chunk_id, fused_score) in fused {
525            let chunk = match graph.get_chunk(chunk_id)? {
526                Some(c) => c,
527                None => continue,
528            };
529
530            let doc = match graph.get_document(chunk.doc_id)? {
531                Some(d) => d,
532                None => continue,
533            };
534
535            // Convert to f32 for API compatibility (score is preserved proportionally)
536            let normalized_score = fused_score as f32 / (RRF_SCALE * 2) as f32;
537
538            search_results.push(SearchResult {
539                chunk,
540                score: normalized_score,
541                doc_path: doc.path.to_string_lossy().to_string(),
542            });
543        }
544
545        Ok(search_results)
546    }
547
548    /// Perform purely semantic search (vector only)
549    pub fn search_semantic(&self, query: &str, k: usize) -> Result<Vec<SearchResult>> {
550        info!("Semantic search for: '{}'", query);
551
552        let query_vec = self
553            .embedder
554            .embed_query(query)
555            .map_err(|e| CPError::Embedding(format!("Failed to embed query: {}", e)))?;
556
557        let raw_results = {
558            let graph = self.graph.lock().expect("graph lock poisoned");
559            graph.search(&query_vec, k)?
560        };
561
562        let mut search_results = Vec::with_capacity(raw_results.len());
563        let graph = self.graph.lock().expect("graph lock poisoned");
564
565        for (emb_id, score) in raw_results {
566            // Semantic search returns embedding IDs, need to resolve to chunk ID
567            if let Some(chunk_id) = graph.get_chunk_id_for_embedding(emb_id)? {
568                if let Some(chunk) = graph.get_chunk(chunk_id)? {
569                    if let Some(doc) = graph.get_document(chunk.doc_id)? {
570                        search_results.push(SearchResult {
571                            chunk,
572                            score,
573                            doc_path: doc.path.to_string_lossy().to_string(),
574                        });
575                    }
576                }
577            }
578        }
579
580        Ok(search_results)
581    }
582
583    /// Perform purely lexical search (keyword only)
584    pub fn search_lexical(&self, query: &str, k: usize) -> Result<Vec<SearchResult>> {
585        info!("Lexical search for: '{}'", query);
586
587        let raw_results = {
588            let graph = self.graph.lock().expect("graph lock poisoned");
589            // Sanitize FTS5 special characters to prevent syntax injection
590            let fts_query = sanitize_fts5_query(query);
591            graph.search_lexical(&fts_query, k)?
592        };
593
594        let mut search_results = Vec::with_capacity(raw_results.len());
595        let graph = self.graph.lock().expect("graph lock poisoned");
596
597        for (chunk_id, score) in raw_results {
598            // Lexical search returns chunk IDs directly
599            if let Some(chunk) = graph.get_chunk(chunk_id)? {
600                if let Some(doc) = graph.get_document(chunk.doc_id)? {
601                    search_results.push(SearchResult {
602                        chunk,
603                        score,
604                        doc_path: doc.path.to_string_lossy().to_string(),
605                    });
606                }
607            }
608        }
609
610        Ok(search_results)
611    }
612
613    /// Search with filters applied
614    ///
615    /// Per CP-012: Supports filtering by document path, MIME type, and modification time.
616    pub fn search_filtered(&self, query: &str, k: usize, filters: &[Filter]) -> Result<Vec<SearchResult>> {
617        info!("Filtered search for: '{}' with {} filters", query, filters.len());
618
619        // Get all matching documents based on filters
620        let matching_doc_ids: std::collections::HashSet<Uuid> = {
621            let graph = self.graph.lock().expect("graph lock poisoned");
622            let all_docs = graph.get_all_documents()?;
623
624            all_docs
625                .into_iter()
626                .filter(|doc| filters.iter().all(|f| f.matches(doc)))
627                .map(|doc| doc.id)
628                .collect()
629        };
630
631        if matching_doc_ids.is_empty() {
632            info!("No documents match filters");
633            return Ok(Vec::new());
634        }
635
636        // Perform regular search
637        let all_results = self.search(query, k * 3)?; // Get more results to filter
638
639        // Filter results to only include matching documents
640        let filtered_results: Vec<SearchResult> = all_results
641            .into_iter()
642            .filter(|r| matching_doc_ids.contains(&r.chunk.doc_id))
643            .take(k)
644            .collect();
645
646        info!("Filtered search returned {} results", filtered_results.len());
647        Ok(filtered_results)
648    }
649
650    /// Search with caching
651    ///
652    /// Per CP-020: Uses query cache for faster repeated queries.
653    pub fn search_cached(&self, query: &str, k: usize) -> Result<Vec<SearchResult>> {
654        // Check cache validity
655        let current_root = {
656            let graph = self.graph.lock().expect("graph lock poisoned");
657            graph.compute_merkle_root()?
658        };
659
660        if !self.cache.is_valid(&current_root) {
661            self.cache.invalidate(current_root);
662        }
663
664        // Check cache (key includes both query and k)
665        if let Some(chunk_ids) = self.cache.get(query, k) {
666            info!("Cache hit for query: '{}' (k={})", query, k);
667
668            let graph = self.graph.lock().expect("graph lock poisoned");
669            let mut results = Vec::new();
670
671            for chunk_id in chunk_ids.iter().take(k) {
672                if let Some(chunk) = graph.get_chunk(*chunk_id)? {
673                    if let Some(doc) = graph.get_document(chunk.doc_id)? {
674                        results.push(SearchResult {
675                            chunk,
676                            score: f32::NAN, // Score not preserved in cache
677                            doc_path: doc.path.to_string_lossy().to_string(),
678                        });
679                    }
680                }
681            }
682
683            return Ok(results);
684        }
685
686        // Cache miss - perform search
687        let results = self.search(query, k)?;
688
689        // Store in cache (key includes both query and k)
690        let chunk_ids: Vec<Uuid> = results.iter().map(|r| r.chunk.id).collect();
691        self.cache.put(query, k, chunk_ids);
692
693        Ok(results)
694    }
695
696    /// Invalidate the query cache
697    pub fn invalidate_cache(&self) -> Result<()> {
698        let root = {
699            let graph = self.graph.lock().expect("graph lock poisoned");
700            graph.compute_merkle_root()?
701        };
702        self.cache.invalidate(root);
703        Ok(())
704    }
705
706    /// Get all chunks for a specific document
707    pub fn get_chunks_for_document(&self, doc_id: Uuid) -> Result<Vec<SearchResult>> {
708        let graph = self.graph.lock().expect("graph lock poisoned");
709        
710        let doc = graph
711            .get_document(doc_id)?
712            .ok_or_else(|| CPError::Database(format!("Doc {} not found", doc_id)))?;
713            
714        let chunks = graph.get_chunks_for_doc(doc_id)?;
715        
716        Ok(chunks.into_iter().map(|c| SearchResult {
717            chunk: c,
718            score: f32::NAN, // Browsing doesn't have a score
719            doc_path: doc.path.to_string_lossy().to_string(),
720        }).collect())
721    }
722
723    /// Access the underlying graph store (for testing/debugging)
724    pub fn graph(&self) -> Arc<Mutex<cp_graph::GraphStore>> {
725        self.graph.clone()
726    }
727
728    /// Generate an answer using the knowledge graph and an LLM
729    pub async fn generate_answer(&self, query: &str) -> Result<GenerationResult> {
730        let start = std::time::Instant::now();
731        info!("Generating answer for: '{}'", query);
732
733        // 1. Search for relevant chunks
734        let results = self.search(query, 5)?;
735
736        // 2. Assemble context
737        let assembler = ContextAssembler::with_budget(self.context_budget);
738        let scored_chunks: Vec<ScoredChunk> = results
739            .iter()
740            .map(|r| ScoredChunk {
741                chunk: r.chunk.clone(),
742                score: r.score,
743                document_path: r.doc_path.clone(),
744            })
745            .collect();
746
747        let state_root = {
748            let graph = self.graph.lock().expect("graph lock poisoned");
749            graph.compute_merkle_root()?
750        };
751        
752        let assembled_context = assembler.assemble(scored_chunks, query, state_root);
753
754        // 3. Generate answer using configured intelligence engine (no fallback)
755        let answer = if let Some(ref engine) = self.intelligence {
756            engine.generate(&assembled_context, query).await?
757        } else {
758            return Err(CPError::NotFound("Intelligence engine not configured".into()));
759        };
760
761        Ok(GenerationResult {
762            answer,
763            context: ContextAssembler::format(&assembled_context),
764            latency_ms: start.elapsed().as_millis() as u64,
765        })
766    }
767
768    /// Generate a cryptographic proof receipt for a query.
769    ///
770    /// This creates a signed, verifiable record of exactly what context
771    /// was available when a search was performed. The receipt includes
772    /// Merkle proofs for each chunk, allowing independent verification.
773    pub fn generate_proof_receipt(
774        &self,
775        query: &str,
776        search_results: &[SearchResult],
777        identity: &cp_sync::DeviceIdentity,
778    ) -> Result<cp_core::ProofReceipt> {
779        let query_hash = *blake3::hash(query.as_bytes()).as_bytes();
780
781        // Assemble context from search results
782        let assembler = ContextAssembler::with_budget(self.context_budget * 2);
783        let scored_chunks: Vec<ScoredChunk> = search_results
784            .iter()
785            .map(|r| ScoredChunk {
786                chunk: r.chunk.clone(),
787                score: r.score,
788                document_path: r.doc_path.clone(),
789            })
790            .collect();
791
792        let state_root = {
793            let graph = self.graph.lock().expect("graph lock poisoned");
794            graph.compute_merkle_root()?
795        };
796
797        let assembled = assembler.assemble(scored_chunks, query, state_root);
798        let context_string = ContextAssembler::format(&assembled);
799        let context_hash = *blake3::hash(context_string.as_bytes()).as_bytes();
800
801        // Get sorted chunk hashes and compute chunk tree root
802        let (sorted_chunk_ids, sorted_chunk_hashes, chunk_tree_root) = {
803            let graph = self.graph.lock().expect("graph lock poisoned");
804            let sorted = graph.get_sorted_chunk_hashes()?;
805            let hashes: Vec<[u8; 32]> = sorted.iter().map(|(_, h)| *h).collect();
806            let root = cp_core::proof::compute_chunk_tree_root(&hashes);
807            (sorted, hashes, root)
808        };
809
810        // Build per-chunk proofs and source references
811        let mut chunk_proofs = Vec::new();
812        let mut sources = Vec::new();
813
814        for result in search_results {
815            let chunk_id_bytes = *result.chunk.id.as_bytes();
816
817            // Find this chunk's index in the sorted list
818            if let Some(idx) = sorted_chunk_ids.iter().position(|(id, _)| *id == chunk_id_bytes) {
819                let proof = cp_core::proof::build_chunk_proof(
820                    chunk_id_bytes,
821                    result.chunk.text_hash,
822                    idx,
823                    &sorted_chunk_hashes,
824                );
825                chunk_proofs.push(proof);
826            }
827
828            sources.push(cp_core::SourceRef {
829                document_path: result.doc_path.clone(),
830                chunk_id: chunk_id_bytes,
831                chunk_text: result.chunk.text.clone(),
832                chunk_sequence: result.chunk.sequence,
833                relevance_score: result.score,
834            });
835        }
836
837        // Generate timestamp
838        let now = std::time::SystemTime::now()
839            .duration_since(std::time::UNIX_EPOCH)
840            .unwrap_or_default();
841        let secs = now.as_secs();
842        let timestamp = format_unix_timestamp(secs);
843
844        // Build and sign receipt
845        let mut receipt = cp_core::ProofReceipt {
846            version: 1,
847            query: query.to_string(),
848            query_hash,
849            timestamp,
850            context_hash,
851            state_root,
852            chunk_tree_root,
853            chunk_proofs,
854            sources,
855            signature: [0u8; 64],
856            signer_public_key: identity.public_key,
857            device_id: identity.device_id,
858        };
859
860        let sig = identity.sign(&receipt.signing_bytes());
861        receipt.signature = sig;
862
863        Ok(receipt)
864    }
865
866    /// Chat with context from the knowledge graph.
867    ///
868    /// Performs RAG search, prepends conversation history to the context,
869    /// and delegates to `generate_answer` for the LLM call.
870    pub async fn chat(&self, query: &str, history: &[Message]) -> Result<String> {
871        let _start = std::time::Instant::now();
872
873        // 1. Search using only the current user query (not history)
874        let results = self.search(query, 5)?;
875
876        // 2. Assemble context from search results
877        let assembler = ContextAssembler::with_budget(2000);
878        let scored_chunks: Vec<ScoredChunk> = results
879            .iter()
880            .map(|r| ScoredChunk {
881                chunk: r.chunk.clone(),
882                score: r.score,
883                document_path: r.doc_path.clone(),
884            })
885            .collect();
886
887        let state_root = {
888            let graph = self.graph.lock().expect("graph lock poisoned");
889            graph.compute_merkle_root()?
890        };
891        let assembled_context = assembler.assemble(scored_chunks, query, state_root);
892
893        // 3. Build the full prompt with history for the LLM
894        let mut full_prompt = String::new();
895        for msg in history {
896            let role = match msg.role {
897                Role::User => "User",
898                Role::Assistant => "Assistant",
899                Role::System => "System",
900            };
901            full_prompt.push_str(&format!("{}: {}\n", role, msg.content));
902        }
903        full_prompt.push_str(&format!("User: {}\n", query));
904
905        // 4. Generate answer with history context
906        let answer = if let Some(ref engine) = self.intelligence {
907            engine.generate(&assembled_context, &full_prompt).await?
908        } else {
909            return Err(CPError::NotFound("Intelligence engine not configured".into()));
910        };
911
912        Ok(answer)
913    }
914}
915
916/// A chat message
917#[derive(Debug, Clone)]
918pub struct Message {
919    pub role: Role,
920    pub content: String,
921}
922
923/// Chat message role
924#[derive(Debug, Clone, Copy, PartialEq, Eq)]
925pub enum Role {
926    User,
927    Assistant,
928    System,
929}
930
931// ============================================================================
932// Cross-source RRF merging (CP-013 ยง16)
933// ============================================================================
934
935/// A verified remote result with its peer weight and origin.
936pub struct VerifiedRemoteResult {
937    pub result: RemoteSearchResult,
938    /// Peer rating weight (0.3..=1.0). Unrated peers default to 0.5.
939    pub weight: f64,
940    pub peer_node_id: [u8; 16],
941    pub peer_state_root: [u8; 32],
942    pub peer_signature: [u8; 64],
943}
944
945/// Merge local search results with verified remote results using
946/// cross-source Reciprocal Rank Fusion.
947///
948/// Local results have implicit weight 1.0. Remote results are weighted by
949/// the peer's rating (0.3..=1.0). Results are deduplicated by chunk_id.
950/// The final list is capped at `max_results` (at most 20).
951pub fn merge_results(
952    local: &[SearchResult],
953    remote: &[VerifiedRemoteResult],
954    max_results: usize,
955) -> Vec<MergedSearchResult> {
956    const K: f64 = 60.0;
957    let cap = max_results.min(MAX_RESULTS as usize);
958
959    // chunk_id bytes โ†’ (accumulated score, MergedSearchResult)
960    let mut scores: HashMap<[u8; 16], (f64, MergedSearchResult)> = HashMap::new();
961
962    // Score local results (weight 1.0)
963    for (rank, result) in local.iter().enumerate() {
964        let rrf = 1.0 / (K + rank as f64 + 1.0);
965        let chunk_id = *result.chunk.id.as_bytes();
966
967        let entry = scores.entry(chunk_id).or_insert_with(|| {
968            (
969                0.0,
970                MergedSearchResult {
971                    chunk_id,
972                    chunk_text: result.chunk.text.clone(),
973                    document_path: result.doc_path.clone(),
974                    score: 0.0,
975                    source: ResultSource::Local,
976                    merkle_proof: None,
977                    peer_state_root: None,
978                    peer_signature: None,
979                },
980            )
981        });
982        entry.0 += rrf;
983    }
984
985    // Score remote results (weighted by peer rating)
986    for (rank, verified) in remote.iter().enumerate() {
987        let rrf = verified.weight / (K + rank as f64 + 1.0);
988        let chunk_id = verified.result.chunk_id;
989
990        if let Some(entry) = scores.get_mut(&chunk_id) {
991            // Duplicate: found both locally and remotely
992            entry.0 += rrf;
993            entry.1.source = ResultSource::Both {
994                peer_node_id: verified.peer_node_id,
995            };
996            // Preserve remote proof/signature for verification
997            if entry.1.peer_state_root.is_none() {
998                entry.1.peer_state_root = Some(verified.peer_state_root);
999                entry.1.peer_signature = Some(verified.peer_signature);
1000                entry.1.merkle_proof = verified.result.merkle_proof.clone();
1001            }
1002        } else {
1003            scores.insert(
1004                chunk_id,
1005                (
1006                    rrf,
1007                    MergedSearchResult {
1008                        chunk_id,
1009                        chunk_text: verified.result.chunk_text.clone(),
1010                        document_path: verified.result.document_path.clone(),
1011                        score: 0.0,
1012                        source: ResultSource::Remote {
1013                            peer_node_id: verified.peer_node_id,
1014                        },
1015                        merkle_proof: verified.result.merkle_proof.clone(),
1016                        peer_state_root: Some(verified.peer_state_root),
1017                        peer_signature: Some(verified.peer_signature),
1018                    },
1019                ),
1020            );
1021        }
1022    }
1023
1024    // Sort by score descending, take top `cap`
1025    let mut merged: Vec<(f64, MergedSearchResult)> = scores.into_values().collect();
1026    merged.sort_by(|a, b| b.0.partial_cmp(&a.0).unwrap_or(std::cmp::Ordering::Equal));
1027    merged.truncate(cap);
1028
1029    // Write final scores into the result structs
1030    merged
1031        .into_iter()
1032        .map(|(score, mut result)| {
1033            result.score = score;
1034            result
1035        })
1036        .collect()
1037}
1038
1039/// Verify a remote SearchResponse and extract weighted results.
1040///
1041/// Checks the response signature against the peer's public key.
1042/// Returns None if the response is invalid or not Ok.
1043pub fn verify_and_extract(
1044    response: &SearchResponse,
1045    peer_public_key: &[u8; 32],
1046    peer_node_id: [u8; 16],
1047    peer_rating: Option<f64>,
1048) -> Option<Vec<VerifiedRemoteResult>> {
1049    if response.status != SearchStatus::Ok {
1050        return None;
1051    }
1052
1053    // Verify response signature
1054    if cp_tor::verify_response(response, peer_public_key).is_err() {
1055        warn!(
1056            "Invalid response signature from peer {}",
1057            hex::encode(&peer_node_id[..4])
1058        );
1059        return None;
1060    }
1061
1062    let weight = peer_rating
1063        .map(|r| r.clamp(0.3, 1.0))
1064        .unwrap_or(0.5);
1065
1066    let results = response
1067        .results
1068        .iter()
1069        .map(|r| VerifiedRemoteResult {
1070            result: r.clone(),
1071            weight,
1072            peer_node_id,
1073            peer_state_root: response.peer_state_root,
1074            peer_signature: response.signature,
1075        })
1076        .collect();
1077
1078    Some(results)
1079}
1080
1081impl QueryEngine {
1082    /// Perform a live search that dispatches the query to the local index
1083    /// and to remote peers in parallel, merging results via cross-source RRF.
1084    ///
1085    /// This is the primary search entrypoint when the node has peer connections.
1086    /// Remote results that arrive within `timeout` are merged. Local results
1087    /// are always included regardless of remote availability.
1088    pub async fn live_search(
1089        &self,
1090        query: &str,
1091        remote_results: Vec<VerifiedRemoteResult>,
1092        max_results: usize,
1093    ) -> Result<Vec<MergedSearchResult>> {
1094        info!(
1095            "Live search for '{}' with {} remote result(s)",
1096            query,
1097            remote_results.len()
1098        );
1099
1100        // 1. Local hybrid search
1101        let local_results = self.search(query, max_results)?;
1102
1103        // 2. Merge local + remote via cross-source RRF
1104        let merged = merge_results(&local_results, &remote_results, max_results);
1105
1106        info!(
1107            "Live search returned {} merged results ({} local, {} remote)",
1108            merged.len(),
1109            local_results.len(),
1110            remote_results.len()
1111        );
1112
1113        Ok(merged)
1114    }
1115}
1116
1117/// Format a unix timestamp as ISO 8601 UTC.
1118/// Sanitize a query string for FTS5 to prevent syntax injection.
1119///
1120/// Strips FTS5 metacharacters (operators, grouping, column filters) and
1121/// wraps the result in double quotes for phrase matching.
1122fn sanitize_fts5_query(query: &str) -> String {
1123    let sanitized: String = query
1124        .chars()
1125        .filter(|c| !matches!(c, '"' | '*' | '^' | '+' | '-' | '(' | ')' | '{' | '}' | ':'))
1126        .collect();
1127    let trimmed = sanitized.trim();
1128    if trimmed.is_empty() {
1129        // All characters were operators โ€” return empty string so FTS returns no results
1130        String::new()
1131    } else {
1132        format!("\"{}\"", trimmed)
1133    }
1134}
1135
1136fn format_unix_timestamp(secs: u64) -> String {
1137    let days_since_epoch = secs / 86400;
1138    let time_of_day = secs % 86400;
1139    let hours = time_of_day / 3600;
1140    let minutes = (time_of_day % 3600) / 60;
1141    let seconds = time_of_day % 60;
1142    let (year, month, day) = days_to_date(days_since_epoch);
1143    format!(
1144        "{:04}-{:02}-{:02}T{:02}:{:02}:{:02}Z",
1145        year, month, day, hours, minutes, seconds
1146    )
1147}
1148
1149fn days_to_date(days: u64) -> (u64, u64, u64) {
1150    let z = days + 719468;
1151    let era = z / 146097;
1152    let doe = z - era * 146097;
1153    let yoe = (doe - doe / 1460 + doe / 36524 - doe / 146096) / 365;
1154    let y = yoe + era * 400;
1155    let doy = doe - (365 * yoe + yoe / 4 - yoe / 100);
1156    let mp = (5 * doy + 2) / 153;
1157    let d = doy - (153 * mp + 2) / 5 + 1;
1158    let m = if mp < 10 { mp + 3 } else { mp - 9 };
1159    let y = if m <= 2 { y + 1 } else { y };
1160    (y, m, d)
1161}
1162
1163#[cfg(test)]
1164mod tests {
1165    use super::*;
1166    use cp_core::{Document, Chunk};
1167    use std::sync::{Arc, Mutex};
1168    use tempfile::TempDir;
1169
1170    #[tokio::test]
1171    async fn test_get_chunks_for_document() {
1172        let temp = TempDir::new().unwrap();
1173        let db_path = temp.path().join("test.db");
1174        let mut graph = cp_graph::GraphStore::open(db_path.to_str().unwrap()).unwrap();
1175        
1176        let doc = Document::new("test.md".into(), b"Hello world", 100);
1177        graph.insert_document(&doc).unwrap();
1178        
1179        let chunk = Chunk {
1180            id: Uuid::new_v4(),
1181            doc_id: doc.id,
1182            text: "Hello world".to_string(),
1183            byte_offset: 0,
1184            byte_length: 11,
1185            sequence: 0,
1186            text_hash: [0; 32],
1187        };
1188        graph.insert_chunk(&chunk).unwrap();
1189        
1190        let embedder = Arc::new(cp_embeddings::EmbeddingEngine::new().unwrap());
1191        let qe = QueryEngine::new(Arc::new(Mutex::new(graph)), embedder);
1192        
1193        let results = qe.get_chunks_for_document(doc.id).unwrap();
1194        assert_eq!(results.len(), 1);
1195        assert_eq!(results[0].chunk.text, "Hello world");
1196    }
1197
1198    #[tokio::test]
1199    async fn test_hybrid_search() {
1200        let temp = TempDir::new().unwrap();
1201        let db_path = temp.path().join("test_hybrid.db");
1202        let mut graph = cp_graph::GraphStore::open(db_path.to_str().unwrap()).unwrap();
1203        
1204        let doc = Document::new("test.md".into(), b"The quick brown fox jumps over the lazy dog", 100);
1205        graph.insert_document(&doc).unwrap();
1206        
1207        let chunk = Chunk {
1208            id: Uuid::new_v4(),
1209            doc_id: doc.id,
1210            text: "The quick brown fox jumps over the lazy dog".to_string(),
1211            byte_offset: 0,
1212            byte_length: 43,
1213            sequence: 0,
1214            text_hash: [0; 32],
1215        };
1216        graph.insert_chunk(&chunk).unwrap();
1217        
1218        // Add embedding for semantic search
1219        let embedder = Arc::new(cp_embeddings::EmbeddingEngine::new().unwrap());
1220        let vec = embedder.embed(&chunk.text).unwrap();
1221        let emb = cp_core::Embedding::new(chunk.id, &vec, embedder.model_hash().unwrap(), 0);
1222        graph.insert_embedding(&emb).unwrap();
1223        
1224        let qe = QueryEngine::new(Arc::new(Mutex::new(graph)), embedder);
1225        
1226        // Test lexical priority
1227        let results = qe.search("quick brown fox", 5).unwrap();
1228        assert!(!results.is_empty());
1229        assert!(results[0].chunk.text.contains("quick brown fox"));
1230        
1231        // Test semantic priority (using synonym/related terms)
1232        let results_sem = qe.search("fast auburn canine", 5).unwrap();
1233        assert!(!results_sem.is_empty());
1234        assert!(results_sem[0].chunk.text.contains("quick brown fox"));
1235    }
1236
1237    #[tokio::test]
1238    async fn test_search_comparison_proof() {
1239        let temp = TempDir::new().unwrap();
1240        let db_path = temp.path().join("comparison_proof.db");
1241        let mut graph = cp_graph::GraphStore::open(db_path.to_str().unwrap()).unwrap();
1242        
1243        // Setup Corpus
1244        let t1 = "The quick brown fox jumps over the lazy dog"; // Keyword: fox
1245        let t2 = "Artificial intelligence is transforming the modern world"; // Keyword: modern
1246        let t3 = "A fast auburn canine leaps across an idle hound"; // Keyword: canine, Semantic match for fox
1247        
1248        let texts = vec![t1, t2, t3];
1249        for (i, text) in texts.iter().enumerate() {
1250            let doc = Document::new(format!("doc_{}.md", i).into(), text.as_bytes(), 100);
1251            graph.insert_document(&doc).unwrap();
1252            let chunk = Chunk {
1253                id: Uuid::new_v4(),
1254                doc_id: doc.id,
1255                text: text.to_string(),
1256                byte_offset: 0,
1257                byte_length: text.len() as u64,
1258                sequence: 0,
1259                text_hash: [0; 32],
1260            };
1261            graph.insert_chunk(&chunk).unwrap();
1262            let embedder = Arc::new(cp_embeddings::EmbeddingEngine::new().unwrap());
1263            let vec = embedder.embed(text).unwrap();
1264            let emb = cp_core::Embedding::new(chunk.id, &vec, embedder.model_hash().unwrap(), 0);
1265            graph.insert_embedding(&emb).unwrap();
1266        }
1267        
1268        let embedder = Arc::new(cp_embeddings::EmbeddingEngine::new().unwrap());
1269        let qe = QueryEngine::new(Arc::new(Mutex::new(graph)), embedder);
1270        
1271        // Query: "fox"
1272        let query = "fox";
1273        
1274        // 1. Lexical Results
1275        let lexical = {
1276            let graph_lock = qe.graph();
1277            let g = graph_lock.lock().expect("graph lock poisoned");
1278            g.search_lexical(query, 5).unwrap()
1279        };
1280        // Expect result: t1 (direct hit)
1281        assert_eq!(lexical.len(), 1); 
1282
1283        // 2. Vector Results
1284        let vector = {
1285            let graph_lock = qe.graph();
1286            let g = graph_lock.lock().expect("graph lock poisoned");
1287            let e = cp_embeddings::EmbeddingEngine::new().unwrap();
1288            let q_vec = e.embed_query(query).unwrap();
1289            g.search(&q_vec, 5).unwrap()
1290        };
1291        // Expect results: t1 and t3 (canine)
1292        assert!(vector.len() >= 2);
1293
1294        // 3. Hybrid Results
1295        let hybrid = qe.search(query, 5).unwrap();
1296        
1297        println!("\n--- SEARCH PROOF FOR '{}' ---", query);
1298        println!("LEXICAL HITS: {}", lexical.len());
1299        println!("VECTOR HITS:  {}", vector.len());
1300        println!("HYBRID HITS:  {}", hybrid.len());
1301        
1302        // Proof: Hybrid should contain both direct hits and semantic relatives
1303        let texts_found: Vec<String> = hybrid.iter().map(|r| r.chunk.text.clone()).collect();
1304        assert!(texts_found.contains(&t1.to_string())); // Direct lexical + semantic
1305        assert!(texts_found.contains(&t3.to_string())); // Semantic only
1306    }
1307
1308    #[test]
1309    fn test_filter_by_mime_type() {
1310        use cp_core::Document;
1311        use std::path::PathBuf;
1312
1313        let doc_md = Document::new(PathBuf::from("test.md"), b"content", 1000);
1314        let doc_pdf = Document::new(PathBuf::from("test.pdf"), b"pdf content", 1000);
1315
1316        let filter = Filter::MimeType("text/markdown".to_string());
1317
1318        assert!(filter.matches(&doc_md));
1319        assert!(!filter.matches(&doc_pdf));
1320    }
1321
1322    #[test]
1323    fn test_filter_by_path_glob() {
1324        use cp_core::Document;
1325        use std::path::PathBuf;
1326
1327        let doc1 = Document::new(PathBuf::from("docs/readme.md"), b"content", 1000);
1328        let doc2 = Document::new(PathBuf::from("src/main.rs"), b"code", 1000);
1329
1330        let filter = Filter::DocumentPath("docs/*.md".to_string());
1331
1332        assert!(filter.matches(&doc1));
1333        assert!(!filter.matches(&doc2));
1334    }
1335
1336    #[test]
1337    fn test_filter_by_modified_time() {
1338        use cp_core::Document;
1339        use std::path::PathBuf;
1340
1341        let old_doc = Document::new(PathBuf::from("old.md"), b"content", 1000);
1342        let new_doc = Document::new(PathBuf::from("new.md"), b"content", 2000);
1343
1344        let filter_after = Filter::ModifiedAfter(1500);
1345        let filter_before = Filter::ModifiedBefore(1500);
1346
1347        assert!(!filter_after.matches(&old_doc));
1348        assert!(filter_after.matches(&new_doc));
1349
1350        assert!(filter_before.matches(&old_doc));
1351        assert!(!filter_before.matches(&new_doc));
1352    }
1353
1354    #[test]
1355    fn test_citation_extraction() {
1356        use cp_core::{ContextChunk, ContextMetadata};
1357
1358        let context = AssembledContext {
1359            chunks: vec![ContextChunk {
1360                chunk_id: Uuid::new_v4(),
1361                document_path: "test.md".to_string(),
1362                text: "The quick brown fox jumps over the lazy dog".to_string(),
1363                score: 1.0,
1364                sequence: 0,
1365            }],
1366            total_tokens: 10,
1367            truncated: false,
1368            metadata: ContextMetadata {
1369                query_hash: [0u8; 32],
1370                state_root: [0u8; 32],
1371            },
1372        };
1373
1374        // Response that contains text from context
1375        let response = "As mentioned, the quick brown fox jumps over the lazy dog in the story.";
1376        let citations = extract_citations(response, &context);
1377
1378        assert!(!citations.is_empty(), "Should find citations for overlapping text");
1379        assert!(citations[0].confidence > 0.0);
1380    }
1381
1382    #[test]
1383    fn test_hallucination_detection() {
1384        use cp_core::{ContextChunk, ContextMetadata};
1385
1386        let context = AssembledContext {
1387            chunks: vec![ContextChunk {
1388                chunk_id: Uuid::new_v4(),
1389                document_path: "test.md".to_string(),
1390                text: "The capital of France is Paris".to_string(),
1391                score: 1.0,
1392                sequence: 0,
1393            }],
1394            total_tokens: 10,
1395            truncated: false,
1396            metadata: ContextMetadata {
1397                query_hash: [0u8; 32],
1398                state_root: [0u8; 32],
1399            },
1400        };
1401
1402        // Response with hallucination phrase
1403        let bad_response = "From my knowledge, I believe that Paris is a beautiful city.";
1404        let result = validate_response(bad_response, &context);
1405
1406        assert!(!result.warnings.is_empty(), "Should detect hallucination phrases");
1407        assert!(result.warnings.iter().any(|w| w.contains("hallucination")));
1408
1409        // Good response that admits missing info
1410        let good_response = "Information is missing from the substrate.";
1411        let result2 = validate_response(good_response, &context);
1412
1413        assert!(result2.is_valid, "Should be valid when admitting missing info");
1414    }
1415
1416    #[tokio::test]
1417    async fn test_real_corpus_proof() {
1418        let temp = TempDir::new().unwrap();
1419        let db_path = temp.path().join("real_corpus.db");
1420        let mut graph = cp_graph::GraphStore::open(db_path.to_str().unwrap()).unwrap();
1421
1422        // 1. Ingest actual test_corpus files
1423        let corpus_dir = match std::env::var("CANON_TEST_CORPUS") {
1424            Ok(p) => std::path::PathBuf::from(p),
1425            Err(_) => std::path::PathBuf::from(env!("CARGO_MANIFEST_DIR")).join("../../test_corpus"),
1426        };
1427
1428        // Skip test if corpus doesn't exist
1429        if !corpus_dir.exists() {
1430            println!("Skipping test: corpus directory not found at {:?}", corpus_dir);
1431            return;
1432        }
1433
1434        let embedder = Arc::new(cp_embeddings::EmbeddingEngine::new().unwrap());
1435        
1436        let files = vec!["zk.md", "ethereum.md", "random.md", "zksnark.md", "lexical_gap.md", "adversarial.md"];
1437        
1438        for file_name in files {
1439            let path = corpus_dir.join(file_name);
1440            if !path.exists() { continue; }
1441            
1442            let content = std::fs::read_to_string(&path).unwrap();
1443            let doc = Document::new(path.clone(), content.as_bytes(), 0);
1444            graph.insert_document(&doc).unwrap();
1445            
1446            // Simple chunking (for brevity in test)
1447            let chunk = Chunk {
1448                id: Uuid::new_v4(),
1449                doc_id: doc.id,
1450                text: content.clone(),
1451                byte_offset: 0,
1452                byte_length: content.len() as u64,
1453                sequence: 0,
1454                text_hash: [0; 32],
1455            };
1456            graph.insert_chunk(&chunk).unwrap();
1457            
1458            let vec = embedder.embed(&content).unwrap();
1459            let emb = cp_core::Embedding::new(chunk.id, &vec, embedder.model_hash().unwrap(), 0);
1460            graph.insert_embedding(&emb).unwrap();
1461        }
1462        
1463        let qe = QueryEngine::new(Arc::new(Mutex::new(graph)), embedder);
1464        
1465        // Test 1: Lexical Gap (Strict identifier)
1466        let q1 = "calculate_hyper_parameter_v7";
1467        let res1 = qe.search(q1, 1).unwrap();
1468        println!("\n--- QUERY: '{}' ---", q1);
1469        for (i, r) in res1.iter().enumerate() {
1470            println!("Rank {}: [Score: {:.4}] {}", i+1, r.score, r.doc_path);
1471        }
1472        assert!(res1[0].doc_path.contains("lexical_gap.md"));
1473
1474        // Test 2: Semantic (ZKP concept)
1475        let q2 = "cryptographic privacy statement validity"; // No "ZKP" or "SNARK" words
1476        let res2 = qe.search(q2, 3).unwrap();
1477        println!("\n--- QUERY: '{}' ---", q2);
1478        for (i, r) in res2.iter().enumerate() {
1479            println!("Rank {}: [Score: {:.4}] {}", i+1, r.score, r.doc_path);
1480        }
1481        // Should find zk.md or zksnark.md
1482        let found_zk = res2.iter().any(|r| r.doc_path.contains("zk"));
1483        assert!(found_zk);
1484
1485        // Test 3: Hybrid (Privacy blockchain)
1486        let q3 = "Ethereum privacy zksnark";
1487        let res3 = qe.search(q3, 5).unwrap();
1488        println!("\n--- QUERY: '{}' ---", q3);
1489        for (i, r) in res3.iter().enumerate() {
1490             println!("Rank {}: [Score: {:.4}] {}", i+1, r.score, r.doc_path);
1491        }
1492    }
1493
1494    #[tokio::test]
1495    async fn test_search_modes() {
1496        let temp = TempDir::new().unwrap();
1497        let db_path = temp.path().join("modes.db");
1498        let mut graph = cp_graph::GraphStore::open(db_path.to_str().unwrap()).unwrap();
1499        
1500        let doc = Document::new("test.md".into(), b"The quick brown fox jumps over the lazy dog", 100);
1501        graph.insert_document(&doc).unwrap();
1502        let chunk = Chunk {
1503            id: Uuid::new_v4(),
1504            doc_id: doc.id,
1505            text: "The quick brown fox jumps over the lazy dog".to_string(),
1506            byte_offset: 0,
1507            byte_length: 43,
1508            sequence: 0,
1509            text_hash: [0; 32],
1510        };
1511        graph.insert_chunk(&chunk).unwrap();
1512        let embedder = Arc::new(cp_embeddings::EmbeddingEngine::new().unwrap());
1513        let vec = embedder.embed(&chunk.text).unwrap();
1514        let emb = cp_core::Embedding::new(chunk.id, &vec, embedder.model_hash().unwrap(), 0);
1515        graph.insert_embedding(&emb).unwrap();
1516        
1517        let qe = QueryEngine::new(Arc::new(Mutex::new(graph)), embedder);
1518        
1519        // Semantic
1520        // "canine" is semantically close to "dog" or "fox"
1521        let results = qe.search_semantic("canine", 5).unwrap();
1522        assert!(!results.is_empty());
1523        
1524        // Lexical
1525        // "fox" is in the text
1526        let results = qe.search_lexical("fox", 5).unwrap();
1527        assert!(!results.is_empty());
1528        assert!(results[0].chunk.text.contains("fox"));
1529    }
1530
1531    // ========================================================================
1532    // Cross-source RRF merge tests (CP-013 ยง16)
1533    // ========================================================================
1534
1535    fn make_local_result(chunk_id_byte: u8, text: &str, doc_path: &str, score: f32) -> SearchResult {
1536        SearchResult {
1537            chunk: Chunk {
1538                id: Uuid::from_bytes([chunk_id_byte; 16]),
1539                doc_id: Uuid::from_bytes([0u8; 16]),
1540                text: text.to_string(),
1541                byte_offset: 0,
1542                byte_length: text.len() as u64,
1543                sequence: 0,
1544                text_hash: [0u8; 32],
1545            },
1546            score,
1547            doc_path: doc_path.to_string(),
1548        }
1549    }
1550
1551    fn make_remote_result(chunk_id_byte: u8, text: &str, doc_path: &str, score: u32) -> RemoteSearchResult {
1552        RemoteSearchResult {
1553            chunk_id: [chunk_id_byte; 16],
1554            chunk_text: text.to_string(),
1555            document_path: doc_path.to_string(),
1556            score,
1557            merkle_proof: None,
1558        }
1559    }
1560
1561    fn make_verified(
1562        result: RemoteSearchResult,
1563        weight: f64,
1564        peer_byte: u8,
1565    ) -> VerifiedRemoteResult {
1566        VerifiedRemoteResult {
1567            result,
1568            weight,
1569            peer_node_id: [peer_byte; 16],
1570            peer_state_root: [0u8; 32],
1571            peer_signature: [0u8; 64],
1572        }
1573    }
1574
1575    #[test]
1576    fn test_merge_local_only() {
1577        let local = vec![
1578            make_local_result(1, "chunk A", "doc_a.md", 0.9),
1579            make_local_result(2, "chunk B", "doc_b.md", 0.8),
1580        ];
1581
1582        let merged = merge_results(&local, &[], 10);
1583        assert_eq!(merged.len(), 2);
1584        assert!(merged[0].score > merged[1].score);
1585        assert!(matches!(merged[0].source, ResultSource::Local));
1586        assert_eq!(merged[0].chunk_text, "chunk A");
1587    }
1588
1589    #[test]
1590    fn test_merge_remote_only() {
1591        let remote = vec![
1592            make_verified(
1593                make_remote_result(3, "remote chunk", "remote.md", 16000),
1594                0.8,
1595                99,
1596            ),
1597        ];
1598
1599        let merged = merge_results(&[], &remote, 10);
1600        assert_eq!(merged.len(), 1);
1601        assert!(matches!(merged[0].source, ResultSource::Remote { .. }));
1602        assert_eq!(merged[0].chunk_text, "remote chunk");
1603        assert!(merged[0].peer_state_root.is_some());
1604    }
1605
1606    #[test]
1607    fn test_merge_deduplication() {
1608        // Same chunk_id in both local and remote
1609        let local = vec![make_local_result(1, "shared chunk local", "doc.md", 0.9)];
1610        let remote = vec![make_verified(
1611            make_remote_result(1, "shared chunk remote", "doc.md", 16000),
1612            0.7,
1613            50,
1614        )];
1615
1616        let merged = merge_results(&local, &remote, 10);
1617        assert_eq!(merged.len(), 1, "Duplicate chunk should be merged");
1618        assert!(matches!(merged[0].source, ResultSource::Both { .. }));
1619
1620        // Score should be higher than local-only since both contributed
1621        let local_only = merge_results(&local, &[], 10);
1622        assert!(
1623            merged[0].score > local_only[0].score,
1624            "Merged score ({}) should exceed local-only score ({})",
1625            merged[0].score,
1626            local_only[0].score
1627        );
1628    }
1629
1630    #[test]
1631    fn test_merge_weight_affects_score() {
1632        let remote_high = vec![make_verified(
1633            make_remote_result(1, "high weight", "doc.md", 16000),
1634            1.0,
1635            10,
1636        )];
1637        let remote_low = vec![make_verified(
1638            make_remote_result(1, "low weight", "doc.md", 16000),
1639            0.3,
1640            20,
1641        )];
1642
1643        let merged_high = merge_results(&[], &remote_high, 10);
1644        let merged_low = merge_results(&[], &remote_low, 10);
1645
1646        assert!(
1647            merged_high[0].score > merged_low[0].score,
1648            "Higher weight ({}) should produce higher score ({}) vs ({})",
1649            1.0,
1650            merged_high[0].score,
1651            merged_low[0].score,
1652        );
1653    }
1654
1655    #[test]
1656    fn test_merge_respects_max_results() {
1657        let local: Vec<SearchResult> = (0..15)
1658            .map(|i| make_local_result(i, &format!("chunk {}", i), "doc.md", 0.9 - i as f32 * 0.01))
1659            .collect();
1660        let remote: Vec<VerifiedRemoteResult> = (15..30)
1661            .map(|i| {
1662                make_verified(
1663                    make_remote_result(i, &format!("remote {}", i), "remote.md", 10000),
1664                    0.5,
1665                    99,
1666                )
1667            })
1668            .collect();
1669
1670        let merged = merge_results(&local, &remote, 10);
1671        assert_eq!(merged.len(), 10, "Should cap at max_results");
1672    }
1673
1674    #[test]
1675    fn test_merge_cap_at_20() {
1676        let local: Vec<SearchResult> = (0..25)
1677            .map(|i| make_local_result(i, &format!("c{}", i), "d.md", 0.5))
1678            .collect();
1679
1680        // Even if max_results is larger, cap at MAX_RESULTS (20)
1681        let merged = merge_results(&local, &[], 100);
1682        assert_eq!(merged.len(), 20, "Should cap at MAX_RESULTS (20)");
1683    }
1684
1685    #[test]
1686    fn test_merge_scores_decrease() {
1687        let local = vec![
1688            make_local_result(1, "first", "a.md", 0.9),
1689            make_local_result(2, "second", "b.md", 0.8),
1690            make_local_result(3, "third", "c.md", 0.7),
1691        ];
1692        let remote = vec![
1693            make_verified(make_remote_result(4, "r1", "d.md", 16000), 0.8, 10),
1694            make_verified(make_remote_result(5, "r2", "e.md", 15000), 0.8, 10),
1695        ];
1696
1697        let merged = merge_results(&local, &remote, 20);
1698        for w in merged.windows(2) {
1699            assert!(
1700                w[0].score >= w[1].score,
1701                "Scores should be in descending order: {} >= {}",
1702                w[0].score,
1703                w[1].score
1704            );
1705        }
1706    }
1707
1708    #[test]
1709    fn test_merge_empty_inputs() {
1710        let merged = merge_results(&[], &[], 10);
1711        assert!(merged.is_empty());
1712    }
1713
1714    #[test]
1715    fn test_verify_and_extract_valid() {
1716        // Build a properly signed response
1717        let signing_key = ed25519_dalek::SigningKey::from_bytes(&[88u8; 32]);
1718        let public_key = signing_key.verifying_key().to_bytes();
1719
1720        let mut response = SearchResponse {
1721            request_id: [1u8; 16],
1722            status: SearchStatus::Ok,
1723            results: vec![RemoteSearchResult {
1724                chunk_id: [2u8; 16],
1725                chunk_text: "test chunk".to_string(),
1726                document_path: "test.md".to_string(),
1727                score: 16000,
1728                merkle_proof: None,
1729            }],
1730            peer_state_root: [3u8; 32],
1731            search_latency_ms: 50,
1732            timestamp: 1000,
1733            signature: [0u8; 64],
1734        };
1735
1736        let signing_bytes = response.signing_bytes();
1737        response.signature =
1738            ed25519_dalek::Signer::sign(&signing_key, &signing_bytes).to_bytes();
1739
1740        let extracted = verify_and_extract(&response, &public_key, [10u8; 16], Some(0.9));
1741        assert!(extracted.is_some());
1742        let results = extracted.unwrap();
1743        assert_eq!(results.len(), 1);
1744        assert!((results[0].weight - 0.9).abs() < 0.001);
1745    }
1746
1747    #[test]
1748    fn test_verify_and_extract_bad_signature() {
1749        let response = SearchResponse {
1750            request_id: [1u8; 16],
1751            status: SearchStatus::Ok,
1752            results: vec![],
1753            peer_state_root: [0u8; 32],
1754            search_latency_ms: 0,
1755            timestamp: 1000,
1756            signature: [0u8; 64], // Invalid signature
1757        };
1758
1759        let fake_key = [0u8; 32];
1760        let extracted = verify_and_extract(&response, &fake_key, [10u8; 16], None);
1761        assert!(extracted.is_none());
1762    }
1763
1764    #[test]
1765    fn test_verify_and_extract_non_ok_status() {
1766        let response = SearchResponse {
1767            request_id: [1u8; 16],
1768            status: SearchStatus::ModelMismatch,
1769            results: vec![],
1770            peer_state_root: [0u8; 32],
1771            search_latency_ms: 0,
1772            timestamp: 1000,
1773            signature: [0u8; 64],
1774        };
1775
1776        let key = [0u8; 32];
1777        let extracted = verify_and_extract(&response, &key, [10u8; 16], None);
1778        assert!(extracted.is_none(), "Non-Ok status should return None");
1779    }
1780
1781    #[test]
1782    fn test_verify_and_extract_unrated_peer() {
1783        let signing_key = ed25519_dalek::SigningKey::from_bytes(&[77u8; 32]);
1784        let public_key = signing_key.verifying_key().to_bytes();
1785
1786        let mut response = SearchResponse {
1787            request_id: [1u8; 16],
1788            status: SearchStatus::Ok,
1789            results: vec![RemoteSearchResult {
1790                chunk_id: [5u8; 16],
1791                chunk_text: "unrated".to_string(),
1792                document_path: "doc.md".to_string(),
1793                score: 14000,
1794                merkle_proof: None,
1795            }],
1796            peer_state_root: [0u8; 32],
1797            search_latency_ms: 10,
1798            timestamp: 2000,
1799            signature: [0u8; 64],
1800        };
1801
1802        let signing_bytes = response.signing_bytes();
1803        response.signature =
1804            ed25519_dalek::Signer::sign(&signing_key, &signing_bytes).to_bytes();
1805
1806        // No rating โ†’ default weight 0.5
1807        let extracted = verify_and_extract(&response, &public_key, [20u8; 16], None);
1808        assert!(extracted.is_some());
1809        assert!((extracted.unwrap()[0].weight - 0.5).abs() < 0.001);
1810    }
1811
1812    #[test]
1813    fn test_verify_and_extract_clamps_weight() {
1814        let signing_key = ed25519_dalek::SigningKey::from_bytes(&[77u8; 32]);
1815        let public_key = signing_key.verifying_key().to_bytes();
1816
1817        let mut response = SearchResponse {
1818            request_id: [1u8; 16],
1819            status: SearchStatus::Ok,
1820            results: vec![RemoteSearchResult {
1821                chunk_id: [5u8; 16],
1822                chunk_text: "clamped".to_string(),
1823                document_path: "doc.md".to_string(),
1824                score: 14000,
1825                merkle_proof: None,
1826            }],
1827            peer_state_root: [0u8; 32],
1828            search_latency_ms: 10,
1829            timestamp: 2000,
1830            signature: [0u8; 64],
1831        };
1832
1833        let signing_bytes = response.signing_bytes();
1834        response.signature =
1835            ed25519_dalek::Signer::sign(&signing_key, &signing_bytes).to_bytes();
1836
1837        // Very low rating โ†’ clamped to 0.3
1838        let extracted = verify_and_extract(&response, &public_key, [20u8; 16], Some(0.01));
1839        assert!((extracted.unwrap()[0].weight - 0.3).abs() < 0.001);
1840
1841        // Very high rating โ†’ clamped to 1.0
1842        let mut response2 = response.clone();
1843        let signing_bytes2 = response2.signing_bytes();
1844        response2.signature =
1845            ed25519_dalek::Signer::sign(&signing_key, &signing_bytes2).to_bytes();
1846
1847        let extracted2 = verify_and_extract(&response2, &public_key, [20u8; 16], Some(5.0));
1848        assert!((extracted2.unwrap()[0].weight - 1.0).abs() < 0.001);
1849    }
1850}