1use cp_core::{Chunk, CPError, Result, ContextAssembler, ScoredChunk, AssembledContext};
8use cp_tor::types::{
9 MergedSearchResult, RemoteSearchResult, ResultSource, SearchResponse, SearchStatus,
10 MAX_RESULTS,
11};
12use glob::Pattern;
13use lru::LruCache;
14use std::collections::HashMap;
15use std::num::NonZeroUsize;
16use std::sync::{Arc, Mutex, RwLock};
17use tracing::{info, warn};
18use uuid::Uuid;
19
20#[derive(Debug, Clone)]
24pub enum Filter {
25 DocumentPath(String),
27 MimeType(String),
29 ModifiedAfter(i64),
31 ModifiedBefore(i64),
33}
34
35impl Filter {
36 pub fn matches(&self, doc: &cp_core::Document) -> bool {
38 match self {
39 Filter::DocumentPath(pattern) => {
40 if let Ok(glob) = Pattern::new(pattern) {
41 glob.matches(doc.path.to_string_lossy().as_ref())
42 } else {
43 false
44 }
45 }
46 Filter::MimeType(mime) => doc.mime_type == *mime,
47 Filter::ModifiedAfter(ts) => doc.mtime > *ts,
48 Filter::ModifiedBefore(ts) => doc.mtime < *ts,
49 }
50 }
51}
52
53pub struct QueryCache {
57 cache: RwLock<LruCache<[u8; 32], Vec<Uuid>>>,
59 state_root: RwLock<[u8; 32]>,
61}
62
63impl QueryCache {
64 pub fn new(capacity: usize) -> Self {
66 Self {
67 cache: RwLock::new(LruCache::new(
68 NonZeroUsize::new(capacity).unwrap_or(NonZeroUsize::new(100).unwrap()),
69 )),
70 state_root: RwLock::new([0u8; 32]),
71 }
72 }
73
74 pub fn get(&self, query: &str, k: usize) -> Option<Vec<Uuid>> {
76 let hash = Self::hash_key(query, k);
77 self.cache.write().ok()?.get(&hash).cloned()
78 }
79
80 pub fn put(&self, query: &str, k: usize, results: Vec<Uuid>) {
82 let hash = Self::hash_key(query, k);
83 if let Ok(mut cache) = self.cache.write() {
84 cache.put(hash, results);
85 }
86 }
87
88 pub fn is_valid(&self, current_root: &[u8; 32]) -> bool {
90 if let Ok(root) = self.state_root.read() {
91 *root == *current_root
92 } else {
93 false
94 }
95 }
96
97 pub fn invalidate(&self, new_root: [u8; 32]) {
99 if let Ok(mut cache) = self.cache.write() {
100 cache.clear();
101 }
102 if let Ok(mut root) = self.state_root.write() {
103 *root = new_root;
104 }
105 }
106
107 fn hash_key(query: &str, k: usize) -> [u8; 32] {
109 let mut hasher = blake3::Hasher::new();
110 hasher.update(query.as_bytes());
111 hasher.update(&k.to_le_bytes());
112 *hasher.finalize().as_bytes()
113 }
114}
115
116impl Default for QueryCache {
117 fn default() -> Self {
118 Self::new(100)
119 }
120}
121
122#[derive(Debug, Clone, serde::Serialize)]
124pub struct SearchResult {
125 pub chunk: Chunk,
127 pub score: f32,
129 pub doc_path: String,
131}
132
133#[derive(Debug, Clone, serde::Serialize)]
135pub struct GenerationResult {
136 pub answer: String,
138 pub context: String,
140 pub latency_ms: u64,
142}
143
144#[derive(Debug, Clone, serde::Serialize)]
148pub struct Citation {
149 pub chunk_id: Uuid,
151 pub span: (usize, usize),
153 pub confidence: f32,
155}
156
157#[derive(Debug, Clone, serde::Serialize)]
161pub struct ValidationResult {
162 pub is_valid: bool,
164 pub warnings: Vec<String>,
166 pub citation_coverage: f32,
168 pub citations: Vec<Citation>,
170}
171
172const HALLUCINATION_PHRASES: &[&str] = &[
174 "from my knowledge",
175 "i recall that",
176 "as far as i know",
177 "i believe that",
178 "in my experience",
179 "typically",
180 "generally speaking",
181 "it's commonly known",
182 "as everyone knows",
183 "i think that",
184 "probably",
185 "most likely",
186 "i assume",
187 "based on my understanding",
188 "from what i've learned",
189];
190
191pub fn extract_citations(response: &str, context: &AssembledContext) -> Vec<Citation> {
195 let mut citations = Vec::new();
196 let response_lower = response.to_lowercase();
197 let response_words: Vec<&str> = response_lower.split_whitespace().collect();
198
199 if response_words.len() < 5 {
200 return citations;
201 }
202
203 for chunk in &context.chunks {
204 let chunk_lower = chunk.text.to_lowercase();
205 let chunk_words: Vec<&str> = chunk_lower.split_whitespace().collect();
206
207 if chunk_words.len() < 5 {
208 continue;
209 }
210
211 let mut overlap_count = 0;
213 let mut matched_positions: Vec<usize> = Vec::new();
214
215 for i in 0..=response_words.len().saturating_sub(5) {
216 let response_ngram: Vec<&str> = response_words[i..i + 5].to_vec();
217
218 for j in 0..=chunk_words.len().saturating_sub(5) {
219 let chunk_ngram: Vec<&str> = chunk_words[j..j + 5].to_vec();
220
221 if response_ngram == chunk_ngram {
222 overlap_count += 1;
223 matched_positions.push(i);
224 break;
225 }
226 }
227 }
228
229 if overlap_count > 0 {
230 let max_ngrams = (response_words.len().saturating_sub(4)).max(1);
232 let confidence = (overlap_count as f32) / (max_ngrams as f32);
233
234 let start_pos = matched_positions.first().copied().unwrap_or(0);
236 let end_pos = matched_positions.last().copied().unwrap_or(0) + 5;
237
238 let mut byte_start = 0;
240 let mut byte_end = response.len();
241
242 let mut word_idx = 0;
243 for (i, c) in response.char_indices() {
244 if c.is_whitespace() {
245 word_idx += 1;
246 if word_idx == start_pos {
247 byte_start = i + 1;
248 }
249 if word_idx == end_pos.min(response_words.len()) {
250 byte_end = i;
251 break;
252 }
253 }
254 }
255
256 citations.push(Citation {
257 chunk_id: chunk.chunk_id,
258 span: (byte_start, byte_end),
259 confidence,
260 });
261 }
262 }
263
264 citations.sort_by(|a, b| b.confidence.partial_cmp(&a.confidence).unwrap_or(std::cmp::Ordering::Equal));
266
267 citations
268}
269
270pub fn validate_response(response: &str, context: &AssembledContext) -> ValidationResult {
274 let mut warnings = Vec::new();
275
276 let citations = extract_citations(response, context);
278
279 let total_response_len = response.len() as f32;
281 let mut spans: Vec<(usize, usize)> = citations.iter().map(|c| c.span).collect();
282 spans.sort_by_key(|s| s.0);
283 let mut merged: Vec<(usize, usize)> = Vec::new();
284 for span in &spans {
285 if let Some(last) = merged.last_mut() {
286 if span.0 <= last.1 {
287 last.1 = last.1.max(span.1);
288 continue;
289 }
290 }
291 merged.push(*span);
292 }
293 let covered_bytes: usize = merged.iter().map(|(a, b)| b.saturating_sub(*a)).sum();
294
295 let citation_coverage = if total_response_len > 0.0 {
296 (covered_bytes as f32 / total_response_len).min(1.0)
297 } else {
298 0.0
299 };
300
301 let response_lower = response.to_lowercase();
303 for phrase in HALLUCINATION_PHRASES {
304 if response_lower.contains(phrase) {
305 warnings.push(format!("Response contains hallucination indicator: '{}'", phrase));
306 }
307 }
308
309 if citation_coverage < 0.3 && !response.is_empty() {
311 warnings.push(format!(
312 "Low citation coverage: {:.1}% (threshold: 30%)",
313 citation_coverage * 100.0
314 ));
315 }
316
317 let good_phrases = ["information is missing", "not found in the context", "cannot find"];
319 let claims_missing = good_phrases.iter().any(|p| response_lower.contains(p));
320
321 let is_valid = warnings.is_empty() || claims_missing;
323
324 ValidationResult {
325 is_valid,
326 warnings,
327 citation_coverage,
328 citations,
329 }
330}
331
332#[async_trait::async_trait]
337pub trait IntelligenceEngine: Send + Sync {
338 async fn generate(&self, context: &AssembledContext, query: &str) -> Result<String>;
341}
342
343pub struct OllamaGenerator {
345 base_url: String,
346 model: String,
347}
348
349impl OllamaGenerator {
350 pub fn new(base_url: String, model: String) -> Self {
352 Self { base_url, model }
353 }
354}
355
356#[async_trait::async_trait]
357impl IntelligenceEngine for OllamaGenerator {
358 async fn generate(&self, context: &AssembledContext, query: &str) -> Result<String> {
359 let formatted_context = ContextAssembler::format(context);
360
361 let prompt = format!(
363 "<|im_start|>system\nYou are a context reading machine. You do not have knowledge of the outside world.\n- Read the Context below carefully.\n- If the answer to the Query is in the Context, output it.\n- If the answer is NOT in the Context, say 'Information is missing from the substrate.' and nothing else.\n- Do not make up facts.\n<|im_end|>\n<|im_start|>user\nContext:\n{}\n\nQuery: {}<|im_end|>\n<|im_start|>assistant\n",
364 formatted_context, query
365 );
366
367 let client = reqwest::Client::builder()
368 .timeout(std::time::Duration::from_secs(120))
369 .build()
370 .map_err(|e| CPError::Inference(format!("Failed to create HTTP client: {}", e)))?;
371 let payload = serde_json::json!({
372 "model": self.model,
373 "prompt": prompt,
374 "stream": false
375 });
376
377 let url = format!("{}/api/generate", self.base_url);
378 let res = client
379 .post(&url)
380 .json(&payload)
381 .send()
382 .await
383 .map_err(|e| CPError::Inference(format!("Ollama request failed: {}", e)))?;
384
385 let json: serde_json::Value = res
386 .json()
387 .await
388 .map_err(|e| CPError::Parse(e.to_string()))?;
389
390 let answer = json["response"]
391 .as_str()
392 .ok_or_else(|| CPError::Parse("Invalid Ollama response".into()))?
393 .to_string();
394
395 Ok(answer)
396 }
397}
398
399pub struct QueryEngine {
401 graph: Arc<Mutex<cp_graph::GraphStore>>,
402 embedder: Arc<cp_embeddings::EmbeddingEngine>,
403 intelligence: Option<Box<dyn IntelligenceEngine>>,
404 cache: QueryCache,
406 context_budget: usize,
408}
409
410impl QueryEngine {
411 pub fn new(
413 graph: Arc<Mutex<cp_graph::GraphStore>>,
414 embedder: Arc<cp_embeddings::EmbeddingEngine>,
415 ) -> Self {
416 Self {
417 graph,
418 embedder,
419 intelligence: None,
420 cache: QueryCache::default(),
421 context_budget: 2000,
422 }
423 }
424
425 pub fn with_cache_capacity(
427 graph: Arc<Mutex<cp_graph::GraphStore>>,
428 embedder: Arc<cp_embeddings::EmbeddingEngine>,
429 cache_capacity: usize,
430 ) -> Self {
431 Self {
432 graph,
433 embedder,
434 intelligence: None,
435 cache: QueryCache::new(cache_capacity),
436 context_budget: 2000,
437 }
438 }
439
440 pub fn with_context_budget(mut self, budget: usize) -> Self {
442 self.context_budget = budget;
443 self
444 }
445
446 pub fn with_intelligence(mut self, intelligence: Box<dyn IntelligenceEngine>) -> Self {
448 self.intelligence = Some(intelligence);
449 self
450 }
451
452 pub fn set_intelligence(&mut self, intelligence: Box<dyn IntelligenceEngine>) {
454 self.intelligence = Some(intelligence);
455 }
456
457 pub fn search(&self, query: &str, k: usize) -> Result<Vec<SearchResult>> {
462 info!("Hybrid search for: '{}'", query);
463
464 let query_vec = self
466 .embedder
467 .embed_query(query)
468 .map_err(|e| CPError::Embedding(format!("Failed to embed query: {}", e)))?;
469
470 let semantic_results = {
471 let graph = self.graph.lock().expect("graph lock poisoned");
472 graph.search(&query_vec, k)?
473 };
474
475 let lexical_results = {
477 let graph = self.graph.lock().expect("graph lock poisoned");
478 let fts_query = sanitize_fts5_query(query);
480 graph.search_lexical(&fts_query, k).unwrap_or_else(|e| {
481 warn!("Lexical search failed: {}. Falling back to semantic only.", e);
482 Vec::new()
483 })
484 };
485
486 const RRF_K: u64 = 60;
490 const RRF_SCALE: u64 = 1_000_000;
491
492 let mut scores: std::collections::HashMap<Uuid, u64> = std::collections::HashMap::new();
494
495 {
496 let graph = self.graph.lock().expect("graph lock poisoned");
497
498 for (i, (emb_id, _)) in semantic_results.iter().enumerate() {
499 if let Ok(Some(chunk_id)) = graph.get_chunk_id_for_embedding(*emb_id) {
500 let score = RRF_SCALE / (RRF_K + i as u64);
502 *scores.entry(chunk_id).or_insert(0) += score;
503 }
504 }
505
506 for (i, (chunk_id, _)) in lexical_results.iter().enumerate() {
507 let score = RRF_SCALE / (RRF_K + i as u64);
508 *scores.entry(*chunk_id).or_insert(0) += score;
509 }
510 }
511
512 let mut fused: Vec<(Uuid, u64)> = scores.into_iter().collect();
514 fused.sort_by(|a, b| {
515 b.1.cmp(&a.1) .then_with(|| a.0.cmp(&b.0)) });
518 fused.truncate(k);
519
520 let mut search_results = Vec::with_capacity(fused.len());
522 let graph = self.graph.lock().expect("graph lock poisoned");
523
524 for (chunk_id, fused_score) in fused {
525 let chunk = match graph.get_chunk(chunk_id)? {
526 Some(c) => c,
527 None => continue,
528 };
529
530 let doc = match graph.get_document(chunk.doc_id)? {
531 Some(d) => d,
532 None => continue,
533 };
534
535 let normalized_score = fused_score as f32 / (RRF_SCALE * 2) as f32;
537
538 search_results.push(SearchResult {
539 chunk,
540 score: normalized_score,
541 doc_path: doc.path.to_string_lossy().to_string(),
542 });
543 }
544
545 Ok(search_results)
546 }
547
548 pub fn search_semantic(&self, query: &str, k: usize) -> Result<Vec<SearchResult>> {
550 info!("Semantic search for: '{}'", query);
551
552 let query_vec = self
553 .embedder
554 .embed_query(query)
555 .map_err(|e| CPError::Embedding(format!("Failed to embed query: {}", e)))?;
556
557 let raw_results = {
558 let graph = self.graph.lock().expect("graph lock poisoned");
559 graph.search(&query_vec, k)?
560 };
561
562 let mut search_results = Vec::with_capacity(raw_results.len());
563 let graph = self.graph.lock().expect("graph lock poisoned");
564
565 for (emb_id, score) in raw_results {
566 if let Some(chunk_id) = graph.get_chunk_id_for_embedding(emb_id)? {
568 if let Some(chunk) = graph.get_chunk(chunk_id)? {
569 if let Some(doc) = graph.get_document(chunk.doc_id)? {
570 search_results.push(SearchResult {
571 chunk,
572 score,
573 doc_path: doc.path.to_string_lossy().to_string(),
574 });
575 }
576 }
577 }
578 }
579
580 Ok(search_results)
581 }
582
583 pub fn search_lexical(&self, query: &str, k: usize) -> Result<Vec<SearchResult>> {
585 info!("Lexical search for: '{}'", query);
586
587 let raw_results = {
588 let graph = self.graph.lock().expect("graph lock poisoned");
589 let fts_query = sanitize_fts5_query(query);
591 graph.search_lexical(&fts_query, k)?
592 };
593
594 let mut search_results = Vec::with_capacity(raw_results.len());
595 let graph = self.graph.lock().expect("graph lock poisoned");
596
597 for (chunk_id, score) in raw_results {
598 if let Some(chunk) = graph.get_chunk(chunk_id)? {
600 if let Some(doc) = graph.get_document(chunk.doc_id)? {
601 search_results.push(SearchResult {
602 chunk,
603 score,
604 doc_path: doc.path.to_string_lossy().to_string(),
605 });
606 }
607 }
608 }
609
610 Ok(search_results)
611 }
612
613 pub fn search_filtered(&self, query: &str, k: usize, filters: &[Filter]) -> Result<Vec<SearchResult>> {
617 info!("Filtered search for: '{}' with {} filters", query, filters.len());
618
619 let matching_doc_ids: std::collections::HashSet<Uuid> = {
621 let graph = self.graph.lock().expect("graph lock poisoned");
622 let all_docs = graph.get_all_documents()?;
623
624 all_docs
625 .into_iter()
626 .filter(|doc| filters.iter().all(|f| f.matches(doc)))
627 .map(|doc| doc.id)
628 .collect()
629 };
630
631 if matching_doc_ids.is_empty() {
632 info!("No documents match filters");
633 return Ok(Vec::new());
634 }
635
636 let all_results = self.search(query, k * 3)?; let filtered_results: Vec<SearchResult> = all_results
641 .into_iter()
642 .filter(|r| matching_doc_ids.contains(&r.chunk.doc_id))
643 .take(k)
644 .collect();
645
646 info!("Filtered search returned {} results", filtered_results.len());
647 Ok(filtered_results)
648 }
649
650 pub fn search_cached(&self, query: &str, k: usize) -> Result<Vec<SearchResult>> {
654 let current_root = {
656 let graph = self.graph.lock().expect("graph lock poisoned");
657 graph.compute_merkle_root()?
658 };
659
660 if !self.cache.is_valid(¤t_root) {
661 self.cache.invalidate(current_root);
662 }
663
664 if let Some(chunk_ids) = self.cache.get(query, k) {
666 info!("Cache hit for query: '{}' (k={})", query, k);
667
668 let graph = self.graph.lock().expect("graph lock poisoned");
669 let mut results = Vec::new();
670
671 for chunk_id in chunk_ids.iter().take(k) {
672 if let Some(chunk) = graph.get_chunk(*chunk_id)? {
673 if let Some(doc) = graph.get_document(chunk.doc_id)? {
674 results.push(SearchResult {
675 chunk,
676 score: f32::NAN, doc_path: doc.path.to_string_lossy().to_string(),
678 });
679 }
680 }
681 }
682
683 return Ok(results);
684 }
685
686 let results = self.search(query, k)?;
688
689 let chunk_ids: Vec<Uuid> = results.iter().map(|r| r.chunk.id).collect();
691 self.cache.put(query, k, chunk_ids);
692
693 Ok(results)
694 }
695
696 pub fn invalidate_cache(&self) -> Result<()> {
698 let root = {
699 let graph = self.graph.lock().expect("graph lock poisoned");
700 graph.compute_merkle_root()?
701 };
702 self.cache.invalidate(root);
703 Ok(())
704 }
705
706 pub fn get_chunks_for_document(&self, doc_id: Uuid) -> Result<Vec<SearchResult>> {
708 let graph = self.graph.lock().expect("graph lock poisoned");
709
710 let doc = graph
711 .get_document(doc_id)?
712 .ok_or_else(|| CPError::Database(format!("Doc {} not found", doc_id)))?;
713
714 let chunks = graph.get_chunks_for_doc(doc_id)?;
715
716 Ok(chunks.into_iter().map(|c| SearchResult {
717 chunk: c,
718 score: f32::NAN, doc_path: doc.path.to_string_lossy().to_string(),
720 }).collect())
721 }
722
723 pub fn graph(&self) -> Arc<Mutex<cp_graph::GraphStore>> {
725 self.graph.clone()
726 }
727
728 pub async fn generate_answer(&self, query: &str) -> Result<GenerationResult> {
730 let start = std::time::Instant::now();
731 info!("Generating answer for: '{}'", query);
732
733 let results = self.search(query, 5)?;
735
736 let assembler = ContextAssembler::with_budget(self.context_budget);
738 let scored_chunks: Vec<ScoredChunk> = results
739 .iter()
740 .map(|r| ScoredChunk {
741 chunk: r.chunk.clone(),
742 score: r.score,
743 document_path: r.doc_path.clone(),
744 })
745 .collect();
746
747 let state_root = {
748 let graph = self.graph.lock().expect("graph lock poisoned");
749 graph.compute_merkle_root()?
750 };
751
752 let assembled_context = assembler.assemble(scored_chunks, query, state_root);
753
754 let answer = if let Some(ref engine) = self.intelligence {
756 engine.generate(&assembled_context, query).await?
757 } else {
758 return Err(CPError::NotFound("Intelligence engine not configured".into()));
759 };
760
761 Ok(GenerationResult {
762 answer,
763 context: ContextAssembler::format(&assembled_context),
764 latency_ms: start.elapsed().as_millis() as u64,
765 })
766 }
767
768 pub fn generate_proof_receipt(
774 &self,
775 query: &str,
776 search_results: &[SearchResult],
777 identity: &cp_sync::DeviceIdentity,
778 ) -> Result<cp_core::ProofReceipt> {
779 let query_hash = *blake3::hash(query.as_bytes()).as_bytes();
780
781 let assembler = ContextAssembler::with_budget(self.context_budget * 2);
783 let scored_chunks: Vec<ScoredChunk> = search_results
784 .iter()
785 .map(|r| ScoredChunk {
786 chunk: r.chunk.clone(),
787 score: r.score,
788 document_path: r.doc_path.clone(),
789 })
790 .collect();
791
792 let state_root = {
793 let graph = self.graph.lock().expect("graph lock poisoned");
794 graph.compute_merkle_root()?
795 };
796
797 let assembled = assembler.assemble(scored_chunks, query, state_root);
798 let context_string = ContextAssembler::format(&assembled);
799 let context_hash = *blake3::hash(context_string.as_bytes()).as_bytes();
800
801 let (sorted_chunk_ids, sorted_chunk_hashes, chunk_tree_root) = {
803 let graph = self.graph.lock().expect("graph lock poisoned");
804 let sorted = graph.get_sorted_chunk_hashes()?;
805 let hashes: Vec<[u8; 32]> = sorted.iter().map(|(_, h)| *h).collect();
806 let root = cp_core::proof::compute_chunk_tree_root(&hashes);
807 (sorted, hashes, root)
808 };
809
810 let mut chunk_proofs = Vec::new();
812 let mut sources = Vec::new();
813
814 for result in search_results {
815 let chunk_id_bytes = *result.chunk.id.as_bytes();
816
817 if let Some(idx) = sorted_chunk_ids.iter().position(|(id, _)| *id == chunk_id_bytes) {
819 let proof = cp_core::proof::build_chunk_proof(
820 chunk_id_bytes,
821 result.chunk.text_hash,
822 idx,
823 &sorted_chunk_hashes,
824 );
825 chunk_proofs.push(proof);
826 }
827
828 sources.push(cp_core::SourceRef {
829 document_path: result.doc_path.clone(),
830 chunk_id: chunk_id_bytes,
831 chunk_text: result.chunk.text.clone(),
832 chunk_sequence: result.chunk.sequence,
833 relevance_score: result.score,
834 });
835 }
836
837 let now = std::time::SystemTime::now()
839 .duration_since(std::time::UNIX_EPOCH)
840 .unwrap_or_default();
841 let secs = now.as_secs();
842 let timestamp = format_unix_timestamp(secs);
843
844 let mut receipt = cp_core::ProofReceipt {
846 version: 1,
847 query: query.to_string(),
848 query_hash,
849 timestamp,
850 context_hash,
851 state_root,
852 chunk_tree_root,
853 chunk_proofs,
854 sources,
855 signature: [0u8; 64],
856 signer_public_key: identity.public_key,
857 device_id: identity.device_id,
858 };
859
860 let sig = identity.sign(&receipt.signing_bytes());
861 receipt.signature = sig;
862
863 Ok(receipt)
864 }
865
866 pub async fn chat(&self, query: &str, history: &[Message]) -> Result<String> {
871 let _start = std::time::Instant::now();
872
873 let results = self.search(query, 5)?;
875
876 let assembler = ContextAssembler::with_budget(2000);
878 let scored_chunks: Vec<ScoredChunk> = results
879 .iter()
880 .map(|r| ScoredChunk {
881 chunk: r.chunk.clone(),
882 score: r.score,
883 document_path: r.doc_path.clone(),
884 })
885 .collect();
886
887 let state_root = {
888 let graph = self.graph.lock().expect("graph lock poisoned");
889 graph.compute_merkle_root()?
890 };
891 let assembled_context = assembler.assemble(scored_chunks, query, state_root);
892
893 let mut full_prompt = String::new();
895 for msg in history {
896 let role = match msg.role {
897 Role::User => "User",
898 Role::Assistant => "Assistant",
899 Role::System => "System",
900 };
901 full_prompt.push_str(&format!("{}: {}\n", role, msg.content));
902 }
903 full_prompt.push_str(&format!("User: {}\n", query));
904
905 let answer = if let Some(ref engine) = self.intelligence {
907 engine.generate(&assembled_context, &full_prompt).await?
908 } else {
909 return Err(CPError::NotFound("Intelligence engine not configured".into()));
910 };
911
912 Ok(answer)
913 }
914}
915
916#[derive(Debug, Clone)]
918pub struct Message {
919 pub role: Role,
920 pub content: String,
921}
922
923#[derive(Debug, Clone, Copy, PartialEq, Eq)]
925pub enum Role {
926 User,
927 Assistant,
928 System,
929}
930
931pub struct VerifiedRemoteResult {
937 pub result: RemoteSearchResult,
938 pub weight: f64,
940 pub peer_node_id: [u8; 16],
941 pub peer_state_root: [u8; 32],
942 pub peer_signature: [u8; 64],
943}
944
945pub fn merge_results(
952 local: &[SearchResult],
953 remote: &[VerifiedRemoteResult],
954 max_results: usize,
955) -> Vec<MergedSearchResult> {
956 const K: f64 = 60.0;
957 let cap = max_results.min(MAX_RESULTS as usize);
958
959 let mut scores: HashMap<[u8; 16], (f64, MergedSearchResult)> = HashMap::new();
961
962 for (rank, result) in local.iter().enumerate() {
964 let rrf = 1.0 / (K + rank as f64 + 1.0);
965 let chunk_id = *result.chunk.id.as_bytes();
966
967 let entry = scores.entry(chunk_id).or_insert_with(|| {
968 (
969 0.0,
970 MergedSearchResult {
971 chunk_id,
972 chunk_text: result.chunk.text.clone(),
973 document_path: result.doc_path.clone(),
974 score: 0.0,
975 source: ResultSource::Local,
976 merkle_proof: None,
977 peer_state_root: None,
978 peer_signature: None,
979 },
980 )
981 });
982 entry.0 += rrf;
983 }
984
985 for (rank, verified) in remote.iter().enumerate() {
987 let rrf = verified.weight / (K + rank as f64 + 1.0);
988 let chunk_id = verified.result.chunk_id;
989
990 if let Some(entry) = scores.get_mut(&chunk_id) {
991 entry.0 += rrf;
993 entry.1.source = ResultSource::Both {
994 peer_node_id: verified.peer_node_id,
995 };
996 if entry.1.peer_state_root.is_none() {
998 entry.1.peer_state_root = Some(verified.peer_state_root);
999 entry.1.peer_signature = Some(verified.peer_signature);
1000 entry.1.merkle_proof = verified.result.merkle_proof.clone();
1001 }
1002 } else {
1003 scores.insert(
1004 chunk_id,
1005 (
1006 rrf,
1007 MergedSearchResult {
1008 chunk_id,
1009 chunk_text: verified.result.chunk_text.clone(),
1010 document_path: verified.result.document_path.clone(),
1011 score: 0.0,
1012 source: ResultSource::Remote {
1013 peer_node_id: verified.peer_node_id,
1014 },
1015 merkle_proof: verified.result.merkle_proof.clone(),
1016 peer_state_root: Some(verified.peer_state_root),
1017 peer_signature: Some(verified.peer_signature),
1018 },
1019 ),
1020 );
1021 }
1022 }
1023
1024 let mut merged: Vec<(f64, MergedSearchResult)> = scores.into_values().collect();
1026 merged.sort_by(|a, b| b.0.partial_cmp(&a.0).unwrap_or(std::cmp::Ordering::Equal));
1027 merged.truncate(cap);
1028
1029 merged
1031 .into_iter()
1032 .map(|(score, mut result)| {
1033 result.score = score;
1034 result
1035 })
1036 .collect()
1037}
1038
1039pub fn verify_and_extract(
1044 response: &SearchResponse,
1045 peer_public_key: &[u8; 32],
1046 peer_node_id: [u8; 16],
1047 peer_rating: Option<f64>,
1048) -> Option<Vec<VerifiedRemoteResult>> {
1049 if response.status != SearchStatus::Ok {
1050 return None;
1051 }
1052
1053 if cp_tor::verify_response(response, peer_public_key).is_err() {
1055 warn!(
1056 "Invalid response signature from peer {}",
1057 hex::encode(&peer_node_id[..4])
1058 );
1059 return None;
1060 }
1061
1062 let weight = peer_rating
1063 .map(|r| r.clamp(0.3, 1.0))
1064 .unwrap_or(0.5);
1065
1066 let results = response
1067 .results
1068 .iter()
1069 .map(|r| VerifiedRemoteResult {
1070 result: r.clone(),
1071 weight,
1072 peer_node_id,
1073 peer_state_root: response.peer_state_root,
1074 peer_signature: response.signature,
1075 })
1076 .collect();
1077
1078 Some(results)
1079}
1080
1081impl QueryEngine {
1082 pub async fn live_search(
1089 &self,
1090 query: &str,
1091 remote_results: Vec<VerifiedRemoteResult>,
1092 max_results: usize,
1093 ) -> Result<Vec<MergedSearchResult>> {
1094 info!(
1095 "Live search for '{}' with {} remote result(s)",
1096 query,
1097 remote_results.len()
1098 );
1099
1100 let local_results = self.search(query, max_results)?;
1102
1103 let merged = merge_results(&local_results, &remote_results, max_results);
1105
1106 info!(
1107 "Live search returned {} merged results ({} local, {} remote)",
1108 merged.len(),
1109 local_results.len(),
1110 remote_results.len()
1111 );
1112
1113 Ok(merged)
1114 }
1115}
1116
1117fn sanitize_fts5_query(query: &str) -> String {
1123 let sanitized: String = query
1124 .chars()
1125 .filter(|c| !matches!(c, '"' | '*' | '^' | '+' | '-' | '(' | ')' | '{' | '}' | ':'))
1126 .collect();
1127 let trimmed = sanitized.trim();
1128 if trimmed.is_empty() {
1129 String::new()
1131 } else {
1132 format!("\"{}\"", trimmed)
1133 }
1134}
1135
1136fn format_unix_timestamp(secs: u64) -> String {
1137 let days_since_epoch = secs / 86400;
1138 let time_of_day = secs % 86400;
1139 let hours = time_of_day / 3600;
1140 let minutes = (time_of_day % 3600) / 60;
1141 let seconds = time_of_day % 60;
1142 let (year, month, day) = days_to_date(days_since_epoch);
1143 format!(
1144 "{:04}-{:02}-{:02}T{:02}:{:02}:{:02}Z",
1145 year, month, day, hours, minutes, seconds
1146 )
1147}
1148
1149fn days_to_date(days: u64) -> (u64, u64, u64) {
1150 let z = days + 719468;
1151 let era = z / 146097;
1152 let doe = z - era * 146097;
1153 let yoe = (doe - doe / 1460 + doe / 36524 - doe / 146096) / 365;
1154 let y = yoe + era * 400;
1155 let doy = doe - (365 * yoe + yoe / 4 - yoe / 100);
1156 let mp = (5 * doy + 2) / 153;
1157 let d = doy - (153 * mp + 2) / 5 + 1;
1158 let m = if mp < 10 { mp + 3 } else { mp - 9 };
1159 let y = if m <= 2 { y + 1 } else { y };
1160 (y, m, d)
1161}
1162
1163#[cfg(test)]
1164mod tests {
1165 use super::*;
1166 use cp_core::{Document, Chunk};
1167 use std::sync::{Arc, Mutex};
1168 use tempfile::TempDir;
1169
1170 #[tokio::test]
1171 async fn test_get_chunks_for_document() {
1172 let temp = TempDir::new().unwrap();
1173 let db_path = temp.path().join("test.db");
1174 let mut graph = cp_graph::GraphStore::open(db_path.to_str().unwrap()).unwrap();
1175
1176 let doc = Document::new("test.md".into(), b"Hello world", 100);
1177 graph.insert_document(&doc).unwrap();
1178
1179 let chunk = Chunk {
1180 id: Uuid::new_v4(),
1181 doc_id: doc.id,
1182 text: "Hello world".to_string(),
1183 byte_offset: 0,
1184 byte_length: 11,
1185 sequence: 0,
1186 text_hash: [0; 32],
1187 };
1188 graph.insert_chunk(&chunk).unwrap();
1189
1190 let embedder = Arc::new(cp_embeddings::EmbeddingEngine::new().unwrap());
1191 let qe = QueryEngine::new(Arc::new(Mutex::new(graph)), embedder);
1192
1193 let results = qe.get_chunks_for_document(doc.id).unwrap();
1194 assert_eq!(results.len(), 1);
1195 assert_eq!(results[0].chunk.text, "Hello world");
1196 }
1197
1198 #[tokio::test]
1199 async fn test_hybrid_search() {
1200 let temp = TempDir::new().unwrap();
1201 let db_path = temp.path().join("test_hybrid.db");
1202 let mut graph = cp_graph::GraphStore::open(db_path.to_str().unwrap()).unwrap();
1203
1204 let doc = Document::new("test.md".into(), b"The quick brown fox jumps over the lazy dog", 100);
1205 graph.insert_document(&doc).unwrap();
1206
1207 let chunk = Chunk {
1208 id: Uuid::new_v4(),
1209 doc_id: doc.id,
1210 text: "The quick brown fox jumps over the lazy dog".to_string(),
1211 byte_offset: 0,
1212 byte_length: 43,
1213 sequence: 0,
1214 text_hash: [0; 32],
1215 };
1216 graph.insert_chunk(&chunk).unwrap();
1217
1218 let embedder = Arc::new(cp_embeddings::EmbeddingEngine::new().unwrap());
1220 let vec = embedder.embed(&chunk.text).unwrap();
1221 let emb = cp_core::Embedding::new(chunk.id, &vec, embedder.model_hash().unwrap(), 0);
1222 graph.insert_embedding(&emb).unwrap();
1223
1224 let qe = QueryEngine::new(Arc::new(Mutex::new(graph)), embedder);
1225
1226 let results = qe.search("quick brown fox", 5).unwrap();
1228 assert!(!results.is_empty());
1229 assert!(results[0].chunk.text.contains("quick brown fox"));
1230
1231 let results_sem = qe.search("fast auburn canine", 5).unwrap();
1233 assert!(!results_sem.is_empty());
1234 assert!(results_sem[0].chunk.text.contains("quick brown fox"));
1235 }
1236
1237 #[tokio::test]
1238 async fn test_search_comparison_proof() {
1239 let temp = TempDir::new().unwrap();
1240 let db_path = temp.path().join("comparison_proof.db");
1241 let mut graph = cp_graph::GraphStore::open(db_path.to_str().unwrap()).unwrap();
1242
1243 let t1 = "The quick brown fox jumps over the lazy dog"; let t2 = "Artificial intelligence is transforming the modern world"; let t3 = "A fast auburn canine leaps across an idle hound"; let texts = vec![t1, t2, t3];
1249 for (i, text) in texts.iter().enumerate() {
1250 let doc = Document::new(format!("doc_{}.md", i).into(), text.as_bytes(), 100);
1251 graph.insert_document(&doc).unwrap();
1252 let chunk = Chunk {
1253 id: Uuid::new_v4(),
1254 doc_id: doc.id,
1255 text: text.to_string(),
1256 byte_offset: 0,
1257 byte_length: text.len() as u64,
1258 sequence: 0,
1259 text_hash: [0; 32],
1260 };
1261 graph.insert_chunk(&chunk).unwrap();
1262 let embedder = Arc::new(cp_embeddings::EmbeddingEngine::new().unwrap());
1263 let vec = embedder.embed(text).unwrap();
1264 let emb = cp_core::Embedding::new(chunk.id, &vec, embedder.model_hash().unwrap(), 0);
1265 graph.insert_embedding(&emb).unwrap();
1266 }
1267
1268 let embedder = Arc::new(cp_embeddings::EmbeddingEngine::new().unwrap());
1269 let qe = QueryEngine::new(Arc::new(Mutex::new(graph)), embedder);
1270
1271 let query = "fox";
1273
1274 let lexical = {
1276 let graph_lock = qe.graph();
1277 let g = graph_lock.lock().expect("graph lock poisoned");
1278 g.search_lexical(query, 5).unwrap()
1279 };
1280 assert_eq!(lexical.len(), 1);
1282
1283 let vector = {
1285 let graph_lock = qe.graph();
1286 let g = graph_lock.lock().expect("graph lock poisoned");
1287 let e = cp_embeddings::EmbeddingEngine::new().unwrap();
1288 let q_vec = e.embed_query(query).unwrap();
1289 g.search(&q_vec, 5).unwrap()
1290 };
1291 assert!(vector.len() >= 2);
1293
1294 let hybrid = qe.search(query, 5).unwrap();
1296
1297 println!("\n--- SEARCH PROOF FOR '{}' ---", query);
1298 println!("LEXICAL HITS: {}", lexical.len());
1299 println!("VECTOR HITS: {}", vector.len());
1300 println!("HYBRID HITS: {}", hybrid.len());
1301
1302 let texts_found: Vec<String> = hybrid.iter().map(|r| r.chunk.text.clone()).collect();
1304 assert!(texts_found.contains(&t1.to_string())); assert!(texts_found.contains(&t3.to_string())); }
1307
1308 #[test]
1309 fn test_filter_by_mime_type() {
1310 use cp_core::Document;
1311 use std::path::PathBuf;
1312
1313 let doc_md = Document::new(PathBuf::from("test.md"), b"content", 1000);
1314 let doc_pdf = Document::new(PathBuf::from("test.pdf"), b"pdf content", 1000);
1315
1316 let filter = Filter::MimeType("text/markdown".to_string());
1317
1318 assert!(filter.matches(&doc_md));
1319 assert!(!filter.matches(&doc_pdf));
1320 }
1321
1322 #[test]
1323 fn test_filter_by_path_glob() {
1324 use cp_core::Document;
1325 use std::path::PathBuf;
1326
1327 let doc1 = Document::new(PathBuf::from("docs/readme.md"), b"content", 1000);
1328 let doc2 = Document::new(PathBuf::from("src/main.rs"), b"code", 1000);
1329
1330 let filter = Filter::DocumentPath("docs/*.md".to_string());
1331
1332 assert!(filter.matches(&doc1));
1333 assert!(!filter.matches(&doc2));
1334 }
1335
1336 #[test]
1337 fn test_filter_by_modified_time() {
1338 use cp_core::Document;
1339 use std::path::PathBuf;
1340
1341 let old_doc = Document::new(PathBuf::from("old.md"), b"content", 1000);
1342 let new_doc = Document::new(PathBuf::from("new.md"), b"content", 2000);
1343
1344 let filter_after = Filter::ModifiedAfter(1500);
1345 let filter_before = Filter::ModifiedBefore(1500);
1346
1347 assert!(!filter_after.matches(&old_doc));
1348 assert!(filter_after.matches(&new_doc));
1349
1350 assert!(filter_before.matches(&old_doc));
1351 assert!(!filter_before.matches(&new_doc));
1352 }
1353
1354 #[test]
1355 fn test_citation_extraction() {
1356 use cp_core::{ContextChunk, ContextMetadata};
1357
1358 let context = AssembledContext {
1359 chunks: vec![ContextChunk {
1360 chunk_id: Uuid::new_v4(),
1361 document_path: "test.md".to_string(),
1362 text: "The quick brown fox jumps over the lazy dog".to_string(),
1363 score: 1.0,
1364 sequence: 0,
1365 }],
1366 total_tokens: 10,
1367 truncated: false,
1368 metadata: ContextMetadata {
1369 query_hash: [0u8; 32],
1370 state_root: [0u8; 32],
1371 },
1372 };
1373
1374 let response = "As mentioned, the quick brown fox jumps over the lazy dog in the story.";
1376 let citations = extract_citations(response, &context);
1377
1378 assert!(!citations.is_empty(), "Should find citations for overlapping text");
1379 assert!(citations[0].confidence > 0.0);
1380 }
1381
1382 #[test]
1383 fn test_hallucination_detection() {
1384 use cp_core::{ContextChunk, ContextMetadata};
1385
1386 let context = AssembledContext {
1387 chunks: vec![ContextChunk {
1388 chunk_id: Uuid::new_v4(),
1389 document_path: "test.md".to_string(),
1390 text: "The capital of France is Paris".to_string(),
1391 score: 1.0,
1392 sequence: 0,
1393 }],
1394 total_tokens: 10,
1395 truncated: false,
1396 metadata: ContextMetadata {
1397 query_hash: [0u8; 32],
1398 state_root: [0u8; 32],
1399 },
1400 };
1401
1402 let bad_response = "From my knowledge, I believe that Paris is a beautiful city.";
1404 let result = validate_response(bad_response, &context);
1405
1406 assert!(!result.warnings.is_empty(), "Should detect hallucination phrases");
1407 assert!(result.warnings.iter().any(|w| w.contains("hallucination")));
1408
1409 let good_response = "Information is missing from the substrate.";
1411 let result2 = validate_response(good_response, &context);
1412
1413 assert!(result2.is_valid, "Should be valid when admitting missing info");
1414 }
1415
1416 #[tokio::test]
1417 async fn test_real_corpus_proof() {
1418 let temp = TempDir::new().unwrap();
1419 let db_path = temp.path().join("real_corpus.db");
1420 let mut graph = cp_graph::GraphStore::open(db_path.to_str().unwrap()).unwrap();
1421
1422 let corpus_dir = match std::env::var("CANON_TEST_CORPUS") {
1424 Ok(p) => std::path::PathBuf::from(p),
1425 Err(_) => std::path::PathBuf::from(env!("CARGO_MANIFEST_DIR")).join("../../test_corpus"),
1426 };
1427
1428 if !corpus_dir.exists() {
1430 println!("Skipping test: corpus directory not found at {:?}", corpus_dir);
1431 return;
1432 }
1433
1434 let embedder = Arc::new(cp_embeddings::EmbeddingEngine::new().unwrap());
1435
1436 let files = vec!["zk.md", "ethereum.md", "random.md", "zksnark.md", "lexical_gap.md", "adversarial.md"];
1437
1438 for file_name in files {
1439 let path = corpus_dir.join(file_name);
1440 if !path.exists() { continue; }
1441
1442 let content = std::fs::read_to_string(&path).unwrap();
1443 let doc = Document::new(path.clone(), content.as_bytes(), 0);
1444 graph.insert_document(&doc).unwrap();
1445
1446 let chunk = Chunk {
1448 id: Uuid::new_v4(),
1449 doc_id: doc.id,
1450 text: content.clone(),
1451 byte_offset: 0,
1452 byte_length: content.len() as u64,
1453 sequence: 0,
1454 text_hash: [0; 32],
1455 };
1456 graph.insert_chunk(&chunk).unwrap();
1457
1458 let vec = embedder.embed(&content).unwrap();
1459 let emb = cp_core::Embedding::new(chunk.id, &vec, embedder.model_hash().unwrap(), 0);
1460 graph.insert_embedding(&emb).unwrap();
1461 }
1462
1463 let qe = QueryEngine::new(Arc::new(Mutex::new(graph)), embedder);
1464
1465 let q1 = "calculate_hyper_parameter_v7";
1467 let res1 = qe.search(q1, 1).unwrap();
1468 println!("\n--- QUERY: '{}' ---", q1);
1469 for (i, r) in res1.iter().enumerate() {
1470 println!("Rank {}: [Score: {:.4}] {}", i+1, r.score, r.doc_path);
1471 }
1472 assert!(res1[0].doc_path.contains("lexical_gap.md"));
1473
1474 let q2 = "cryptographic privacy statement validity"; let res2 = qe.search(q2, 3).unwrap();
1477 println!("\n--- QUERY: '{}' ---", q2);
1478 for (i, r) in res2.iter().enumerate() {
1479 println!("Rank {}: [Score: {:.4}] {}", i+1, r.score, r.doc_path);
1480 }
1481 let found_zk = res2.iter().any(|r| r.doc_path.contains("zk"));
1483 assert!(found_zk);
1484
1485 let q3 = "Ethereum privacy zksnark";
1487 let res3 = qe.search(q3, 5).unwrap();
1488 println!("\n--- QUERY: '{}' ---", q3);
1489 for (i, r) in res3.iter().enumerate() {
1490 println!("Rank {}: [Score: {:.4}] {}", i+1, r.score, r.doc_path);
1491 }
1492 }
1493
1494 #[tokio::test]
1495 async fn test_search_modes() {
1496 let temp = TempDir::new().unwrap();
1497 let db_path = temp.path().join("modes.db");
1498 let mut graph = cp_graph::GraphStore::open(db_path.to_str().unwrap()).unwrap();
1499
1500 let doc = Document::new("test.md".into(), b"The quick brown fox jumps over the lazy dog", 100);
1501 graph.insert_document(&doc).unwrap();
1502 let chunk = Chunk {
1503 id: Uuid::new_v4(),
1504 doc_id: doc.id,
1505 text: "The quick brown fox jumps over the lazy dog".to_string(),
1506 byte_offset: 0,
1507 byte_length: 43,
1508 sequence: 0,
1509 text_hash: [0; 32],
1510 };
1511 graph.insert_chunk(&chunk).unwrap();
1512 let embedder = Arc::new(cp_embeddings::EmbeddingEngine::new().unwrap());
1513 let vec = embedder.embed(&chunk.text).unwrap();
1514 let emb = cp_core::Embedding::new(chunk.id, &vec, embedder.model_hash().unwrap(), 0);
1515 graph.insert_embedding(&emb).unwrap();
1516
1517 let qe = QueryEngine::new(Arc::new(Mutex::new(graph)), embedder);
1518
1519 let results = qe.search_semantic("canine", 5).unwrap();
1522 assert!(!results.is_empty());
1523
1524 let results = qe.search_lexical("fox", 5).unwrap();
1527 assert!(!results.is_empty());
1528 assert!(results[0].chunk.text.contains("fox"));
1529 }
1530
1531 fn make_local_result(chunk_id_byte: u8, text: &str, doc_path: &str, score: f32) -> SearchResult {
1536 SearchResult {
1537 chunk: Chunk {
1538 id: Uuid::from_bytes([chunk_id_byte; 16]),
1539 doc_id: Uuid::from_bytes([0u8; 16]),
1540 text: text.to_string(),
1541 byte_offset: 0,
1542 byte_length: text.len() as u64,
1543 sequence: 0,
1544 text_hash: [0u8; 32],
1545 },
1546 score,
1547 doc_path: doc_path.to_string(),
1548 }
1549 }
1550
1551 fn make_remote_result(chunk_id_byte: u8, text: &str, doc_path: &str, score: u32) -> RemoteSearchResult {
1552 RemoteSearchResult {
1553 chunk_id: [chunk_id_byte; 16],
1554 chunk_text: text.to_string(),
1555 document_path: doc_path.to_string(),
1556 score,
1557 merkle_proof: None,
1558 }
1559 }
1560
1561 fn make_verified(
1562 result: RemoteSearchResult,
1563 weight: f64,
1564 peer_byte: u8,
1565 ) -> VerifiedRemoteResult {
1566 VerifiedRemoteResult {
1567 result,
1568 weight,
1569 peer_node_id: [peer_byte; 16],
1570 peer_state_root: [0u8; 32],
1571 peer_signature: [0u8; 64],
1572 }
1573 }
1574
1575 #[test]
1576 fn test_merge_local_only() {
1577 let local = vec![
1578 make_local_result(1, "chunk A", "doc_a.md", 0.9),
1579 make_local_result(2, "chunk B", "doc_b.md", 0.8),
1580 ];
1581
1582 let merged = merge_results(&local, &[], 10);
1583 assert_eq!(merged.len(), 2);
1584 assert!(merged[0].score > merged[1].score);
1585 assert!(matches!(merged[0].source, ResultSource::Local));
1586 assert_eq!(merged[0].chunk_text, "chunk A");
1587 }
1588
1589 #[test]
1590 fn test_merge_remote_only() {
1591 let remote = vec![
1592 make_verified(
1593 make_remote_result(3, "remote chunk", "remote.md", 16000),
1594 0.8,
1595 99,
1596 ),
1597 ];
1598
1599 let merged = merge_results(&[], &remote, 10);
1600 assert_eq!(merged.len(), 1);
1601 assert!(matches!(merged[0].source, ResultSource::Remote { .. }));
1602 assert_eq!(merged[0].chunk_text, "remote chunk");
1603 assert!(merged[0].peer_state_root.is_some());
1604 }
1605
1606 #[test]
1607 fn test_merge_deduplication() {
1608 let local = vec![make_local_result(1, "shared chunk local", "doc.md", 0.9)];
1610 let remote = vec![make_verified(
1611 make_remote_result(1, "shared chunk remote", "doc.md", 16000),
1612 0.7,
1613 50,
1614 )];
1615
1616 let merged = merge_results(&local, &remote, 10);
1617 assert_eq!(merged.len(), 1, "Duplicate chunk should be merged");
1618 assert!(matches!(merged[0].source, ResultSource::Both { .. }));
1619
1620 let local_only = merge_results(&local, &[], 10);
1622 assert!(
1623 merged[0].score > local_only[0].score,
1624 "Merged score ({}) should exceed local-only score ({})",
1625 merged[0].score,
1626 local_only[0].score
1627 );
1628 }
1629
1630 #[test]
1631 fn test_merge_weight_affects_score() {
1632 let remote_high = vec![make_verified(
1633 make_remote_result(1, "high weight", "doc.md", 16000),
1634 1.0,
1635 10,
1636 )];
1637 let remote_low = vec![make_verified(
1638 make_remote_result(1, "low weight", "doc.md", 16000),
1639 0.3,
1640 20,
1641 )];
1642
1643 let merged_high = merge_results(&[], &remote_high, 10);
1644 let merged_low = merge_results(&[], &remote_low, 10);
1645
1646 assert!(
1647 merged_high[0].score > merged_low[0].score,
1648 "Higher weight ({}) should produce higher score ({}) vs ({})",
1649 1.0,
1650 merged_high[0].score,
1651 merged_low[0].score,
1652 );
1653 }
1654
1655 #[test]
1656 fn test_merge_respects_max_results() {
1657 let local: Vec<SearchResult> = (0..15)
1658 .map(|i| make_local_result(i, &format!("chunk {}", i), "doc.md", 0.9 - i as f32 * 0.01))
1659 .collect();
1660 let remote: Vec<VerifiedRemoteResult> = (15..30)
1661 .map(|i| {
1662 make_verified(
1663 make_remote_result(i, &format!("remote {}", i), "remote.md", 10000),
1664 0.5,
1665 99,
1666 )
1667 })
1668 .collect();
1669
1670 let merged = merge_results(&local, &remote, 10);
1671 assert_eq!(merged.len(), 10, "Should cap at max_results");
1672 }
1673
1674 #[test]
1675 fn test_merge_cap_at_20() {
1676 let local: Vec<SearchResult> = (0..25)
1677 .map(|i| make_local_result(i, &format!("c{}", i), "d.md", 0.5))
1678 .collect();
1679
1680 let merged = merge_results(&local, &[], 100);
1682 assert_eq!(merged.len(), 20, "Should cap at MAX_RESULTS (20)");
1683 }
1684
1685 #[test]
1686 fn test_merge_scores_decrease() {
1687 let local = vec![
1688 make_local_result(1, "first", "a.md", 0.9),
1689 make_local_result(2, "second", "b.md", 0.8),
1690 make_local_result(3, "third", "c.md", 0.7),
1691 ];
1692 let remote = vec![
1693 make_verified(make_remote_result(4, "r1", "d.md", 16000), 0.8, 10),
1694 make_verified(make_remote_result(5, "r2", "e.md", 15000), 0.8, 10),
1695 ];
1696
1697 let merged = merge_results(&local, &remote, 20);
1698 for w in merged.windows(2) {
1699 assert!(
1700 w[0].score >= w[1].score,
1701 "Scores should be in descending order: {} >= {}",
1702 w[0].score,
1703 w[1].score
1704 );
1705 }
1706 }
1707
1708 #[test]
1709 fn test_merge_empty_inputs() {
1710 let merged = merge_results(&[], &[], 10);
1711 assert!(merged.is_empty());
1712 }
1713
1714 #[test]
1715 fn test_verify_and_extract_valid() {
1716 let signing_key = ed25519_dalek::SigningKey::from_bytes(&[88u8; 32]);
1718 let public_key = signing_key.verifying_key().to_bytes();
1719
1720 let mut response = SearchResponse {
1721 request_id: [1u8; 16],
1722 status: SearchStatus::Ok,
1723 results: vec![RemoteSearchResult {
1724 chunk_id: [2u8; 16],
1725 chunk_text: "test chunk".to_string(),
1726 document_path: "test.md".to_string(),
1727 score: 16000,
1728 merkle_proof: None,
1729 }],
1730 peer_state_root: [3u8; 32],
1731 search_latency_ms: 50,
1732 timestamp: 1000,
1733 signature: [0u8; 64],
1734 };
1735
1736 let signing_bytes = response.signing_bytes();
1737 response.signature =
1738 ed25519_dalek::Signer::sign(&signing_key, &signing_bytes).to_bytes();
1739
1740 let extracted = verify_and_extract(&response, &public_key, [10u8; 16], Some(0.9));
1741 assert!(extracted.is_some());
1742 let results = extracted.unwrap();
1743 assert_eq!(results.len(), 1);
1744 assert!((results[0].weight - 0.9).abs() < 0.001);
1745 }
1746
1747 #[test]
1748 fn test_verify_and_extract_bad_signature() {
1749 let response = SearchResponse {
1750 request_id: [1u8; 16],
1751 status: SearchStatus::Ok,
1752 results: vec![],
1753 peer_state_root: [0u8; 32],
1754 search_latency_ms: 0,
1755 timestamp: 1000,
1756 signature: [0u8; 64], };
1758
1759 let fake_key = [0u8; 32];
1760 let extracted = verify_and_extract(&response, &fake_key, [10u8; 16], None);
1761 assert!(extracted.is_none());
1762 }
1763
1764 #[test]
1765 fn test_verify_and_extract_non_ok_status() {
1766 let response = SearchResponse {
1767 request_id: [1u8; 16],
1768 status: SearchStatus::ModelMismatch,
1769 results: vec![],
1770 peer_state_root: [0u8; 32],
1771 search_latency_ms: 0,
1772 timestamp: 1000,
1773 signature: [0u8; 64],
1774 };
1775
1776 let key = [0u8; 32];
1777 let extracted = verify_and_extract(&response, &key, [10u8; 16], None);
1778 assert!(extracted.is_none(), "Non-Ok status should return None");
1779 }
1780
1781 #[test]
1782 fn test_verify_and_extract_unrated_peer() {
1783 let signing_key = ed25519_dalek::SigningKey::from_bytes(&[77u8; 32]);
1784 let public_key = signing_key.verifying_key().to_bytes();
1785
1786 let mut response = SearchResponse {
1787 request_id: [1u8; 16],
1788 status: SearchStatus::Ok,
1789 results: vec![RemoteSearchResult {
1790 chunk_id: [5u8; 16],
1791 chunk_text: "unrated".to_string(),
1792 document_path: "doc.md".to_string(),
1793 score: 14000,
1794 merkle_proof: None,
1795 }],
1796 peer_state_root: [0u8; 32],
1797 search_latency_ms: 10,
1798 timestamp: 2000,
1799 signature: [0u8; 64],
1800 };
1801
1802 let signing_bytes = response.signing_bytes();
1803 response.signature =
1804 ed25519_dalek::Signer::sign(&signing_key, &signing_bytes).to_bytes();
1805
1806 let extracted = verify_and_extract(&response, &public_key, [20u8; 16], None);
1808 assert!(extracted.is_some());
1809 assert!((extracted.unwrap()[0].weight - 0.5).abs() < 0.001);
1810 }
1811
1812 #[test]
1813 fn test_verify_and_extract_clamps_weight() {
1814 let signing_key = ed25519_dalek::SigningKey::from_bytes(&[77u8; 32]);
1815 let public_key = signing_key.verifying_key().to_bytes();
1816
1817 let mut response = SearchResponse {
1818 request_id: [1u8; 16],
1819 status: SearchStatus::Ok,
1820 results: vec![RemoteSearchResult {
1821 chunk_id: [5u8; 16],
1822 chunk_text: "clamped".to_string(),
1823 document_path: "doc.md".to_string(),
1824 score: 14000,
1825 merkle_proof: None,
1826 }],
1827 peer_state_root: [0u8; 32],
1828 search_latency_ms: 10,
1829 timestamp: 2000,
1830 signature: [0u8; 64],
1831 };
1832
1833 let signing_bytes = response.signing_bytes();
1834 response.signature =
1835 ed25519_dalek::Signer::sign(&signing_key, &signing_bytes).to_bytes();
1836
1837 let extracted = verify_and_extract(&response, &public_key, [20u8; 16], Some(0.01));
1839 assert!((extracted.unwrap()[0].weight - 0.3).abs() < 0.001);
1840
1841 let mut response2 = response.clone();
1843 let signing_bytes2 = response2.signing_bytes();
1844 response2.signature =
1845 ed25519_dalek::Signer::sign(&signing_key, &signing_bytes2).to_bytes();
1846
1847 let extracted2 = verify_and_extract(&response2, &public_key, [20u8; 16], Some(5.0));
1848 assert!((extracted2.unwrap()[0].weight - 1.0).abs() < 0.001);
1849 }
1850}