cp_query/
lib.rs

1//! CP Query - RAG query engine
2//!
3//! Provides semantic search and chat over the knowledge graph.
4//!
5//! Per CP-012/CP-020: Supports filtered retrieval and query caching.
6
7use cp_core::{Chunk, CPError, Result, ContextAssembler, ScoredChunk, AssembledContext};
8use glob::Pattern;
9use lru::LruCache;
10use std::num::NonZeroUsize;
11use std::sync::{Arc, Mutex, RwLock};
12use tracing::{info, warn};
13use uuid::Uuid;
14use std::collections::HashMap;
15
16/// Filter for search queries
17///
18/// Per CP-012: Supports filtering by document path, MIME type, and modification time.
19#[derive(Debug, Clone)]
20pub enum Filter {
21    /// Filter by document path glob pattern (e.g., "docs/*.md")
22    DocumentPath(String),
23    /// Filter by MIME type (e.g., "text/markdown")
24    MimeType(String),
25    /// Filter by modification time (Unix timestamp, documents modified after this time)
26    ModifiedAfter(i64),
27    /// Filter by modification time (Unix timestamp, documents modified before this time)
28    ModifiedBefore(i64),
29}
30
31impl Filter {
32    /// Check if a document matches this filter
33    pub fn matches(&self, doc: &cp_core::Document) -> bool {
34        match self {
35            Filter::DocumentPath(pattern) => {
36                if let Ok(glob) = Pattern::new(pattern) {
37                    glob.matches(doc.path.to_string_lossy().as_ref())
38                } else {
39                    false
40                }
41            }
42            Filter::MimeType(mime) => doc.mime_type == *mime,
43            Filter::ModifiedAfter(ts) => doc.mtime > *ts,
44            Filter::ModifiedBefore(ts) => doc.mtime < *ts,
45        }
46    }
47}
48
49/// Query cache for storing search results
50///
51/// Per CP-020: Caches query results keyed by query hash, invalidated on state change.
52pub struct QueryCache {
53    /// LRU cache: query_hash -> chunk IDs
54    cache: RwLock<LruCache<[u8; 32], Vec<Uuid>>>,
55    /// State root when cache was last valid
56    state_root: RwLock<[u8; 32]>,
57}
58
59impl QueryCache {
60    /// Create a new query cache with specified capacity
61    pub fn new(capacity: usize) -> Self {
62        Self {
63            cache: RwLock::new(LruCache::new(
64                NonZeroUsize::new(capacity).unwrap_or(NonZeroUsize::new(100).unwrap()),
65            )),
66            state_root: RwLock::new([0u8; 32]),
67        }
68    }
69
70    /// Get cached results for a query
71    pub fn get(&self, query: &str) -> Option<Vec<Uuid>> {
72        let hash = Self::hash_query(query);
73        self.cache.write().ok()?.get(&hash).cloned()
74    }
75
76    /// Store results for a query
77    pub fn put(&self, query: &str, results: Vec<Uuid>) {
78        let hash = Self::hash_query(query);
79        if let Ok(mut cache) = self.cache.write() {
80            cache.put(hash, results);
81        }
82    }
83
84    /// Check if cache is valid for current state root
85    pub fn is_valid(&self, current_root: &[u8; 32]) -> bool {
86        if let Ok(root) = self.state_root.read() {
87            *root == *current_root
88        } else {
89            false
90        }
91    }
92
93    /// Invalidate cache and update state root
94    pub fn invalidate(&self, new_root: [u8; 32]) {
95        if let Ok(mut cache) = self.cache.write() {
96            cache.clear();
97        }
98        if let Ok(mut root) = self.state_root.write() {
99            *root = new_root;
100        }
101    }
102
103    /// Hash a query string
104    fn hash_query(query: &str) -> [u8; 32] {
105        *blake3::hash(query.as_bytes()).as_bytes()
106    }
107}
108
109impl Default for QueryCache {
110    fn default() -> Self {
111        Self::new(100)
112    }
113}
114
115/// Search result with relevance score
116#[derive(Debug, Clone, serde::Serialize)]
117pub struct SearchResult {
118    /// The matching chunk
119    pub chunk: Chunk,
120    /// Similarity score (0.0-1.0)
121    pub score: f32,
122    /// Path to the source document
123    pub doc_path: String,
124}
125
126/// Result of an LLM generation
127#[derive(Debug, Clone, serde::Serialize)]
128pub struct GenerationResult {
129    /// The generated answer
130    pub answer: String,
131    /// The context used to generate the answer
132    pub context: String,
133    /// Latency in milliseconds
134    pub latency_ms: u64,
135}
136
137/// A citation linking response text to source chunks
138///
139/// Per CP-020: Tracks which parts of a response are grounded in context.
140#[derive(Debug, Clone, serde::Serialize)]
141pub struct Citation {
142    /// ID of the source chunk
143    pub chunk_id: Uuid,
144    /// Byte span in the response (start, end)
145    pub span: (usize, usize),
146    /// Confidence score (0.0-1.0) based on overlap ratio
147    pub confidence: f32,
148}
149
150/// Result of response validation
151///
152/// Per CP-020: Detects potential hallucinations and measures citation coverage.
153#[derive(Debug, Clone, serde::Serialize)]
154pub struct ValidationResult {
155    /// Whether the response is considered valid (well-grounded)
156    pub is_valid: bool,
157    /// Warning messages about potential issues
158    pub warnings: Vec<String>,
159    /// Percentage of response covered by citations (0.0-1.0)
160    pub citation_coverage: f32,
161    /// Extracted citations
162    pub citations: Vec<Citation>,
163}
164
165/// Phrases that often indicate hallucination
166const HALLUCINATION_PHRASES: &[&str] = &[
167    "from my knowledge",
168    "i recall that",
169    "as far as i know",
170    "i believe that",
171    "in my experience",
172    "typically",
173    "generally speaking",
174    "it's commonly known",
175    "as everyone knows",
176    "i think that",
177    "probably",
178    "most likely",
179    "i assume",
180    "based on my understanding",
181    "from what i've learned",
182];
183
184/// Extract citations by finding n-gram overlaps between response and context
185///
186/// Per CP-020: Uses 5-gram overlap detection to identify grounded text.
187pub fn extract_citations(response: &str, context: &AssembledContext) -> Vec<Citation> {
188    let mut citations = Vec::new();
189    let response_lower = response.to_lowercase();
190    let response_words: Vec<&str> = response_lower.split_whitespace().collect();
191
192    if response_words.len() < 5 {
193        return citations;
194    }
195
196    for chunk in &context.chunks {
197        let chunk_lower = chunk.text.to_lowercase();
198        let chunk_words: Vec<&str> = chunk_lower.split_whitespace().collect();
199
200        if chunk_words.len() < 5 {
201            continue;
202        }
203
204        // Find 5-gram overlaps
205        let mut overlap_count = 0;
206        let mut matched_positions: Vec<usize> = Vec::new();
207
208        for i in 0..=response_words.len().saturating_sub(5) {
209            let response_ngram: Vec<&str> = response_words[i..i + 5].to_vec();
210
211            for j in 0..=chunk_words.len().saturating_sub(5) {
212                let chunk_ngram: Vec<&str> = chunk_words[j..j + 5].to_vec();
213
214                if response_ngram == chunk_ngram {
215                    overlap_count += 1;
216                    matched_positions.push(i);
217                    break;
218                }
219            }
220        }
221
222        if overlap_count > 0 {
223            // Calculate confidence as ratio of matched n-grams
224            let max_ngrams = (response_words.len().saturating_sub(4)).max(1);
225            let confidence = (overlap_count as f32) / (max_ngrams as f32);
226
227            // Find byte span from word positions
228            let start_pos = matched_positions.first().copied().unwrap_or(0);
229            let end_pos = matched_positions.last().copied().unwrap_or(0) + 5;
230
231            // Convert word positions to byte offsets (approximate)
232            let mut byte_start = 0;
233            let mut byte_end = response.len();
234
235            let mut word_idx = 0;
236            for (i, c) in response.char_indices() {
237                if c.is_whitespace() {
238                    word_idx += 1;
239                    if word_idx == start_pos {
240                        byte_start = i + 1;
241                    }
242                    if word_idx == end_pos.min(response_words.len()) {
243                        byte_end = i;
244                        break;
245                    }
246                }
247            }
248
249            citations.push(Citation {
250                chunk_id: chunk.chunk_id,
251                span: (byte_start, byte_end),
252                confidence,
253            });
254        }
255    }
256
257    // Sort by confidence descending
258    citations.sort_by(|a, b| b.confidence.partial_cmp(&a.confidence).unwrap_or(std::cmp::Ordering::Equal));
259
260    citations
261}
262
263/// Validate a response for potential hallucinations
264///
265/// Per CP-020: Checks for hallucination phrases and low citation coverage.
266pub fn validate_response(response: &str, context: &AssembledContext) -> ValidationResult {
267    let mut warnings = Vec::new();
268
269    // Extract citations
270    let citations = extract_citations(response, context);
271
272    // Calculate citation coverage
273    let total_response_len = response.len() as f32;
274    let mut covered_bytes = 0usize;
275
276    for citation in &citations {
277        covered_bytes += citation.span.1.saturating_sub(citation.span.0);
278    }
279
280    let citation_coverage = if total_response_len > 0.0 {
281        (covered_bytes as f32 / total_response_len).min(1.0)
282    } else {
283        0.0
284    };
285
286    // Check for hallucination phrases
287    let response_lower = response.to_lowercase();
288    for phrase in HALLUCINATION_PHRASES {
289        if response_lower.contains(phrase) {
290            warnings.push(format!("Response contains hallucination indicator: '{}'", phrase));
291        }
292    }
293
294    // Check for low citation coverage
295    if citation_coverage < 0.3 && !response.is_empty() {
296        warnings.push(format!(
297            "Low citation coverage: {:.1}% (threshold: 30%)",
298            citation_coverage * 100.0
299        ));
300    }
301
302    // Check if response claims missing information (this is good, not a warning)
303    let good_phrases = ["information is missing", "not found in the context", "cannot find"];
304    let claims_missing = good_phrases.iter().any(|p| response_lower.contains(p));
305
306    // Determine validity
307    let is_valid = warnings.is_empty() || claims_missing;
308
309    ValidationResult {
310        is_valid,
311        warnings,
312        citation_coverage,
313        citations,
314    }
315}
316
317/// Trait for the CP Intelligence Module (IM)
318/// 
319/// An IntelligenceEngine is a read-only consumer of the semantic substrate.
320/// It synthesizes information from retrieved context into human-readable answers.
321#[async_trait::async_trait]
322pub trait IntelligenceEngine: Send + Sync {
323    /// Generate a synthesized answer from the provided context and query.
324    /// This is a read-only operation and cannot mutate the underlying graph.
325    async fn generate(&self, context: &AssembledContext, query: &str) -> Result<String>;
326}
327
328/// Ollama-based LLM generator (for desktop/server use)
329pub struct OllamaGenerator {
330    base_url: String,
331    model: String,
332}
333
334impl OllamaGenerator {
335    /// Create a new Ollama generator
336    pub fn new(base_url: String, model: String) -> Self {
337        Self { base_url, model }
338    }
339}
340
341#[async_trait::async_trait]
342impl IntelligenceEngine for OllamaGenerator {
343    async fn generate(&self, context: &AssembledContext, query: &str) -> Result<String> {
344        let formatted_context = ContextAssembler::format(context);
345        
346        // Engineered for consistency with mobile prompt
347        let prompt = format!(
348            "<|im_start|>system\nYou are a context reading machine. You do not have knowledge of the outside world.\n- Read the Context below carefully.\n- If the answer to the Query is in the Context, output it.\n- If the answer is NOT in the Context, say 'Information is missing from the substrate.' and nothing else.\n- Do not make up facts.\n<|im_end|>\n<|im_start|>user\nContext:\n{}\n\nQuery: {}<|im_end|>\n<|im_start|>assistant\n",
349            formatted_context, query
350        );
351
352        let client = reqwest::Client::builder()
353            .timeout(std::time::Duration::from_secs(120))
354            .build()
355            .map_err(|e| CPError::Embedding(format!("Failed to create HTTP client: {}", e)))?;
356        let payload = serde_json::json!({
357            "model": self.model,
358            "prompt": prompt,
359            "stream": false
360        });
361
362        let url = format!("{}/api/generate", self.base_url);
363        let res = client
364            .post(&url)
365            .json(&payload)
366            .send()
367            .await
368            .map_err(|e| CPError::Embedding(format!("Ollama request failed: {}", e)))?;
369
370        let json: serde_json::Value = res
371            .json()
372            .await
373            .map_err(|e| CPError::Parse(e.to_string()))?;
374
375        let answer = json["response"]
376            .as_str()
377            .ok_or_else(|| CPError::Parse("Invalid Ollama response".into()))?
378            .to_string();
379
380        Ok(answer)
381    }
382}
383
384/// Query engine for semantic search
385pub struct QueryEngine {
386    graph: Arc<Mutex<cp_graph::GraphStore>>,
387    embedder: Arc<cp_embeddings::EmbeddingEngine>,
388    intelligence: Option<Box<dyn IntelligenceEngine>>,
389    /// Query result cache
390    cache: QueryCache,
391}
392
393impl QueryEngine {
394    /// Create a new query engine
395    pub fn new(
396        graph: Arc<Mutex<cp_graph::GraphStore>>,
397        embedder: Arc<cp_embeddings::EmbeddingEngine>,
398    ) -> Self {
399        Self {
400            graph,
401            embedder,
402            intelligence: None,
403            cache: QueryCache::default(),
404        }
405    }
406
407    /// Create a new query engine with custom cache capacity
408    pub fn with_cache_capacity(
409        graph: Arc<Mutex<cp_graph::GraphStore>>,
410        embedder: Arc<cp_embeddings::EmbeddingEngine>,
411        cache_capacity: usize,
412    ) -> Self {
413        Self {
414            graph,
415            embedder,
416            intelligence: None,
417            cache: QueryCache::new(cache_capacity),
418        }
419    }
420
421    /// Set the intelligence engine for RAG (builder pattern)
422    pub fn with_intelligence(mut self, intelligence: Box<dyn IntelligenceEngine>) -> Self {
423        self.intelligence = Some(intelligence);
424        self
425    }
426
427    /// Set or replace the intelligence engine after creation
428    pub fn set_intelligence(&mut self, intelligence: Box<dyn IntelligenceEngine>) {
429        self.intelligence = Some(intelligence);
430    }
431
432    /// Search for relevant chunks using a hybrid (semantic + lexical) approach
433    ///
434    /// Uses integer Reciprocal Rank Fusion (RRF) per CP-003 §7 for deterministic results:
435    /// Score(d) = 1,000,000 / (k + rank(d))
436    pub fn search(&self, query: &str, k: usize) -> Result<Vec<SearchResult>> {
437        info!("Hybrid search for: '{}'", query);
438
439        // 1. Semantic Search (Vector)
440        let query_vec = self
441            .embedder
442            .embed(query)
443            .map_err(|e| CPError::Embedding(format!("Failed to embed query: {}", e)))?;
444
445        let semantic_results = {
446            let graph = self.graph.lock().unwrap();
447            graph.search(&query_vec, k)?
448        };
449
450        // 2. Lexical Search (FTS5)
451        let lexical_results = {
452            let graph = self.graph.lock().unwrap();
453            // Surround query with quotes for better FTS performance if it has spaces
454            let fts_query = if query.contains(' ') {
455                format!("\"{}\"", query.replace('"', ""))
456            } else {
457                query.to_string()
458            };
459            graph.search_lexical(&fts_query, k).unwrap_or_else(|e| {
460                warn!("Lexical search failed: {}. Falling back to semantic only.", e);
461                Vec::new()
462            })
463        };
464
465        // 3. Merge Results using Integer Reciprocal Rank Fusion (RRF)
466        // Per CP-003 §7: Score(d) = 1,000,000 / (k + rank(d))
467        // This uses integer math for deterministic results across platforms
468        const RRF_K: u64 = 60;
469        const RRF_SCALE: u64 = 1_000_000;
470
471        // Use u64 for scores to ensure integer determinism
472        let mut scores: std::collections::HashMap<Uuid, u64> = std::collections::HashMap::new();
473
474        {
475            let graph = self.graph.lock().unwrap();
476
477            for (i, (emb_id, _)) in semantic_results.iter().enumerate() {
478                if let Ok(Some(chunk_id)) = graph.get_chunk_id_for_embedding(*emb_id) {
479                    // Integer RRF: 1,000,000 / (60 + rank)
480                    let score = RRF_SCALE / (RRF_K + i as u64);
481                    *scores.entry(chunk_id).or_insert(0) += score;
482                }
483            }
484
485            for (i, (chunk_id, _)) in lexical_results.iter().enumerate() {
486                let score = RRF_SCALE / (RRF_K + i as u64);
487                *scores.entry(*chunk_id).or_insert(0) += score;
488            }
489        }
490
491        // Sort by fused score (descending), then by chunk ID (ascending) for deterministic tiebreaking
492        let mut fused: Vec<(Uuid, u64)> = scores.into_iter().collect();
493        fused.sort_by(|a, b| {
494            b.1.cmp(&a.1) // Score descending
495                .then_with(|| a.0.cmp(&b.0)) // Chunk ID ascending for tiebreak
496        });
497        fused.truncate(k);
498
499        // 4. Retrieve chunks and docs
500        let mut search_results = Vec::with_capacity(fused.len());
501        let graph = self.graph.lock().unwrap();
502
503        for (chunk_id, fused_score) in fused {
504            let chunk = match graph.get_chunk(chunk_id)? {
505                Some(c) => c,
506                None => continue,
507            };
508
509            let doc = match graph.get_document(chunk.doc_id)? {
510                Some(d) => d,
511                None => continue,
512            };
513
514            // Convert to f32 for API compatibility (score is preserved proportionally)
515            let normalized_score = fused_score as f32 / (RRF_SCALE * 2) as f32;
516
517            search_results.push(SearchResult {
518                chunk,
519                score: normalized_score,
520                doc_path: doc.path.to_string_lossy().to_string(),
521            });
522        }
523
524        Ok(search_results)
525    }
526
527    /// Perform purely semantic search (vector only)
528    pub fn search_semantic(&self, query: &str, k: usize) -> Result<Vec<SearchResult>> {
529        info!("Semantic search for: '{}'", query);
530
531        let query_vec = self
532            .embedder
533            .embed(query)
534            .map_err(|e| CPError::Embedding(format!("Failed to embed query: {}", e)))?;
535
536        let raw_results = {
537            let graph = self.graph.lock().unwrap();
538            graph.search(&query_vec, k)?
539        };
540
541        let mut search_results = Vec::with_capacity(raw_results.len());
542        let graph = self.graph.lock().unwrap();
543
544        for (emb_id, score) in raw_results {
545            // Semantic search returns embedding IDs, need to resolve to chunk ID
546            if let Some(chunk_id) = graph.get_chunk_id_for_embedding(emb_id)? {
547                if let Some(chunk) = graph.get_chunk(chunk_id)? {
548                    if let Some(doc) = graph.get_document(chunk.doc_id)? {
549                        search_results.push(SearchResult {
550                            chunk,
551                            score,
552                            doc_path: doc.path.to_string_lossy().to_string(),
553                        });
554                    }
555                }
556            }
557        }
558
559        Ok(search_results)
560    }
561
562    /// Perform purely lexical search (keyword only)
563    pub fn search_lexical(&self, query: &str, k: usize) -> Result<Vec<SearchResult>> {
564        info!("Lexical search for: '{}'", query);
565
566        let raw_results = {
567            let graph = self.graph.lock().unwrap();
568            // Surround query with quotes for better FTS performance if it has spaces
569            let fts_query = if query.contains(' ') {
570                format!("\"{}\"", query.replace('"', ""))
571            } else {
572                query.to_string()
573            };
574            graph.search_lexical(&fts_query, k)?
575        };
576
577        let mut search_results = Vec::with_capacity(raw_results.len());
578        let graph = self.graph.lock().unwrap();
579
580        for (chunk_id, score) in raw_results {
581            // Lexical search returns chunk IDs directly
582            if let Some(chunk) = graph.get_chunk(chunk_id)? {
583                if let Some(doc) = graph.get_document(chunk.doc_id)? {
584                    search_results.push(SearchResult {
585                        chunk,
586                        score,
587                        doc_path: doc.path.to_string_lossy().to_string(),
588                    });
589                }
590            }
591        }
592
593        Ok(search_results)
594    }
595
596    /// Search with filters applied
597    ///
598    /// Per CP-012: Supports filtering by document path, MIME type, and modification time.
599    pub fn search_filtered(&self, query: &str, k: usize, filters: &[Filter]) -> Result<Vec<SearchResult>> {
600        info!("Filtered search for: '{}' with {} filters", query, filters.len());
601
602        // Get all matching documents based on filters
603        let matching_doc_ids: std::collections::HashSet<Uuid> = {
604            let graph = self.graph.lock().unwrap();
605            let all_docs = graph.get_all_documents()?;
606
607            all_docs
608                .into_iter()
609                .filter(|doc| filters.iter().all(|f| f.matches(doc)))
610                .map(|doc| doc.id)
611                .collect()
612        };
613
614        if matching_doc_ids.is_empty() {
615            info!("No documents match filters");
616            return Ok(Vec::new());
617        }
618
619        // Perform regular search
620        let all_results = self.search(query, k * 3)?; // Get more results to filter
621
622        // Filter results to only include matching documents
623        let filtered_results: Vec<SearchResult> = all_results
624            .into_iter()
625            .filter(|r| matching_doc_ids.contains(&r.chunk.doc_id))
626            .take(k)
627            .collect();
628
629        info!("Filtered search returned {} results", filtered_results.len());
630        Ok(filtered_results)
631    }
632
633    /// Search with caching
634    ///
635    /// Per CP-020: Uses query cache for faster repeated queries.
636    pub fn search_cached(&self, query: &str, k: usize) -> Result<Vec<SearchResult>> {
637        // Check cache validity
638        let current_root = {
639            let graph = self.graph.lock().unwrap();
640            graph.compute_merkle_root()?
641        };
642
643        if !self.cache.is_valid(&current_root) {
644            self.cache.invalidate(current_root);
645        }
646
647        // Check cache
648        if let Some(chunk_ids) = self.cache.get(query) {
649            info!("Cache hit for query: '{}'", query);
650
651            let graph = self.graph.lock().unwrap();
652            let mut results = Vec::new();
653
654            for chunk_id in chunk_ids.iter().take(k) {
655                if let Some(chunk) = graph.get_chunk(*chunk_id)? {
656                    if let Some(doc) = graph.get_document(chunk.doc_id)? {
657                        results.push(SearchResult {
658                            chunk,
659                            score: 1.0, // Score not preserved in cache
660                            doc_path: doc.path.to_string_lossy().to_string(),
661                        });
662                    }
663                }
664            }
665
666            return Ok(results);
667        }
668
669        // Cache miss - perform search
670        let results = self.search(query, k)?;
671
672        // Store in cache
673        let chunk_ids: Vec<Uuid> = results.iter().map(|r| r.chunk.id).collect();
674        self.cache.put(query, chunk_ids);
675
676        Ok(results)
677    }
678
679    /// Invalidate the query cache
680    pub fn invalidate_cache(&self) -> Result<()> {
681        let root = {
682            let graph = self.graph.lock().unwrap();
683            graph.compute_merkle_root()?
684        };
685        self.cache.invalidate(root);
686        Ok(())
687    }
688
689    /// Get all chunks for a specific document
690    pub fn get_chunks_for_document(&self, doc_id: Uuid) -> Result<Vec<SearchResult>> {
691        let graph = self.graph.lock().unwrap();
692        
693        let doc = graph
694            .get_document(doc_id)?
695            .ok_or_else(|| CPError::Database(format!("Doc {} not found", doc_id)))?;
696            
697        let chunks = graph.get_chunks_for_doc(doc_id)?;
698        
699        Ok(chunks.into_iter().map(|c| SearchResult {
700            chunk: c,
701            score: 1.0, // Browsing doesn't have a score
702            doc_path: doc.path.to_string_lossy().to_string(),
703        }).collect())
704    }
705
706    /// Access the underlying graph store (for testing/debugging)
707    pub fn graph(&self) -> Arc<Mutex<cp_graph::GraphStore>> {
708        self.graph.clone()
709    }
710
711    /// Generate an answer using the knowledge graph and an LLM
712    pub async fn generate_answer(&self, query: &str) -> Result<GenerationResult> {
713        let start = std::time::Instant::now();
714        info!("Generating answer for: '{}'", query);
715
716        // 1. Search for relevant chunks
717        let results = self.search(query, 5)?;
718
719        // 2. Assemble context
720        let assembler = ContextAssembler::with_budget(2000); 
721        let scored_chunks: Vec<ScoredChunk> = results
722            .iter()
723            .map(|r| ScoredChunk {
724                chunk: r.chunk.clone(),
725                score: r.score,
726                document_path: r.doc_path.clone(),
727            })
728            .collect();
729
730        let state_root = {
731            let graph = self.graph.lock().unwrap();
732            graph.compute_merkle_root()?
733        };
734        
735        let assembled_context = assembler.assemble(scored_chunks, query, state_root);
736
737        // 3. Generate answer using configured intelligence engine (no fallback)
738        let answer = if let Some(ref engine) = self.intelligence {
739            engine.generate(&assembled_context, query).await?
740        } else {
741            return Err(CPError::NotFound("Intelligence engine not configured".into()));
742        };
743
744        Ok(GenerationResult {
745            answer,
746            context: ContextAssembler::format(&assembled_context),
747            latency_ms: start.elapsed().as_millis() as u64,
748        })
749    }
750
751    /// Generate a cryptographic proof receipt for a query.
752    ///
753    /// This creates a signed, verifiable record of exactly what context
754    /// was available when a search was performed. The receipt includes
755    /// Merkle proofs for each chunk, allowing independent verification.
756    pub fn generate_proof_receipt(
757        &self,
758        query: &str,
759        search_results: &[SearchResult],
760        identity: &cp_sync::DeviceIdentity,
761    ) -> Result<cp_core::ProofReceipt> {
762        let query_hash = *blake3::hash(query.as_bytes()).as_bytes();
763
764        // Assemble context from search results
765        let assembler = ContextAssembler::with_budget(4000);
766        let scored_chunks: Vec<ScoredChunk> = search_results
767            .iter()
768            .map(|r| ScoredChunk {
769                chunk: r.chunk.clone(),
770                score: r.score,
771                document_path: r.doc_path.clone(),
772            })
773            .collect();
774
775        let state_root = {
776            let graph = self.graph.lock().unwrap();
777            graph.compute_merkle_root()?
778        };
779
780        let assembled = assembler.assemble(scored_chunks, query, state_root);
781        let context_string = ContextAssembler::format(&assembled);
782        let context_hash = *blake3::hash(context_string.as_bytes()).as_bytes();
783
784        // Get sorted chunk hashes and compute chunk tree root
785        let (sorted_chunk_ids, sorted_chunk_hashes, chunk_tree_root) = {
786            let graph = self.graph.lock().unwrap();
787            let sorted = graph.get_sorted_chunk_hashes()?;
788            let hashes: Vec<[u8; 32]> = sorted.iter().map(|(_, h)| *h).collect();
789            let root = cp_core::proof::compute_chunk_tree_root(&hashes);
790            (sorted, hashes, root)
791        };
792
793        // Build per-chunk proofs and source references
794        let mut chunk_proofs = Vec::new();
795        let mut sources = Vec::new();
796
797        for result in search_results {
798            let chunk_id_bytes = *result.chunk.id.as_bytes();
799
800            // Find this chunk's index in the sorted list
801            if let Some(idx) = sorted_chunk_ids.iter().position(|(id, _)| *id == chunk_id_bytes) {
802                let proof = cp_core::proof::build_chunk_proof(
803                    chunk_id_bytes,
804                    result.chunk.text_hash,
805                    idx,
806                    &sorted_chunk_hashes,
807                );
808                chunk_proofs.push(proof);
809            }
810
811            sources.push(cp_core::SourceRef {
812                document_path: result.doc_path.clone(),
813                chunk_id: chunk_id_bytes,
814                chunk_text: result.chunk.text.clone(),
815                chunk_sequence: result.chunk.sequence,
816                relevance_score: result.score,
817            });
818        }
819
820        // Generate timestamp
821        let now = std::time::SystemTime::now()
822            .duration_since(std::time::UNIX_EPOCH)
823            .unwrap_or_default();
824        let secs = now.as_secs();
825        let timestamp = format_unix_timestamp(secs);
826
827        // Build and sign receipt
828        let mut receipt = cp_core::ProofReceipt {
829            version: 1,
830            query: query.to_string(),
831            query_hash,
832            timestamp,
833            context_hash,
834            state_root,
835            chunk_tree_root,
836            chunk_proofs,
837            sources,
838            signature: [0u8; 64],
839            signer_public_key: identity.public_key,
840            device_id: identity.device_id,
841            git: None,
842        };
843
844        let sig = identity.sign(&receipt.signing_bytes());
845        receipt.signature = sig;
846
847        Ok(receipt)
848    }
849
850    /// Chat with context from the knowledge graph
851    pub fn chat(&self, _query: &str, _history: &[Message]) -> Result<String> {
852        // This will be expanded later in Phase 2
853        Err(CPError::NotFound("Use generate_answer for Phase 2 initial integration".into()))
854    }
855}
856
857/// A chat message
858#[derive(Debug, Clone)]
859pub struct Message {
860    pub role: Role,
861    pub content: String,
862}
863
864/// Chat message role
865#[derive(Debug, Clone, Copy, PartialEq, Eq)]
866pub enum Role {
867    User,
868    Assistant,
869    System,
870}
871
872/// Format a unix timestamp as ISO 8601 UTC.
873fn format_unix_timestamp(secs: u64) -> String {
874    let days_since_epoch = secs / 86400;
875    let time_of_day = secs % 86400;
876    let hours = time_of_day / 3600;
877    let minutes = (time_of_day % 3600) / 60;
878    let seconds = time_of_day % 60;
879    let (year, month, day) = days_to_date(days_since_epoch);
880    format!(
881        "{:04}-{:02}-{:02}T{:02}:{:02}:{:02}Z",
882        year, month, day, hours, minutes, seconds
883    )
884}
885
886fn days_to_date(days: u64) -> (u64, u64, u64) {
887    let z = days + 719468;
888    let era = z / 146097;
889    let doe = z - era * 146097;
890    let yoe = (doe - doe / 1460 + doe / 36524 - doe / 146096) / 365;
891    let y = yoe + era * 400;
892    let doy = doe - (365 * yoe + yoe / 4 - yoe / 100);
893    let mp = (5 * doy + 2) / 153;
894    let d = doy - (153 * mp + 2) / 5 + 1;
895    let m = if mp < 10 { mp + 3 } else { mp - 9 };
896    let y = if m <= 2 { y + 1 } else { y };
897    (y, m, d)
898}
899
900#[cfg(test)]
901mod tests {
902    use super::*;
903    use cp_core::{Document, Chunk};
904    use std::sync::{Arc, Mutex};
905    use tempfile::TempDir;
906
907    #[tokio::test]
908    async fn test_get_chunks_for_document() {
909        let temp = TempDir::new().unwrap();
910        let db_path = temp.path().join("test.db");
911        let mut graph = cp_graph::GraphStore::open(db_path.to_str().unwrap()).unwrap();
912        
913        let doc = Document::new("test.md".into(), b"Hello world", 100);
914        graph.insert_document(&doc).unwrap();
915        
916        let chunk = Chunk {
917            id: Uuid::new_v4(),
918            doc_id: doc.id,
919            text: "Hello world".to_string(),
920            byte_offset: 0,
921            byte_length: 11,
922            sequence: 0,
923            text_hash: [0; 32],
924        };
925        graph.insert_chunk(&chunk).unwrap();
926        
927        let embedder = Arc::new(cp_embeddings::EmbeddingEngine::new().unwrap());
928        let qe = QueryEngine::new(Arc::new(Mutex::new(graph)), embedder);
929        
930        let results = qe.get_chunks_for_document(doc.id).unwrap();
931        assert_eq!(results.len(), 1);
932        assert_eq!(results[0].chunk.text, "Hello world");
933    }
934
935    #[tokio::test]
936    async fn test_hybrid_search() {
937        let temp = TempDir::new().unwrap();
938        let db_path = temp.path().join("test_hybrid.db");
939        let mut graph = cp_graph::GraphStore::open(db_path.to_str().unwrap()).unwrap();
940        
941        let doc = Document::new("test.md".into(), b"The quick brown fox jumps over the lazy dog", 100);
942        graph.insert_document(&doc).unwrap();
943        
944        let chunk = Chunk {
945            id: Uuid::new_v4(),
946            doc_id: doc.id,
947            text: "The quick brown fox jumps over the lazy dog".to_string(),
948            byte_offset: 0,
949            byte_length: 43,
950            sequence: 0,
951            text_hash: [0; 32],
952        };
953        graph.insert_chunk(&chunk).unwrap();
954        
955        // Add embedding for semantic search
956        let embedder = Arc::new(cp_embeddings::EmbeddingEngine::new().unwrap());
957        let vec = embedder.embed(&chunk.text).unwrap();
958        let emb = cp_core::Embedding::new(chunk.id, &vec, embedder.model_hash(), 0);
959        graph.insert_embedding(&emb).unwrap();
960        
961        let qe = QueryEngine::new(Arc::new(Mutex::new(graph)), embedder);
962        
963        // Test lexical priority
964        let results = qe.search("quick brown fox", 5).unwrap();
965        assert!(!results.is_empty());
966        assert!(results[0].chunk.text.contains("quick brown fox"));
967        
968        // Test semantic priority (using synonym/related terms)
969        let results_sem = qe.search("fast auburn canine", 5).unwrap();
970        assert!(!results_sem.is_empty());
971        assert!(results_sem[0].chunk.text.contains("quick brown fox"));
972    }
973
974    #[tokio::test]
975    async fn test_search_comparison_proof() {
976        let temp = TempDir::new().unwrap();
977        let db_path = temp.path().join("comparison_proof.db");
978        let mut graph = cp_graph::GraphStore::open(db_path.to_str().unwrap()).unwrap();
979        
980        // Setup Corpus
981        let t1 = "The quick brown fox jumps over the lazy dog"; // Keyword: fox
982        let t2 = "Artificial intelligence is transforming the modern world"; // Keyword: modern
983        let t3 = "A fast auburn canine leaps across an idle hound"; // Keyword: canine, Semantic match for fox
984        
985        let texts = vec![t1, t2, t3];
986        for (i, text) in texts.iter().enumerate() {
987            let doc = Document::new(format!("doc_{}.md", i).into(), text.as_bytes(), 100);
988            graph.insert_document(&doc).unwrap();
989            let chunk = Chunk {
990                id: Uuid::new_v4(),
991                doc_id: doc.id,
992                text: text.to_string(),
993                byte_offset: 0,
994                byte_length: text.len() as u64,
995                sequence: 0,
996                text_hash: [0; 32],
997            };
998            graph.insert_chunk(&chunk).unwrap();
999            let embedder = Arc::new(cp_embeddings::EmbeddingEngine::new().unwrap());
1000            let vec = embedder.embed(text).unwrap();
1001            let emb = cp_core::Embedding::new(chunk.id, &vec, embedder.model_hash(), 0);
1002            graph.insert_embedding(&emb).unwrap();
1003        }
1004        
1005        let embedder = Arc::new(cp_embeddings::EmbeddingEngine::new().unwrap());
1006        let qe = QueryEngine::new(Arc::new(Mutex::new(graph)), embedder);
1007        
1008        // Query: "fox"
1009        let query = "fox";
1010        
1011        // 1. Lexical Results
1012        let lexical = {
1013            let graph_lock = qe.graph();
1014            let g = graph_lock.lock().unwrap();
1015            g.search_lexical(query, 5).unwrap()
1016        };
1017        // Expect result: t1 (direct hit)
1018        assert_eq!(lexical.len(), 1); 
1019
1020        // 2. Vector Results
1021        let vector = {
1022            let graph_lock = qe.graph();
1023            let g = graph_lock.lock().unwrap();
1024            let e = cp_embeddings::EmbeddingEngine::new().unwrap();
1025            let q_vec = e.embed(query).unwrap();
1026            g.search(&q_vec, 5).unwrap()
1027        };
1028        // Expect results: t1 and t3 (canine)
1029        assert!(vector.len() >= 2);
1030
1031        // 3. Hybrid Results
1032        let hybrid = qe.search(query, 5).unwrap();
1033        
1034        println!("\n--- SEARCH PROOF FOR '{}' ---", query);
1035        println!("LEXICAL HITS: {}", lexical.len());
1036        println!("VECTOR HITS:  {}", vector.len());
1037        println!("HYBRID HITS:  {}", hybrid.len());
1038        
1039        // Proof: Hybrid should contain both direct hits and semantic relatives
1040        let texts_found: Vec<String> = hybrid.iter().map(|r| r.chunk.text.clone()).collect();
1041        assert!(texts_found.contains(&t1.to_string())); // Direct lexical + semantic
1042        assert!(texts_found.contains(&t3.to_string())); // Semantic only
1043    }
1044
1045    #[test]
1046    fn test_filter_by_mime_type() {
1047        use cp_core::Document;
1048        use std::path::PathBuf;
1049
1050        let doc_md = Document::new(PathBuf::from("test.md"), b"content", 1000);
1051        let doc_pdf = Document::new(PathBuf::from("test.pdf"), b"pdf content", 1000);
1052
1053        let filter = Filter::MimeType("text/markdown".to_string());
1054
1055        assert!(filter.matches(&doc_md));
1056        assert!(!filter.matches(&doc_pdf));
1057    }
1058
1059    #[test]
1060    fn test_filter_by_path_glob() {
1061        use cp_core::Document;
1062        use std::path::PathBuf;
1063
1064        let doc1 = Document::new(PathBuf::from("docs/readme.md"), b"content", 1000);
1065        let doc2 = Document::new(PathBuf::from("src/main.rs"), b"code", 1000);
1066
1067        let filter = Filter::DocumentPath("docs/*.md".to_string());
1068
1069        assert!(filter.matches(&doc1));
1070        assert!(!filter.matches(&doc2));
1071    }
1072
1073    #[test]
1074    fn test_filter_by_modified_time() {
1075        use cp_core::Document;
1076        use std::path::PathBuf;
1077
1078        let old_doc = Document::new(PathBuf::from("old.md"), b"content", 1000);
1079        let new_doc = Document::new(PathBuf::from("new.md"), b"content", 2000);
1080
1081        let filter_after = Filter::ModifiedAfter(1500);
1082        let filter_before = Filter::ModifiedBefore(1500);
1083
1084        assert!(!filter_after.matches(&old_doc));
1085        assert!(filter_after.matches(&new_doc));
1086
1087        assert!(filter_before.matches(&old_doc));
1088        assert!(!filter_before.matches(&new_doc));
1089    }
1090
1091    #[test]
1092    fn test_citation_extraction() {
1093        use cp_core::{ContextChunk, ContextMetadata};
1094
1095        let context = AssembledContext {
1096            chunks: vec![ContextChunk {
1097                chunk_id: Uuid::new_v4(),
1098                document_path: "test.md".to_string(),
1099                text: "The quick brown fox jumps over the lazy dog".to_string(),
1100                score: 1.0,
1101                sequence: 0,
1102            }],
1103            total_tokens: 10,
1104            truncated: false,
1105            metadata: ContextMetadata {
1106                query_hash: [0u8; 32],
1107                state_root: [0u8; 32],
1108            },
1109        };
1110
1111        // Response that contains text from context
1112        let response = "As mentioned, the quick brown fox jumps over the lazy dog in the story.";
1113        let citations = extract_citations(response, &context);
1114
1115        assert!(!citations.is_empty(), "Should find citations for overlapping text");
1116        assert!(citations[0].confidence > 0.0);
1117    }
1118
1119    #[test]
1120    fn test_hallucination_detection() {
1121        use cp_core::{ContextChunk, ContextMetadata};
1122
1123        let context = AssembledContext {
1124            chunks: vec![ContextChunk {
1125                chunk_id: Uuid::new_v4(),
1126                document_path: "test.md".to_string(),
1127                text: "The capital of France is Paris".to_string(),
1128                score: 1.0,
1129                sequence: 0,
1130            }],
1131            total_tokens: 10,
1132            truncated: false,
1133            metadata: ContextMetadata {
1134                query_hash: [0u8; 32],
1135                state_root: [0u8; 32],
1136            },
1137        };
1138
1139        // Response with hallucination phrase
1140        let bad_response = "From my knowledge, I believe that Paris is a beautiful city.";
1141        let result = validate_response(bad_response, &context);
1142
1143        assert!(!result.warnings.is_empty(), "Should detect hallucination phrases");
1144        assert!(result.warnings.iter().any(|w| w.contains("hallucination")));
1145
1146        // Good response that admits missing info
1147        let good_response = "Information is missing from the substrate.";
1148        let result2 = validate_response(good_response, &context);
1149
1150        assert!(result2.is_valid, "Should be valid when admitting missing info");
1151    }
1152
1153    #[tokio::test]
1154    async fn test_real_corpus_proof() {
1155        let temp = TempDir::new().unwrap();
1156        let db_path = temp.path().join("real_corpus.db");
1157        let mut graph = cp_graph::GraphStore::open(db_path.to_str().unwrap()).unwrap();
1158
1159        // 1. Ingest actual test_corpus files
1160        let corpus_dir = std::path::PathBuf::from("/Users/nadeem/dev/CP/test_corpus");
1161
1162        // Skip test if corpus doesn't exist
1163        if !corpus_dir.exists() {
1164            println!("Skipping test: corpus directory not found at {:?}", corpus_dir);
1165            return;
1166        }
1167
1168        let embedder = Arc::new(cp_embeddings::EmbeddingEngine::new().unwrap());
1169        
1170        let files = vec!["zk.md", "ethereum.md", "random.md", "zksnark.md", "lexical_gap.md", "adversarial.md"];
1171        
1172        for file_name in files {
1173            let path = corpus_dir.join(file_name);
1174            if !path.exists() { continue; }
1175            
1176            let content = std::fs::read_to_string(&path).unwrap();
1177            let doc = Document::new(path.clone(), content.as_bytes(), 0);
1178            graph.insert_document(&doc).unwrap();
1179            
1180            // Simple chunking (for brevity in test)
1181            let chunk = Chunk {
1182                id: Uuid::new_v4(),
1183                doc_id: doc.id,
1184                text: content.clone(),
1185                byte_offset: 0,
1186                byte_length: content.len() as u64,
1187                sequence: 0,
1188                text_hash: [0; 32],
1189            };
1190            graph.insert_chunk(&chunk).unwrap();
1191            
1192            let vec = embedder.embed(&content).unwrap();
1193            let emb = cp_core::Embedding::new(chunk.id, &vec, embedder.model_hash(), 0);
1194            graph.insert_embedding(&emb).unwrap();
1195        }
1196        
1197        let qe = QueryEngine::new(Arc::new(Mutex::new(graph)), embedder);
1198        
1199        // Test 1: Lexical Gap (Strict identifier)
1200        let q1 = "calculate_hyper_parameter_v7";
1201        let res1 = qe.search(q1, 1).unwrap();
1202        println!("\n--- QUERY: '{}' ---", q1);
1203        for (i, r) in res1.iter().enumerate() {
1204            println!("Rank {}: [Score: {:.4}] {}", i+1, r.score, r.doc_path);
1205        }
1206        assert!(res1[0].doc_path.contains("lexical_gap.md"));
1207
1208        // Test 2: Semantic (ZKP concept)
1209        let q2 = "cryptographic privacy statement validity"; // No "ZKP" or "SNARK" words
1210        let res2 = qe.search(q2, 3).unwrap();
1211        println!("\n--- QUERY: '{}' ---", q2);
1212        for (i, r) in res2.iter().enumerate() {
1213            println!("Rank {}: [Score: {:.4}] {}", i+1, r.score, r.doc_path);
1214        }
1215        // Should find zk.md or zksnark.md
1216        let found_zk = res2.iter().any(|r| r.doc_path.contains("zk"));
1217        assert!(found_zk);
1218
1219        // Test 3: Hybrid (Privacy blockchain)
1220        let q3 = "Ethereum privacy zksnark";
1221        let res3 = qe.search(q3, 5).unwrap();
1222        println!("\n--- QUERY: '{}' ---", q3);
1223        for (i, r) in res3.iter().enumerate() {
1224             println!("Rank {}: [Score: {:.4}] {}", i+1, r.score, r.doc_path);
1225        }
1226    }
1227
1228    #[tokio::test]
1229    async fn test_search_modes() {
1230        let temp = TempDir::new().unwrap();
1231        let db_path = temp.path().join("modes.db");
1232        let mut graph = cp_graph::GraphStore::open(db_path.to_str().unwrap()).unwrap();
1233        
1234        let doc = Document::new("test.md".into(), b"The quick brown fox jumps over the lazy dog", 100);
1235        graph.insert_document(&doc).unwrap();
1236        let chunk = Chunk {
1237            id: Uuid::new_v4(),
1238            doc_id: doc.id,
1239            text: "The quick brown fox jumps over the lazy dog".to_string(),
1240            byte_offset: 0,
1241            byte_length: 43,
1242            sequence: 0,
1243            text_hash: [0; 32],
1244        };
1245        graph.insert_chunk(&chunk).unwrap();
1246        let embedder = Arc::new(cp_embeddings::EmbeddingEngine::new().unwrap());
1247        let vec = embedder.embed(&chunk.text).unwrap();
1248        let emb = cp_core::Embedding::new(chunk.id, &vec, embedder.model_hash(), 0);
1249        graph.insert_embedding(&emb).unwrap();
1250        
1251        let qe = QueryEngine::new(Arc::new(Mutex::new(graph)), embedder);
1252        
1253        // Semantic
1254        // "canine" is semantically close to "dog" or "fox"
1255        let results = qe.search_semantic("canine", 5).unwrap();
1256        assert!(!results.is_empty());
1257        
1258        // Lexical
1259        // "fox" is in the text
1260        let results = qe.search_lexical("fox", 5).unwrap();
1261        assert!(!results.is_empty());
1262        assert!(results[0].chunk.text.contains("fox"));
1263    }
1264}
cp_query/lib.rs

cp_query/
lib.rs