1use cp_core::{Chunk, CPError, Result, ContextAssembler, ScoredChunk, AssembledContext};
8use glob::Pattern;
9use lru::LruCache;
10use std::num::NonZeroUsize;
11use std::sync::{Arc, Mutex, RwLock};
12use tracing::{info, warn};
13use uuid::Uuid;
14use std::collections::HashMap;
15
16#[derive(Debug, Clone)]
20pub enum Filter {
21 DocumentPath(String),
23 MimeType(String),
25 ModifiedAfter(i64),
27 ModifiedBefore(i64),
29}
30
31impl Filter {
32 pub fn matches(&self, doc: &cp_core::Document) -> bool {
34 match self {
35 Filter::DocumentPath(pattern) => {
36 if let Ok(glob) = Pattern::new(pattern) {
37 glob.matches(doc.path.to_string_lossy().as_ref())
38 } else {
39 false
40 }
41 }
42 Filter::MimeType(mime) => doc.mime_type == *mime,
43 Filter::ModifiedAfter(ts) => doc.mtime > *ts,
44 Filter::ModifiedBefore(ts) => doc.mtime < *ts,
45 }
46 }
47}
48
49pub struct QueryCache {
53 cache: RwLock<LruCache<[u8; 32], Vec<Uuid>>>,
55 state_root: RwLock<[u8; 32]>,
57}
58
59impl QueryCache {
60 pub fn new(capacity: usize) -> Self {
62 Self {
63 cache: RwLock::new(LruCache::new(
64 NonZeroUsize::new(capacity).unwrap_or(NonZeroUsize::new(100).unwrap()),
65 )),
66 state_root: RwLock::new([0u8; 32]),
67 }
68 }
69
70 pub fn get(&self, query: &str) -> Option<Vec<Uuid>> {
72 let hash = Self::hash_query(query);
73 self.cache.write().ok()?.get(&hash).cloned()
74 }
75
76 pub fn put(&self, query: &str, results: Vec<Uuid>) {
78 let hash = Self::hash_query(query);
79 if let Ok(mut cache) = self.cache.write() {
80 cache.put(hash, results);
81 }
82 }
83
84 pub fn is_valid(&self, current_root: &[u8; 32]) -> bool {
86 if let Ok(root) = self.state_root.read() {
87 *root == *current_root
88 } else {
89 false
90 }
91 }
92
93 pub fn invalidate(&self, new_root: [u8; 32]) {
95 if let Ok(mut cache) = self.cache.write() {
96 cache.clear();
97 }
98 if let Ok(mut root) = self.state_root.write() {
99 *root = new_root;
100 }
101 }
102
103 fn hash_query(query: &str) -> [u8; 32] {
105 *blake3::hash(query.as_bytes()).as_bytes()
106 }
107}
108
109impl Default for QueryCache {
110 fn default() -> Self {
111 Self::new(100)
112 }
113}
114
115#[derive(Debug, Clone, serde::Serialize)]
117pub struct SearchResult {
118 pub chunk: Chunk,
120 pub score: f32,
122 pub doc_path: String,
124}
125
126#[derive(Debug, Clone, serde::Serialize)]
128pub struct GenerationResult {
129 pub answer: String,
131 pub context: String,
133 pub latency_ms: u64,
135}
136
137#[derive(Debug, Clone, serde::Serialize)]
141pub struct Citation {
142 pub chunk_id: Uuid,
144 pub span: (usize, usize),
146 pub confidence: f32,
148}
149
150#[derive(Debug, Clone, serde::Serialize)]
154pub struct ValidationResult {
155 pub is_valid: bool,
157 pub warnings: Vec<String>,
159 pub citation_coverage: f32,
161 pub citations: Vec<Citation>,
163}
164
165const HALLUCINATION_PHRASES: &[&str] = &[
167 "from my knowledge",
168 "i recall that",
169 "as far as i know",
170 "i believe that",
171 "in my experience",
172 "typically",
173 "generally speaking",
174 "it's commonly known",
175 "as everyone knows",
176 "i think that",
177 "probably",
178 "most likely",
179 "i assume",
180 "based on my understanding",
181 "from what i've learned",
182];
183
184pub fn extract_citations(response: &str, context: &AssembledContext) -> Vec<Citation> {
188 let mut citations = Vec::new();
189 let response_lower = response.to_lowercase();
190 let response_words: Vec<&str> = response_lower.split_whitespace().collect();
191
192 if response_words.len() < 5 {
193 return citations;
194 }
195
196 for chunk in &context.chunks {
197 let chunk_lower = chunk.text.to_lowercase();
198 let chunk_words: Vec<&str> = chunk_lower.split_whitespace().collect();
199
200 if chunk_words.len() < 5 {
201 continue;
202 }
203
204 let mut overlap_count = 0;
206 let mut matched_positions: Vec<usize> = Vec::new();
207
208 for i in 0..=response_words.len().saturating_sub(5) {
209 let response_ngram: Vec<&str> = response_words[i..i + 5].to_vec();
210
211 for j in 0..=chunk_words.len().saturating_sub(5) {
212 let chunk_ngram: Vec<&str> = chunk_words[j..j + 5].to_vec();
213
214 if response_ngram == chunk_ngram {
215 overlap_count += 1;
216 matched_positions.push(i);
217 break;
218 }
219 }
220 }
221
222 if overlap_count > 0 {
223 let max_ngrams = (response_words.len().saturating_sub(4)).max(1);
225 let confidence = (overlap_count as f32) / (max_ngrams as f32);
226
227 let start_pos = matched_positions.first().copied().unwrap_or(0);
229 let end_pos = matched_positions.last().copied().unwrap_or(0) + 5;
230
231 let mut byte_start = 0;
233 let mut byte_end = response.len();
234
235 let mut word_idx = 0;
236 for (i, c) in response.char_indices() {
237 if c.is_whitespace() {
238 word_idx += 1;
239 if word_idx == start_pos {
240 byte_start = i + 1;
241 }
242 if word_idx == end_pos.min(response_words.len()) {
243 byte_end = i;
244 break;
245 }
246 }
247 }
248
249 citations.push(Citation {
250 chunk_id: chunk.chunk_id,
251 span: (byte_start, byte_end),
252 confidence,
253 });
254 }
255 }
256
257 citations.sort_by(|a, b| b.confidence.partial_cmp(&a.confidence).unwrap_or(std::cmp::Ordering::Equal));
259
260 citations
261}
262
263pub fn validate_response(response: &str, context: &AssembledContext) -> ValidationResult {
267 let mut warnings = Vec::new();
268
269 let citations = extract_citations(response, context);
271
272 let total_response_len = response.len() as f32;
274 let mut covered_bytes = 0usize;
275
276 for citation in &citations {
277 covered_bytes += citation.span.1.saturating_sub(citation.span.0);
278 }
279
280 let citation_coverage = if total_response_len > 0.0 {
281 (covered_bytes as f32 / total_response_len).min(1.0)
282 } else {
283 0.0
284 };
285
286 let response_lower = response.to_lowercase();
288 for phrase in HALLUCINATION_PHRASES {
289 if response_lower.contains(phrase) {
290 warnings.push(format!("Response contains hallucination indicator: '{}'", phrase));
291 }
292 }
293
294 if citation_coverage < 0.3 && !response.is_empty() {
296 warnings.push(format!(
297 "Low citation coverage: {:.1}% (threshold: 30%)",
298 citation_coverage * 100.0
299 ));
300 }
301
302 let good_phrases = ["information is missing", "not found in the context", "cannot find"];
304 let claims_missing = good_phrases.iter().any(|p| response_lower.contains(p));
305
306 let is_valid = warnings.is_empty() || claims_missing;
308
309 ValidationResult {
310 is_valid,
311 warnings,
312 citation_coverage,
313 citations,
314 }
315}
316
317#[async_trait::async_trait]
322pub trait IntelligenceEngine: Send + Sync {
323 async fn generate(&self, context: &AssembledContext, query: &str) -> Result<String>;
326}
327
328pub struct OllamaGenerator {
330 base_url: String,
331 model: String,
332}
333
334impl OllamaGenerator {
335 pub fn new(base_url: String, model: String) -> Self {
337 Self { base_url, model }
338 }
339}
340
341#[async_trait::async_trait]
342impl IntelligenceEngine for OllamaGenerator {
343 async fn generate(&self, context: &AssembledContext, query: &str) -> Result<String> {
344 let formatted_context = ContextAssembler::format(context);
345
346 let prompt = format!(
348 "<|im_start|>system\nYou are a context reading machine. You do not have knowledge of the outside world.\n- Read the Context below carefully.\n- If the answer to the Query is in the Context, output it.\n- If the answer is NOT in the Context, say 'Information is missing from the substrate.' and nothing else.\n- Do not make up facts.\n<|im_end|>\n<|im_start|>user\nContext:\n{}\n\nQuery: {}<|im_end|>\n<|im_start|>assistant\n",
349 formatted_context, query
350 );
351
352 let client = reqwest::Client::builder()
353 .timeout(std::time::Duration::from_secs(120))
354 .build()
355 .map_err(|e| CPError::Embedding(format!("Failed to create HTTP client: {}", e)))?;
356 let payload = serde_json::json!({
357 "model": self.model,
358 "prompt": prompt,
359 "stream": false
360 });
361
362 let url = format!("{}/api/generate", self.base_url);
363 let res = client
364 .post(&url)
365 .json(&payload)
366 .send()
367 .await
368 .map_err(|e| CPError::Embedding(format!("Ollama request failed: {}", e)))?;
369
370 let json: serde_json::Value = res
371 .json()
372 .await
373 .map_err(|e| CPError::Parse(e.to_string()))?;
374
375 let answer = json["response"]
376 .as_str()
377 .ok_or_else(|| CPError::Parse("Invalid Ollama response".into()))?
378 .to_string();
379
380 Ok(answer)
381 }
382}
383
384pub struct QueryEngine {
386 graph: Arc<Mutex<cp_graph::GraphStore>>,
387 embedder: Arc<cp_embeddings::EmbeddingEngine>,
388 intelligence: Option<Box<dyn IntelligenceEngine>>,
389 cache: QueryCache,
391}
392
393impl QueryEngine {
394 pub fn new(
396 graph: Arc<Mutex<cp_graph::GraphStore>>,
397 embedder: Arc<cp_embeddings::EmbeddingEngine>,
398 ) -> Self {
399 Self {
400 graph,
401 embedder,
402 intelligence: None,
403 cache: QueryCache::default(),
404 }
405 }
406
407 pub fn with_cache_capacity(
409 graph: Arc<Mutex<cp_graph::GraphStore>>,
410 embedder: Arc<cp_embeddings::EmbeddingEngine>,
411 cache_capacity: usize,
412 ) -> Self {
413 Self {
414 graph,
415 embedder,
416 intelligence: None,
417 cache: QueryCache::new(cache_capacity),
418 }
419 }
420
421 pub fn with_intelligence(mut self, intelligence: Box<dyn IntelligenceEngine>) -> Self {
423 self.intelligence = Some(intelligence);
424 self
425 }
426
427 pub fn set_intelligence(&mut self, intelligence: Box<dyn IntelligenceEngine>) {
429 self.intelligence = Some(intelligence);
430 }
431
432 pub fn search(&self, query: &str, k: usize) -> Result<Vec<SearchResult>> {
437 info!("Hybrid search for: '{}'", query);
438
439 let query_vec = self
441 .embedder
442 .embed(query)
443 .map_err(|e| CPError::Embedding(format!("Failed to embed query: {}", e)))?;
444
445 let semantic_results = {
446 let graph = self.graph.lock().unwrap();
447 graph.search(&query_vec, k)?
448 };
449
450 let lexical_results = {
452 let graph = self.graph.lock().unwrap();
453 let fts_query = if query.contains(' ') {
455 format!("\"{}\"", query.replace('"', ""))
456 } else {
457 query.to_string()
458 };
459 graph.search_lexical(&fts_query, k).unwrap_or_else(|e| {
460 warn!("Lexical search failed: {}. Falling back to semantic only.", e);
461 Vec::new()
462 })
463 };
464
465 const RRF_K: u64 = 60;
469 const RRF_SCALE: u64 = 1_000_000;
470
471 let mut scores: std::collections::HashMap<Uuid, u64> = std::collections::HashMap::new();
473
474 {
475 let graph = self.graph.lock().unwrap();
476
477 for (i, (emb_id, _)) in semantic_results.iter().enumerate() {
478 if let Ok(Some(chunk_id)) = graph.get_chunk_id_for_embedding(*emb_id) {
479 let score = RRF_SCALE / (RRF_K + i as u64);
481 *scores.entry(chunk_id).or_insert(0) += score;
482 }
483 }
484
485 for (i, (chunk_id, _)) in lexical_results.iter().enumerate() {
486 let score = RRF_SCALE / (RRF_K + i as u64);
487 *scores.entry(*chunk_id).or_insert(0) += score;
488 }
489 }
490
491 let mut fused: Vec<(Uuid, u64)> = scores.into_iter().collect();
493 fused.sort_by(|a, b| {
494 b.1.cmp(&a.1) .then_with(|| a.0.cmp(&b.0)) });
497 fused.truncate(k);
498
499 let mut search_results = Vec::with_capacity(fused.len());
501 let graph = self.graph.lock().unwrap();
502
503 for (chunk_id, fused_score) in fused {
504 let chunk = match graph.get_chunk(chunk_id)? {
505 Some(c) => c,
506 None => continue,
507 };
508
509 let doc = match graph.get_document(chunk.doc_id)? {
510 Some(d) => d,
511 None => continue,
512 };
513
514 let normalized_score = fused_score as f32 / (RRF_SCALE * 2) as f32;
516
517 search_results.push(SearchResult {
518 chunk,
519 score: normalized_score,
520 doc_path: doc.path.to_string_lossy().to_string(),
521 });
522 }
523
524 Ok(search_results)
525 }
526
527 pub fn search_semantic(&self, query: &str, k: usize) -> Result<Vec<SearchResult>> {
529 info!("Semantic search for: '{}'", query);
530
531 let query_vec = self
532 .embedder
533 .embed(query)
534 .map_err(|e| CPError::Embedding(format!("Failed to embed query: {}", e)))?;
535
536 let raw_results = {
537 let graph = self.graph.lock().unwrap();
538 graph.search(&query_vec, k)?
539 };
540
541 let mut search_results = Vec::with_capacity(raw_results.len());
542 let graph = self.graph.lock().unwrap();
543
544 for (emb_id, score) in raw_results {
545 if let Some(chunk_id) = graph.get_chunk_id_for_embedding(emb_id)? {
547 if let Some(chunk) = graph.get_chunk(chunk_id)? {
548 if let Some(doc) = graph.get_document(chunk.doc_id)? {
549 search_results.push(SearchResult {
550 chunk,
551 score,
552 doc_path: doc.path.to_string_lossy().to_string(),
553 });
554 }
555 }
556 }
557 }
558
559 Ok(search_results)
560 }
561
562 pub fn search_lexical(&self, query: &str, k: usize) -> Result<Vec<SearchResult>> {
564 info!("Lexical search for: '{}'", query);
565
566 let raw_results = {
567 let graph = self.graph.lock().unwrap();
568 let fts_query = if query.contains(' ') {
570 format!("\"{}\"", query.replace('"', ""))
571 } else {
572 query.to_string()
573 };
574 graph.search_lexical(&fts_query, k)?
575 };
576
577 let mut search_results = Vec::with_capacity(raw_results.len());
578 let graph = self.graph.lock().unwrap();
579
580 for (chunk_id, score) in raw_results {
581 if let Some(chunk) = graph.get_chunk(chunk_id)? {
583 if let Some(doc) = graph.get_document(chunk.doc_id)? {
584 search_results.push(SearchResult {
585 chunk,
586 score,
587 doc_path: doc.path.to_string_lossy().to_string(),
588 });
589 }
590 }
591 }
592
593 Ok(search_results)
594 }
595
596 pub fn search_filtered(&self, query: &str, k: usize, filters: &[Filter]) -> Result<Vec<SearchResult>> {
600 info!("Filtered search for: '{}' with {} filters", query, filters.len());
601
602 let matching_doc_ids: std::collections::HashSet<Uuid> = {
604 let graph = self.graph.lock().unwrap();
605 let all_docs = graph.get_all_documents()?;
606
607 all_docs
608 .into_iter()
609 .filter(|doc| filters.iter().all(|f| f.matches(doc)))
610 .map(|doc| doc.id)
611 .collect()
612 };
613
614 if matching_doc_ids.is_empty() {
615 info!("No documents match filters");
616 return Ok(Vec::new());
617 }
618
619 let all_results = self.search(query, k * 3)?; let filtered_results: Vec<SearchResult> = all_results
624 .into_iter()
625 .filter(|r| matching_doc_ids.contains(&r.chunk.doc_id))
626 .take(k)
627 .collect();
628
629 info!("Filtered search returned {} results", filtered_results.len());
630 Ok(filtered_results)
631 }
632
633 pub fn search_cached(&self, query: &str, k: usize) -> Result<Vec<SearchResult>> {
637 let current_root = {
639 let graph = self.graph.lock().unwrap();
640 graph.compute_merkle_root()?
641 };
642
643 if !self.cache.is_valid(¤t_root) {
644 self.cache.invalidate(current_root);
645 }
646
647 if let Some(chunk_ids) = self.cache.get(query) {
649 info!("Cache hit for query: '{}'", query);
650
651 let graph = self.graph.lock().unwrap();
652 let mut results = Vec::new();
653
654 for chunk_id in chunk_ids.iter().take(k) {
655 if let Some(chunk) = graph.get_chunk(*chunk_id)? {
656 if let Some(doc) = graph.get_document(chunk.doc_id)? {
657 results.push(SearchResult {
658 chunk,
659 score: 1.0, doc_path: doc.path.to_string_lossy().to_string(),
661 });
662 }
663 }
664 }
665
666 return Ok(results);
667 }
668
669 let results = self.search(query, k)?;
671
672 let chunk_ids: Vec<Uuid> = results.iter().map(|r| r.chunk.id).collect();
674 self.cache.put(query, chunk_ids);
675
676 Ok(results)
677 }
678
679 pub fn invalidate_cache(&self) -> Result<()> {
681 let root = {
682 let graph = self.graph.lock().unwrap();
683 graph.compute_merkle_root()?
684 };
685 self.cache.invalidate(root);
686 Ok(())
687 }
688
689 pub fn get_chunks_for_document(&self, doc_id: Uuid) -> Result<Vec<SearchResult>> {
691 let graph = self.graph.lock().unwrap();
692
693 let doc = graph
694 .get_document(doc_id)?
695 .ok_or_else(|| CPError::Database(format!("Doc {} not found", doc_id)))?;
696
697 let chunks = graph.get_chunks_for_doc(doc_id)?;
698
699 Ok(chunks.into_iter().map(|c| SearchResult {
700 chunk: c,
701 score: 1.0, doc_path: doc.path.to_string_lossy().to_string(),
703 }).collect())
704 }
705
706 pub fn graph(&self) -> Arc<Mutex<cp_graph::GraphStore>> {
708 self.graph.clone()
709 }
710
711 pub async fn generate_answer(&self, query: &str) -> Result<GenerationResult> {
713 let start = std::time::Instant::now();
714 info!("Generating answer for: '{}'", query);
715
716 let results = self.search(query, 5)?;
718
719 let assembler = ContextAssembler::with_budget(2000);
721 let scored_chunks: Vec<ScoredChunk> = results
722 .iter()
723 .map(|r| ScoredChunk {
724 chunk: r.chunk.clone(),
725 score: r.score,
726 document_path: r.doc_path.clone(),
727 })
728 .collect();
729
730 let state_root = {
731 let graph = self.graph.lock().unwrap();
732 graph.compute_merkle_root()?
733 };
734
735 let assembled_context = assembler.assemble(scored_chunks, query, state_root);
736
737 let answer = if let Some(ref engine) = self.intelligence {
739 engine.generate(&assembled_context, query).await?
740 } else {
741 return Err(CPError::NotFound("Intelligence engine not configured".into()));
742 };
743
744 Ok(GenerationResult {
745 answer,
746 context: ContextAssembler::format(&assembled_context),
747 latency_ms: start.elapsed().as_millis() as u64,
748 })
749 }
750
751 pub fn generate_proof_receipt(
757 &self,
758 query: &str,
759 search_results: &[SearchResult],
760 identity: &cp_sync::DeviceIdentity,
761 ) -> Result<cp_core::ProofReceipt> {
762 let query_hash = *blake3::hash(query.as_bytes()).as_bytes();
763
764 let assembler = ContextAssembler::with_budget(4000);
766 let scored_chunks: Vec<ScoredChunk> = search_results
767 .iter()
768 .map(|r| ScoredChunk {
769 chunk: r.chunk.clone(),
770 score: r.score,
771 document_path: r.doc_path.clone(),
772 })
773 .collect();
774
775 let state_root = {
776 let graph = self.graph.lock().unwrap();
777 graph.compute_merkle_root()?
778 };
779
780 let assembled = assembler.assemble(scored_chunks, query, state_root);
781 let context_string = ContextAssembler::format(&assembled);
782 let context_hash = *blake3::hash(context_string.as_bytes()).as_bytes();
783
784 let (sorted_chunk_ids, sorted_chunk_hashes, chunk_tree_root) = {
786 let graph = self.graph.lock().unwrap();
787 let sorted = graph.get_sorted_chunk_hashes()?;
788 let hashes: Vec<[u8; 32]> = sorted.iter().map(|(_, h)| *h).collect();
789 let root = cp_core::proof::compute_chunk_tree_root(&hashes);
790 (sorted, hashes, root)
791 };
792
793 let mut chunk_proofs = Vec::new();
795 let mut sources = Vec::new();
796
797 for result in search_results {
798 let chunk_id_bytes = *result.chunk.id.as_bytes();
799
800 if let Some(idx) = sorted_chunk_ids.iter().position(|(id, _)| *id == chunk_id_bytes) {
802 let proof = cp_core::proof::build_chunk_proof(
803 chunk_id_bytes,
804 result.chunk.text_hash,
805 idx,
806 &sorted_chunk_hashes,
807 );
808 chunk_proofs.push(proof);
809 }
810
811 sources.push(cp_core::SourceRef {
812 document_path: result.doc_path.clone(),
813 chunk_id: chunk_id_bytes,
814 chunk_text: result.chunk.text.clone(),
815 chunk_sequence: result.chunk.sequence,
816 relevance_score: result.score,
817 });
818 }
819
820 let now = std::time::SystemTime::now()
822 .duration_since(std::time::UNIX_EPOCH)
823 .unwrap_or_default();
824 let secs = now.as_secs();
825 let timestamp = format_unix_timestamp(secs);
826
827 let mut receipt = cp_core::ProofReceipt {
829 version: 1,
830 query: query.to_string(),
831 query_hash,
832 timestamp,
833 context_hash,
834 state_root,
835 chunk_tree_root,
836 chunk_proofs,
837 sources,
838 signature: [0u8; 64],
839 signer_public_key: identity.public_key,
840 device_id: identity.device_id,
841 git: None,
842 };
843
844 let sig = identity.sign(&receipt.signing_bytes());
845 receipt.signature = sig;
846
847 Ok(receipt)
848 }
849
850 pub fn chat(&self, _query: &str, _history: &[Message]) -> Result<String> {
852 Err(CPError::NotFound("Use generate_answer for Phase 2 initial integration".into()))
854 }
855}
856
857#[derive(Debug, Clone)]
859pub struct Message {
860 pub role: Role,
861 pub content: String,
862}
863
864#[derive(Debug, Clone, Copy, PartialEq, Eq)]
866pub enum Role {
867 User,
868 Assistant,
869 System,
870}
871
872fn format_unix_timestamp(secs: u64) -> String {
874 let days_since_epoch = secs / 86400;
875 let time_of_day = secs % 86400;
876 let hours = time_of_day / 3600;
877 let minutes = (time_of_day % 3600) / 60;
878 let seconds = time_of_day % 60;
879 let (year, month, day) = days_to_date(days_since_epoch);
880 format!(
881 "{:04}-{:02}-{:02}T{:02}:{:02}:{:02}Z",
882 year, month, day, hours, minutes, seconds
883 )
884}
885
886fn days_to_date(days: u64) -> (u64, u64, u64) {
887 let z = days + 719468;
888 let era = z / 146097;
889 let doe = z - era * 146097;
890 let yoe = (doe - doe / 1460 + doe / 36524 - doe / 146096) / 365;
891 let y = yoe + era * 400;
892 let doy = doe - (365 * yoe + yoe / 4 - yoe / 100);
893 let mp = (5 * doy + 2) / 153;
894 let d = doy - (153 * mp + 2) / 5 + 1;
895 let m = if mp < 10 { mp + 3 } else { mp - 9 };
896 let y = if m <= 2 { y + 1 } else { y };
897 (y, m, d)
898}
899
900#[cfg(test)]
901mod tests {
902 use super::*;
903 use cp_core::{Document, Chunk};
904 use std::sync::{Arc, Mutex};
905 use tempfile::TempDir;
906
907 #[tokio::test]
908 async fn test_get_chunks_for_document() {
909 let temp = TempDir::new().unwrap();
910 let db_path = temp.path().join("test.db");
911 let mut graph = cp_graph::GraphStore::open(db_path.to_str().unwrap()).unwrap();
912
913 let doc = Document::new("test.md".into(), b"Hello world", 100);
914 graph.insert_document(&doc).unwrap();
915
916 let chunk = Chunk {
917 id: Uuid::new_v4(),
918 doc_id: doc.id,
919 text: "Hello world".to_string(),
920 byte_offset: 0,
921 byte_length: 11,
922 sequence: 0,
923 text_hash: [0; 32],
924 };
925 graph.insert_chunk(&chunk).unwrap();
926
927 let embedder = Arc::new(cp_embeddings::EmbeddingEngine::new().unwrap());
928 let qe = QueryEngine::new(Arc::new(Mutex::new(graph)), embedder);
929
930 let results = qe.get_chunks_for_document(doc.id).unwrap();
931 assert_eq!(results.len(), 1);
932 assert_eq!(results[0].chunk.text, "Hello world");
933 }
934
935 #[tokio::test]
936 async fn test_hybrid_search() {
937 let temp = TempDir::new().unwrap();
938 let db_path = temp.path().join("test_hybrid.db");
939 let mut graph = cp_graph::GraphStore::open(db_path.to_str().unwrap()).unwrap();
940
941 let doc = Document::new("test.md".into(), b"The quick brown fox jumps over the lazy dog", 100);
942 graph.insert_document(&doc).unwrap();
943
944 let chunk = Chunk {
945 id: Uuid::new_v4(),
946 doc_id: doc.id,
947 text: "The quick brown fox jumps over the lazy dog".to_string(),
948 byte_offset: 0,
949 byte_length: 43,
950 sequence: 0,
951 text_hash: [0; 32],
952 };
953 graph.insert_chunk(&chunk).unwrap();
954
955 let embedder = Arc::new(cp_embeddings::EmbeddingEngine::new().unwrap());
957 let vec = embedder.embed(&chunk.text).unwrap();
958 let emb = cp_core::Embedding::new(chunk.id, &vec, embedder.model_hash(), 0);
959 graph.insert_embedding(&emb).unwrap();
960
961 let qe = QueryEngine::new(Arc::new(Mutex::new(graph)), embedder);
962
963 let results = qe.search("quick brown fox", 5).unwrap();
965 assert!(!results.is_empty());
966 assert!(results[0].chunk.text.contains("quick brown fox"));
967
968 let results_sem = qe.search("fast auburn canine", 5).unwrap();
970 assert!(!results_sem.is_empty());
971 assert!(results_sem[0].chunk.text.contains("quick brown fox"));
972 }
973
974 #[tokio::test]
975 async fn test_search_comparison_proof() {
976 let temp = TempDir::new().unwrap();
977 let db_path = temp.path().join("comparison_proof.db");
978 let mut graph = cp_graph::GraphStore::open(db_path.to_str().unwrap()).unwrap();
979
980 let t1 = "The quick brown fox jumps over the lazy dog"; let t2 = "Artificial intelligence is transforming the modern world"; let t3 = "A fast auburn canine leaps across an idle hound"; let texts = vec![t1, t2, t3];
986 for (i, text) in texts.iter().enumerate() {
987 let doc = Document::new(format!("doc_{}.md", i).into(), text.as_bytes(), 100);
988 graph.insert_document(&doc).unwrap();
989 let chunk = Chunk {
990 id: Uuid::new_v4(),
991 doc_id: doc.id,
992 text: text.to_string(),
993 byte_offset: 0,
994 byte_length: text.len() as u64,
995 sequence: 0,
996 text_hash: [0; 32],
997 };
998 graph.insert_chunk(&chunk).unwrap();
999 let embedder = Arc::new(cp_embeddings::EmbeddingEngine::new().unwrap());
1000 let vec = embedder.embed(text).unwrap();
1001 let emb = cp_core::Embedding::new(chunk.id, &vec, embedder.model_hash(), 0);
1002 graph.insert_embedding(&emb).unwrap();
1003 }
1004
1005 let embedder = Arc::new(cp_embeddings::EmbeddingEngine::new().unwrap());
1006 let qe = QueryEngine::new(Arc::new(Mutex::new(graph)), embedder);
1007
1008 let query = "fox";
1010
1011 let lexical = {
1013 let graph_lock = qe.graph();
1014 let g = graph_lock.lock().unwrap();
1015 g.search_lexical(query, 5).unwrap()
1016 };
1017 assert_eq!(lexical.len(), 1);
1019
1020 let vector = {
1022 let graph_lock = qe.graph();
1023 let g = graph_lock.lock().unwrap();
1024 let e = cp_embeddings::EmbeddingEngine::new().unwrap();
1025 let q_vec = e.embed(query).unwrap();
1026 g.search(&q_vec, 5).unwrap()
1027 };
1028 assert!(vector.len() >= 2);
1030
1031 let hybrid = qe.search(query, 5).unwrap();
1033
1034 println!("\n--- SEARCH PROOF FOR '{}' ---", query);
1035 println!("LEXICAL HITS: {}", lexical.len());
1036 println!("VECTOR HITS: {}", vector.len());
1037 println!("HYBRID HITS: {}", hybrid.len());
1038
1039 let texts_found: Vec<String> = hybrid.iter().map(|r| r.chunk.text.clone()).collect();
1041 assert!(texts_found.contains(&t1.to_string())); assert!(texts_found.contains(&t3.to_string())); }
1044
1045 #[test]
1046 fn test_filter_by_mime_type() {
1047 use cp_core::Document;
1048 use std::path::PathBuf;
1049
1050 let doc_md = Document::new(PathBuf::from("test.md"), b"content", 1000);
1051 let doc_pdf = Document::new(PathBuf::from("test.pdf"), b"pdf content", 1000);
1052
1053 let filter = Filter::MimeType("text/markdown".to_string());
1054
1055 assert!(filter.matches(&doc_md));
1056 assert!(!filter.matches(&doc_pdf));
1057 }
1058
1059 #[test]
1060 fn test_filter_by_path_glob() {
1061 use cp_core::Document;
1062 use std::path::PathBuf;
1063
1064 let doc1 = Document::new(PathBuf::from("docs/readme.md"), b"content", 1000);
1065 let doc2 = Document::new(PathBuf::from("src/main.rs"), b"code", 1000);
1066
1067 let filter = Filter::DocumentPath("docs/*.md".to_string());
1068
1069 assert!(filter.matches(&doc1));
1070 assert!(!filter.matches(&doc2));
1071 }
1072
1073 #[test]
1074 fn test_filter_by_modified_time() {
1075 use cp_core::Document;
1076 use std::path::PathBuf;
1077
1078 let old_doc = Document::new(PathBuf::from("old.md"), b"content", 1000);
1079 let new_doc = Document::new(PathBuf::from("new.md"), b"content", 2000);
1080
1081 let filter_after = Filter::ModifiedAfter(1500);
1082 let filter_before = Filter::ModifiedBefore(1500);
1083
1084 assert!(!filter_after.matches(&old_doc));
1085 assert!(filter_after.matches(&new_doc));
1086
1087 assert!(filter_before.matches(&old_doc));
1088 assert!(!filter_before.matches(&new_doc));
1089 }
1090
1091 #[test]
1092 fn test_citation_extraction() {
1093 use cp_core::{ContextChunk, ContextMetadata};
1094
1095 let context = AssembledContext {
1096 chunks: vec![ContextChunk {
1097 chunk_id: Uuid::new_v4(),
1098 document_path: "test.md".to_string(),
1099 text: "The quick brown fox jumps over the lazy dog".to_string(),
1100 score: 1.0,
1101 sequence: 0,
1102 }],
1103 total_tokens: 10,
1104 truncated: false,
1105 metadata: ContextMetadata {
1106 query_hash: [0u8; 32],
1107 state_root: [0u8; 32],
1108 },
1109 };
1110
1111 let response = "As mentioned, the quick brown fox jumps over the lazy dog in the story.";
1113 let citations = extract_citations(response, &context);
1114
1115 assert!(!citations.is_empty(), "Should find citations for overlapping text");
1116 assert!(citations[0].confidence > 0.0);
1117 }
1118
1119 #[test]
1120 fn test_hallucination_detection() {
1121 use cp_core::{ContextChunk, ContextMetadata};
1122
1123 let context = AssembledContext {
1124 chunks: vec![ContextChunk {
1125 chunk_id: Uuid::new_v4(),
1126 document_path: "test.md".to_string(),
1127 text: "The capital of France is Paris".to_string(),
1128 score: 1.0,
1129 sequence: 0,
1130 }],
1131 total_tokens: 10,
1132 truncated: false,
1133 metadata: ContextMetadata {
1134 query_hash: [0u8; 32],
1135 state_root: [0u8; 32],
1136 },
1137 };
1138
1139 let bad_response = "From my knowledge, I believe that Paris is a beautiful city.";
1141 let result = validate_response(bad_response, &context);
1142
1143 assert!(!result.warnings.is_empty(), "Should detect hallucination phrases");
1144 assert!(result.warnings.iter().any(|w| w.contains("hallucination")));
1145
1146 let good_response = "Information is missing from the substrate.";
1148 let result2 = validate_response(good_response, &context);
1149
1150 assert!(result2.is_valid, "Should be valid when admitting missing info");
1151 }
1152
1153 #[tokio::test]
1154 async fn test_real_corpus_proof() {
1155 let temp = TempDir::new().unwrap();
1156 let db_path = temp.path().join("real_corpus.db");
1157 let mut graph = cp_graph::GraphStore::open(db_path.to_str().unwrap()).unwrap();
1158
1159 let corpus_dir = std::path::PathBuf::from("/Users/nadeem/dev/CP/test_corpus");
1161
1162 if !corpus_dir.exists() {
1164 println!("Skipping test: corpus directory not found at {:?}", corpus_dir);
1165 return;
1166 }
1167
1168 let embedder = Arc::new(cp_embeddings::EmbeddingEngine::new().unwrap());
1169
1170 let files = vec!["zk.md", "ethereum.md", "random.md", "zksnark.md", "lexical_gap.md", "adversarial.md"];
1171
1172 for file_name in files {
1173 let path = corpus_dir.join(file_name);
1174 if !path.exists() { continue; }
1175
1176 let content = std::fs::read_to_string(&path).unwrap();
1177 let doc = Document::new(path.clone(), content.as_bytes(), 0);
1178 graph.insert_document(&doc).unwrap();
1179
1180 let chunk = Chunk {
1182 id: Uuid::new_v4(),
1183 doc_id: doc.id,
1184 text: content.clone(),
1185 byte_offset: 0,
1186 byte_length: content.len() as u64,
1187 sequence: 0,
1188 text_hash: [0; 32],
1189 };
1190 graph.insert_chunk(&chunk).unwrap();
1191
1192 let vec = embedder.embed(&content).unwrap();
1193 let emb = cp_core::Embedding::new(chunk.id, &vec, embedder.model_hash(), 0);
1194 graph.insert_embedding(&emb).unwrap();
1195 }
1196
1197 let qe = QueryEngine::new(Arc::new(Mutex::new(graph)), embedder);
1198
1199 let q1 = "calculate_hyper_parameter_v7";
1201 let res1 = qe.search(q1, 1).unwrap();
1202 println!("\n--- QUERY: '{}' ---", q1);
1203 for (i, r) in res1.iter().enumerate() {
1204 println!("Rank {}: [Score: {:.4}] {}", i+1, r.score, r.doc_path);
1205 }
1206 assert!(res1[0].doc_path.contains("lexical_gap.md"));
1207
1208 let q2 = "cryptographic privacy statement validity"; let res2 = qe.search(q2, 3).unwrap();
1211 println!("\n--- QUERY: '{}' ---", q2);
1212 for (i, r) in res2.iter().enumerate() {
1213 println!("Rank {}: [Score: {:.4}] {}", i+1, r.score, r.doc_path);
1214 }
1215 let found_zk = res2.iter().any(|r| r.doc_path.contains("zk"));
1217 assert!(found_zk);
1218
1219 let q3 = "Ethereum privacy zksnark";
1221 let res3 = qe.search(q3, 5).unwrap();
1222 println!("\n--- QUERY: '{}' ---", q3);
1223 for (i, r) in res3.iter().enumerate() {
1224 println!("Rank {}: [Score: {:.4}] {}", i+1, r.score, r.doc_path);
1225 }
1226 }
1227
1228 #[tokio::test]
1229 async fn test_search_modes() {
1230 let temp = TempDir::new().unwrap();
1231 let db_path = temp.path().join("modes.db");
1232 let mut graph = cp_graph::GraphStore::open(db_path.to_str().unwrap()).unwrap();
1233
1234 let doc = Document::new("test.md".into(), b"The quick brown fox jumps over the lazy dog", 100);
1235 graph.insert_document(&doc).unwrap();
1236 let chunk = Chunk {
1237 id: Uuid::new_v4(),
1238 doc_id: doc.id,
1239 text: "The quick brown fox jumps over the lazy dog".to_string(),
1240 byte_offset: 0,
1241 byte_length: 43,
1242 sequence: 0,
1243 text_hash: [0; 32],
1244 };
1245 graph.insert_chunk(&chunk).unwrap();
1246 let embedder = Arc::new(cp_embeddings::EmbeddingEngine::new().unwrap());
1247 let vec = embedder.embed(&chunk.text).unwrap();
1248 let emb = cp_core::Embedding::new(chunk.id, &vec, embedder.model_hash(), 0);
1249 graph.insert_embedding(&emb).unwrap();
1250
1251 let qe = QueryEngine::new(Arc::new(Mutex::new(graph)), embedder);
1252
1253 let results = qe.search_semantic("canine", 5).unwrap();
1256 assert!(!results.is_empty());
1257
1258 let results = qe.search_lexical("fox", 5).unwrap();
1261 assert!(!results.is_empty());
1262 assert!(results[0].chunk.text.contains("fox"));
1263 }
1264}