1use crate::{similarity::SimilarityMetric, Vector, VectorId, VectorStoreTrait};
7use anyhow::{anyhow, Result};
8use oxirs_core::model::{GraphName, Literal, NamedNode, Term};
9use serde::{Deserialize, Serialize};
10use std::collections::{HashMap, HashSet};
11use std::hash::{Hash, Hasher};
12use std::sync::{Arc, RwLock};
13
14#[derive(Debug, Clone, Serialize, Deserialize)]
16pub struct RdfVectorConfig {
17 pub uri_decomposition: bool,
19 pub include_literal_types: bool,
21 pub graph_context: bool,
23 pub namespace_aware: bool,
25 pub default_metric: SimilarityMetric,
27 pub cache_size: usize,
29}
30
31impl Default for RdfVectorConfig {
32 fn default() -> Self {
33 Self {
34 uri_decomposition: true,
35 include_literal_types: true,
36 graph_context: true,
37 namespace_aware: true,
38 default_metric: SimilarityMetric::Cosine,
39 cache_size: 10000,
40 }
41 }
42}
43
44#[derive(Debug, Clone, Serialize, Deserialize)]
46pub struct RdfTermMapping {
47 pub term: Term,
49 pub vector_id: VectorId,
51 pub graph_context: Option<GraphName>,
53 pub metadata: RdfTermMetadata,
55}
56
57#[derive(Debug, Clone, Serialize, Deserialize)]
59pub struct RdfTermMetadata {
60 pub term_type: RdfTermType,
62 pub namespace: Option<String>,
64 pub local_name: Option<String>,
66 pub datatype: Option<NamedNode>,
68 pub language: Option<String>,
70 pub complexity_score: f32,
72}
73
74#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
76pub enum RdfTermType {
77 NamedNode,
78 BlankNode,
79 Literal,
80 Variable,
81 QuotedTriple,
82}
83
84#[derive(Debug, Clone, Serialize, Deserialize)]
86pub struct RdfVectorSearchResult {
87 pub term: Term,
89 pub score: f32,
91 pub vector_id: VectorId,
93 pub graph_context: Option<GraphName>,
95 pub metadata: SearchMetadata,
97}
98
99#[derive(Debug, Clone, Serialize, Deserialize)]
101pub struct SearchMetadata {
102 pub algorithm: String,
104 pub processing_time_us: u64,
106 pub confidence: f32,
108 pub explanation: Option<String>,
110}
111
112pub struct RdfVectorIntegration {
114 config: RdfVectorConfig,
116 term_mappings: Arc<RwLock<HashMap<TermHash, RdfTermMapping>>>,
118 vector_mappings: Arc<RwLock<HashMap<VectorId, RdfTermMapping>>>,
120 graph_cache: Arc<RwLock<HashMap<GraphName, HashSet<VectorId>>>>,
122 namespace_registry: Arc<RwLock<HashMap<String, String>>>,
124 vector_store: Arc<RwLock<dyn VectorStoreTrait>>,
126}
127
128#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
130struct TermHash(u64);
131
132impl TermHash {
133 fn from_term(term: &Term) -> Self {
134 use std::collections::hash_map::DefaultHasher;
135 let mut hasher = DefaultHasher::new();
136
137 match term {
138 Term::NamedNode(node) => {
139 "NamedNode".hash(&mut hasher);
140 node.as_str().hash(&mut hasher);
141 }
142 Term::BlankNode(node) => {
143 "BlankNode".hash(&mut hasher);
144 node.as_str().hash(&mut hasher);
145 }
146 Term::Literal(literal) => {
147 "Literal".hash(&mut hasher);
148 literal.value().hash(&mut hasher);
149 if let Some(lang) = literal.language() {
150 lang.hash(&mut hasher);
151 }
152 literal.datatype().as_str().hash(&mut hasher);
153 }
154 Term::Variable(var) => {
155 "Variable".hash(&mut hasher);
156 var.as_str().hash(&mut hasher);
157 }
158 Term::QuotedTriple(_) => {
159 "QuotedTriple".hash(&mut hasher);
160 "quoted_triple".hash(&mut hasher);
162 }
163 }
164
165 TermHash(hasher.finish())
166 }
167}
168
169impl RdfVectorIntegration {
170 pub fn new(config: RdfVectorConfig, vector_store: Arc<RwLock<dyn VectorStoreTrait>>) -> Self {
172 Self {
173 config,
174 term_mappings: Arc::new(RwLock::new(HashMap::new())),
175 vector_mappings: Arc::new(RwLock::new(HashMap::new())),
176 graph_cache: Arc::new(RwLock::new(HashMap::new())),
177 namespace_registry: Arc::new(RwLock::new(HashMap::new())),
178 vector_store,
179 }
180 }
181
182 pub fn register_term(
184 &self,
185 term: Term,
186 vector: Vector,
187 graph_context: Option<GraphName>,
188 ) -> Result<VectorId> {
189 let vector_id = self
190 .vector_store
191 .write()
192 .expect("lock poisoned")
193 .add_vector(vector)?;
194 let metadata = self.extract_term_metadata(&term)?;
195
196 let mapping = RdfTermMapping {
197 term: term.clone(),
198 vector_id: vector_id.clone(),
199 graph_context: graph_context.clone(),
200 metadata,
201 };
202
203 let term_hash = TermHash::from_term(&term);
204
205 {
207 let mut term_mappings = self.term_mappings.write().expect("lock poisoned");
208 term_mappings.insert(term_hash, mapping.clone());
209 }
210
211 {
212 let mut vector_mappings = self.vector_mappings.write().expect("lock poisoned");
213 vector_mappings.insert(vector_id.clone(), mapping);
214 }
215
216 if let Some(graph) = graph_context {
218 let mut graph_cache = self.graph_cache.write().expect("lock poisoned");
219 graph_cache
220 .entry(graph)
221 .or_default()
222 .insert(vector_id.clone());
223 }
224
225 Ok(vector_id)
226 }
227
228 pub fn find_similar_terms(
230 &self,
231 query_term: &Term,
232 limit: usize,
233 threshold: Option<f32>,
234 graph_context: Option<&GraphName>,
235 ) -> Result<Vec<RdfVectorSearchResult>> {
236 let start_time = std::time::Instant::now();
237
238 let query_vector_id = self
240 .get_vector_id(query_term)?
241 .ok_or_else(|| anyhow!("Query term not found in vector store"))?;
242
243 let query_vector = self
244 .vector_store
245 .read()
246 .expect("lock poisoned")
247 .get_vector(&query_vector_id)?
248 .ok_or_else(|| anyhow!("Query vector not found"))?;
249
250 let candidate_vectors = if let Some(graph) = graph_context {
252 let graph_cache = self.graph_cache.read().expect("lock poisoned");
253 graph_cache
254 .get(graph)
255 .map(|set| set.iter().cloned().collect::<Vec<_>>())
256 .unwrap_or_default()
257 } else {
258 self.vector_store
260 .read()
261 .expect("lock poisoned")
262 .get_all_vector_ids()?
263 };
264
265 let mut results = Vec::new();
267 for vector_id in candidate_vectors {
268 if *vector_id == query_vector_id {
269 continue; }
271
272 if let Ok(Some(vector)) = self
273 .vector_store
274 .read()
275 .expect("lock poisoned")
276 .get_vector(&vector_id)
277 {
278 let similarity = self.config.default_metric.compute(&query_vector, &vector)?;
279
280 if let Some(thresh) = threshold {
282 if similarity < thresh {
283 continue;
284 }
285 }
286
287 let vector_mappings = self.vector_mappings.read().expect("lock poisoned");
289 if let Some(mapping) = vector_mappings.get(&vector_id) {
290 let processing_time = start_time.elapsed().as_micros() as u64;
291
292 results.push(RdfVectorSearchResult {
293 term: mapping.term.clone(),
294 score: similarity,
295 vector_id: vector_id.clone(),
296 graph_context: mapping.graph_context.clone(),
297 metadata: SearchMetadata {
298 algorithm: "vector_similarity".to_string(),
299 processing_time_us: processing_time,
300 confidence: self.calculate_confidence(similarity, &mapping.metadata),
301 explanation: self.generate_explanation(&mapping.metadata, similarity),
302 },
303 });
304 }
305 }
306 }
307
308 results.sort_by(|a, b| {
310 b.score
311 .partial_cmp(&a.score)
312 .unwrap_or(std::cmp::Ordering::Equal)
313 });
314
315 results.truncate(limit);
317
318 Ok(results)
319 }
320
321 pub fn search_by_text(
323 &self,
324 query_text: &str,
325 limit: usize,
326 threshold: Option<f32>,
327 graph_context: Option<&GraphName>,
328 ) -> Result<Vec<RdfVectorSearchResult>> {
329 let literal = Literal::new_simple_literal(query_text);
331 let _query_term = Term::Literal(literal);
332
333 let query_vector = self.generate_text_embedding(query_text)?;
336
337 let temp_vector_id = self
339 .vector_store
340 .write()
341 .expect("lock poisoned")
342 .add_vector(query_vector.clone())?;
343
344 let candidate_vectors = if let Some(graph) = graph_context {
346 let graph_cache = self.graph_cache.read().expect("lock poisoned");
347 graph_cache
348 .get(graph)
349 .map(|set| set.iter().cloned().collect::<Vec<_>>())
350 .unwrap_or_default()
351 } else {
352 self.vector_store
353 .read()
354 .expect("lock poisoned")
355 .get_all_vector_ids()?
356 };
357
358 let mut results = Vec::new();
359 let start_time = std::time::Instant::now();
360
361 for vector_id in candidate_vectors {
362 if let Ok(Some(vector)) = self
363 .vector_store
364 .read()
365 .expect("lock poisoned")
366 .get_vector(&vector_id)
367 {
368 let similarity = self.config.default_metric.compute(&query_vector, &vector)?;
369
370 if let Some(thresh) = threshold {
371 if similarity < thresh {
372 continue;
373 }
374 }
375
376 let vector_mappings = self.vector_mappings.read().expect("lock poisoned");
377 if let Some(mapping) = vector_mappings.get(&vector_id) {
378 let processing_time = start_time.elapsed().as_micros() as u64;
379
380 results.push(RdfVectorSearchResult {
381 term: mapping.term.clone(),
382 score: similarity,
383 vector_id: vector_id.clone(),
384 graph_context: mapping.graph_context.clone(),
385 metadata: SearchMetadata {
386 algorithm: "text_similarity".to_string(),
387 processing_time_us: processing_time,
388 confidence: self.calculate_confidence(similarity, &mapping.metadata),
389 explanation: Some(format!("Text similarity match: '{query_text}'")),
390 },
391 });
392 }
393 }
394 }
395
396 let _ = self
398 .vector_store
399 .write()
400 .expect("lock poisoned")
401 .remove_vector(&temp_vector_id);
402
403 results.sort_by(|a, b| {
405 b.score
406 .partial_cmp(&a.score)
407 .unwrap_or(std::cmp::Ordering::Equal)
408 });
409 results.truncate(limit);
410
411 Ok(results)
412 }
413
414 pub fn get_vector_id(&self, term: &Term) -> Result<Option<VectorId>> {
416 let term_hash = TermHash::from_term(term);
417 let term_mappings = self.term_mappings.read().expect("lock poisoned");
418 Ok(term_mappings
419 .get(&term_hash)
420 .map(|mapping| mapping.vector_id.clone()))
421 }
422
423 pub fn get_term(&self, vector_id: VectorId) -> Result<Option<Term>> {
425 let vector_mappings = self.vector_mappings.read().expect("lock poisoned");
426 Ok(vector_mappings
427 .get(&vector_id)
428 .map(|mapping| mapping.term.clone()))
429 }
430
431 pub fn register_namespace(&self, prefix: String, uri: String) -> Result<()> {
433 let mut registry = self.namespace_registry.write().expect("lock poisoned");
434 registry.insert(prefix, uri);
435 Ok(())
436 }
437
438 fn extract_term_metadata(&self, term: &Term) -> Result<RdfTermMetadata> {
440 match term {
441 Term::NamedNode(node) => {
442 let uri = node.as_str();
443 let (namespace, local_name) = self.split_uri(uri);
444
445 Ok(RdfTermMetadata {
446 term_type: RdfTermType::NamedNode,
447 namespace,
448 local_name,
449 datatype: None,
450 language: None,
451 complexity_score: self.calculate_uri_complexity(uri),
452 })
453 }
454 Term::BlankNode(_) => {
455 Ok(RdfTermMetadata {
456 term_type: RdfTermType::BlankNode,
457 namespace: None,
458 local_name: None,
459 datatype: None,
460 language: None,
461 complexity_score: 0.5, })
463 }
464 Term::Literal(literal) => Ok(RdfTermMetadata {
465 term_type: RdfTermType::Literal,
466 namespace: None,
467 local_name: None,
468 datatype: Some(literal.datatype().into()),
469 language: literal.language().map(|s| s.to_string()),
470 complexity_score: self.calculate_literal_complexity(literal),
471 }),
472 Term::Variable(_) => {
473 Ok(RdfTermMetadata {
474 term_type: RdfTermType::Variable,
475 namespace: None,
476 local_name: None,
477 datatype: None,
478 language: None,
479 complexity_score: 0.3, })
481 }
482 Term::QuotedTriple(_) => {
483 Ok(RdfTermMetadata {
484 term_type: RdfTermType::QuotedTriple,
485 namespace: None,
486 local_name: None,
487 datatype: None,
488 language: None,
489 complexity_score: 1.0, })
491 }
492 }
493 }
494
495 fn split_uri(&self, uri: &str) -> (Option<String>, Option<String>) {
497 if let Some(pos) = uri.rfind(&['#', '/'][..]) {
499 let namespace = uri[..pos + 1].to_string();
500 let local_name = uri[pos + 1..].to_string();
501 (Some(namespace), Some(local_name))
502 } else {
503 (None, Some(uri.to_string()))
504 }
505 }
506
507 fn calculate_uri_complexity(&self, uri: &str) -> f32 {
509 let length_factor = (uri.len() as f32 / 100.0).min(1.0);
510 let segment_count = uri.matches(&['/', '#'][..]).count() as f32 / 10.0;
511 let query_params = if uri.contains('?') { 0.2 } else { 0.0 };
512
513 (length_factor + segment_count + query_params).min(1.0)
514 }
515
516 fn calculate_literal_complexity(&self, literal: &Literal) -> f32 {
518 let value_length = literal.value().len() as f32 / 200.0;
519 let datatype_complexity =
520 if literal.datatype().as_str() == "http://www.w3.org/2001/XMLSchema#string" {
521 0.3
522 } else {
523 0.7
524 };
525 let language_bonus = if literal.language().is_some() {
526 0.2
527 } else {
528 0.0
529 };
530
531 (value_length + datatype_complexity + language_bonus).min(1.0)
532 }
533
534 fn calculate_confidence(&self, similarity: f32, metadata: &RdfTermMetadata) -> f32 {
536 let base_confidence = similarity;
537 let complexity_bonus = metadata.complexity_score * 0.1;
538 let type_bonus = match metadata.term_type {
539 RdfTermType::NamedNode => 0.1,
540 RdfTermType::Literal => 0.05,
541 RdfTermType::BlankNode => 0.02,
542 RdfTermType::Variable => 0.01,
543 RdfTermType::QuotedTriple => 0.15,
544 };
545
546 (base_confidence + complexity_bonus + type_bonus).min(1.0)
547 }
548
549 fn generate_explanation(&self, metadata: &RdfTermMetadata, similarity: f32) -> Option<String> {
551 let term_type_str = match metadata.term_type {
552 RdfTermType::NamedNode => "Named Node",
553 RdfTermType::BlankNode => "Blank Node",
554 RdfTermType::Literal => "Literal",
555 RdfTermType::Variable => "Variable",
556 RdfTermType::QuotedTriple => "Quoted Triple",
557 };
558
559 let mut explanation = format!(
560 "{} with {:.2}% similarity",
561 term_type_str,
562 similarity * 100.0
563 );
564
565 if let Some(namespace) = &metadata.namespace {
566 explanation.push_str(&format!(", namespace: {namespace}"));
567 }
568
569 if let Some(language) = &metadata.language {
570 explanation.push_str(&format!(", language: {language}"));
571 }
572
573 Some(explanation)
574 }
575
576 fn generate_text_embedding(&self, text: &str) -> Result<Vector> {
578 let words: Vec<&str> = text.split_whitespace().collect();
581 let dimension = 384; let mut vector_data = vec![0.0; dimension];
584
585 for word in words.iter() {
587 let word_hash = {
588 use std::collections::hash_map::DefaultHasher;
589 let mut hasher = DefaultHasher::new();
590 word.hash(&mut hasher);
591 hasher.finish()
592 };
593
594 for j in 0..dimension {
596 let index = (word_hash as usize + j) % dimension;
597 vector_data[index] += 1.0 / (words.len() as f32);
598 }
599 }
600
601 let norm: f32 = vector_data.iter().map(|x| x * x).sum::<f32>().sqrt();
603 if norm > 0.0 {
604 for value in &mut vector_data {
605 *value /= norm;
606 }
607 }
608
609 Ok(Vector::new(vector_data))
610 }
611
612 pub fn get_statistics(&self) -> RdfIntegrationStats {
614 let term_mappings = self.term_mappings.read().expect("lock poisoned");
615 let graph_cache = self.graph_cache.read().expect("lock poisoned");
616 let namespace_registry = self.namespace_registry.read().expect("lock poisoned");
617
618 let mut type_counts = HashMap::new();
619 for mapping in term_mappings.values() {
620 *type_counts.entry(mapping.metadata.term_type).or_insert(0) += 1;
621 }
622
623 RdfIntegrationStats {
624 total_terms: term_mappings.len(),
625 total_graphs: graph_cache.len(),
626 total_namespaces: namespace_registry.len(),
627 type_distribution: type_counts,
628 cache_hit_ratio: 0.95, }
630 }
631}
632
633#[derive(Debug, Clone, Serialize, Deserialize)]
635pub struct RdfIntegrationStats {
636 pub total_terms: usize,
637 pub total_graphs: usize,
638 pub total_namespaces: usize,
639 pub type_distribution: HashMap<RdfTermType, usize>,
640 pub cache_hit_ratio: f32,
641}
642
643#[cfg(test)]
644mod tests {
645 use super::*;
646 use crate::VectorStore;
647 use oxirs_core::model::{NamedNode, Term};
648
649 #[test]
650 fn test_rdf_term_registration() {
651 let config = RdfVectorConfig::default();
652 let vector_store = Arc::new(RwLock::new(VectorStore::new()));
653 let integration = RdfVectorIntegration::new(config, vector_store);
654
655 let named_node = NamedNode::new("http://example.org/person").unwrap();
656 let term = Term::NamedNode(named_node);
657 let vector = Vector::new(vec![1.0, 0.0, 0.0]);
658
659 let vector_id = integration
660 .register_term(term.clone(), vector, None)
661 .unwrap();
662
663 assert!(integration.get_vector_id(&term).unwrap().is_some());
664 assert_eq!(
665 integration.get_vector_id(&term).unwrap().unwrap(),
666 vector_id
667 );
668 }
669
670 #[test]
671 fn test_uri_splitting() {
672 let config = RdfVectorConfig::default();
673 let vector_store = Arc::new(RwLock::new(VectorStore::new()));
674 let integration = RdfVectorIntegration::new(config, vector_store);
675
676 let (namespace, local_name) = integration.split_uri("http://example.org/ontology#Person");
677 assert_eq!(namespace, Some("http://example.org/ontology#".to_string()));
678 assert_eq!(local_name, Some("Person".to_string()));
679 }
680
681 #[test]
682 fn test_metadata_extraction() {
683 let config = RdfVectorConfig::default();
684 let vector_store = Arc::new(RwLock::new(VectorStore::new()));
685 let integration = RdfVectorIntegration::new(config, vector_store);
686
687 let literal = Literal::new_language_tagged_literal("Hello", "en").unwrap();
688 let term = Term::Literal(literal);
689
690 let metadata = integration.extract_term_metadata(&term).unwrap();
691 assert_eq!(metadata.term_type, RdfTermType::Literal);
692 assert_eq!(metadata.language, Some("en".to_string()));
693 }
694}