oxirs_vec/
rdf_integration.rs

1//! RDF term support integration with oxirs-core
2//!
3//! This module provides seamless integration between oxirs-vec's vector operations
4//! and oxirs-core's RDF term system, enabling semantic vector search on RDF data.
5
6use crate::{similarity::SimilarityMetric, Vector, VectorId, VectorStoreTrait};
7use anyhow::{anyhow, Result};
8use oxirs_core::model::{GraphName, Literal, NamedNode, Term};
9use serde::{Deserialize, Serialize};
10use std::collections::{HashMap, HashSet};
11use std::hash::{Hash, Hasher};
12use std::sync::{Arc, RwLock};
13
14/// Configuration for RDF-vector integration
15#[derive(Debug, Clone, Serialize, Deserialize)]
16pub struct RdfVectorConfig {
17    /// Enable automatic URI decomposition for embeddings
18    pub uri_decomposition: bool,
19    /// Include literal types in embeddings
20    pub include_literal_types: bool,
21    /// Enable graph context awareness
22    pub graph_context: bool,
23    /// Namespace prefix handling
24    pub namespace_aware: bool,
25    /// Default similarity metric for RDF term comparisons
26    pub default_metric: SimilarityMetric,
27    /// Cache size for term-to-vector mappings
28    pub cache_size: usize,
29}
30
31impl Default for RdfVectorConfig {
32    fn default() -> Self {
33        Self {
34            uri_decomposition: true,
35            include_literal_types: true,
36            graph_context: true,
37            namespace_aware: true,
38            default_metric: SimilarityMetric::Cosine,
39            cache_size: 10000,
40        }
41    }
42}
43
44/// Mapping between RDF terms and vector identifiers
45#[derive(Debug, Clone, Serialize, Deserialize)]
46pub struct RdfTermMapping {
47    /// Original RDF term
48    pub term: Term,
49    /// Associated vector identifier
50    pub vector_id: VectorId,
51    /// Graph context (if applicable)
52    pub graph_context: Option<GraphName>,
53    /// Term metadata for enhanced processing
54    pub metadata: RdfTermMetadata,
55}
56
57/// Metadata for RDF terms to enhance vector processing
58#[derive(Debug, Clone, Serialize, Deserialize)]
59pub struct RdfTermMetadata {
60    /// Term type for specialized processing
61    pub term_type: RdfTermType,
62    /// Namespace information
63    pub namespace: Option<String>,
64    /// Local name component
65    pub local_name: Option<String>,
66    /// Literal datatype (if applicable)
67    pub datatype: Option<NamedNode>,
68    /// Language tag (if applicable)
69    pub language: Option<String>,
70    /// Term complexity score for weighting
71    pub complexity_score: f32,
72}
73
74/// RDF term type enumeration for processing
75#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
76pub enum RdfTermType {
77    NamedNode,
78    BlankNode,
79    Literal,
80    Variable,
81    QuotedTriple,
82}
83
84/// Result of RDF-aware vector search
85#[derive(Debug, Clone, Serialize, Deserialize)]
86pub struct RdfVectorSearchResult {
87    /// Matching RDF term
88    pub term: Term,
89    /// Similarity score
90    pub score: f32,
91    /// Vector identifier
92    pub vector_id: VectorId,
93    /// Graph context
94    pub graph_context: Option<GraphName>,
95    /// Search metadata
96    pub metadata: SearchMetadata,
97}
98
99/// Search metadata for RDF vector results
100#[derive(Debug, Clone, Serialize, Deserialize)]
101pub struct SearchMetadata {
102    /// Search algorithm used
103    pub algorithm: String,
104    /// Processing time in microseconds
105    pub processing_time_us: u64,
106    /// Term matching confidence
107    pub confidence: f32,
108    /// Explanation of result relevance
109    pub explanation: Option<String>,
110}
111
112/// RDF-Vector integration engine
113pub struct RdfVectorIntegration {
114    /// Configuration
115    config: RdfVectorConfig,
116    /// Term to vector mappings
117    term_mappings: Arc<RwLock<HashMap<TermHash, RdfTermMapping>>>,
118    /// Vector to term reverse mappings
119    vector_mappings: Arc<RwLock<HashMap<VectorId, RdfTermMapping>>>,
120    /// Graph context cache
121    graph_cache: Arc<RwLock<HashMap<GraphName, HashSet<VectorId>>>>,
122    /// Namespace registry
123    namespace_registry: Arc<RwLock<HashMap<String, String>>>,
124    /// Vector store reference
125    vector_store: Arc<RwLock<dyn VectorStoreTrait>>,
126}
127
128/// Hash wrapper for RDF terms to enable HashMap keys
129#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
130struct TermHash(u64);
131
132impl TermHash {
133    fn from_term(term: &Term) -> Self {
134        use std::collections::hash_map::DefaultHasher;
135        let mut hasher = DefaultHasher::new();
136
137        match term {
138            Term::NamedNode(node) => {
139                "NamedNode".hash(&mut hasher);
140                node.as_str().hash(&mut hasher);
141            }
142            Term::BlankNode(node) => {
143                "BlankNode".hash(&mut hasher);
144                node.as_str().hash(&mut hasher);
145            }
146            Term::Literal(literal) => {
147                "Literal".hash(&mut hasher);
148                literal.value().hash(&mut hasher);
149                if let Some(lang) = literal.language() {
150                    lang.hash(&mut hasher);
151                }
152                literal.datatype().as_str().hash(&mut hasher);
153            }
154            Term::Variable(var) => {
155                "Variable".hash(&mut hasher);
156                var.as_str().hash(&mut hasher);
157            }
158            Term::QuotedTriple(_) => {
159                "QuotedTriple".hash(&mut hasher);
160                // Simplified hash for quoted triples
161                "quoted_triple".hash(&mut hasher);
162            }
163        }
164
165        TermHash(hasher.finish())
166    }
167}
168
169impl RdfVectorIntegration {
170    /// Create a new RDF-vector integration instance
171    pub fn new(config: RdfVectorConfig, vector_store: Arc<RwLock<dyn VectorStoreTrait>>) -> Self {
172        Self {
173            config,
174            term_mappings: Arc::new(RwLock::new(HashMap::new())),
175            vector_mappings: Arc::new(RwLock::new(HashMap::new())),
176            graph_cache: Arc::new(RwLock::new(HashMap::new())),
177            namespace_registry: Arc::new(RwLock::new(HashMap::new())),
178            vector_store,
179        }
180    }
181
182    /// Register an RDF term with vector representation
183    pub fn register_term(
184        &self,
185        term: Term,
186        vector: Vector,
187        graph_context: Option<GraphName>,
188    ) -> Result<VectorId> {
189        let vector_id = self
190            .vector_store
191            .write()
192            .expect("lock poisoned")
193            .add_vector(vector)?;
194        let metadata = self.extract_term_metadata(&term)?;
195
196        let mapping = RdfTermMapping {
197            term: term.clone(),
198            vector_id: vector_id.clone(),
199            graph_context: graph_context.clone(),
200            metadata,
201        };
202
203        let term_hash = TermHash::from_term(&term);
204
205        // Update mappings
206        {
207            let mut term_mappings = self.term_mappings.write().expect("lock poisoned");
208            term_mappings.insert(term_hash, mapping.clone());
209        }
210
211        {
212            let mut vector_mappings = self.vector_mappings.write().expect("lock poisoned");
213            vector_mappings.insert(vector_id.clone(), mapping);
214        }
215
216        // Update graph cache if applicable
217        if let Some(graph) = graph_context {
218            let mut graph_cache = self.graph_cache.write().expect("lock poisoned");
219            graph_cache
220                .entry(graph)
221                .or_default()
222                .insert(vector_id.clone());
223        }
224
225        Ok(vector_id)
226    }
227
228    /// Find similar RDF terms using vector similarity
229    pub fn find_similar_terms(
230        &self,
231        query_term: &Term,
232        limit: usize,
233        threshold: Option<f32>,
234        graph_context: Option<&GraphName>,
235    ) -> Result<Vec<RdfVectorSearchResult>> {
236        let start_time = std::time::Instant::now();
237
238        // Get vector for query term
239        let query_vector_id = self
240            .get_vector_id(query_term)?
241            .ok_or_else(|| anyhow!("Query term not found in vector store"))?;
242
243        let query_vector = self
244            .vector_store
245            .read()
246            .expect("lock poisoned")
247            .get_vector(&query_vector_id)?
248            .ok_or_else(|| anyhow!("Query vector not found"))?;
249
250        // Filter by graph context if specified
251        let candidate_vectors = if let Some(graph) = graph_context {
252            let graph_cache = self.graph_cache.read().expect("lock poisoned");
253            graph_cache
254                .get(graph)
255                .map(|set| set.iter().cloned().collect::<Vec<_>>())
256                .unwrap_or_default()
257        } else {
258            // Use all vectors if no graph context specified
259            self.vector_store
260                .read()
261                .expect("lock poisoned")
262                .get_all_vector_ids()?
263        };
264
265        // Perform similarity search
266        let mut results = Vec::new();
267        for vector_id in candidate_vectors {
268            if *vector_id == query_vector_id {
269                continue; // Skip self
270            }
271
272            if let Ok(Some(vector)) = self
273                .vector_store
274                .read()
275                .expect("lock poisoned")
276                .get_vector(&vector_id)
277            {
278                let similarity = self.config.default_metric.compute(&query_vector, &vector)?;
279
280                // Apply threshold filtering
281                if let Some(thresh) = threshold {
282                    if similarity < thresh {
283                        continue;
284                    }
285                }
286
287                // Get term mapping
288                let vector_mappings = self.vector_mappings.read().expect("lock poisoned");
289                if let Some(mapping) = vector_mappings.get(&vector_id) {
290                    let processing_time = start_time.elapsed().as_micros() as u64;
291
292                    results.push(RdfVectorSearchResult {
293                        term: mapping.term.clone(),
294                        score: similarity,
295                        vector_id: vector_id.clone(),
296                        graph_context: mapping.graph_context.clone(),
297                        metadata: SearchMetadata {
298                            algorithm: "vector_similarity".to_string(),
299                            processing_time_us: processing_time,
300                            confidence: self.calculate_confidence(similarity, &mapping.metadata),
301                            explanation: self.generate_explanation(&mapping.metadata, similarity),
302                        },
303                    });
304                }
305            }
306        }
307
308        // Sort by similarity score (descending)
309        results.sort_by(|a, b| {
310            b.score
311                .partial_cmp(&a.score)
312                .unwrap_or(std::cmp::Ordering::Equal)
313        });
314
315        // Apply limit
316        results.truncate(limit);
317
318        Ok(results)
319    }
320
321    /// Search for terms by text content with RDF-aware processing
322    pub fn search_by_text(
323        &self,
324        query_text: &str,
325        limit: usize,
326        threshold: Option<f32>,
327        graph_context: Option<&GraphName>,
328    ) -> Result<Vec<RdfVectorSearchResult>> {
329        // Create a temporary literal term for text search
330        let literal = Literal::new_simple_literal(query_text);
331        let _query_term = Term::Literal(literal);
332
333        // For text search, we would typically generate an embedding
334        // This is a simplified version - in practice, you'd use an embedding model
335        let query_vector = self.generate_text_embedding(query_text)?;
336
337        // Register temporary term (optional - for caching)
338        let temp_vector_id = self
339            .vector_store
340            .write()
341            .expect("lock poisoned")
342            .add_vector(query_vector.clone())?;
343
344        // Perform similarity search against all terms
345        let candidate_vectors = if let Some(graph) = graph_context {
346            let graph_cache = self.graph_cache.read().expect("lock poisoned");
347            graph_cache
348                .get(graph)
349                .map(|set| set.iter().cloned().collect::<Vec<_>>())
350                .unwrap_or_default()
351        } else {
352            self.vector_store
353                .read()
354                .expect("lock poisoned")
355                .get_all_vector_ids()?
356        };
357
358        let mut results = Vec::new();
359        let start_time = std::time::Instant::now();
360
361        for vector_id in candidate_vectors {
362            if let Ok(Some(vector)) = self
363                .vector_store
364                .read()
365                .expect("lock poisoned")
366                .get_vector(&vector_id)
367            {
368                let similarity = self.config.default_metric.compute(&query_vector, &vector)?;
369
370                if let Some(thresh) = threshold {
371                    if similarity < thresh {
372                        continue;
373                    }
374                }
375
376                let vector_mappings = self.vector_mappings.read().expect("lock poisoned");
377                if let Some(mapping) = vector_mappings.get(&vector_id) {
378                    let processing_time = start_time.elapsed().as_micros() as u64;
379
380                    results.push(RdfVectorSearchResult {
381                        term: mapping.term.clone(),
382                        score: similarity,
383                        vector_id: vector_id.clone(),
384                        graph_context: mapping.graph_context.clone(),
385                        metadata: SearchMetadata {
386                            algorithm: "text_similarity".to_string(),
387                            processing_time_us: processing_time,
388                            confidence: self.calculate_confidence(similarity, &mapping.metadata),
389                            explanation: Some(format!("Text similarity match: '{query_text}'")),
390                        },
391                    });
392                }
393            }
394        }
395
396        // Clean up temporary vector
397        let _ = self
398            .vector_store
399            .write()
400            .expect("lock poisoned")
401            .remove_vector(&temp_vector_id);
402
403        // Sort and limit results
404        results.sort_by(|a, b| {
405            b.score
406                .partial_cmp(&a.score)
407                .unwrap_or(std::cmp::Ordering::Equal)
408        });
409        results.truncate(limit);
410
411        Ok(results)
412    }
413
414    /// Get vector ID for an RDF term
415    pub fn get_vector_id(&self, term: &Term) -> Result<Option<VectorId>> {
416        let term_hash = TermHash::from_term(term);
417        let term_mappings = self.term_mappings.read().expect("lock poisoned");
418        Ok(term_mappings
419            .get(&term_hash)
420            .map(|mapping| mapping.vector_id.clone()))
421    }
422
423    /// Get RDF term for a vector ID
424    pub fn get_term(&self, vector_id: VectorId) -> Result<Option<Term>> {
425        let vector_mappings = self.vector_mappings.read().expect("lock poisoned");
426        Ok(vector_mappings
427            .get(&vector_id)
428            .map(|mapping| mapping.term.clone()))
429    }
430
431    /// Register a namespace prefix
432    pub fn register_namespace(&self, prefix: String, uri: String) -> Result<()> {
433        let mut registry = self.namespace_registry.write().expect("lock poisoned");
434        registry.insert(prefix, uri);
435        Ok(())
436    }
437
438    /// Extract metadata from RDF term
439    fn extract_term_metadata(&self, term: &Term) -> Result<RdfTermMetadata> {
440        match term {
441            Term::NamedNode(node) => {
442                let uri = node.as_str();
443                let (namespace, local_name) = self.split_uri(uri);
444
445                Ok(RdfTermMetadata {
446                    term_type: RdfTermType::NamedNode,
447                    namespace,
448                    local_name,
449                    datatype: None,
450                    language: None,
451                    complexity_score: self.calculate_uri_complexity(uri),
452                })
453            }
454            Term::BlankNode(_) => {
455                Ok(RdfTermMetadata {
456                    term_type: RdfTermType::BlankNode,
457                    namespace: None,
458                    local_name: None,
459                    datatype: None,
460                    language: None,
461                    complexity_score: 0.5, // Blank nodes have medium complexity
462                })
463            }
464            Term::Literal(literal) => Ok(RdfTermMetadata {
465                term_type: RdfTermType::Literal,
466                namespace: None,
467                local_name: None,
468                datatype: Some(literal.datatype().into()),
469                language: literal.language().map(|s| s.to_string()),
470                complexity_score: self.calculate_literal_complexity(literal),
471            }),
472            Term::Variable(_) => {
473                Ok(RdfTermMetadata {
474                    term_type: RdfTermType::Variable,
475                    namespace: None,
476                    local_name: None,
477                    datatype: None,
478                    language: None,
479                    complexity_score: 0.3, // Variables have low complexity
480                })
481            }
482            Term::QuotedTriple(_) => {
483                Ok(RdfTermMetadata {
484                    term_type: RdfTermType::QuotedTriple,
485                    namespace: None,
486                    local_name: None,
487                    datatype: None,
488                    language: None,
489                    complexity_score: 1.0, // Quoted triples have high complexity
490                })
491            }
492        }
493    }
494
495    /// Split URI into namespace and local name
496    fn split_uri(&self, uri: &str) -> (Option<String>, Option<String>) {
497        // Simple URI splitting logic - can be enhanced
498        if let Some(pos) = uri.rfind(&['#', '/'][..]) {
499            let namespace = uri[..pos + 1].to_string();
500            let local_name = uri[pos + 1..].to_string();
501            (Some(namespace), Some(local_name))
502        } else {
503            (None, Some(uri.to_string()))
504        }
505    }
506
507    /// Calculate URI complexity score
508    fn calculate_uri_complexity(&self, uri: &str) -> f32 {
509        let length_factor = (uri.len() as f32 / 100.0).min(1.0);
510        let segment_count = uri.matches(&['/', '#'][..]).count() as f32 / 10.0;
511        let query_params = if uri.contains('?') { 0.2 } else { 0.0 };
512
513        (length_factor + segment_count + query_params).min(1.0)
514    }
515
516    /// Calculate literal complexity score
517    fn calculate_literal_complexity(&self, literal: &Literal) -> f32 {
518        let value_length = literal.value().len() as f32 / 200.0;
519        let datatype_complexity =
520            if literal.datatype().as_str() == "http://www.w3.org/2001/XMLSchema#string" {
521                0.3
522            } else {
523                0.7
524            };
525        let language_bonus = if literal.language().is_some() {
526            0.2
527        } else {
528            0.0
529        };
530
531        (value_length + datatype_complexity + language_bonus).min(1.0)
532    }
533
534    /// Calculate confidence score for search results
535    fn calculate_confidence(&self, similarity: f32, metadata: &RdfTermMetadata) -> f32 {
536        let base_confidence = similarity;
537        let complexity_bonus = metadata.complexity_score * 0.1;
538        let type_bonus = match metadata.term_type {
539            RdfTermType::NamedNode => 0.1,
540            RdfTermType::Literal => 0.05,
541            RdfTermType::BlankNode => 0.02,
542            RdfTermType::Variable => 0.01,
543            RdfTermType::QuotedTriple => 0.15,
544        };
545
546        (base_confidence + complexity_bonus + type_bonus).min(1.0)
547    }
548
549    /// Generate explanation for search results
550    fn generate_explanation(&self, metadata: &RdfTermMetadata, similarity: f32) -> Option<String> {
551        let term_type_str = match metadata.term_type {
552            RdfTermType::NamedNode => "Named Node",
553            RdfTermType::BlankNode => "Blank Node",
554            RdfTermType::Literal => "Literal",
555            RdfTermType::Variable => "Variable",
556            RdfTermType::QuotedTriple => "Quoted Triple",
557        };
558
559        let mut explanation = format!(
560            "{} with {:.2}% similarity",
561            term_type_str,
562            similarity * 100.0
563        );
564
565        if let Some(namespace) = &metadata.namespace {
566            explanation.push_str(&format!(", namespace: {namespace}"));
567        }
568
569        if let Some(language) = &metadata.language {
570            explanation.push_str(&format!(", language: {language}"));
571        }
572
573        Some(explanation)
574    }
575
576    /// Generate text embedding (placeholder implementation)
577    fn generate_text_embedding(&self, text: &str) -> Result<Vector> {
578        // This is a simplified implementation
579        // In production, you would use a proper embedding model
580        let words: Vec<&str> = text.split_whitespace().collect();
581        let dimension = 384; // Standard sentence transformer dimension
582
583        let mut vector_data = vec![0.0; dimension];
584
585        // Simple word-based embedding generation
586        for word in words.iter() {
587            let word_hash = {
588                use std::collections::hash_map::DefaultHasher;
589                let mut hasher = DefaultHasher::new();
590                word.hash(&mut hasher);
591                hasher.finish()
592            };
593
594            // Distribute word influence across vector dimensions
595            for j in 0..dimension {
596                let index = (word_hash as usize + j) % dimension;
597                vector_data[index] += 1.0 / (words.len() as f32);
598            }
599        }
600
601        // Normalize vector
602        let norm: f32 = vector_data.iter().map(|x| x * x).sum::<f32>().sqrt();
603        if norm > 0.0 {
604            for value in &mut vector_data {
605                *value /= norm;
606            }
607        }
608
609        Ok(Vector::new(vector_data))
610    }
611
612    /// Get statistics about the RDF-vector integration
613    pub fn get_statistics(&self) -> RdfIntegrationStats {
614        let term_mappings = self.term_mappings.read().expect("lock poisoned");
615        let graph_cache = self.graph_cache.read().expect("lock poisoned");
616        let namespace_registry = self.namespace_registry.read().expect("lock poisoned");
617
618        let mut type_counts = HashMap::new();
619        for mapping in term_mappings.values() {
620            *type_counts.entry(mapping.metadata.term_type).or_insert(0) += 1;
621        }
622
623        RdfIntegrationStats {
624            total_terms: term_mappings.len(),
625            total_graphs: graph_cache.len(),
626            total_namespaces: namespace_registry.len(),
627            type_distribution: type_counts,
628            cache_hit_ratio: 0.95, // Placeholder
629        }
630    }
631}
632
633/// Statistics for RDF-vector integration
634#[derive(Debug, Clone, Serialize, Deserialize)]
635pub struct RdfIntegrationStats {
636    pub total_terms: usize,
637    pub total_graphs: usize,
638    pub total_namespaces: usize,
639    pub type_distribution: HashMap<RdfTermType, usize>,
640    pub cache_hit_ratio: f32,
641}
642
643#[cfg(test)]
644mod tests {
645    use super::*;
646    use crate::VectorStore;
647    use oxirs_core::model::{NamedNode, Term};
648
649    #[test]
650    fn test_rdf_term_registration() {
651        let config = RdfVectorConfig::default();
652        let vector_store = Arc::new(RwLock::new(VectorStore::new()));
653        let integration = RdfVectorIntegration::new(config, vector_store);
654
655        let named_node = NamedNode::new("http://example.org/person").unwrap();
656        let term = Term::NamedNode(named_node);
657        let vector = Vector::new(vec![1.0, 0.0, 0.0]);
658
659        let vector_id = integration
660            .register_term(term.clone(), vector, None)
661            .unwrap();
662
663        assert!(integration.get_vector_id(&term).unwrap().is_some());
664        assert_eq!(
665            integration.get_vector_id(&term).unwrap().unwrap(),
666            vector_id
667        );
668    }
669
670    #[test]
671    fn test_uri_splitting() {
672        let config = RdfVectorConfig::default();
673        let vector_store = Arc::new(RwLock::new(VectorStore::new()));
674        let integration = RdfVectorIntegration::new(config, vector_store);
675
676        let (namespace, local_name) = integration.split_uri("http://example.org/ontology#Person");
677        assert_eq!(namespace, Some("http://example.org/ontology#".to_string()));
678        assert_eq!(local_name, Some("Person".to_string()));
679    }
680
681    #[test]
682    fn test_metadata_extraction() {
683        let config = RdfVectorConfig::default();
684        let vector_store = Arc::new(RwLock::new(VectorStore::new()));
685        let integration = RdfVectorIntegration::new(config, vector_store);
686
687        let literal = Literal::new_language_tagged_literal("Hello", "en").unwrap();
688        let term = Term::Literal(literal);
689
690        let metadata = integration.extract_term_metadata(&term).unwrap();
691        assert_eq!(metadata.term_type, RdfTermType::Literal);
692        assert_eq!(metadata.language, Some("en".to_string()));
693    }
694}