oxirs_vec/
rdf_integration.rs

1//! RDF term support integration with oxirs-core
2//!
3//! This module provides seamless integration between oxirs-vec's vector operations
4//! and oxirs-core's RDF term system, enabling semantic vector search on RDF data.
5
6use crate::{similarity::SimilarityMetric, Vector, VectorId, VectorStoreTrait};
7use anyhow::{anyhow, Result};
8use oxirs_core::model::{GraphName, Literal, NamedNode, Term};
9use serde::{Deserialize, Serialize};
10use std::collections::{HashMap, HashSet};
11use std::hash::{Hash, Hasher};
12use std::sync::{Arc, RwLock};
13
14/// Configuration for RDF-vector integration
15#[derive(Debug, Clone, Serialize, Deserialize)]
16pub struct RdfVectorConfig {
17    /// Enable automatic URI decomposition for embeddings
18    pub uri_decomposition: bool,
19    /// Include literal types in embeddings
20    pub include_literal_types: bool,
21    /// Enable graph context awareness
22    pub graph_context: bool,
23    /// Namespace prefix handling
24    pub namespace_aware: bool,
25    /// Default similarity metric for RDF term comparisons
26    pub default_metric: SimilarityMetric,
27    /// Cache size for term-to-vector mappings
28    pub cache_size: usize,
29}
30
31impl Default for RdfVectorConfig {
32    fn default() -> Self {
33        Self {
34            uri_decomposition: true,
35            include_literal_types: true,
36            graph_context: true,
37            namespace_aware: true,
38            default_metric: SimilarityMetric::Cosine,
39            cache_size: 10000,
40        }
41    }
42}
43
44/// Mapping between RDF terms and vector identifiers
45#[derive(Debug, Clone, Serialize, Deserialize)]
46pub struct RdfTermMapping {
47    /// Original RDF term
48    pub term: Term,
49    /// Associated vector identifier
50    pub vector_id: VectorId,
51    /// Graph context (if applicable)
52    pub graph_context: Option<GraphName>,
53    /// Term metadata for enhanced processing
54    pub metadata: RdfTermMetadata,
55}
56
57/// Metadata for RDF terms to enhance vector processing
58#[derive(Debug, Clone, Serialize, Deserialize)]
59pub struct RdfTermMetadata {
60    /// Term type for specialized processing
61    pub term_type: RdfTermType,
62    /// Namespace information
63    pub namespace: Option<String>,
64    /// Local name component
65    pub local_name: Option<String>,
66    /// Literal datatype (if applicable)
67    pub datatype: Option<NamedNode>,
68    /// Language tag (if applicable)
69    pub language: Option<String>,
70    /// Term complexity score for weighting
71    pub complexity_score: f32,
72}
73
74/// RDF term type enumeration for processing
75#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
76pub enum RdfTermType {
77    NamedNode,
78    BlankNode,
79    Literal,
80    Variable,
81    QuotedTriple,
82}
83
84/// Result of RDF-aware vector search
85#[derive(Debug, Clone, Serialize, Deserialize)]
86pub struct RdfVectorSearchResult {
87    /// Matching RDF term
88    pub term: Term,
89    /// Similarity score
90    pub score: f32,
91    /// Vector identifier
92    pub vector_id: VectorId,
93    /// Graph context
94    pub graph_context: Option<GraphName>,
95    /// Search metadata
96    pub metadata: SearchMetadata,
97}
98
99/// Search metadata for RDF vector results
100#[derive(Debug, Clone, Serialize, Deserialize)]
101pub struct SearchMetadata {
102    /// Search algorithm used
103    pub algorithm: String,
104    /// Processing time in microseconds
105    pub processing_time_us: u64,
106    /// Term matching confidence
107    pub confidence: f32,
108    /// Explanation of result relevance
109    pub explanation: Option<String>,
110}
111
112/// RDF-Vector integration engine
113pub struct RdfVectorIntegration {
114    /// Configuration
115    config: RdfVectorConfig,
116    /// Term to vector mappings
117    term_mappings: Arc<RwLock<HashMap<TermHash, RdfTermMapping>>>,
118    /// Vector to term reverse mappings
119    vector_mappings: Arc<RwLock<HashMap<VectorId, RdfTermMapping>>>,
120    /// Graph context cache
121    graph_cache: Arc<RwLock<HashMap<GraphName, HashSet<VectorId>>>>,
122    /// Namespace registry
123    namespace_registry: Arc<RwLock<HashMap<String, String>>>,
124    /// Vector store reference
125    vector_store: Arc<RwLock<dyn VectorStoreTrait>>,
126}
127
128/// Hash wrapper for RDF terms to enable HashMap keys
129#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
130struct TermHash(u64);
131
132impl TermHash {
133    fn from_term(term: &Term) -> Self {
134        use std::collections::hash_map::DefaultHasher;
135        let mut hasher = DefaultHasher::new();
136
137        match term {
138            Term::NamedNode(node) => {
139                "NamedNode".hash(&mut hasher);
140                node.as_str().hash(&mut hasher);
141            }
142            Term::BlankNode(node) => {
143                "BlankNode".hash(&mut hasher);
144                node.as_str().hash(&mut hasher);
145            }
146            Term::Literal(literal) => {
147                "Literal".hash(&mut hasher);
148                literal.value().hash(&mut hasher);
149                if let Some(lang) = literal.language() {
150                    lang.hash(&mut hasher);
151                }
152                literal.datatype().as_str().hash(&mut hasher);
153            }
154            Term::Variable(var) => {
155                "Variable".hash(&mut hasher);
156                var.as_str().hash(&mut hasher);
157            }
158            Term::QuotedTriple(_) => {
159                "QuotedTriple".hash(&mut hasher);
160                // Simplified hash for quoted triples
161                "quoted_triple".hash(&mut hasher);
162            }
163        }
164
165        TermHash(hasher.finish())
166    }
167}
168
169impl RdfVectorIntegration {
170    /// Create a new RDF-vector integration instance
171    pub fn new(config: RdfVectorConfig, vector_store: Arc<RwLock<dyn VectorStoreTrait>>) -> Self {
172        Self {
173            config,
174            term_mappings: Arc::new(RwLock::new(HashMap::new())),
175            vector_mappings: Arc::new(RwLock::new(HashMap::new())),
176            graph_cache: Arc::new(RwLock::new(HashMap::new())),
177            namespace_registry: Arc::new(RwLock::new(HashMap::new())),
178            vector_store,
179        }
180    }
181
182    /// Register an RDF term with vector representation
183    pub fn register_term(
184        &self,
185        term: Term,
186        vector: Vector,
187        graph_context: Option<GraphName>,
188    ) -> Result<VectorId> {
189        let vector_id = self.vector_store.write().unwrap().add_vector(vector)?;
190        let metadata = self.extract_term_metadata(&term)?;
191
192        let mapping = RdfTermMapping {
193            term: term.clone(),
194            vector_id: vector_id.clone(),
195            graph_context: graph_context.clone(),
196            metadata,
197        };
198
199        let term_hash = TermHash::from_term(&term);
200
201        // Update mappings
202        {
203            let mut term_mappings = self.term_mappings.write().unwrap();
204            term_mappings.insert(term_hash, mapping.clone());
205        }
206
207        {
208            let mut vector_mappings = self.vector_mappings.write().unwrap();
209            vector_mappings.insert(vector_id.clone(), mapping);
210        }
211
212        // Update graph cache if applicable
213        if let Some(graph) = graph_context {
214            let mut graph_cache = self.graph_cache.write().unwrap();
215            graph_cache
216                .entry(graph)
217                .or_default()
218                .insert(vector_id.clone());
219        }
220
221        Ok(vector_id)
222    }
223
224    /// Find similar RDF terms using vector similarity
225    pub fn find_similar_terms(
226        &self,
227        query_term: &Term,
228        limit: usize,
229        threshold: Option<f32>,
230        graph_context: Option<&GraphName>,
231    ) -> Result<Vec<RdfVectorSearchResult>> {
232        let start_time = std::time::Instant::now();
233
234        // Get vector for query term
235        let query_vector_id = self
236            .get_vector_id(query_term)?
237            .ok_or_else(|| anyhow!("Query term not found in vector store"))?;
238
239        let query_vector = self
240            .vector_store
241            .read()
242            .unwrap()
243            .get_vector(&query_vector_id)?
244            .ok_or_else(|| anyhow!("Query vector not found"))?;
245
246        // Filter by graph context if specified
247        let candidate_vectors = if let Some(graph) = graph_context {
248            let graph_cache = self.graph_cache.read().unwrap();
249            graph_cache
250                .get(graph)
251                .map(|set| set.iter().cloned().collect::<Vec<_>>())
252                .unwrap_or_default()
253        } else {
254            // Use all vectors if no graph context specified
255            self.vector_store.read().unwrap().get_all_vector_ids()?
256        };
257
258        // Perform similarity search
259        let mut results = Vec::new();
260        for vector_id in candidate_vectors {
261            if *vector_id == query_vector_id {
262                continue; // Skip self
263            }
264
265            if let Ok(Some(vector)) = self.vector_store.read().unwrap().get_vector(&vector_id) {
266                let similarity = self.config.default_metric.compute(&query_vector, &vector)?;
267
268                // Apply threshold filtering
269                if let Some(thresh) = threshold {
270                    if similarity < thresh {
271                        continue;
272                    }
273                }
274
275                // Get term mapping
276                let vector_mappings = self.vector_mappings.read().unwrap();
277                if let Some(mapping) = vector_mappings.get(&vector_id) {
278                    let processing_time = start_time.elapsed().as_micros() as u64;
279
280                    results.push(RdfVectorSearchResult {
281                        term: mapping.term.clone(),
282                        score: similarity,
283                        vector_id: vector_id.clone(),
284                        graph_context: mapping.graph_context.clone(),
285                        metadata: SearchMetadata {
286                            algorithm: "vector_similarity".to_string(),
287                            processing_time_us: processing_time,
288                            confidence: self.calculate_confidence(similarity, &mapping.metadata),
289                            explanation: self.generate_explanation(&mapping.metadata, similarity),
290                        },
291                    });
292                }
293            }
294        }
295
296        // Sort by similarity score (descending)
297        results.sort_by(|a, b| {
298            b.score
299                .partial_cmp(&a.score)
300                .unwrap_or(std::cmp::Ordering::Equal)
301        });
302
303        // Apply limit
304        results.truncate(limit);
305
306        Ok(results)
307    }
308
309    /// Search for terms by text content with RDF-aware processing
310    pub fn search_by_text(
311        &self,
312        query_text: &str,
313        limit: usize,
314        threshold: Option<f32>,
315        graph_context: Option<&GraphName>,
316    ) -> Result<Vec<RdfVectorSearchResult>> {
317        // Create a temporary literal term for text search
318        let literal = Literal::new_simple_literal(query_text);
319        let _query_term = Term::Literal(literal);
320
321        // For text search, we would typically generate an embedding
322        // This is a simplified version - in practice, you'd use an embedding model
323        let query_vector = self.generate_text_embedding(query_text)?;
324
325        // Register temporary term (optional - for caching)
326        let temp_vector_id = self
327            .vector_store
328            .write()
329            .unwrap()
330            .add_vector(query_vector.clone())?;
331
332        // Perform similarity search against all terms
333        let candidate_vectors = if let Some(graph) = graph_context {
334            let graph_cache = self.graph_cache.read().unwrap();
335            graph_cache
336                .get(graph)
337                .map(|set| set.iter().cloned().collect::<Vec<_>>())
338                .unwrap_or_default()
339        } else {
340            self.vector_store.read().unwrap().get_all_vector_ids()?
341        };
342
343        let mut results = Vec::new();
344        let start_time = std::time::Instant::now();
345
346        for vector_id in candidate_vectors {
347            if let Ok(Some(vector)) = self.vector_store.read().unwrap().get_vector(&vector_id) {
348                let similarity = self.config.default_metric.compute(&query_vector, &vector)?;
349
350                if let Some(thresh) = threshold {
351                    if similarity < thresh {
352                        continue;
353                    }
354                }
355
356                let vector_mappings = self.vector_mappings.read().unwrap();
357                if let Some(mapping) = vector_mappings.get(&vector_id) {
358                    let processing_time = start_time.elapsed().as_micros() as u64;
359
360                    results.push(RdfVectorSearchResult {
361                        term: mapping.term.clone(),
362                        score: similarity,
363                        vector_id: vector_id.clone(),
364                        graph_context: mapping.graph_context.clone(),
365                        metadata: SearchMetadata {
366                            algorithm: "text_similarity".to_string(),
367                            processing_time_us: processing_time,
368                            confidence: self.calculate_confidence(similarity, &mapping.metadata),
369                            explanation: Some(format!("Text similarity match: '{query_text}'")),
370                        },
371                    });
372                }
373            }
374        }
375
376        // Clean up temporary vector
377        let _ = self
378            .vector_store
379            .write()
380            .unwrap()
381            .remove_vector(&temp_vector_id);
382
383        // Sort and limit results
384        results.sort_by(|a, b| {
385            b.score
386                .partial_cmp(&a.score)
387                .unwrap_or(std::cmp::Ordering::Equal)
388        });
389        results.truncate(limit);
390
391        Ok(results)
392    }
393
394    /// Get vector ID for an RDF term
395    pub fn get_vector_id(&self, term: &Term) -> Result<Option<VectorId>> {
396        let term_hash = TermHash::from_term(term);
397        let term_mappings = self.term_mappings.read().unwrap();
398        Ok(term_mappings
399            .get(&term_hash)
400            .map(|mapping| mapping.vector_id.clone()))
401    }
402
403    /// Get RDF term for a vector ID
404    pub fn get_term(&self, vector_id: VectorId) -> Result<Option<Term>> {
405        let vector_mappings = self.vector_mappings.read().unwrap();
406        Ok(vector_mappings
407            .get(&vector_id)
408            .map(|mapping| mapping.term.clone()))
409    }
410
411    /// Register a namespace prefix
412    pub fn register_namespace(&self, prefix: String, uri: String) -> Result<()> {
413        let mut registry = self.namespace_registry.write().unwrap();
414        registry.insert(prefix, uri);
415        Ok(())
416    }
417
418    /// Extract metadata from RDF term
419    fn extract_term_metadata(&self, term: &Term) -> Result<RdfTermMetadata> {
420        match term {
421            Term::NamedNode(node) => {
422                let uri = node.as_str();
423                let (namespace, local_name) = self.split_uri(uri);
424
425                Ok(RdfTermMetadata {
426                    term_type: RdfTermType::NamedNode,
427                    namespace,
428                    local_name,
429                    datatype: None,
430                    language: None,
431                    complexity_score: self.calculate_uri_complexity(uri),
432                })
433            }
434            Term::BlankNode(_) => {
435                Ok(RdfTermMetadata {
436                    term_type: RdfTermType::BlankNode,
437                    namespace: None,
438                    local_name: None,
439                    datatype: None,
440                    language: None,
441                    complexity_score: 0.5, // Blank nodes have medium complexity
442                })
443            }
444            Term::Literal(literal) => Ok(RdfTermMetadata {
445                term_type: RdfTermType::Literal,
446                namespace: None,
447                local_name: None,
448                datatype: Some(literal.datatype().into()),
449                language: literal.language().map(|s| s.to_string()),
450                complexity_score: self.calculate_literal_complexity(literal),
451            }),
452            Term::Variable(_) => {
453                Ok(RdfTermMetadata {
454                    term_type: RdfTermType::Variable,
455                    namespace: None,
456                    local_name: None,
457                    datatype: None,
458                    language: None,
459                    complexity_score: 0.3, // Variables have low complexity
460                })
461            }
462            Term::QuotedTriple(_) => {
463                Ok(RdfTermMetadata {
464                    term_type: RdfTermType::QuotedTriple,
465                    namespace: None,
466                    local_name: None,
467                    datatype: None,
468                    language: None,
469                    complexity_score: 1.0, // Quoted triples have high complexity
470                })
471            }
472        }
473    }
474
475    /// Split URI into namespace and local name
476    fn split_uri(&self, uri: &str) -> (Option<String>, Option<String>) {
477        // Simple URI splitting logic - can be enhanced
478        if let Some(pos) = uri.rfind(&['#', '/'][..]) {
479            let namespace = uri[..pos + 1].to_string();
480            let local_name = uri[pos + 1..].to_string();
481            (Some(namespace), Some(local_name))
482        } else {
483            (None, Some(uri.to_string()))
484        }
485    }
486
487    /// Calculate URI complexity score
488    fn calculate_uri_complexity(&self, uri: &str) -> f32 {
489        let length_factor = (uri.len() as f32 / 100.0).min(1.0);
490        let segment_count = uri.matches(&['/', '#'][..]).count() as f32 / 10.0;
491        let query_params = if uri.contains('?') { 0.2 } else { 0.0 };
492
493        (length_factor + segment_count + query_params).min(1.0)
494    }
495
496    /// Calculate literal complexity score
497    fn calculate_literal_complexity(&self, literal: &Literal) -> f32 {
498        let value_length = literal.value().len() as f32 / 200.0;
499        let datatype_complexity =
500            if literal.datatype().as_str() == "http://www.w3.org/2001/XMLSchema#string" {
501                0.3
502            } else {
503                0.7
504            };
505        let language_bonus = if literal.language().is_some() {
506            0.2
507        } else {
508            0.0
509        };
510
511        (value_length + datatype_complexity + language_bonus).min(1.0)
512    }
513
514    /// Calculate confidence score for search results
515    fn calculate_confidence(&self, similarity: f32, metadata: &RdfTermMetadata) -> f32 {
516        let base_confidence = similarity;
517        let complexity_bonus = metadata.complexity_score * 0.1;
518        let type_bonus = match metadata.term_type {
519            RdfTermType::NamedNode => 0.1,
520            RdfTermType::Literal => 0.05,
521            RdfTermType::BlankNode => 0.02,
522            RdfTermType::Variable => 0.01,
523            RdfTermType::QuotedTriple => 0.15,
524        };
525
526        (base_confidence + complexity_bonus + type_bonus).min(1.0)
527    }
528
529    /// Generate explanation for search results
530    fn generate_explanation(&self, metadata: &RdfTermMetadata, similarity: f32) -> Option<String> {
531        let term_type_str = match metadata.term_type {
532            RdfTermType::NamedNode => "Named Node",
533            RdfTermType::BlankNode => "Blank Node",
534            RdfTermType::Literal => "Literal",
535            RdfTermType::Variable => "Variable",
536            RdfTermType::QuotedTriple => "Quoted Triple",
537        };
538
539        let mut explanation = format!(
540            "{} with {:.2}% similarity",
541            term_type_str,
542            similarity * 100.0
543        );
544
545        if let Some(namespace) = &metadata.namespace {
546            explanation.push_str(&format!(", namespace: {namespace}"));
547        }
548
549        if let Some(language) = &metadata.language {
550            explanation.push_str(&format!(", language: {language}"));
551        }
552
553        Some(explanation)
554    }
555
556    /// Generate text embedding (placeholder implementation)
557    fn generate_text_embedding(&self, text: &str) -> Result<Vector> {
558        // This is a simplified implementation
559        // In production, you would use a proper embedding model
560        let words: Vec<&str> = text.split_whitespace().collect();
561        let dimension = 384; // Standard sentence transformer dimension
562
563        let mut vector_data = vec![0.0; dimension];
564
565        // Simple word-based embedding generation
566        for word in words.iter() {
567            let word_hash = {
568                use std::collections::hash_map::DefaultHasher;
569                let mut hasher = DefaultHasher::new();
570                word.hash(&mut hasher);
571                hasher.finish()
572            };
573
574            // Distribute word influence across vector dimensions
575            for j in 0..dimension {
576                let index = (word_hash as usize + j) % dimension;
577                vector_data[index] += 1.0 / (words.len() as f32);
578            }
579        }
580
581        // Normalize vector
582        let norm: f32 = vector_data.iter().map(|x| x * x).sum::<f32>().sqrt();
583        if norm > 0.0 {
584            for value in &mut vector_data {
585                *value /= norm;
586            }
587        }
588
589        Ok(Vector::new(vector_data))
590    }
591
592    /// Get statistics about the RDF-vector integration
593    pub fn get_statistics(&self) -> RdfIntegrationStats {
594        let term_mappings = self.term_mappings.read().unwrap();
595        let graph_cache = self.graph_cache.read().unwrap();
596        let namespace_registry = self.namespace_registry.read().unwrap();
597
598        let mut type_counts = HashMap::new();
599        for mapping in term_mappings.values() {
600            *type_counts.entry(mapping.metadata.term_type).or_insert(0) += 1;
601        }
602
603        RdfIntegrationStats {
604            total_terms: term_mappings.len(),
605            total_graphs: graph_cache.len(),
606            total_namespaces: namespace_registry.len(),
607            type_distribution: type_counts,
608            cache_hit_ratio: 0.95, // Placeholder
609        }
610    }
611}
612
613/// Statistics for RDF-vector integration
614#[derive(Debug, Clone, Serialize, Deserialize)]
615pub struct RdfIntegrationStats {
616    pub total_terms: usize,
617    pub total_graphs: usize,
618    pub total_namespaces: usize,
619    pub type_distribution: HashMap<RdfTermType, usize>,
620    pub cache_hit_ratio: f32,
621}
622
623#[cfg(test)]
624mod tests {
625    use super::*;
626    use crate::VectorStore;
627    use oxirs_core::model::{NamedNode, Term};
628
629    #[test]
630    fn test_rdf_term_registration() {
631        let config = RdfVectorConfig::default();
632        let vector_store = Arc::new(RwLock::new(VectorStore::new()));
633        let integration = RdfVectorIntegration::new(config, vector_store);
634
635        let named_node = NamedNode::new("http://example.org/person").unwrap();
636        let term = Term::NamedNode(named_node);
637        let vector = Vector::new(vec![1.0, 0.0, 0.0]);
638
639        let vector_id = integration
640            .register_term(term.clone(), vector, None)
641            .unwrap();
642
643        assert!(integration.get_vector_id(&term).unwrap().is_some());
644        assert_eq!(
645            integration.get_vector_id(&term).unwrap().unwrap(),
646            vector_id
647        );
648    }
649
650    #[test]
651    fn test_uri_splitting() {
652        let config = RdfVectorConfig::default();
653        let vector_store = Arc::new(RwLock::new(VectorStore::new()));
654        let integration = RdfVectorIntegration::new(config, vector_store);
655
656        let (namespace, local_name) = integration.split_uri("http://example.org/ontology#Person");
657        assert_eq!(namespace, Some("http://example.org/ontology#".to_string()));
658        assert_eq!(local_name, Some("Person".to_string()));
659    }
660
661    #[test]
662    fn test_metadata_extraction() {
663        let config = RdfVectorConfig::default();
664        let vector_store = Arc::new(RwLock::new(VectorStore::new()));
665        let integration = RdfVectorIntegration::new(config, vector_store);
666
667        let literal = Literal::new_language_tagged_literal("Hello", "en").unwrap();
668        let term = Term::Literal(literal);
669
670        let metadata = integration.extract_term_metadata(&term).unwrap();
671        assert_eq!(metadata.term_type, RdfTermType::Literal);
672        assert_eq!(metadata.language, Some("en".to_string()));
673    }
674}