1use crate::{similarity::SimilarityMetric, Vector, VectorId, VectorStoreTrait};
7use anyhow::{anyhow, Result};
8use oxirs_core::model::{GraphName, Literal, NamedNode, Term};
9use serde::{Deserialize, Serialize};
10use std::collections::{HashMap, HashSet};
11use std::hash::{Hash, Hasher};
12use std::sync::{Arc, RwLock};
13
14#[derive(Debug, Clone, Serialize, Deserialize)]
16pub struct RdfVectorConfig {
17 pub uri_decomposition: bool,
19 pub include_literal_types: bool,
21 pub graph_context: bool,
23 pub namespace_aware: bool,
25 pub default_metric: SimilarityMetric,
27 pub cache_size: usize,
29}
30
31impl Default for RdfVectorConfig {
32 fn default() -> Self {
33 Self {
34 uri_decomposition: true,
35 include_literal_types: true,
36 graph_context: true,
37 namespace_aware: true,
38 default_metric: SimilarityMetric::Cosine,
39 cache_size: 10000,
40 }
41 }
42}
43
44#[derive(Debug, Clone, Serialize, Deserialize)]
46pub struct RdfTermMapping {
47 pub term: Term,
49 pub vector_id: VectorId,
51 pub graph_context: Option<GraphName>,
53 pub metadata: RdfTermMetadata,
55}
56
57#[derive(Debug, Clone, Serialize, Deserialize)]
59pub struct RdfTermMetadata {
60 pub term_type: RdfTermType,
62 pub namespace: Option<String>,
64 pub local_name: Option<String>,
66 pub datatype: Option<NamedNode>,
68 pub language: Option<String>,
70 pub complexity_score: f32,
72}
73
74#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
76pub enum RdfTermType {
77 NamedNode,
78 BlankNode,
79 Literal,
80 Variable,
81 QuotedTriple,
82}
83
84#[derive(Debug, Clone, Serialize, Deserialize)]
86pub struct RdfVectorSearchResult {
87 pub term: Term,
89 pub score: f32,
91 pub vector_id: VectorId,
93 pub graph_context: Option<GraphName>,
95 pub metadata: SearchMetadata,
97}
98
99#[derive(Debug, Clone, Serialize, Deserialize)]
101pub struct SearchMetadata {
102 pub algorithm: String,
104 pub processing_time_us: u64,
106 pub confidence: f32,
108 pub explanation: Option<String>,
110}
111
112pub struct RdfVectorIntegration {
114 config: RdfVectorConfig,
116 term_mappings: Arc<RwLock<HashMap<TermHash, RdfTermMapping>>>,
118 vector_mappings: Arc<RwLock<HashMap<VectorId, RdfTermMapping>>>,
120 graph_cache: Arc<RwLock<HashMap<GraphName, HashSet<VectorId>>>>,
122 namespace_registry: Arc<RwLock<HashMap<String, String>>>,
124 vector_store: Arc<RwLock<dyn VectorStoreTrait>>,
126}
127
128#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
130struct TermHash(u64);
131
132impl TermHash {
133 fn from_term(term: &Term) -> Self {
134 use std::collections::hash_map::DefaultHasher;
135 let mut hasher = DefaultHasher::new();
136
137 match term {
138 Term::NamedNode(node) => {
139 "NamedNode".hash(&mut hasher);
140 node.as_str().hash(&mut hasher);
141 }
142 Term::BlankNode(node) => {
143 "BlankNode".hash(&mut hasher);
144 node.as_str().hash(&mut hasher);
145 }
146 Term::Literal(literal) => {
147 "Literal".hash(&mut hasher);
148 literal.value().hash(&mut hasher);
149 if let Some(lang) = literal.language() {
150 lang.hash(&mut hasher);
151 }
152 literal.datatype().as_str().hash(&mut hasher);
153 }
154 Term::Variable(var) => {
155 "Variable".hash(&mut hasher);
156 var.as_str().hash(&mut hasher);
157 }
158 Term::QuotedTriple(_) => {
159 "QuotedTriple".hash(&mut hasher);
160 "quoted_triple".hash(&mut hasher);
162 }
163 }
164
165 TermHash(hasher.finish())
166 }
167}
168
169impl RdfVectorIntegration {
170 pub fn new(config: RdfVectorConfig, vector_store: Arc<RwLock<dyn VectorStoreTrait>>) -> Self {
172 Self {
173 config,
174 term_mappings: Arc::new(RwLock::new(HashMap::new())),
175 vector_mappings: Arc::new(RwLock::new(HashMap::new())),
176 graph_cache: Arc::new(RwLock::new(HashMap::new())),
177 namespace_registry: Arc::new(RwLock::new(HashMap::new())),
178 vector_store,
179 }
180 }
181
182 pub fn register_term(
184 &self,
185 term: Term,
186 vector: Vector,
187 graph_context: Option<GraphName>,
188 ) -> Result<VectorId> {
189 let vector_id = self.vector_store.write().unwrap().add_vector(vector)?;
190 let metadata = self.extract_term_metadata(&term)?;
191
192 let mapping = RdfTermMapping {
193 term: term.clone(),
194 vector_id: vector_id.clone(),
195 graph_context: graph_context.clone(),
196 metadata,
197 };
198
199 let term_hash = TermHash::from_term(&term);
200
201 {
203 let mut term_mappings = self.term_mappings.write().unwrap();
204 term_mappings.insert(term_hash, mapping.clone());
205 }
206
207 {
208 let mut vector_mappings = self.vector_mappings.write().unwrap();
209 vector_mappings.insert(vector_id.clone(), mapping);
210 }
211
212 if let Some(graph) = graph_context {
214 let mut graph_cache = self.graph_cache.write().unwrap();
215 graph_cache
216 .entry(graph)
217 .or_default()
218 .insert(vector_id.clone());
219 }
220
221 Ok(vector_id)
222 }
223
224 pub fn find_similar_terms(
226 &self,
227 query_term: &Term,
228 limit: usize,
229 threshold: Option<f32>,
230 graph_context: Option<&GraphName>,
231 ) -> Result<Vec<RdfVectorSearchResult>> {
232 let start_time = std::time::Instant::now();
233
234 let query_vector_id = self
236 .get_vector_id(query_term)?
237 .ok_or_else(|| anyhow!("Query term not found in vector store"))?;
238
239 let query_vector = self
240 .vector_store
241 .read()
242 .unwrap()
243 .get_vector(&query_vector_id)?
244 .ok_or_else(|| anyhow!("Query vector not found"))?;
245
246 let candidate_vectors = if let Some(graph) = graph_context {
248 let graph_cache = self.graph_cache.read().unwrap();
249 graph_cache
250 .get(graph)
251 .map(|set| set.iter().cloned().collect::<Vec<_>>())
252 .unwrap_or_default()
253 } else {
254 self.vector_store.read().unwrap().get_all_vector_ids()?
256 };
257
258 let mut results = Vec::new();
260 for vector_id in candidate_vectors {
261 if *vector_id == query_vector_id {
262 continue; }
264
265 if let Ok(Some(vector)) = self.vector_store.read().unwrap().get_vector(&vector_id) {
266 let similarity = self.config.default_metric.compute(&query_vector, &vector)?;
267
268 if let Some(thresh) = threshold {
270 if similarity < thresh {
271 continue;
272 }
273 }
274
275 let vector_mappings = self.vector_mappings.read().unwrap();
277 if let Some(mapping) = vector_mappings.get(&vector_id) {
278 let processing_time = start_time.elapsed().as_micros() as u64;
279
280 results.push(RdfVectorSearchResult {
281 term: mapping.term.clone(),
282 score: similarity,
283 vector_id: vector_id.clone(),
284 graph_context: mapping.graph_context.clone(),
285 metadata: SearchMetadata {
286 algorithm: "vector_similarity".to_string(),
287 processing_time_us: processing_time,
288 confidence: self.calculate_confidence(similarity, &mapping.metadata),
289 explanation: self.generate_explanation(&mapping.metadata, similarity),
290 },
291 });
292 }
293 }
294 }
295
296 results.sort_by(|a, b| {
298 b.score
299 .partial_cmp(&a.score)
300 .unwrap_or(std::cmp::Ordering::Equal)
301 });
302
303 results.truncate(limit);
305
306 Ok(results)
307 }
308
309 pub fn search_by_text(
311 &self,
312 query_text: &str,
313 limit: usize,
314 threshold: Option<f32>,
315 graph_context: Option<&GraphName>,
316 ) -> Result<Vec<RdfVectorSearchResult>> {
317 let literal = Literal::new_simple_literal(query_text);
319 let _query_term = Term::Literal(literal);
320
321 let query_vector = self.generate_text_embedding(query_text)?;
324
325 let temp_vector_id = self
327 .vector_store
328 .write()
329 .unwrap()
330 .add_vector(query_vector.clone())?;
331
332 let candidate_vectors = if let Some(graph) = graph_context {
334 let graph_cache = self.graph_cache.read().unwrap();
335 graph_cache
336 .get(graph)
337 .map(|set| set.iter().cloned().collect::<Vec<_>>())
338 .unwrap_or_default()
339 } else {
340 self.vector_store.read().unwrap().get_all_vector_ids()?
341 };
342
343 let mut results = Vec::new();
344 let start_time = std::time::Instant::now();
345
346 for vector_id in candidate_vectors {
347 if let Ok(Some(vector)) = self.vector_store.read().unwrap().get_vector(&vector_id) {
348 let similarity = self.config.default_metric.compute(&query_vector, &vector)?;
349
350 if let Some(thresh) = threshold {
351 if similarity < thresh {
352 continue;
353 }
354 }
355
356 let vector_mappings = self.vector_mappings.read().unwrap();
357 if let Some(mapping) = vector_mappings.get(&vector_id) {
358 let processing_time = start_time.elapsed().as_micros() as u64;
359
360 results.push(RdfVectorSearchResult {
361 term: mapping.term.clone(),
362 score: similarity,
363 vector_id: vector_id.clone(),
364 graph_context: mapping.graph_context.clone(),
365 metadata: SearchMetadata {
366 algorithm: "text_similarity".to_string(),
367 processing_time_us: processing_time,
368 confidence: self.calculate_confidence(similarity, &mapping.metadata),
369 explanation: Some(format!("Text similarity match: '{query_text}'")),
370 },
371 });
372 }
373 }
374 }
375
376 let _ = self
378 .vector_store
379 .write()
380 .unwrap()
381 .remove_vector(&temp_vector_id);
382
383 results.sort_by(|a, b| {
385 b.score
386 .partial_cmp(&a.score)
387 .unwrap_or(std::cmp::Ordering::Equal)
388 });
389 results.truncate(limit);
390
391 Ok(results)
392 }
393
394 pub fn get_vector_id(&self, term: &Term) -> Result<Option<VectorId>> {
396 let term_hash = TermHash::from_term(term);
397 let term_mappings = self.term_mappings.read().unwrap();
398 Ok(term_mappings
399 .get(&term_hash)
400 .map(|mapping| mapping.vector_id.clone()))
401 }
402
403 pub fn get_term(&self, vector_id: VectorId) -> Result<Option<Term>> {
405 let vector_mappings = self.vector_mappings.read().unwrap();
406 Ok(vector_mappings
407 .get(&vector_id)
408 .map(|mapping| mapping.term.clone()))
409 }
410
411 pub fn register_namespace(&self, prefix: String, uri: String) -> Result<()> {
413 let mut registry = self.namespace_registry.write().unwrap();
414 registry.insert(prefix, uri);
415 Ok(())
416 }
417
418 fn extract_term_metadata(&self, term: &Term) -> Result<RdfTermMetadata> {
420 match term {
421 Term::NamedNode(node) => {
422 let uri = node.as_str();
423 let (namespace, local_name) = self.split_uri(uri);
424
425 Ok(RdfTermMetadata {
426 term_type: RdfTermType::NamedNode,
427 namespace,
428 local_name,
429 datatype: None,
430 language: None,
431 complexity_score: self.calculate_uri_complexity(uri),
432 })
433 }
434 Term::BlankNode(_) => {
435 Ok(RdfTermMetadata {
436 term_type: RdfTermType::BlankNode,
437 namespace: None,
438 local_name: None,
439 datatype: None,
440 language: None,
441 complexity_score: 0.5, })
443 }
444 Term::Literal(literal) => Ok(RdfTermMetadata {
445 term_type: RdfTermType::Literal,
446 namespace: None,
447 local_name: None,
448 datatype: Some(literal.datatype().into()),
449 language: literal.language().map(|s| s.to_string()),
450 complexity_score: self.calculate_literal_complexity(literal),
451 }),
452 Term::Variable(_) => {
453 Ok(RdfTermMetadata {
454 term_type: RdfTermType::Variable,
455 namespace: None,
456 local_name: None,
457 datatype: None,
458 language: None,
459 complexity_score: 0.3, })
461 }
462 Term::QuotedTriple(_) => {
463 Ok(RdfTermMetadata {
464 term_type: RdfTermType::QuotedTriple,
465 namespace: None,
466 local_name: None,
467 datatype: None,
468 language: None,
469 complexity_score: 1.0, })
471 }
472 }
473 }
474
475 fn split_uri(&self, uri: &str) -> (Option<String>, Option<String>) {
477 if let Some(pos) = uri.rfind(&['#', '/'][..]) {
479 let namespace = uri[..pos + 1].to_string();
480 let local_name = uri[pos + 1..].to_string();
481 (Some(namespace), Some(local_name))
482 } else {
483 (None, Some(uri.to_string()))
484 }
485 }
486
487 fn calculate_uri_complexity(&self, uri: &str) -> f32 {
489 let length_factor = (uri.len() as f32 / 100.0).min(1.0);
490 let segment_count = uri.matches(&['/', '#'][..]).count() as f32 / 10.0;
491 let query_params = if uri.contains('?') { 0.2 } else { 0.0 };
492
493 (length_factor + segment_count + query_params).min(1.0)
494 }
495
496 fn calculate_literal_complexity(&self, literal: &Literal) -> f32 {
498 let value_length = literal.value().len() as f32 / 200.0;
499 let datatype_complexity =
500 if literal.datatype().as_str() == "http://www.w3.org/2001/XMLSchema#string" {
501 0.3
502 } else {
503 0.7
504 };
505 let language_bonus = if literal.language().is_some() {
506 0.2
507 } else {
508 0.0
509 };
510
511 (value_length + datatype_complexity + language_bonus).min(1.0)
512 }
513
514 fn calculate_confidence(&self, similarity: f32, metadata: &RdfTermMetadata) -> f32 {
516 let base_confidence = similarity;
517 let complexity_bonus = metadata.complexity_score * 0.1;
518 let type_bonus = match metadata.term_type {
519 RdfTermType::NamedNode => 0.1,
520 RdfTermType::Literal => 0.05,
521 RdfTermType::BlankNode => 0.02,
522 RdfTermType::Variable => 0.01,
523 RdfTermType::QuotedTriple => 0.15,
524 };
525
526 (base_confidence + complexity_bonus + type_bonus).min(1.0)
527 }
528
529 fn generate_explanation(&self, metadata: &RdfTermMetadata, similarity: f32) -> Option<String> {
531 let term_type_str = match metadata.term_type {
532 RdfTermType::NamedNode => "Named Node",
533 RdfTermType::BlankNode => "Blank Node",
534 RdfTermType::Literal => "Literal",
535 RdfTermType::Variable => "Variable",
536 RdfTermType::QuotedTriple => "Quoted Triple",
537 };
538
539 let mut explanation = format!(
540 "{} with {:.2}% similarity",
541 term_type_str,
542 similarity * 100.0
543 );
544
545 if let Some(namespace) = &metadata.namespace {
546 explanation.push_str(&format!(", namespace: {namespace}"));
547 }
548
549 if let Some(language) = &metadata.language {
550 explanation.push_str(&format!(", language: {language}"));
551 }
552
553 Some(explanation)
554 }
555
556 fn generate_text_embedding(&self, text: &str) -> Result<Vector> {
558 let words: Vec<&str> = text.split_whitespace().collect();
561 let dimension = 384; let mut vector_data = vec![0.0; dimension];
564
565 for word in words.iter() {
567 let word_hash = {
568 use std::collections::hash_map::DefaultHasher;
569 let mut hasher = DefaultHasher::new();
570 word.hash(&mut hasher);
571 hasher.finish()
572 };
573
574 for j in 0..dimension {
576 let index = (word_hash as usize + j) % dimension;
577 vector_data[index] += 1.0 / (words.len() as f32);
578 }
579 }
580
581 let norm: f32 = vector_data.iter().map(|x| x * x).sum::<f32>().sqrt();
583 if norm > 0.0 {
584 for value in &mut vector_data {
585 *value /= norm;
586 }
587 }
588
589 Ok(Vector::new(vector_data))
590 }
591
592 pub fn get_statistics(&self) -> RdfIntegrationStats {
594 let term_mappings = self.term_mappings.read().unwrap();
595 let graph_cache = self.graph_cache.read().unwrap();
596 let namespace_registry = self.namespace_registry.read().unwrap();
597
598 let mut type_counts = HashMap::new();
599 for mapping in term_mappings.values() {
600 *type_counts.entry(mapping.metadata.term_type).or_insert(0) += 1;
601 }
602
603 RdfIntegrationStats {
604 total_terms: term_mappings.len(),
605 total_graphs: graph_cache.len(),
606 total_namespaces: namespace_registry.len(),
607 type_distribution: type_counts,
608 cache_hit_ratio: 0.95, }
610 }
611}
612
613#[derive(Debug, Clone, Serialize, Deserialize)]
615pub struct RdfIntegrationStats {
616 pub total_terms: usize,
617 pub total_graphs: usize,
618 pub total_namespaces: usize,
619 pub type_distribution: HashMap<RdfTermType, usize>,
620 pub cache_hit_ratio: f32,
621}
622
623#[cfg(test)]
624mod tests {
625 use super::*;
626 use crate::VectorStore;
627 use oxirs_core::model::{NamedNode, Term};
628
629 #[test]
630 fn test_rdf_term_registration() {
631 let config = RdfVectorConfig::default();
632 let vector_store = Arc::new(RwLock::new(VectorStore::new()));
633 let integration = RdfVectorIntegration::new(config, vector_store);
634
635 let named_node = NamedNode::new("http://example.org/person").unwrap();
636 let term = Term::NamedNode(named_node);
637 let vector = Vector::new(vec![1.0, 0.0, 0.0]);
638
639 let vector_id = integration
640 .register_term(term.clone(), vector, None)
641 .unwrap();
642
643 assert!(integration.get_vector_id(&term).unwrap().is_some());
644 assert_eq!(
645 integration.get_vector_id(&term).unwrap().unwrap(),
646 vector_id
647 );
648 }
649
650 #[test]
651 fn test_uri_splitting() {
652 let config = RdfVectorConfig::default();
653 let vector_store = Arc::new(RwLock::new(VectorStore::new()));
654 let integration = RdfVectorIntegration::new(config, vector_store);
655
656 let (namespace, local_name) = integration.split_uri("http://example.org/ontology#Person");
657 assert_eq!(namespace, Some("http://example.org/ontology#".to_string()));
658 assert_eq!(local_name, Some("Person".to_string()));
659 }
660
661 #[test]
662 fn test_metadata_extraction() {
663 let config = RdfVectorConfig::default();
664 let vector_store = Arc::new(RwLock::new(VectorStore::new()));
665 let integration = RdfVectorIntegration::new(config, vector_store);
666
667 let literal = Literal::new_language_tagged_literal("Hello", "en").unwrap();
668 let term = Term::Literal(literal);
669
670 let metadata = integration.extract_term_metadata(&term).unwrap();
671 assert_eq!(metadata.term_type, RdfTermType::Literal);
672 assert_eq!(metadata.language, Some("en".to_string()));
673 }
674}