use crate::{similarity::SimilarityMetric, Vector, VectorId, VectorStoreTrait};
use anyhow::{anyhow, Result};
use oxirs_core::model::{GraphName, Literal, NamedNode, Term};
use serde::{Deserialize, Serialize};
use std::collections::{HashMap, HashSet};
use std::hash::{Hash, Hasher};
use std::sync::{Arc, RwLock};
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct RdfVectorConfig {
pub uri_decomposition: bool,
pub include_literal_types: bool,
pub graph_context: bool,
pub namespace_aware: bool,
pub default_metric: SimilarityMetric,
pub cache_size: usize,
}
impl Default for RdfVectorConfig {
fn default() -> Self {
Self {
uri_decomposition: true,
include_literal_types: true,
graph_context: true,
namespace_aware: true,
default_metric: SimilarityMetric::Cosine,
cache_size: 10000,
}
}
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct RdfTermMapping {
pub term: Term,
pub vector_id: VectorId,
pub graph_context: Option<GraphName>,
pub metadata: RdfTermMetadata,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct RdfTermMetadata {
pub term_type: RdfTermType,
pub namespace: Option<String>,
pub local_name: Option<String>,
pub datatype: Option<NamedNode>,
pub language: Option<String>,
pub complexity_score: f32,
}
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
pub enum RdfTermType {
NamedNode,
BlankNode,
Literal,
Variable,
QuotedTriple,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct RdfVectorSearchResult {
pub term: Term,
pub score: f32,
pub vector_id: VectorId,
pub graph_context: Option<GraphName>,
pub metadata: SearchMetadata,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct SearchMetadata {
pub algorithm: String,
pub processing_time_us: u64,
pub confidence: f32,
pub explanation: Option<String>,
}
pub struct RdfVectorIntegration {
config: RdfVectorConfig,
term_mappings: Arc<RwLock<HashMap<TermHash, RdfTermMapping>>>,
vector_mappings: Arc<RwLock<HashMap<VectorId, RdfTermMapping>>>,
graph_cache: Arc<RwLock<HashMap<GraphName, HashSet<VectorId>>>>,
namespace_registry: Arc<RwLock<HashMap<String, String>>>,
vector_store: Arc<RwLock<dyn VectorStoreTrait>>,
}
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
struct TermHash(u64);
impl TermHash {
fn from_term(term: &Term) -> Self {
use std::collections::hash_map::DefaultHasher;
let mut hasher = DefaultHasher::new();
match term {
Term::NamedNode(node) => {
"NamedNode".hash(&mut hasher);
node.as_str().hash(&mut hasher);
}
Term::BlankNode(node) => {
"BlankNode".hash(&mut hasher);
node.as_str().hash(&mut hasher);
}
Term::Literal(literal) => {
"Literal".hash(&mut hasher);
literal.value().hash(&mut hasher);
if let Some(lang) = literal.language() {
lang.hash(&mut hasher);
}
literal.datatype().as_str().hash(&mut hasher);
}
Term::Variable(var) => {
"Variable".hash(&mut hasher);
var.as_str().hash(&mut hasher);
}
Term::QuotedTriple(_) => {
"QuotedTriple".hash(&mut hasher);
"quoted_triple".hash(&mut hasher);
}
}
TermHash(hasher.finish())
}
}
impl RdfVectorIntegration {
pub fn new(config: RdfVectorConfig, vector_store: Arc<RwLock<dyn VectorStoreTrait>>) -> Self {
Self {
config,
term_mappings: Arc::new(RwLock::new(HashMap::new())),
vector_mappings: Arc::new(RwLock::new(HashMap::new())),
graph_cache: Arc::new(RwLock::new(HashMap::new())),
namespace_registry: Arc::new(RwLock::new(HashMap::new())),
vector_store,
}
}
pub fn register_term(
&self,
term: Term,
vector: Vector,
graph_context: Option<GraphName>,
) -> Result<VectorId> {
let vector_id = self
.vector_store
.write()
.expect("lock poisoned")
.add_vector(vector)?;
let metadata = self.extract_term_metadata(&term)?;
let mapping = RdfTermMapping {
term: term.clone(),
vector_id: vector_id.clone(),
graph_context: graph_context.clone(),
metadata,
};
let term_hash = TermHash::from_term(&term);
{
let mut term_mappings = self.term_mappings.write().expect("lock poisoned");
term_mappings.insert(term_hash, mapping.clone());
}
{
let mut vector_mappings = self.vector_mappings.write().expect("lock poisoned");
vector_mappings.insert(vector_id.clone(), mapping);
}
if let Some(graph) = graph_context {
let mut graph_cache = self.graph_cache.write().expect("lock poisoned");
graph_cache
.entry(graph)
.or_default()
.insert(vector_id.clone());
}
Ok(vector_id)
}
pub fn find_similar_terms(
&self,
query_term: &Term,
limit: usize,
threshold: Option<f32>,
graph_context: Option<&GraphName>,
) -> Result<Vec<RdfVectorSearchResult>> {
let start_time = std::time::Instant::now();
let query_vector_id = self
.get_vector_id(query_term)?
.ok_or_else(|| anyhow!("Query term not found in vector store"))?;
let query_vector = self
.vector_store
.read()
.expect("lock poisoned")
.get_vector(&query_vector_id)?
.ok_or_else(|| anyhow!("Query vector not found"))?;
let candidate_vectors = if let Some(graph) = graph_context {
let graph_cache = self.graph_cache.read().expect("lock poisoned");
graph_cache
.get(graph)
.map(|set| set.iter().cloned().collect::<Vec<_>>())
.unwrap_or_default()
} else {
self.vector_store
.read()
.expect("lock poisoned")
.get_all_vector_ids()?
};
let mut results = Vec::new();
for vector_id in candidate_vectors {
if *vector_id == query_vector_id {
continue; }
if let Ok(Some(vector)) = self
.vector_store
.read()
.expect("lock poisoned")
.get_vector(&vector_id)
{
let similarity = self.config.default_metric.compute(&query_vector, &vector)?;
if let Some(thresh) = threshold {
if similarity < thresh {
continue;
}
}
let vector_mappings = self.vector_mappings.read().expect("lock poisoned");
if let Some(mapping) = vector_mappings.get(&vector_id) {
let processing_time = start_time.elapsed().as_micros() as u64;
results.push(RdfVectorSearchResult {
term: mapping.term.clone(),
score: similarity,
vector_id: vector_id.clone(),
graph_context: mapping.graph_context.clone(),
metadata: SearchMetadata {
algorithm: "vector_similarity".to_string(),
processing_time_us: processing_time,
confidence: self.calculate_confidence(similarity, &mapping.metadata),
explanation: self.generate_explanation(&mapping.metadata, similarity),
},
});
}
}
}
results.sort_by(|a, b| {
b.score
.partial_cmp(&a.score)
.unwrap_or(std::cmp::Ordering::Equal)
});
results.truncate(limit);
Ok(results)
}
pub fn search_by_text(
&self,
query_text: &str,
limit: usize,
threshold: Option<f32>,
graph_context: Option<&GraphName>,
) -> Result<Vec<RdfVectorSearchResult>> {
let literal = Literal::new_simple_literal(query_text);
let _query_term = Term::Literal(literal);
let query_vector = self.generate_text_embedding(query_text)?;
let temp_vector_id = self
.vector_store
.write()
.expect("lock poisoned")
.add_vector(query_vector.clone())?;
let candidate_vectors = if let Some(graph) = graph_context {
let graph_cache = self.graph_cache.read().expect("lock poisoned");
graph_cache
.get(graph)
.map(|set| set.iter().cloned().collect::<Vec<_>>())
.unwrap_or_default()
} else {
self.vector_store
.read()
.expect("lock poisoned")
.get_all_vector_ids()?
};
let mut results = Vec::new();
let start_time = std::time::Instant::now();
for vector_id in candidate_vectors {
if let Ok(Some(vector)) = self
.vector_store
.read()
.expect("lock poisoned")
.get_vector(&vector_id)
{
let similarity = self.config.default_metric.compute(&query_vector, &vector)?;
if let Some(thresh) = threshold {
if similarity < thresh {
continue;
}
}
let vector_mappings = self.vector_mappings.read().expect("lock poisoned");
if let Some(mapping) = vector_mappings.get(&vector_id) {
let processing_time = start_time.elapsed().as_micros() as u64;
results.push(RdfVectorSearchResult {
term: mapping.term.clone(),
score: similarity,
vector_id: vector_id.clone(),
graph_context: mapping.graph_context.clone(),
metadata: SearchMetadata {
algorithm: "text_similarity".to_string(),
processing_time_us: processing_time,
confidence: self.calculate_confidence(similarity, &mapping.metadata),
explanation: Some(format!("Text similarity match: '{query_text}'")),
},
});
}
}
}
let _ = self
.vector_store
.write()
.expect("lock poisoned")
.remove_vector(&temp_vector_id);
results.sort_by(|a, b| {
b.score
.partial_cmp(&a.score)
.unwrap_or(std::cmp::Ordering::Equal)
});
results.truncate(limit);
Ok(results)
}
pub fn get_vector_id(&self, term: &Term) -> Result<Option<VectorId>> {
let term_hash = TermHash::from_term(term);
let term_mappings = self.term_mappings.read().expect("lock poisoned");
Ok(term_mappings
.get(&term_hash)
.map(|mapping| mapping.vector_id.clone()))
}
pub fn get_term(&self, vector_id: VectorId) -> Result<Option<Term>> {
let vector_mappings = self.vector_mappings.read().expect("lock poisoned");
Ok(vector_mappings
.get(&vector_id)
.map(|mapping| mapping.term.clone()))
}
pub fn register_namespace(&self, prefix: String, uri: String) -> Result<()> {
let mut registry = self.namespace_registry.write().expect("lock poisoned");
registry.insert(prefix, uri);
Ok(())
}
fn extract_term_metadata(&self, term: &Term) -> Result<RdfTermMetadata> {
match term {
Term::NamedNode(node) => {
let uri = node.as_str();
let (namespace, local_name) = self.split_uri(uri);
Ok(RdfTermMetadata {
term_type: RdfTermType::NamedNode,
namespace,
local_name,
datatype: None,
language: None,
complexity_score: self.calculate_uri_complexity(uri),
})
}
Term::BlankNode(_) => {
Ok(RdfTermMetadata {
term_type: RdfTermType::BlankNode,
namespace: None,
local_name: None,
datatype: None,
language: None,
complexity_score: 0.5, })
}
Term::Literal(literal) => Ok(RdfTermMetadata {
term_type: RdfTermType::Literal,
namespace: None,
local_name: None,
datatype: Some(literal.datatype().into()),
language: literal.language().map(|s| s.to_string()),
complexity_score: self.calculate_literal_complexity(literal),
}),
Term::Variable(_) => {
Ok(RdfTermMetadata {
term_type: RdfTermType::Variable,
namespace: None,
local_name: None,
datatype: None,
language: None,
complexity_score: 0.3, })
}
Term::QuotedTriple(_) => {
Ok(RdfTermMetadata {
term_type: RdfTermType::QuotedTriple,
namespace: None,
local_name: None,
datatype: None,
language: None,
complexity_score: 1.0, })
}
}
}
fn split_uri(&self, uri: &str) -> (Option<String>, Option<String>) {
if let Some(pos) = uri.rfind(&['#', '/'][..]) {
let namespace = uri[..pos + 1].to_string();
let local_name = uri[pos + 1..].to_string();
(Some(namespace), Some(local_name))
} else {
(None, Some(uri.to_string()))
}
}
fn calculate_uri_complexity(&self, uri: &str) -> f32 {
let length_factor = (uri.len() as f32 / 100.0).min(1.0);
let segment_count = uri.matches(&['/', '#'][..]).count() as f32 / 10.0;
let query_params = if uri.contains('?') { 0.2 } else { 0.0 };
(length_factor + segment_count + query_params).min(1.0)
}
fn calculate_literal_complexity(&self, literal: &Literal) -> f32 {
let value_length = literal.value().len() as f32 / 200.0;
let datatype_complexity =
if literal.datatype().as_str() == "http://www.w3.org/2001/XMLSchema#string" {
0.3
} else {
0.7
};
let language_bonus = if literal.language().is_some() {
0.2
} else {
0.0
};
(value_length + datatype_complexity + language_bonus).min(1.0)
}
fn calculate_confidence(&self, similarity: f32, metadata: &RdfTermMetadata) -> f32 {
let base_confidence = similarity;
let complexity_bonus = metadata.complexity_score * 0.1;
let type_bonus = match metadata.term_type {
RdfTermType::NamedNode => 0.1,
RdfTermType::Literal => 0.05,
RdfTermType::BlankNode => 0.02,
RdfTermType::Variable => 0.01,
RdfTermType::QuotedTriple => 0.15,
};
(base_confidence + complexity_bonus + type_bonus).min(1.0)
}
fn generate_explanation(&self, metadata: &RdfTermMetadata, similarity: f32) -> Option<String> {
let term_type_str = match metadata.term_type {
RdfTermType::NamedNode => "Named Node",
RdfTermType::BlankNode => "Blank Node",
RdfTermType::Literal => "Literal",
RdfTermType::Variable => "Variable",
RdfTermType::QuotedTriple => "Quoted Triple",
};
let mut explanation = format!(
"{} with {:.2}% similarity",
term_type_str,
similarity * 100.0
);
if let Some(namespace) = &metadata.namespace {
explanation.push_str(&format!(", namespace: {namespace}"));
}
if let Some(language) = &metadata.language {
explanation.push_str(&format!(", language: {language}"));
}
Some(explanation)
}
fn generate_text_embedding(&self, text: &str) -> Result<Vector> {
let words: Vec<&str> = text.split_whitespace().collect();
let dimension = 384;
let mut vector_data = vec![0.0; dimension];
for word in words.iter() {
let word_hash = {
use std::collections::hash_map::DefaultHasher;
let mut hasher = DefaultHasher::new();
word.hash(&mut hasher);
hasher.finish()
};
for j in 0..dimension {
let index = (word_hash as usize + j) % dimension;
vector_data[index] += 1.0 / (words.len() as f32);
}
}
let norm: f32 = vector_data.iter().map(|x| x * x).sum::<f32>().sqrt();
if norm > 0.0 {
for value in &mut vector_data {
*value /= norm;
}
}
Ok(Vector::new(vector_data))
}
pub fn get_statistics(&self) -> RdfIntegrationStats {
let term_mappings = self.term_mappings.read().expect("lock poisoned");
let graph_cache = self.graph_cache.read().expect("lock poisoned");
let namespace_registry = self.namespace_registry.read().expect("lock poisoned");
let mut type_counts = HashMap::new();
for mapping in term_mappings.values() {
*type_counts.entry(mapping.metadata.term_type).or_insert(0) += 1;
}
RdfIntegrationStats {
total_terms: term_mappings.len(),
total_graphs: graph_cache.len(),
total_namespaces: namespace_registry.len(),
type_distribution: type_counts,
cache_hit_ratio: 0.95, }
}
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct RdfIntegrationStats {
pub total_terms: usize,
pub total_graphs: usize,
pub total_namespaces: usize,
pub type_distribution: HashMap<RdfTermType, usize>,
pub cache_hit_ratio: f32,
}
#[cfg(test)]
mod tests {
use super::*;
use crate::VectorStore;
use anyhow::Result;
use oxirs_core::model::{NamedNode, Term};
#[test]
fn test_rdf_term_registration() -> Result<()> {
let config = RdfVectorConfig::default();
let vector_store = Arc::new(RwLock::new(VectorStore::new()));
let integration = RdfVectorIntegration::new(config, vector_store);
let named_node = NamedNode::new("http://example.org/person")?;
let term = Term::NamedNode(named_node);
let vector = Vector::new(vec![1.0, 0.0, 0.0]);
let vector_id = integration.register_term(term.clone(), vector, None)?;
assert!(integration
.get_vector_id(&term)
.expect("test value")
.is_some());
assert_eq!(
integration
.get_vector_id(&term)
.expect("get_vector_id should return Some")
.expect("inner Option should be Some"),
vector_id
);
Ok(())
}
#[test]
fn test_uri_splitting() {
let config = RdfVectorConfig::default();
let vector_store = Arc::new(RwLock::new(VectorStore::new()));
let integration = RdfVectorIntegration::new(config, vector_store);
let (namespace, local_name) = integration.split_uri("http://example.org/ontology#Person");
assert_eq!(namespace, Some("http://example.org/ontology#".to_string()));
assert_eq!(local_name, Some("Person".to_string()));
}
#[test]
fn test_metadata_extraction() -> Result<()> {
let config = RdfVectorConfig::default();
let vector_store = Arc::new(RwLock::new(VectorStore::new()));
let integration = RdfVectorIntegration::new(config, vector_store);
let literal = Literal::new_language_tagged_literal("Hello", "en")?;
let term = Term::Literal(literal);
let metadata = integration.extract_term_metadata(&term)?;
assert_eq!(metadata.term_type, RdfTermType::Literal);
assert_eq!(metadata.language, Some("en".to_string()));
Ok(())
}
}