use crate::converters::{ShExToUnified, ShaclToUnified};
use crate::unified_constraints::{UnifiedConstraint, UnifiedConstraintModel};
use crate::{DataGeneratorError, Result};
use shex_ast::ast::{ShapeDecl, ShapeExpr, TripleExpr};
use shex_ast::compact::ShExParser;
use std::collections::HashMap;
use std::path::Path;
#[derive(Debug, Clone)]
pub struct ShapeInfo {
pub declaration: ShapeDecl,
pub dependencies: Vec<ShapeDependency>,
pub properties: Vec<PropertyInfo>,
}
#[derive(Debug, Clone)]
pub struct ShapeDependency {
pub target_shape: String,
pub property: String,
pub min_cardinality: Option<i32>,
pub max_cardinality: Option<i32>,
}
#[derive(Debug, Clone)]
pub struct PropertyInfo {
pub property_iri: String,
pub datatype: Option<String>,
pub shape_ref: Option<String>,
pub min_cardinality: Option<i32>,
pub max_cardinality: Option<i32>,
pub constraints: Vec<UnifiedConstraint>,
}
pub struct ShapeProcessor {
shapes: HashMap<String, ShapeInfo>,
dependency_graph: HashMap<String, Vec<String>>,
unified_model: Option<UnifiedConstraintModel>,
shex_converter: ShExToUnified,
shacl_converter: ShaclToUnified,
}
impl Default for ShapeProcessor {
fn default() -> Self {
Self::new()
}
}
impl ShapeProcessor {
pub fn new() -> Self {
Self {
shapes: HashMap::new(),
dependency_graph: HashMap::new(),
unified_model: None,
shex_converter: ShExToUnified,
shacl_converter: ShaclToUnified,
}
}
pub async fn extract_shapes<P: AsRef<Path>>(&mut self, shex_path: P) -> Result<Vec<ShapeDecl>> {
let path = shex_path.as_ref().to_path_buf();
let shapes = tokio::task::spawn_blocking(move || {
let schema = ShExParser::parse_buf(&path, None).map_err(|e| {
DataGeneratorError::ShexParsing(format!("Failed to parse ShEx: {e}"))
})?;
schema.shapes().ok_or_else(|| {
DataGeneratorError::ShexParsing("No shapes found in schema".to_string())
})
})
.await??;
self.process_shapes(&shapes).await?;
Ok(shapes)
}
async fn process_shapes(&mut self, shapes: &[ShapeDecl]) -> Result<()> {
self.shapes.clear();
self.dependency_graph.clear();
let shape_futures: Vec<_> = shapes
.iter()
.map(|shape| self.process_single_shape(shape))
.collect();
let processed_shapes: Result<Vec<ShapeInfo>> =
futures::future::try_join_all(shape_futures).await;
let processed_shapes = processed_shapes?;
for shape_info in processed_shapes {
let shape_id = shape_info.declaration.id.to_string();
let dependencies: Vec<String> = shape_info
.dependencies
.iter()
.map(|dep| dep.target_shape.clone())
.collect();
self.dependency_graph.insert(shape_id.clone(), dependencies);
self.shapes.insert(shape_id, shape_info);
}
Ok(())
}
async fn process_single_shape(&self, shape: &ShapeDecl) -> Result<ShapeInfo> {
let mut dependencies = Vec::new();
let mut properties = Vec::new();
if let ShapeExpr::Shape(s) = &shape.shape_expr {
if let Some(expr) = &s.expression {
Self::extract_dependencies_and_properties(
&expr.te,
&mut dependencies,
&mut properties,
);
}
}
Ok(ShapeInfo {
declaration: shape.clone(),
dependencies,
properties,
})
}
fn extract_dependencies_and_properties(
expr: &TripleExpr,
dependencies: &mut Vec<ShapeDependency>,
properties: &mut Vec<PropertyInfo>,
) {
match expr {
TripleExpr::EachOf { expressions, .. } | TripleExpr::OneOf { expressions, .. } => {
for e in expressions {
Self::extract_dependencies_and_properties(&e.te, dependencies, properties);
}
}
TripleExpr::TripleConstraint {
predicate,
value_expr,
min,
max,
..
} => {
let property_iri = predicate.to_string();
let (min_card, max_card) = match (*min, *max) {
(None, None) => (Some(1), Some(1)), (min, max) => (min, max),
};
if let Some(val_expr) = value_expr {
match &**val_expr {
ShapeExpr::Ref(ref_to) => {
dependencies.push(ShapeDependency {
target_shape: ref_to.to_string(),
property: property_iri.clone(),
min_cardinality: min_card,
max_cardinality: max_card,
});
properties.push(PropertyInfo {
property_iri,
datatype: None,
shape_ref: Some(ref_to.to_string()),
min_cardinality: min_card,
max_cardinality: max_card,
constraints: vec![], });
}
ShapeExpr::NodeConstraint(node_constraint) => {
let datatype = if let Some(dt) = node_constraint.datatype() {
Some(dt.to_string())
} else {
Some("http://www.w3.org/2001/XMLSchema#string".to_string())
};
properties.push(PropertyInfo {
property_iri,
datatype,
shape_ref: None,
min_cardinality: min_card,
max_cardinality: max_card,
constraints: vec![], });
}
_ => {
properties.push(PropertyInfo {
property_iri,
datatype: Some(
"http://www.w3.org/2001/XMLSchema#string".to_string(),
),
shape_ref: None,
min_cardinality: min_card,
max_cardinality: max_card,
constraints: vec![], });
}
}
} else {
properties.push(PropertyInfo {
property_iri,
datatype: Some("http://www.w3.org/2001/XMLSchema#string".to_string()),
shape_ref: None,
min_cardinality: min_card,
max_cardinality: max_card,
constraints: vec![], });
}
}
TripleExpr::TripleExprRef(_) => {
}
}
}
pub fn get_shapes(&self) -> &HashMap<String, ShapeInfo> {
&self.shapes
}
pub fn get_dependency_graph(&self) -> &HashMap<String, Vec<String>> {
&self.dependency_graph
}
pub fn get_topological_order(&self) -> Result<Vec<String>> {
topological_sort(&self.dependency_graph)
}
pub fn get_shape(&self, shape_id: &str) -> Option<&ShapeInfo> {
self.shapes.get(shape_id)
}
pub async fn load_shex_schema<P: AsRef<Path>>(&mut self, path: P) -> Result<()> {
let unified_model = self.shex_converter.convert_file(path).await?;
self.unified_model = Some(unified_model);
Ok(())
}
pub async fn load_shacl_schema<P: AsRef<Path>>(&mut self, path: P) -> Result<()> {
let unified_model = self.shacl_converter.convert_file(path).await?;
self.unified_model = Some(unified_model);
Ok(())
}
pub async fn load_schema_auto<P: AsRef<Path>>(&mut self, path: P) -> Result<()> {
let path_str = path.as_ref().to_string_lossy();
if path_str.ends_with(".shex") {
self.load_shex_schema(path).await
} else if path_str.ends_with(".ttl")
|| path_str.ends_with(".rdf")
|| path_str.ends_with(".nt")
{
self.load_shacl_schema(path).await
} else {
self.load_shex_schema(path).await
}
}
pub fn get_unified_model(&self) -> Option<&UnifiedConstraintModel> {
self.unified_model.as_ref()
}
}
fn topological_sort(graph: &HashMap<String, Vec<String>>) -> Result<Vec<String>> {
let mut result = Vec::new();
let mut visited = std::collections::HashSet::new();
let mut temp_visited = std::collections::HashSet::new();
fn visit(
node: &str,
graph: &HashMap<String, Vec<String>>,
visited: &mut std::collections::HashSet<String>,
temp_visited: &mut std::collections::HashSet<String>,
result: &mut Vec<String>,
) -> Result<()> {
if temp_visited.contains(node) {
return Err(DataGeneratorError::GraphGeneration(format!(
"Circular dependency detected involving shape: {node}"
)));
}
if visited.contains(node) {
return Ok(());
}
temp_visited.insert(node.to_string());
if let Some(dependencies) = graph.get(node) {
for dep in dependencies {
visit(dep, graph, visited, temp_visited, result)?;
}
}
temp_visited.remove(node);
visited.insert(node.to_string());
result.push(node.to_string());
Ok(())
}
for node in graph.keys() {
if !visited.contains(node) {
visit(node, graph, &mut visited, &mut temp_visited, &mut result)?;
}
}
result.reverse();
Ok(result)
}