use crate::Result;
use serde::{Deserialize, Serialize};
use std::collections::HashMap;
use std::path::{Path, PathBuf};
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct GeneratorConfig {
pub generation: GenerationConfig,
pub field_generators: FieldGeneratorConfig,
pub output: OutputConfig,
pub parallel: ParallelConfig,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct GenerationConfig {
pub entity_count: usize,
pub seed: Option<u64>,
pub entity_distribution: EntityDistribution,
pub cardinality_strategy: CardinalityStrategy,
pub schema_format: Option<SchemaFormat>,
#[serde(default = "default_property_fill_probability")]
pub property_fill_probability: f64,
#[serde(default)]
pub ignore_min_cardinality: bool,
#[serde(default)]
pub max_properties_per_instance: usize,
#[serde(default)]
pub property_selection_strategy: PropertySelectionStrategy,
#[serde(default)]
pub property_count_variance: f64,
#[serde(default)]
pub excluded_properties: Vec<String>,
#[serde(default)]
pub type_overrides: HashMap<String, TypeOverrideConfig>,
}
fn default_property_fill_probability() -> f64 {
1.0
}
#[derive(Default, Debug, Clone, Copy, Serialize, Deserialize, PartialEq)]
pub enum PropertySelectionStrategy {
#[default]
All,
Random,
Weighted,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct TypeOverrideConfig {
pub property_fill_probability: Option<f64>,
pub ignore_min_cardinality: Option<bool>,
pub max_properties_per_instance: Option<usize>,
pub property_selection_strategy: Option<PropertySelectionStrategy>,
pub property_count_variance: Option<f64>,
pub excluded_properties: Option<Vec<String>>,
}
#[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq)]
pub enum SchemaFormat {
ShEx,
Shacl,
}
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
pub enum EntityDistribution {
Equal,
Weighted(HashMap<String, f64>),
Custom(HashMap<String, usize>),
}
#[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq)]
pub enum CardinalityStrategy {
Minimum,
Maximum,
Random,
Balanced,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct FieldGeneratorConfig {
pub default: DefaultFieldConfig,
#[serde(default)]
pub datatypes: HashMap<String, DatatypeConfig>,
#[serde(default)]
pub properties: HashMap<String, PropertyConfig>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct DefaultFieldConfig {
pub locale: String,
pub quality: DataQuality,
}
#[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq)]
pub enum DataQuality {
Low, Medium, High, }
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct DatatypeConfig {
pub generator: String,
pub parameters: HashMap<String, serde_json::Value>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct PropertyConfig {
pub generator: String,
pub parameters: HashMap<String, serde_json::Value>,
pub templates: Option<Vec<String>>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct OutputConfig {
pub path: PathBuf,
pub format: OutputFormat,
pub compress: bool,
pub write_stats: bool,
pub parallel_writing: bool,
pub parallel_file_count: usize,
}
#[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq)]
pub enum OutputFormat {
Turtle,
NTriples,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ParallelConfig {
pub worker_threads: Option<usize>,
pub batch_size: usize,
pub parallel_shapes: bool,
pub parallel_fields: bool,
}
impl Default for GeneratorConfig {
fn default() -> Self {
Self {
generation: GenerationConfig {
entity_count: 1000,
seed: None,
entity_distribution: EntityDistribution::Equal,
cardinality_strategy: CardinalityStrategy::Balanced,
schema_format: None,
property_fill_probability: 1.0,
ignore_min_cardinality: false,
max_properties_per_instance: 0,
property_selection_strategy: PropertySelectionStrategy::All,
property_count_variance: 0.0,
excluded_properties: Vec::new(),
type_overrides: HashMap::new(),
},
field_generators: FieldGeneratorConfig {
default: DefaultFieldConfig {
locale: "en".to_string(),
quality: DataQuality::Medium,
},
datatypes: HashMap::new(),
properties: HashMap::new(),
},
output: OutputConfig {
path: PathBuf::from("output.ttl"),
format: OutputFormat::Turtle,
compress: false,
write_stats: true,
parallel_writing: false,
parallel_file_count: 0, },
parallel: ParallelConfig {
worker_threads: None,
batch_size: 100,
parallel_shapes: true,
parallel_fields: true,
},
}
}
}
impl OutputConfig {
pub fn get_optimal_file_count(&self, total_triples: usize) -> usize {
if self.parallel_file_count > 0 {
return self.parallel_file_count;
}
if !self.parallel_writing {
return 1;
}
let cpu_count = std::thread::available_parallelism().map(|n| n.get()).unwrap_or(4);
let optimal_count = match total_triples {
0..=1000 => 1, 1001..=5000 => cpu_count.min(4), 5001..=50000 => (cpu_count * 2).min(8), _ => (cpu_count * 2).min(16), };
tracing::info!(
"Auto-detected optimal parallel file count: {} (CPU cores: {}, triples: {})",
optimal_count,
cpu_count,
total_triples
);
optimal_count
}
}
impl GeneratorConfig {
pub fn from_toml_file<P: AsRef<Path>>(path: P) -> Result<Self> {
let content = std::fs::read_to_string(path)?;
let config: Self = toml::from_str(&content)?;
Ok(config)
}
pub fn from_json_file<P: AsRef<Path>>(path: P) -> Result<Self> {
let content = std::fs::read_to_string(path)?;
let config: Self = serde_json::from_str(&content)?;
Ok(config)
}
pub fn to_toml_file<P: AsRef<Path>>(&self, path: P) -> Result<()> {
let content = toml::to_string_pretty(self).map_err(|e| crate::DataGeneratorError::Config(e.to_string()))?;
std::fs::write(path, content)?;
Ok(())
}
pub fn merge_cli_overrides(
&mut self,
entity_count: Option<usize>,
output_path: Option<PathBuf>,
seed: Option<u64>,
) {
if let Some(count) = entity_count {
self.generation.entity_count = count;
}
if let Some(path) = output_path {
self.output.path = path;
}
if let Some(seed_val) = seed {
self.generation.seed = Some(seed_val);
}
}
pub fn validate(&self) -> Result<()> {
if self.generation.entity_count == 0 {
return Err(crate::DataGeneratorError::Config(
"entity_count must be greater than 0".to_string(),
));
}
if self.parallel.batch_size == 0 {
return Err(crate::DataGeneratorError::Config(
"batch_size must be greater than 0".to_string(),
));
}
if let EntityDistribution::Weighted(ref weights) = self.generation.entity_distribution {
let total: f64 = weights.values().sum();
if total <= 0.0 {
return Err(crate::DataGeneratorError::Config(
"Weighted distribution weights must sum to a positive value".to_string(),
));
}
}
if self.generation.property_fill_probability < 0.0 || self.generation.property_fill_probability > 1.0 {
return Err(crate::DataGeneratorError::Config(
"property_fill_probability must be between 0.0 and 1.0".to_string(),
));
}
if self.generation.property_count_variance < 0.0 || self.generation.property_count_variance > 1.0 {
return Err(crate::DataGeneratorError::Config(
"property_count_variance must be between 0.0 and 1.0".to_string(),
));
}
Ok(())
}
}