1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
use crate::Result;
use serde::{Deserialize, Serialize};
use std::collections::HashMap;
use std::path::{Path, PathBuf};
/// Main configuration structure for the data generator
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct GeneratorConfig {
pub generation: GenerationConfig,
pub field_generators: FieldGeneratorConfig,
pub output: OutputConfig,
pub parallel: ParallelConfig,
}
/// Configuration for data generation parameters
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct GenerationConfig {
/// Number of entities to generate
pub entity_count: usize,
/// Random seed for reproducible generation
pub seed: Option<u64>,
/// Distribution strategy for entities across shapes
pub entity_distribution: EntityDistribution,
/// Cardinality generation strategy
pub cardinality_strategy: CardinalityStrategy,
/// Schema format specification
pub schema_format: Option<SchemaFormat>,
}
/// Schema format for the generator
#[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq)]
pub enum SchemaFormat {
ShEx,
SHACL,
}
/// How to distribute entities across different shapes
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
pub enum EntityDistribution {
/// Equal distribution across all shapes
Equal,
/// Weighted distribution based on shape importance
Weighted(HashMap<String, f64>),
/// Custom distribution with explicit counts per shape
Custom(HashMap<String, usize>),
}
/// Strategy for handling cardinalities in relationships
#[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq)]
pub enum CardinalityStrategy {
/// Use minimum cardinalities
Minimum,
/// Use maximum cardinalities (with reasonable bounds)
Maximum,
/// Random within cardinality bounds
Random,
/// Balanced approach favoring realistic distributions
Balanced,
}
/// Configuration for field value generators
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct FieldGeneratorConfig {
/// Default generator settings
pub default: DefaultFieldConfig,
/// Per-datatype specific configurations
pub datatypes: HashMap<String, DatatypeConfig>,
/// Per-property specific configurations
pub properties: HashMap<String, PropertyConfig>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct DefaultFieldConfig {
/// Locale for text generation (e.g., "en", "es", "fr")
pub locale: String,
/// Quality level for generated data (low, medium, high)
pub quality: DataQuality,
}
#[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq)]
pub enum DataQuality {
Low, // Simple random data
Medium, // Realistic patterns
High, // Complex realistic data with correlations
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct DatatypeConfig {
/// Generator type to use for this datatype
pub generator: String,
/// Additional parameters for the generator
pub parameters: HashMap<String, serde_json::Value>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct PropertyConfig {
/// Generator type to use for this property
pub generator: String,
/// Additional parameters for the generator
pub parameters: HashMap<String, serde_json::Value>,
/// Value templates or patterns
pub templates: Option<Vec<String>>,
}
/// Output configuration
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct OutputConfig {
/// Output file path
pub path: PathBuf,
/// Output format (turtle, ntriples, jsonld, etc.)
pub format: OutputFormat,
/// Whether to compress output
pub compress: bool,
/// Write statistics file
pub write_stats: bool,
/// Enable parallel writing to multiple files
pub parallel_writing: bool,
/// Number of parallel output files (when parallel_writing is true)
/// If set to 0, the system will automatically determine the optimal count
pub parallel_file_count: usize,
}
#[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq)]
pub enum OutputFormat {
Turtle,
NTriples,
// NOTE: Only Turtle and NTriples are supported.
// JsonLd and RdfXml removed to avoid serialization issues.
}
/// Parallelization configuration
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ParallelConfig {
/// Number of worker threads (None = auto-detect)
pub worker_threads: Option<usize>,
/// Batch size for parallel processing
pub batch_size: usize,
/// Enable parallel shape processing
pub parallel_shapes: bool,
/// Enable parallel field generation
pub parallel_fields: bool,
}
impl Default for GeneratorConfig {
fn default() -> Self {
Self {
generation: GenerationConfig {
entity_count: 1000,
seed: None,
entity_distribution: EntityDistribution::Equal,
cardinality_strategy: CardinalityStrategy::Balanced,
schema_format: None, // Auto-detect
},
field_generators: FieldGeneratorConfig {
default: DefaultFieldConfig {
locale: "en".to_string(),
quality: DataQuality::Medium,
},
datatypes: HashMap::new(),
properties: HashMap::new(),
},
output: OutputConfig {
path: PathBuf::from("output.ttl"),
format: OutputFormat::Turtle,
compress: false,
write_stats: true,
parallel_writing: false,
parallel_file_count: 0, // 0 means auto-detect optimal count
},
parallel: ParallelConfig {
worker_threads: None,
batch_size: 100,
parallel_shapes: true,
parallel_fields: true,
},
}
}
}
impl OutputConfig {
/// Calculate optimal parallel file count based on dataset size and system capabilities
pub fn get_optimal_file_count(&self, total_triples: usize) -> usize {
// If user explicitly set a count, use it
if self.parallel_file_count > 0 {
return self.parallel_file_count;
}
// If parallel writing is disabled, always use 1 file
if !self.parallel_writing {
return 1;
}
// Detect CPU cores (with fallback to 4)
let cpu_count = std::thread::available_parallelism()
.map(|n| n.get())
.unwrap_or(4);
// Calculate optimal file count based on dataset size
let optimal_count = match total_triples {
0..=1000 => 1, // Small datasets: single file
1001..=5000 => cpu_count.min(4), // Small-medium: up to 4 files
5001..=50000 => (cpu_count * 2).min(8), // Medium: up to 2x CPU cores, max 8
_ => (cpu_count * 2).min(16), // Large: up to 2x CPU cores, max 16
};
tracing::info!(
"Auto-detected optimal parallel file count: {} (CPU cores: {}, triples: {})",
optimal_count,
cpu_count,
total_triples
);
optimal_count
}
}
impl GeneratorConfig {
/// Load configuration from a TOML file
pub fn from_toml_file<P: AsRef<Path>>(path: P) -> Result<Self> {
let content = std::fs::read_to_string(path)?;
let config: Self = toml::from_str(&content)?;
Ok(config)
}
/// Load configuration from a JSON file
pub fn from_json_file<P: AsRef<Path>>(path: P) -> Result<Self> {
let content = std::fs::read_to_string(path)?;
let config: Self = serde_json::from_str(&content)?;
Ok(config)
}
/// Save configuration to a TOML file
pub fn to_toml_file<P: AsRef<Path>>(&self, path: P) -> Result<()> {
let content = toml::to_string_pretty(self)
.map_err(|e| crate::DataGeneratorError::Config(e.to_string()))?;
std::fs::write(path, content)?;
Ok(())
}
/// Merge with command-line overrides
pub fn merge_cli_overrides(
&mut self,
entity_count: Option<usize>,
output_path: Option<PathBuf>,
seed: Option<u64>,
) {
if let Some(count) = entity_count {
self.generation.entity_count = count;
}
if let Some(path) = output_path {
self.output.path = path;
}
if let Some(seed_val) = seed {
self.generation.seed = Some(seed_val);
}
}
/// Validate configuration
pub fn validate(&self) -> Result<()> {
if self.generation.entity_count == 0 {
return Err(crate::DataGeneratorError::Config(
"entity_count must be greater than 0".to_string(),
));
}
if self.parallel.batch_size == 0 {
return Err(crate::DataGeneratorError::Config(
"batch_size must be greater than 0".to_string(),
));
}
// Validate entity distribution weights sum to reasonable values
if let EntityDistribution::Weighted(ref weights) = self.generation.entity_distribution {
let total: f64 = weights.values().sum();
if total <= 0.0 {
return Err(crate::DataGeneratorError::Config(
"Weighted distribution weights must sum to a positive value".to_string(),
));
}
}
Ok(())
}
}