hedl_cli/batch/operations.rs
1// Dweve HEDL - Hierarchical Entity Data Language
2//
3// Copyright (c) 2025 Dweve IP B.V. and individual contributors.
4//
5// SPDX-License-Identifier: Apache-2.0
6//
7// Licensed under the Apache License, Version 2.0 (the "License");
8// you may not use this file except in compliance with the License.
9// You may obtain a copy of the License in the LICENSE file at the
10// root of this repository or at: http://www.apache.org/licenses/LICENSE-2.0
11//
12// Unless required by applicable law or agreed to in writing, software
13// distributed under the License is distributed on an "AS IS" BASIS,
14// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15// See the License for the specific language governing permissions and
16// limitations under the License.
17
18//! Standard and streaming batch operations.
19
20use super::traits::{BatchOperation, StreamingBatchOperation};
21use crate::error::CliError;
22use std::collections::HashSet;
23use std::path::Path;
24
25// ============================================================================
26// Standard Operations
27// ============================================================================
28
29/// Batch validation operation.
30///
31/// Validates multiple HEDL files in parallel, checking syntax and optionally
32/// enforcing strict reference resolution.
33#[derive(Debug, Clone)]
34pub struct ValidationOperation {
35 /// Enable strict reference validation
36 pub strict: bool,
37}
38
39impl BatchOperation for ValidationOperation {
40 type Output = ValidationStats;
41
42 fn process_file(&self, path: &Path) -> Result<Self::Output, CliError> {
43 use hedl_core::{parse_with_limits, Item, Node, ParseOptions, ReferenceMode};
44
45 let content = std::fs::read_to_string(path).map_err(|e| CliError::io_error(path, e))?;
46
47 let options = ParseOptions {
48 reference_mode: if self.strict {
49 ReferenceMode::Strict
50 } else {
51 ReferenceMode::Lenient
52 },
53 ..ParseOptions::default()
54 };
55
56 let doc = parse_with_limits(content.as_bytes(), options)
57 .map_err(|e| CliError::parse(e.to_string()))?;
58
59 // Collect statistics from the parsed document
60 let mut stats = ValidationStats::new();
61
62 // Get version from document metadata
63 stats.version = format!("{}.{}", doc.version.0, doc.version.1);
64
65 // Recursive helper to count nodes
66 fn count_node(node: &Node, stats: &mut ValidationStats) {
67 stats.node_count += 1;
68 stats.field_count += node.fields.len();
69 let full_id = format!("{}:{}", node.type_name, node.id);
70 stats.seen_ids.insert(full_id);
71
72 // Count children recursively
73 if let Some(ref children) = node.children {
74 for child_nodes in children.values() {
75 for child in child_nodes {
76 count_node(child, stats);
77 }
78 }
79 }
80 }
81
82 // Recursive helper to traverse items
83 fn traverse_item(item: &Item, stats: &mut ValidationStats) {
84 match item {
85 Item::List(list) => {
86 stats.list_count += 1;
87 for node in &list.rows {
88 count_node(node, stats);
89 }
90 }
91 Item::Object(obj) => {
92 for child_item in obj.values() {
93 traverse_item(child_item, stats);
94 }
95 }
96 Item::Scalar(_) => {
97 // Scalars don't contribute to node counts
98 }
99 }
100 }
101
102 // Traverse all items in the document root
103 for item in doc.root.values() {
104 traverse_item(item, &mut stats);
105 }
106
107 Ok(stats)
108 }
109
110 fn name(&self) -> &'static str {
111 "validate"
112 }
113}
114
115/// Batch format operation.
116///
117/// Formats multiple HEDL files to canonical form, optionally checking if files
118/// are already canonical.
119#[derive(Debug, Clone)]
120pub struct FormatOperation {
121 /// Only check if files are canonical (don't write)
122 pub check: bool,
123 /// Use ditto optimization
124 pub ditto: bool,
125 /// Add count hints to matrix lists
126 pub with_counts: bool,
127}
128
129impl BatchOperation for FormatOperation {
130 type Output = String;
131
132 fn process_file(&self, path: &Path) -> Result<Self::Output, CliError> {
133 use hedl_c14n::{canonicalize_with_config, CanonicalConfig};
134 use hedl_core::parse;
135
136 let content = std::fs::read_to_string(path).map_err(|e| CliError::io_error(path, e))?;
137
138 let mut doc = parse(content.as_bytes()).map_err(|e| CliError::parse(e.to_string()))?;
139
140 // Add count hints if requested
141 if self.with_counts {
142 add_count_hints(&mut doc);
143 }
144
145 let config = CanonicalConfig::new().with_ditto(self.ditto);
146
147 let canonical = canonicalize_with_config(&doc, &config)
148 .map_err(|e| CliError::canonicalization(e.to_string()))?;
149
150 if self.check && canonical != content {
151 return Err(CliError::NotCanonical);
152 }
153
154 Ok(canonical)
155 }
156
157 fn name(&self) -> &str {
158 if self.check {
159 "format-check"
160 } else {
161 "format"
162 }
163 }
164}
165
166/// Batch lint operation.
167///
168/// Lints multiple HEDL files for best practices and common issues.
169#[derive(Debug, Clone)]
170pub struct LintOperation {
171 /// Treat warnings as errors
172 pub warn_error: bool,
173}
174
175impl BatchOperation for LintOperation {
176 type Output = Vec<String>;
177
178 fn process_file(&self, path: &Path) -> Result<Self::Output, CliError> {
179 use hedl_core::parse;
180 use hedl_lint::lint;
181
182 let content = std::fs::read_to_string(path).map_err(|e| CliError::io_error(path, e))?;
183
184 let doc = parse(content.as_bytes()).map_err(|e| CliError::parse(e.to_string()))?;
185
186 let diagnostics = lint(&doc);
187
188 if self.warn_error && !diagnostics.is_empty() {
189 return Err(CliError::LintErrors);
190 }
191
192 Ok(diagnostics
193 .iter()
194 .map(std::string::ToString::to_string)
195 .collect())
196 }
197
198 fn name(&self) -> &'static str {
199 "lint"
200 }
201}
202
203// ============================================================================
204// Streaming Operations
205// ============================================================================
206
207/// Statistics collected during streaming validation.
208///
209/// Provides detailed statistics about the parsed document including
210/// entity counts, field counts, and ID tracking for reference validation.
211#[derive(Debug, Clone, Default)]
212pub struct ValidationStats {
213 /// HEDL version string
214 pub version: String,
215 /// Number of lists encountered
216 pub list_count: usize,
217 /// Total number of nodes processed
218 pub node_count: usize,
219 /// Total number of fields across all nodes
220 pub field_count: usize,
221 /// Set of seen IDs for strict reference validation (type:id format)
222 pub seen_ids: HashSet<String>,
223}
224
225impl ValidationStats {
226 /// Create new empty validation statistics
227 #[must_use]
228 pub fn new() -> Self {
229 Self::default()
230 }
231}
232
233/// Streaming validation operation for memory-efficient validation of large files.
234///
235/// Uses the streaming parser from `hedl-stream` to validate files with O(1) memory
236/// usage regardless of file size. Ideal for:
237/// - Files larger than 100MB
238/// - Validating thousands of files with limited RAM
239/// - Container environments with memory limits
240///
241/// # Memory Profile
242///
243/// - **Input**: O(1) - buffer size only (~8KB)
244/// - **Working**: `O(n_ids)` - seen ID set for strict validation
245/// - **Output**: O(1) - small statistics struct
246/// - **Peak**: ~8KB + ID set size (vs. full file size in standard mode)
247///
248/// # Examples
249///
250/// ```rust,no_run
251/// use hedl_cli::batch::{BatchExecutor, StreamingValidationOperation};
252/// use std::path::PathBuf;
253///
254/// # fn main() -> Result<(), Box<dyn std::error::Error>> {
255/// let processor = BatchExecutor::default_config();
256/// let files = vec![PathBuf::from("large-file.hedl")];
257///
258/// let operation = StreamingValidationOperation { strict: false };
259/// let results = processor.process_streaming(&files, operation, true)?;
260///
261/// println!("Validated {} files with constant memory", results.success_count());
262/// # Ok(())
263/// # }
264/// ```
265#[derive(Debug, Clone)]
266pub struct StreamingValidationOperation {
267 /// Enable strict reference validation
268 pub strict: bool,
269}
270
271impl StreamingBatchOperation for StreamingValidationOperation {
272 type Output = ValidationStats;
273
274 fn process_file_streaming(&self, path: &Path) -> Result<Self::Output, CliError> {
275 use hedl_stream::{NodeEvent, StreamError, StreamingParser};
276 use std::fs::File;
277 use std::io::BufReader;
278
279 let file = File::open(path).map_err(|e| CliError::io_error(path, e))?;
280 let reader = BufReader::with_capacity(8192, file);
281
282 let parser = StreamingParser::new(reader)
283 .map_err(|e: StreamError| CliError::parse(e.to_string()))?;
284
285 let mut stats = ValidationStats::new();
286 let mut _current_type = String::new();
287
288 // Process events incrementally
289 for event in parser {
290 let event = event.map_err(|e: StreamError| CliError::parse(e.to_string()))?;
291
292 match event {
293 NodeEvent::Header(info) => {
294 // Validate version exists
295 let version_str = format!("{}.{}", info.version.0, info.version.1);
296 if version_str.is_empty() {
297 return Err(CliError::parse("Missing VERSION".to_string()));
298 }
299 stats.version = version_str;
300 }
301 NodeEvent::ListStart { type_name, .. } => {
302 stats.list_count += 1;
303 _current_type = type_name;
304 }
305 NodeEvent::Node(node) => {
306 stats.node_count += 1;
307 stats.field_count += node.fields.len();
308
309 // Track IDs for strict mode validation
310 let full_id = format!("{}:{}", node.type_name, node.id);
311
312 if self.strict {
313 // In strict mode, validate references
314 // For now, just track IDs - full reference validation
315 // would require accumulating references and validating at end
316 stats.seen_ids.insert(full_id);
317 } else {
318 stats.seen_ids.insert(full_id);
319 }
320 }
321 NodeEvent::ListEnd { .. } => {
322 // List validation complete
323 }
324 NodeEvent::Scalar { .. } => {
325 // Scalar validation - no action needed
326 }
327 NodeEvent::ObjectStart { .. } => {
328 // Object start - no action needed
329 }
330 NodeEvent::ObjectEnd { .. } => {
331 // Object end - no action needed
332 }
333 NodeEvent::EndOfDocument => {
334 // Document complete
335 break;
336 }
337 }
338 }
339
340 Ok(stats)
341 }
342
343 fn name(&self) -> &'static str {
344 "validate-streaming"
345 }
346
347 fn supports_streaming(&self) -> bool {
348 true
349 }
350}
351
352// ============================================================================
353// Helper Functions for Count Hints
354// ============================================================================
355
356/// Recursively add count hints to all matrix lists in the document
357fn add_count_hints(doc: &mut hedl_core::Document) {
358 for item in doc.root.values_mut() {
359 add_count_hints_to_item(item);
360 }
361}
362
363/// Recursively add count hints to an item
364fn add_count_hints_to_item(item: &mut hedl_core::Item) {
365 use hedl_core::Item;
366
367 match item {
368 Item::List(list) => {
369 // Set count hint based on actual row count
370 list.count_hint = Some(list.rows.len());
371
372 // Recursively add child counts to each node
373 for node in &mut list.rows {
374 add_child_count_to_node(node);
375 }
376 }
377 Item::Object(map) => {
378 // Recursively process nested objects
379 for nested_item in map.values_mut() {
380 add_count_hints_to_item(nested_item);
381 }
382 }
383 Item::Scalar(_) => {
384 // Scalars don't have matrix lists
385 }
386 }
387}
388
389/// Recursively set `child_count` on nodes that have children
390fn add_child_count_to_node(node: &mut hedl_core::Node) {
391 // Calculate total number of direct children across all child types
392 let total_children: usize = node
393 .children()
394 .map_or(0, |c| c.values().map(std::vec::Vec::len).sum());
395
396 if total_children > 0 {
397 node.child_count = total_children.min(u16::MAX as usize) as u16;
398
399 // Recursively process all child nodes
400 if let Some(children) = node.children_mut() {
401 for child_list in children.values_mut() {
402 for child_node in child_list {
403 add_child_count_to_node(child_node);
404 }
405 }
406 }
407 }
408}