Skip to main content

hedl_cli/batch/
operations.rs

1// Dweve HEDL - Hierarchical Entity Data Language
2//
3// Copyright (c) 2025 Dweve IP B.V. and individual contributors.
4//
5// SPDX-License-Identifier: Apache-2.0
6//
7// Licensed under the Apache License, Version 2.0 (the "License");
8// you may not use this file except in compliance with the License.
9// You may obtain a copy of the License in the LICENSE file at the
10// root of this repository or at: http://www.apache.org/licenses/LICENSE-2.0
11//
12// Unless required by applicable law or agreed to in writing, software
13// distributed under the License is distributed on an "AS IS" BASIS,
14// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15// See the License for the specific language governing permissions and
16// limitations under the License.
17
18//! Standard and streaming batch operations.
19
20use super::traits::{BatchOperation, StreamingBatchOperation};
21use crate::error::CliError;
22use std::collections::HashSet;
23use std::path::Path;
24
25// ============================================================================
26// Standard Operations
27// ============================================================================
28
29/// Batch validation operation.
30///
31/// Validates multiple HEDL files in parallel, checking syntax and optionally
32/// enforcing strict reference resolution.
33#[derive(Debug, Clone)]
34pub struct ValidationOperation {
35    /// Enable strict reference validation
36    pub strict: bool,
37}
38
39impl BatchOperation for ValidationOperation {
40    type Output = ValidationStats;
41
42    fn process_file(&self, path: &Path) -> Result<Self::Output, CliError> {
43        use hedl_core::{parse_with_limits, Item, Node, ParseOptions, ReferenceMode};
44
45        let content = std::fs::read_to_string(path).map_err(|e| CliError::io_error(path, e))?;
46
47        let options = ParseOptions {
48            reference_mode: if self.strict {
49                ReferenceMode::Strict
50            } else {
51                ReferenceMode::Lenient
52            },
53            ..ParseOptions::default()
54        };
55
56        let doc = parse_with_limits(content.as_bytes(), options)
57            .map_err(|e| CliError::parse(e.to_string()))?;
58
59        // Collect statistics from the parsed document
60        let mut stats = ValidationStats::new();
61
62        // Get version from document metadata
63        stats.version = format!("{}.{}", doc.version.0, doc.version.1);
64
65        // Recursive helper to count nodes
66        fn count_node(node: &Node, stats: &mut ValidationStats) {
67            stats.node_count += 1;
68            stats.field_count += node.fields.len();
69            let full_id = format!("{}:{}", node.type_name, node.id);
70            stats.seen_ids.insert(full_id);
71
72            // Count children recursively
73            if let Some(ref children) = node.children {
74                for child_nodes in children.values() {
75                    for child in child_nodes {
76                        count_node(child, stats);
77                    }
78                }
79            }
80        }
81
82        // Recursive helper to traverse items
83        fn traverse_item(item: &Item, stats: &mut ValidationStats) {
84            match item {
85                Item::List(list) => {
86                    stats.list_count += 1;
87                    for node in &list.rows {
88                        count_node(node, stats);
89                    }
90                }
91                Item::Object(obj) => {
92                    for child_item in obj.values() {
93                        traverse_item(child_item, stats);
94                    }
95                }
96                Item::Scalar(_) => {
97                    // Scalars don't contribute to node counts
98                }
99            }
100        }
101
102        // Traverse all items in the document root
103        for item in doc.root.values() {
104            traverse_item(item, &mut stats);
105        }
106
107        Ok(stats)
108    }
109
110    fn name(&self) -> &'static str {
111        "validate"
112    }
113}
114
115/// Batch format operation.
116///
117/// Formats multiple HEDL files to canonical form, optionally checking if files
118/// are already canonical.
119#[derive(Debug, Clone)]
120pub struct FormatOperation {
121    /// Only check if files are canonical (don't write)
122    pub check: bool,
123    /// Use ditto optimization
124    pub ditto: bool,
125    /// Add count hints to matrix lists
126    pub with_counts: bool,
127}
128
129impl BatchOperation for FormatOperation {
130    type Output = String;
131
132    fn process_file(&self, path: &Path) -> Result<Self::Output, CliError> {
133        use hedl_c14n::{canonicalize_with_config, CanonicalConfig};
134        use hedl_core::parse;
135
136        let content = std::fs::read_to_string(path).map_err(|e| CliError::io_error(path, e))?;
137
138        let mut doc = parse(content.as_bytes()).map_err(|e| CliError::parse(e.to_string()))?;
139
140        // Add count hints if requested
141        if self.with_counts {
142            add_count_hints(&mut doc);
143        }
144
145        let config = CanonicalConfig::new().with_ditto(self.ditto);
146
147        let canonical = canonicalize_with_config(&doc, &config)
148            .map_err(|e| CliError::canonicalization(e.to_string()))?;
149
150        if self.check && canonical != content {
151            return Err(CliError::NotCanonical);
152        }
153
154        Ok(canonical)
155    }
156
157    fn name(&self) -> &str {
158        if self.check {
159            "format-check"
160        } else {
161            "format"
162        }
163    }
164}
165
166/// Batch lint operation.
167///
168/// Lints multiple HEDL files for best practices and common issues.
169#[derive(Debug, Clone)]
170pub struct LintOperation {
171    /// Treat warnings as errors
172    pub warn_error: bool,
173}
174
175impl BatchOperation for LintOperation {
176    type Output = Vec<String>;
177
178    fn process_file(&self, path: &Path) -> Result<Self::Output, CliError> {
179        use hedl_core::parse;
180        use hedl_lint::lint;
181
182        let content = std::fs::read_to_string(path).map_err(|e| CliError::io_error(path, e))?;
183
184        let doc = parse(content.as_bytes()).map_err(|e| CliError::parse(e.to_string()))?;
185
186        let diagnostics = lint(&doc);
187
188        if self.warn_error && !diagnostics.is_empty() {
189            return Err(CliError::LintErrors);
190        }
191
192        Ok(diagnostics
193            .iter()
194            .map(std::string::ToString::to_string)
195            .collect())
196    }
197
198    fn name(&self) -> &'static str {
199        "lint"
200    }
201}
202
203// ============================================================================
204// Streaming Operations
205// ============================================================================
206
207/// Statistics collected during streaming validation.
208///
209/// Provides detailed statistics about the parsed document including
210/// entity counts, field counts, and ID tracking for reference validation.
211#[derive(Debug, Clone, Default)]
212pub struct ValidationStats {
213    /// HEDL version string
214    pub version: String,
215    /// Number of lists encountered
216    pub list_count: usize,
217    /// Total number of nodes processed
218    pub node_count: usize,
219    /// Total number of fields across all nodes
220    pub field_count: usize,
221    /// Set of seen IDs for strict reference validation (type:id format)
222    pub seen_ids: HashSet<String>,
223}
224
225impl ValidationStats {
226    /// Create new empty validation statistics
227    #[must_use]
228    pub fn new() -> Self {
229        Self::default()
230    }
231}
232
233/// Streaming validation operation for memory-efficient validation of large files.
234///
235/// Uses the streaming parser from `hedl-stream` to validate files with O(1) memory
236/// usage regardless of file size. Ideal for:
237/// - Files larger than 100MB
238/// - Validating thousands of files with limited RAM
239/// - Container environments with memory limits
240///
241/// # Memory Profile
242///
243/// - **Input**: O(1) - buffer size only (~8KB)
244/// - **Working**: `O(n_ids)` - seen ID set for strict validation
245/// - **Output**: O(1) - small statistics struct
246/// - **Peak**: ~8KB + ID set size (vs. full file size in standard mode)
247///
248/// # Examples
249///
250/// ```rust,no_run
251/// use hedl_cli::batch::{BatchExecutor, StreamingValidationOperation};
252/// use std::path::PathBuf;
253///
254/// # fn main() -> Result<(), Box<dyn std::error::Error>> {
255/// let processor = BatchExecutor::default_config();
256/// let files = vec![PathBuf::from("large-file.hedl")];
257///
258/// let operation = StreamingValidationOperation { strict: false };
259/// let results = processor.process_streaming(&files, operation, true)?;
260///
261/// println!("Validated {} files with constant memory", results.success_count());
262/// # Ok(())
263/// # }
264/// ```
265#[derive(Debug, Clone)]
266pub struct StreamingValidationOperation {
267    /// Enable strict reference validation
268    pub strict: bool,
269}
270
271impl StreamingBatchOperation for StreamingValidationOperation {
272    type Output = ValidationStats;
273
274    fn process_file_streaming(&self, path: &Path) -> Result<Self::Output, CliError> {
275        use hedl_stream::{NodeEvent, StreamError, StreamingParser};
276        use std::fs::File;
277        use std::io::BufReader;
278
279        let file = File::open(path).map_err(|e| CliError::io_error(path, e))?;
280        let reader = BufReader::with_capacity(8192, file);
281
282        let parser = StreamingParser::new(reader)
283            .map_err(|e: StreamError| CliError::parse(e.to_string()))?;
284
285        let mut stats = ValidationStats::new();
286        let mut _current_type = String::new();
287
288        // Process events incrementally
289        for event in parser {
290            let event = event.map_err(|e: StreamError| CliError::parse(e.to_string()))?;
291
292            match event {
293                NodeEvent::Header(info) => {
294                    // Validate version exists
295                    let version_str = format!("{}.{}", info.version.0, info.version.1);
296                    if version_str.is_empty() {
297                        return Err(CliError::parse("Missing VERSION".to_string()));
298                    }
299                    stats.version = version_str;
300                }
301                NodeEvent::ListStart { type_name, .. } => {
302                    stats.list_count += 1;
303                    _current_type = type_name;
304                }
305                NodeEvent::Node(node) => {
306                    stats.node_count += 1;
307                    stats.field_count += node.fields.len();
308
309                    // Track IDs for strict mode validation
310                    let full_id = format!("{}:{}", node.type_name, node.id);
311
312                    if self.strict {
313                        // In strict mode, validate references
314                        // For now, just track IDs - full reference validation
315                        // would require accumulating references and validating at end
316                        stats.seen_ids.insert(full_id);
317                    } else {
318                        stats.seen_ids.insert(full_id);
319                    }
320                }
321                NodeEvent::ListEnd { .. } => {
322                    // List validation complete
323                }
324                NodeEvent::Scalar { .. } => {
325                    // Scalar validation - no action needed
326                }
327                NodeEvent::ObjectStart { .. } => {
328                    // Object start - no action needed
329                }
330                NodeEvent::ObjectEnd { .. } => {
331                    // Object end - no action needed
332                }
333                NodeEvent::EndOfDocument => {
334                    // Document complete
335                    break;
336                }
337            }
338        }
339
340        Ok(stats)
341    }
342
343    fn name(&self) -> &'static str {
344        "validate-streaming"
345    }
346
347    fn supports_streaming(&self) -> bool {
348        true
349    }
350}
351
352// ============================================================================
353// Helper Functions for Count Hints
354// ============================================================================
355
356/// Recursively add count hints to all matrix lists in the document
357fn add_count_hints(doc: &mut hedl_core::Document) {
358    for item in doc.root.values_mut() {
359        add_count_hints_to_item(item);
360    }
361}
362
363/// Recursively add count hints to an item
364fn add_count_hints_to_item(item: &mut hedl_core::Item) {
365    use hedl_core::Item;
366
367    match item {
368        Item::List(list) => {
369            // Set count hint based on actual row count
370            list.count_hint = Some(list.rows.len());
371
372            // Recursively add child counts to each node
373            for node in &mut list.rows {
374                add_child_count_to_node(node);
375            }
376        }
377        Item::Object(map) => {
378            // Recursively process nested objects
379            for nested_item in map.values_mut() {
380                add_count_hints_to_item(nested_item);
381            }
382        }
383        Item::Scalar(_) => {
384            // Scalars don't have matrix lists
385        }
386    }
387}
388
389/// Recursively set `child_count` on nodes that have children
390fn add_child_count_to_node(node: &mut hedl_core::Node) {
391    // Calculate total number of direct children across all child types
392    let total_children: usize = node
393        .children()
394        .map_or(0, |c| c.values().map(std::vec::Vec::len).sum());
395
396    if total_children > 0 {
397        node.child_count = total_children.min(u16::MAX as usize) as u16;
398
399        // Recursively process all child nodes
400        if let Some(children) = node.children_mut() {
401            for child_list in children.values_mut() {
402                for child_node in child_list {
403                    add_child_count_to_node(child_node);
404                }
405            }
406        }
407    }
408}