hedl_cli/batch/config.rs
1// Dweve HEDL - Hierarchical Entity Data Language
2//
3// Copyright (c) 2025 Dweve IP B.V. and individual contributors.
4//
5// SPDX-License-Identifier: Apache-2.0
6//
7// Licensed under the Apache License, Version 2.0 (the "License");
8// you may not use this file except in compliance with the License.
9// You may obtain a copy of the License in the LICENSE file at the
10// root of this repository or at: http://www.apache.org/licenses/LICENSE-2.0
11//
12// Unless required by applicable law or agreed to in writing, software
13// distributed under the License is distributed on an "AS IS" BASIS,
14// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15// See the License for the specific language governing permissions and
16// limitations under the License.
17
18//! Batch processing configuration.
19
20use crate::error::CliError;
21use colored::Colorize;
22
23/// Configuration for batch processing operations.
24///
25/// Controls parallelization strategy, progress reporting, and error handling behavior.
26///
27/// # Examples
28///
29/// ```rust
30/// use hedl_cli::batch::BatchConfig;
31///
32/// // Default configuration (auto parallelization)
33/// let config = BatchConfig::default();
34///
35/// // Custom configuration
36/// let config = BatchConfig {
37/// parallel_threshold: 5, // Parallelize if >= 5 files
38/// max_threads: Some(4), // Use at most 4 threads
39/// progress_interval: 10, // Update progress every 10 files
40/// verbose: true, // Show detailed progress
41/// max_files: Some(10_000), // Limit to 10,000 files
42/// };
43/// ```
44#[derive(Debug, Clone)]
45pub struct BatchConfig {
46 /// Minimum number of files to trigger parallel processing.
47 ///
48 /// Files below this threshold are processed serially to avoid thread pool overhead.
49 /// Default: 10
50 pub parallel_threshold: usize,
51
52 /// Maximum number of threads to use for parallel processing.
53 ///
54 /// When set, creates a local thread pool isolated to this batch operation.
55 /// This ensures configuration always takes effect and prevents global state pollution.
56 ///
57 /// # Behavior
58 ///
59 /// - `None` (default): Uses Rayon's global thread pool (typically number of CPU cores)
60 /// - `Some(n)`: Creates a local thread pool with exactly `n` threads for this operation
61 ///
62 /// # Thread Pool Isolation
63 ///
64 /// Local thread pools provide complete isolation:
65 /// - No global state modification
66 /// - Concurrent batch operations can use different thread counts
67 /// - Configuration is guaranteed to take effect or error explicitly
68 /// - Thread pool lifetime matches the `process()` call duration
69 ///
70 /// # Performance Considerations
71 ///
72 /// Local thread pool creation has small overhead (~0.5-1ms) and memory cost (~2-8MB per thread).
73 /// For maximum performance with default configuration, leave as `None`.
74 ///
75 /// # Examples
76 ///
77 /// ```rust
78 /// use hedl_cli::batch::BatchConfig;
79 ///
80 /// // Default: uses global pool
81 /// let config = BatchConfig::default();
82 ///
83 /// // Custom: creates local pool with 4 threads
84 /// let config = BatchConfig {
85 /// max_threads: Some(4),
86 /// ..Default::default()
87 /// };
88 /// ```
89 ///
90 /// Default: None
91 pub max_threads: Option<usize>,
92
93 /// Number of files between progress updates.
94 ///
95 /// Progress is printed every N files processed. Set to 0 to disable.
96 /// Default: 1 (update after each file)
97 pub progress_interval: usize,
98
99 /// Enable verbose progress reporting.
100 ///
101 /// When true, shows file names and detailed status for each file.
102 /// Default: false
103 pub verbose: bool,
104
105 /// Maximum number of files allowed in a batch operation.
106 ///
107 /// This prevents resource exhaustion when processing very large file sets.
108 /// - `Some(n)`: Limit to n files (default: 10,000)
109 /// - `None`: No limit (use with caution)
110 ///
111 /// # Security
112 ///
113 /// Protects against:
114 /// - Memory exhaustion from storing millions of file paths
115 /// - File descriptor exhaustion from concurrent operations
116 /// - Excessive CPU time from unbounded processing
117 ///
118 /// # Configuration
119 ///
120 /// Can be overridden via:
121 /// - Environment variable: `HEDL_MAX_BATCH_FILES`
122 /// - CLI flag: `--max-files <N>`
123 /// - Programmatic: `BatchConfig { max_files: Some(n), .. }`
124 ///
125 /// # Examples
126 ///
127 /// ```rust
128 /// use hedl_cli::batch::BatchConfig;
129 ///
130 /// // Default limit (10,000 files)
131 /// let config = BatchConfig::default();
132 ///
133 /// // Custom limit
134 /// let config = BatchConfig {
135 /// max_files: Some(50_000),
136 /// ..Default::default()
137 /// };
138 ///
139 /// // Unlimited (use with caution)
140 /// let config = BatchConfig {
141 /// max_files: None,
142 /// ..Default::default()
143 /// };
144 /// ```
145 pub max_files: Option<usize>,
146}
147
148impl Default for BatchConfig {
149 fn default() -> Self {
150 Self {
151 parallel_threshold: 10,
152 max_threads: None,
153 progress_interval: 1,
154 verbose: false,
155 max_files: Some(get_max_batch_files()),
156 }
157 }
158}
159
160/// Get maximum batch files from environment variable or default.
161///
162/// Checks `HEDL_MAX_BATCH_FILES` environment variable. Falls back to
163/// `DEFAULT_MAX_BATCH_FILES` (10,000) if not set or invalid.
164///
165/// # Examples
166///
167/// ```bash
168/// export HEDL_MAX_BATCH_FILES=50000
169/// hedl batch-validate "*.hedl"
170/// ```
171pub fn get_max_batch_files() -> usize {
172 const DEFAULT_MAX_BATCH_FILES: usize = 10_000;
173
174 std::env::var("HEDL_MAX_BATCH_FILES")
175 .ok()
176 .and_then(|s| s.parse::<usize>().ok())
177 .unwrap_or(DEFAULT_MAX_BATCH_FILES)
178}
179
180/// Validate file count against configured limit.
181///
182/// # Arguments
183///
184/// * `file_count` - Number of files to process
185/// * `max_files` - Maximum allowed files (None = unlimited)
186///
187/// # Returns
188///
189/// * `Ok(())` - File count is within limit
190/// * `Err(CliError)` - File count exceeds limit
191///
192/// # Examples
193///
194/// ```rust
195/// use hedl_cli::batch::validate_file_count;
196///
197/// // Within limit
198/// assert!(validate_file_count(100, Some(1000)).is_ok());
199///
200/// // Exceeds limit
201/// assert!(validate_file_count(2000, Some(1000)).is_err());
202///
203/// // Unlimited
204/// assert!(validate_file_count(1_000_000, None).is_ok());
205/// ```
206pub fn validate_file_count(file_count: usize, max_files: Option<usize>) -> Result<(), CliError> {
207 if let Some(limit) = max_files {
208 if file_count > limit {
209 return Err(CliError::invalid_input(format!(
210 "File count ({file_count}) exceeds maximum limit ({limit}). \
211 Consider:\n \
212 - Refining glob patterns to match fewer files\n \
213 - Using --max-files flag to increase limit\n \
214 - Setting HEDL_MAX_BATCH_FILES environment variable\n \
215 - Processing files in smaller batches"
216 )));
217 }
218 }
219 Ok(())
220}
221
222/// Warn if file count is large and suggest verbose mode.
223///
224/// Prints a warning when processing many files to inform user of operation scale.
225///
226/// # Arguments
227///
228/// * `file_count` - Number of files to process
229/// * `verbose` - Whether verbose mode is enabled
230///
231/// # Threshold
232///
233/// Warns if `file_count` >= 1000 and not already in verbose mode.
234pub fn warn_large_batch(file_count: usize, verbose: bool) {
235 const WARN_THRESHOLD: usize = 1_000;
236
237 if file_count >= WARN_THRESHOLD && !verbose {
238 eprintln!(
239 "{} Processing {} files. Consider using {} for progress updates.",
240 "Warning:".yellow().bold(),
241 file_count.to_string().bright_white(),
242 "--verbose".bright_cyan()
243 );
244 }
245}