Skip to main content

hedl_cli/batch/
config.rs

1// Dweve HEDL - Hierarchical Entity Data Language
2//
3// Copyright (c) 2025 Dweve IP B.V. and individual contributors.
4//
5// SPDX-License-Identifier: Apache-2.0
6//
7// Licensed under the Apache License, Version 2.0 (the "License");
8// you may not use this file except in compliance with the License.
9// You may obtain a copy of the License in the LICENSE file at the
10// root of this repository or at: http://www.apache.org/licenses/LICENSE-2.0
11//
12// Unless required by applicable law or agreed to in writing, software
13// distributed under the License is distributed on an "AS IS" BASIS,
14// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15// See the License for the specific language governing permissions and
16// limitations under the License.
17
18//! Batch processing configuration.
19
20use crate::error::CliError;
21use colored::Colorize;
22
23/// Configuration for batch processing operations.
24///
25/// Controls parallelization strategy, progress reporting, and error handling behavior.
26///
27/// # Examples
28///
29/// ```rust
30/// use hedl_cli::batch::BatchConfig;
31///
32/// // Default configuration (auto parallelization)
33/// let config = BatchConfig::default();
34///
35/// // Custom configuration
36/// let config = BatchConfig {
37///     parallel_threshold: 5,  // Parallelize if >= 5 files
38///     max_threads: Some(4),   // Use at most 4 threads
39///     progress_interval: 10,  // Update progress every 10 files
40///     verbose: true,          // Show detailed progress
41///     max_files: Some(10_000), // Limit to 10,000 files
42/// };
43/// ```
44#[derive(Debug, Clone)]
45pub struct BatchConfig {
46    /// Minimum number of files to trigger parallel processing.
47    ///
48    /// Files below this threshold are processed serially to avoid thread pool overhead.
49    /// Default: 10
50    pub parallel_threshold: usize,
51
52    /// Maximum number of threads to use for parallel processing.
53    ///
54    /// When set, creates a local thread pool isolated to this batch operation.
55    /// This ensures configuration always takes effect and prevents global state pollution.
56    ///
57    /// # Behavior
58    ///
59    /// - `None` (default): Uses Rayon's global thread pool (typically number of CPU cores)
60    /// - `Some(n)`: Creates a local thread pool with exactly `n` threads for this operation
61    ///
62    /// # Thread Pool Isolation
63    ///
64    /// Local thread pools provide complete isolation:
65    /// - No global state modification
66    /// - Concurrent batch operations can use different thread counts
67    /// - Configuration is guaranteed to take effect or error explicitly
68    /// - Thread pool lifetime matches the `process()` call duration
69    ///
70    /// # Performance Considerations
71    ///
72    /// Local thread pool creation has small overhead (~0.5-1ms) and memory cost (~2-8MB per thread).
73    /// For maximum performance with default configuration, leave as `None`.
74    ///
75    /// # Examples
76    ///
77    /// ```rust
78    /// use hedl_cli::batch::BatchConfig;
79    ///
80    /// // Default: uses global pool
81    /// let config = BatchConfig::default();
82    ///
83    /// // Custom: creates local pool with 4 threads
84    /// let config = BatchConfig {
85    ///     max_threads: Some(4),
86    ///     ..Default::default()
87    /// };
88    /// ```
89    ///
90    /// Default: None
91    pub max_threads: Option<usize>,
92
93    /// Number of files between progress updates.
94    ///
95    /// Progress is printed every N files processed. Set to 0 to disable.
96    /// Default: 1 (update after each file)
97    pub progress_interval: usize,
98
99    /// Enable verbose progress reporting.
100    ///
101    /// When true, shows file names and detailed status for each file.
102    /// Default: false
103    pub verbose: bool,
104
105    /// Maximum number of files allowed in a batch operation.
106    ///
107    /// This prevents resource exhaustion when processing very large file sets.
108    /// - `Some(n)`: Limit to n files (default: 10,000)
109    /// - `None`: No limit (use with caution)
110    ///
111    /// # Security
112    ///
113    /// Protects against:
114    /// - Memory exhaustion from storing millions of file paths
115    /// - File descriptor exhaustion from concurrent operations
116    /// - Excessive CPU time from unbounded processing
117    ///
118    /// # Configuration
119    ///
120    /// Can be overridden via:
121    /// - Environment variable: `HEDL_MAX_BATCH_FILES`
122    /// - CLI flag: `--max-files <N>`
123    /// - Programmatic: `BatchConfig { max_files: Some(n), .. }`
124    ///
125    /// # Examples
126    ///
127    /// ```rust
128    /// use hedl_cli::batch::BatchConfig;
129    ///
130    /// // Default limit (10,000 files)
131    /// let config = BatchConfig::default();
132    ///
133    /// // Custom limit
134    /// let config = BatchConfig {
135    ///     max_files: Some(50_000),
136    ///     ..Default::default()
137    /// };
138    ///
139    /// // Unlimited (use with caution)
140    /// let config = BatchConfig {
141    ///     max_files: None,
142    ///     ..Default::default()
143    /// };
144    /// ```
145    pub max_files: Option<usize>,
146}
147
148impl Default for BatchConfig {
149    fn default() -> Self {
150        Self {
151            parallel_threshold: 10,
152            max_threads: None,
153            progress_interval: 1,
154            verbose: false,
155            max_files: Some(get_max_batch_files()),
156        }
157    }
158}
159
160/// Get maximum batch files from environment variable or default.
161///
162/// Checks `HEDL_MAX_BATCH_FILES` environment variable. Falls back to
163/// `DEFAULT_MAX_BATCH_FILES` (10,000) if not set or invalid.
164///
165/// # Examples
166///
167/// ```bash
168/// export HEDL_MAX_BATCH_FILES=50000
169/// hedl batch-validate "*.hedl"
170/// ```
171pub fn get_max_batch_files() -> usize {
172    const DEFAULT_MAX_BATCH_FILES: usize = 10_000;
173
174    std::env::var("HEDL_MAX_BATCH_FILES")
175        .ok()
176        .and_then(|s| s.parse::<usize>().ok())
177        .unwrap_or(DEFAULT_MAX_BATCH_FILES)
178}
179
180/// Validate file count against configured limit.
181///
182/// # Arguments
183///
184/// * `file_count` - Number of files to process
185/// * `max_files` - Maximum allowed files (None = unlimited)
186///
187/// # Returns
188///
189/// * `Ok(())` - File count is within limit
190/// * `Err(CliError)` - File count exceeds limit
191///
192/// # Examples
193///
194/// ```rust
195/// use hedl_cli::batch::validate_file_count;
196///
197/// // Within limit
198/// assert!(validate_file_count(100, Some(1000)).is_ok());
199///
200/// // Exceeds limit
201/// assert!(validate_file_count(2000, Some(1000)).is_err());
202///
203/// // Unlimited
204/// assert!(validate_file_count(1_000_000, None).is_ok());
205/// ```
206pub fn validate_file_count(file_count: usize, max_files: Option<usize>) -> Result<(), CliError> {
207    if let Some(limit) = max_files {
208        if file_count > limit {
209            return Err(CliError::invalid_input(format!(
210                "File count ({file_count}) exceeds maximum limit ({limit}). \
211                 Consider:\n  \
212                 - Refining glob patterns to match fewer files\n  \
213                 - Using --max-files flag to increase limit\n  \
214                 - Setting HEDL_MAX_BATCH_FILES environment variable\n  \
215                 - Processing files in smaller batches"
216            )));
217        }
218    }
219    Ok(())
220}
221
222/// Warn if file count is large and suggest verbose mode.
223///
224/// Prints a warning when processing many files to inform user of operation scale.
225///
226/// # Arguments
227///
228/// * `file_count` - Number of files to process
229/// * `verbose` - Whether verbose mode is enabled
230///
231/// # Threshold
232///
233/// Warns if `file_count` >= 1000 and not already in verbose mode.
234pub fn warn_large_batch(file_count: usize, verbose: bool) {
235    const WARN_THRESHOLD: usize = 1_000;
236
237    if file_count >= WARN_THRESHOLD && !verbose {
238        eprintln!(
239            "{} Processing {} files. Consider using {} for progress updates.",
240            "Warning:".yellow().bold(),
241            file_count.to_string().bright_white(),
242            "--verbose".bright_cyan()
243        );
244    }
245}