llm_utl/
config.rs

1use crate::error::{Error, Result};
2use crate::filter::{FileFilterConfig, FilterConfig};
3use crate::preset::PresetKind;
4use crate::token::TokenizerKind;
5use std::collections::HashMap;
6use std::path::PathBuf;
7
8const DEFAULT_MAX_TOKENS: usize = 100_000;
9const DEFAULT_OVERLAP_TOKENS: usize = 1_000;
10const DEFAULT_CHUNK_SAFETY_MARGIN: usize = 2_000;
11const DEFAULT_OUTPUT_PATTERN: &str = "prompt_{index:03}.{ext}";
12
13/// Output format for generated prompts.
14#[derive(Debug, Clone, Copy, PartialEq, Eq)]
15pub enum OutputFormat {
16    /// Markdown format with code blocks
17    Markdown,
18    /// XML format with structured tags
19    Xml,
20    /// JSON format with metadata
21    Json,
22    /// Custom format with external template
23    Custom,
24}
25
26impl OutputFormat {
27    /// Returns the file extension for this format.
28    ///
29    /// For Custom format, returns a default extension "txt".
30    /// Use `Config::custom_extension` for the actual custom extension.
31    #[must_use]
32    pub const fn extension(self) -> &'static str {
33        match self {
34            Self::Markdown => "md",
35            Self::Xml => "xml",
36            Self::Json => "json",
37            Self::Custom => "txt",
38        }
39    }
40
41    /// Returns the template name for this format.
42    ///
43    /// For Custom format, returns a default name "custom".
44    /// Use `Config::custom_format_name` for the actual custom template name.
45    #[must_use]
46    pub const fn template_name(self) -> &'static str {
47        match self {
48            Self::Markdown => "markdown",
49            Self::Xml => "xml",
50            Self::Json => "json",
51            Self::Custom => "custom",
52        }
53    }
54}
55
56/// Configuration for the llm-utl pipeline.
57///
58/// Use [`Config::builder()`] to construct a new configuration.
59#[derive(Debug, Clone)]
60#[non_exhaustive]
61pub struct Config {
62    /// Root directory to scan for files
63    pub root_dir: PathBuf,
64
65    /// Output directory for generated prompts
66    pub output_dir: PathBuf,
67
68    /// Output filename pattern (supports {index}, {index:03}, {ext})
69    pub output_pattern: String,
70
71    /// Output format
72    pub format: OutputFormat,
73
74    /// Maximum tokens per chunk
75    pub max_tokens: usize,
76
77    /// Overlap tokens between chunks for context continuity
78    pub overlap_tokens: usize,
79
80    /// Safety margin to prevent exceeding limits
81    pub chunk_safety_margin: usize,
82
83    /// Tokenizer implementation to use
84    pub tokenizer: TokenizerKind,
85
86    /// Whether to prefer splitting at line boundaries
87    pub prefer_line_boundaries: bool,
88
89    /// Code filtering configuration
90    pub filter_config: FilterConfig,
91
92    /// Code filtering configuration
93    pub file_filter_config: FileFilterConfig,
94
95    /// LLM preset for specialized output
96    pub preset: Option<PresetKind>,
97
98    /// Dry run mode (no file writes)
99    pub dry_run: bool,
100
101    /// Include binary files in output
102    pub include_binary_files: bool,
103
104    /// Create backups of existing files
105    pub backup_existing: bool,
106
107    /// Path to external template file
108    pub template_path: Option<PathBuf>,
109
110    /// Custom format name (used with Custom output format)
111    pub custom_format_name: Option<String>,
112
113    /// Custom file extension (used with Custom output format)
114    pub custom_extension: Option<String>,
115
116    /// Custom data to pass to templates
117    pub custom_data: HashMap<String, serde_json::Value>,
118}
119
120impl Config {
121    /// Creates a new configuration builder.
122    ///
123    /// # Examples
124    ///
125    /// ```
126    /// use llm_utl::Config;
127    ///
128    /// let config = Config::builder()
129    ///     .root_dir("./src")
130    ///     .max_tokens(50_000)
131    ///     .build()
132    ///     .expect("valid configuration");
133    /// ```
134    #[must_use]
135    pub fn builder() -> ConfigBuilder {
136        ConfigBuilder::default()
137    }
138
139    /// Validates the configuration.
140    ///
141    /// # Errors
142    ///
143    /// Returns an error if:
144    /// - Root directory doesn't exist
145    /// - Token limits are invalid
146    /// - Output pattern is invalid
147    pub fn validate(&self) -> Result<()> {
148        // Validate root directory
149        if !self.root_dir.exists() {
150            return Err(Error::config(format!(
151                "Root directory does not exist: {}",
152                self.root_dir.display()
153            )));
154        }
155
156        if !self.root_dir.is_dir() {
157            return Err(Error::config(format!(
158                "Root path is not a directory: {}",
159                self.root_dir.display()
160            )));
161        }
162
163        // Validate token limits
164        if self.max_tokens == 0 {
165            return Err(Error::config("max_tokens must be greater than 0"));
166        }
167
168        if self.overlap_tokens >= self.max_tokens {
169            return Err(Error::config(format!(
170                "overlap_tokens ({}) must be less than max_tokens ({})",
171                self.overlap_tokens, self.max_tokens
172            )));
173        }
174
175        if self.chunk_safety_margin >= self.max_tokens {
176            return Err(Error::config(format!(
177                "chunk_safety_margin ({}) must be less than max_tokens ({})",
178                self.chunk_safety_margin, self.max_tokens
179            )));
180        }
181
182        // Validate output pattern
183        if !self.output_pattern.contains("{index") {
184            return Err(Error::invalid_pattern(
185                &self.output_pattern,
186                "Pattern must contain {index} or {index:03} placeholder",
187            ));
188        }
189
190        if !self.output_pattern.contains("{ext}") {
191            return Err(Error::invalid_pattern(
192                &self.output_pattern,
193                "Pattern must contain {ext} placeholder",
194            ));
195        }
196
197        // Validate template configuration
198        if let Some(ref template_path) = self.template_path {
199            // Validate template file exists and is valid
200            if !template_path.exists() {
201                return Err(Error::config(format!(
202                    "Template file does not exist: {}",
203                    template_path.display()
204                )));
205            }
206
207            if !template_path.is_file() {
208                return Err(Error::config(format!(
209                    "Template path is not a file: {}",
210                    template_path.display()
211                )));
212            }
213
214            // Validate template using TemplateValidator
215            crate::template_validator::TemplateValidator::validate_template(template_path)?;
216        }
217
218        // Validate Custom format requirements
219        if matches!(self.format, OutputFormat::Custom) {
220            if self.custom_format_name.is_none() {
221                return Err(Error::config(
222                    "Custom format requires custom_format_name. \
223                    Use Config::builder().custom_format_name(\"my_format\")",
224                ));
225            }
226
227            if self.custom_extension.is_none() {
228                return Err(Error::config(
229                    "Custom format requires custom_extension. \
230                    Use Config::builder().custom_extension(\"txt\")",
231                ));
232            }
233
234            if self.template_path.is_none() {
235                return Err(Error::config(
236                    "Custom format requires template_path. \
237                    Use Config::builder().template_path(\"./template.tera\")",
238                ));
239            }
240        } else {
241            // Warn if custom settings are provided for non-Custom format
242            if self.custom_format_name.is_some() || self.custom_extension.is_some() {
243                tracing::warn!(
244                    "custom_format_name and custom_extension are only used with OutputFormat::Custom. \
245                    Current format: {:?}",
246                    self.format
247                );
248            }
249        }
250
251        Ok(())
252    }
253
254    /// Returns the effective chunk size after applying safety margin.
255    #[must_use]
256    pub const fn effective_chunk_size(&self) -> usize {
257        self.max_tokens.saturating_sub(self.chunk_safety_margin)
258    }
259}
260
261impl Default for Config {
262    fn default() -> Self {
263        Self {
264            root_dir: PathBuf::from("."),
265            output_dir: PathBuf::from("out"),
266            output_pattern: DEFAULT_OUTPUT_PATTERN.to_string(),
267            format: OutputFormat::Markdown,
268            max_tokens: DEFAULT_MAX_TOKENS,
269            overlap_tokens: DEFAULT_OVERLAP_TOKENS,
270            chunk_safety_margin: DEFAULT_CHUNK_SAFETY_MARGIN,
271            tokenizer: TokenizerKind::Simple,
272            prefer_line_boundaries: true,
273            filter_config: FilterConfig::default(),
274            file_filter_config: FileFilterConfig::default(),
275            preset: None,
276            dry_run: false,
277            include_binary_files: false,
278            backup_existing: true,
279            template_path: None,
280            custom_format_name: None,
281            custom_extension: None,
282            custom_data: HashMap::new(),
283        }
284    }
285}
286
287/// Builder for creating a [`Config`].
288#[derive(Debug, Default)]
289pub struct ConfigBuilder {
290    root_dir: Option<PathBuf>,
291    output_dir: Option<PathBuf>,
292    output_pattern: Option<String>,
293    format: Option<OutputFormat>,
294    max_tokens: Option<usize>,
295    overlap_tokens: Option<usize>,
296    chunk_safety_margin: Option<usize>,
297    tokenizer: Option<TokenizerKind>,
298    prefer_line_boundaries: Option<bool>,
299    filter_config: Option<FilterConfig>,
300    file_filter_config: Option<FileFilterConfig>,
301    preset: Option<PresetKind>,
302    dry_run: bool,
303    include_binary_files: bool,
304    backup_existing: Option<bool>,
305    template_path: Option<PathBuf>,
306    custom_format_name: Option<String>,
307    custom_extension: Option<String>,
308    custom_data: HashMap<String, serde_json::Value>,
309}
310
311impl ConfigBuilder {
312    /// Sets the root directory to scan.
313    #[must_use]
314    pub fn root_dir(mut self, path: impl Into<PathBuf>) -> Self {
315        self.root_dir = Some(path.into());
316        self
317    }
318
319    /// Sets the output directory for generated files.
320    #[must_use]
321    pub fn output_dir(mut self, path: impl Into<PathBuf>) -> Self {
322        self.output_dir = Some(path.into());
323        self
324    }
325
326    /// Sets the output filename pattern.
327    ///
328    /// Pattern must contain `{index}` and `{ext}` placeholders.
329    #[must_use]
330    pub fn output_pattern(mut self, pattern: impl Into<String>) -> Self {
331        self.output_pattern = Some(pattern.into());
332        self
333    }
334
335    /// Sets the output format.
336    #[must_use]
337    pub fn format(mut self, format: OutputFormat) -> Self {
338        self.format = Some(format);
339        self
340    }
341
342    /// Sets the maximum tokens per chunk.
343    #[must_use]
344    pub fn max_tokens(mut self, tokens: usize) -> Self {
345        self.max_tokens = Some(tokens);
346        self
347    }
348
349    /// Sets the overlap tokens between chunks.
350    #[must_use]
351    pub fn overlap_tokens(mut self, tokens: usize) -> Self {
352        self.overlap_tokens = Some(tokens);
353        self
354    }
355
356    /// Sets the chunk safety margin.
357    #[must_use]
358    pub fn chunk_safety_margin(mut self, margin: usize) -> Self {
359        self.chunk_safety_margin = Some(margin);
360        self
361    }
362
363    /// Sets the tokenizer implementation.
364    #[must_use]
365    pub fn tokenizer(mut self, kind: TokenizerKind) -> Self {
366        self.tokenizer = Some(kind);
367        self
368    }
369
370    /// Enables or disables line boundary preference.
371    #[must_use]
372    pub fn prefer_line_boundaries(mut self, enabled: bool) -> Self {
373        self.prefer_line_boundaries = Some(enabled);
374        self
375    }
376
377    /// Enables dry run mode (no file writes).
378    #[must_use]
379    pub fn dry_run(mut self, enabled: bool) -> Self {
380        self.dry_run = enabled;
381        self
382    }
383
384    /// Enables or disables binary file inclusion.
385    #[must_use]
386    pub fn include_binary_files(mut self, enabled: bool) -> Self {
387        self.include_binary_files = enabled;
388        self
389    }
390
391    /// Enables or disables backup creation.
392    #[must_use]
393    pub fn backup_existing(mut self, enabled: bool) -> Self {
394        self.backup_existing = Some(enabled);
395        self
396    }
397
398    /// Sets the code filtering configuration.
399    #[must_use]
400    pub fn filter_config(mut self, config: FilterConfig) -> Self {
401        self.filter_config = Some(config);
402        self
403    }
404
405    /// Sets the code filtering configuration.
406    #[must_use]
407    pub fn file_filter_config(mut self, config: FileFilterConfig) -> Self {
408        self.file_filter_config = Some(config);
409        self
410    }
411
412    /// Sets the LLM preset.
413    #[must_use]
414    pub fn preset(mut self, preset: PresetKind) -> Self {
415        self.preset = Some(preset);
416        self
417    }
418
419    /// Sets the path to an external template file.
420    ///
421    /// When provided, this template will be used instead of the built-in template
422    /// for the selected format. The template file must exist and contain valid Tera syntax.
423    #[must_use]
424    pub fn template_path(mut self, path: impl Into<PathBuf>) -> Self {
425        self.template_path = Some(path.into());
426        self
427    }
428
429    /// Sets the custom format name.
430    ///
431    /// Required when using `OutputFormat::Custom`. This name will be used
432    /// internally to identify the custom template.
433    #[must_use]
434    pub fn custom_format_name(mut self, name: impl Into<String>) -> Self {
435        self.custom_format_name = Some(name.into());
436        self
437    }
438
439    /// Sets the custom file extension.
440    ///
441    /// Required when using `OutputFormat::Custom`. This extension will be used
442    /// for output files (without the leading dot).
443    ///
444    /// # Examples
445    ///
446    /// ```no_run
447    /// use llm_utl::{Config, OutputFormat};
448    ///
449    /// let config = Config::builder()
450    ///     .root_dir(".")
451    ///     .format(OutputFormat::Custom)
452    ///     .custom_extension("txt")  // Files will be .txt
453    ///     .custom_format_name("my_format")
454    ///     .template_path("./template.tera")
455    ///     .build()
456    ///     .expect("valid config");
457    /// ```
458    #[must_use]
459    pub fn custom_extension(mut self, ext: impl Into<String>) -> Self {
460        self.custom_extension = Some(ext.into());
461        self
462    }
463
464    /// Sets custom data to be passed to templates.
465    ///
466    /// This data will be available in templates under the `ctx.custom` namespace.
467    ///
468    /// # Examples
469    ///
470    /// ```no_run
471    /// use llm_utl::Config;
472    /// use std::collections::HashMap;
473    /// use serde_json::Value;
474    ///
475    /// let mut custom_data = HashMap::new();
476    /// custom_data.insert("version".to_string(), Value::String("1.0.0".to_string()));
477    /// custom_data.insert("author".to_string(), Value::String("John Doe".to_string()));
478    ///
479    /// let config = Config::builder()
480    ///     .root_dir(".")
481    ///     .custom_data(custom_data)
482    ///     .build()
483    ///     .expect("valid config");
484    /// ```
485    #[must_use]
486    pub fn custom_data(mut self, data: HashMap<String, serde_json::Value>) -> Self {
487        self.custom_data = data;
488        self
489    }
490
491    /// Builds the configuration.
492    ///
493    /// # Errors
494    ///
495    /// Returns an error if validation fails.
496    pub fn build(self) -> Result<Config> {
497        let config = Config {
498            root_dir: self.root_dir.unwrap_or_else(|| PathBuf::from(".")),
499            output_dir: self.output_dir.unwrap_or_else(|| PathBuf::from("out")),
500            output_pattern: self
501                .output_pattern
502                .unwrap_or_else(|| DEFAULT_OUTPUT_PATTERN.to_string()),
503            format: self.format.unwrap_or(OutputFormat::Markdown),
504            max_tokens: self.max_tokens.unwrap_or(DEFAULT_MAX_TOKENS),
505            overlap_tokens: self.overlap_tokens.unwrap_or(DEFAULT_OVERLAP_TOKENS),
506            chunk_safety_margin: self
507                .chunk_safety_margin
508                .unwrap_or(DEFAULT_CHUNK_SAFETY_MARGIN),
509            tokenizer: self.tokenizer.unwrap_or(TokenizerKind::Simple),
510            prefer_line_boundaries: self.prefer_line_boundaries.unwrap_or(true),
511            filter_config: self.filter_config.unwrap_or_default(),
512            file_filter_config: self.file_filter_config.unwrap_or_default(),
513            preset: self.preset,
514            dry_run: self.dry_run,
515            include_binary_files: self.include_binary_files,
516            backup_existing: self.backup_existing.unwrap_or(true),
517            template_path: self.template_path,
518            custom_format_name: self.custom_format_name,
519            custom_extension: self.custom_extension,
520            custom_data: self.custom_data,
521        };
522
523        config.validate()?;
524        Ok(config)
525    }
526}
527
528#[cfg(test)]
529mod tests {
530    use super::*;
531
532    #[test]
533    fn test_default_config() {
534        let temp = assert_fs::TempDir::new().unwrap();
535        let config = Config::builder()
536            .root_dir(temp.path())
537            .build()
538            .unwrap();
539
540        assert_eq!(config.max_tokens, DEFAULT_MAX_TOKENS);
541        assert_eq!(config.format, OutputFormat::Markdown);
542    }
543
544    #[test]
545    fn test_invalid_root_dir() {
546        let result = Config::builder()
547            .root_dir("/nonexistent/path/that/should/not/exist")
548            .build();
549
550        assert!(result.is_err());
551    }
552
553    #[test]
554    fn test_invalid_token_limits() {
555        let temp = assert_fs::TempDir::new().unwrap();
556
557        let result = Config::builder()
558            .root_dir(temp.path())
559            .max_tokens(1000)
560            .overlap_tokens(1000)
561            .build();
562
563        assert!(result.is_err());
564    }
565
566    #[test]
567    fn test_invalid_pattern() {
568        let temp = assert_fs::TempDir::new().unwrap();
569
570        let result = Config::builder()
571            .root_dir(temp.path())
572            .output_pattern("invalid_pattern")
573            .build();
574
575        assert!(result.is_err());
576    }
577}