1use crate::error::{Error, Result};
2use crate::filter::{FileFilterConfig, FilterConfig};
3use crate::preset::PresetKind;
4use crate::token::TokenizerKind;
5use std::collections::HashMap;
6use std::path::PathBuf;
7
8const DEFAULT_MAX_TOKENS: usize = 100_000;
9const DEFAULT_OVERLAP_TOKENS: usize = 1_000;
10const DEFAULT_CHUNK_SAFETY_MARGIN: usize = 2_000;
11const DEFAULT_OUTPUT_PATTERN: &str = "prompt_{index:03}.{ext}";
12
13#[derive(Debug, Clone, Copy, PartialEq, Eq)]
15pub enum OutputFormat {
16 Markdown,
18 Xml,
20 Json,
22 Custom,
24}
25
26impl OutputFormat {
27 #[must_use]
32 pub const fn extension(self) -> &'static str {
33 match self {
34 Self::Markdown => "md",
35 Self::Xml => "xml",
36 Self::Json => "json",
37 Self::Custom => "txt",
38 }
39 }
40
41 #[must_use]
46 pub const fn template_name(self) -> &'static str {
47 match self {
48 Self::Markdown => "markdown",
49 Self::Xml => "xml",
50 Self::Json => "json",
51 Self::Custom => "custom",
52 }
53 }
54}
55
56#[derive(Debug, Clone)]
60#[non_exhaustive]
61pub struct Config {
62 pub root_dir: PathBuf,
64
65 pub output_dir: PathBuf,
67
68 pub output_pattern: String,
70
71 pub format: OutputFormat,
73
74 pub max_tokens: usize,
76
77 pub overlap_tokens: usize,
79
80 pub chunk_safety_margin: usize,
82
83 pub tokenizer: TokenizerKind,
85
86 pub prefer_line_boundaries: bool,
88
89 pub filter_config: FilterConfig,
91
92 pub file_filter_config: FileFilterConfig,
94
95 pub preset: Option<PresetKind>,
97
98 pub dry_run: bool,
100
101 pub include_binary_files: bool,
103
104 pub backup_existing: bool,
106
107 pub template_path: Option<PathBuf>,
109
110 pub custom_format_name: Option<String>,
112
113 pub custom_extension: Option<String>,
115
116 pub custom_data: HashMap<String, serde_json::Value>,
118}
119
120impl Config {
121 #[must_use]
135 pub fn builder() -> ConfigBuilder {
136 ConfigBuilder::default()
137 }
138
139 pub fn validate(&self) -> Result<()> {
148 if !self.root_dir.exists() {
150 return Err(Error::config(format!(
151 "Root directory does not exist: {}",
152 self.root_dir.display()
153 )));
154 }
155
156 if !self.root_dir.is_dir() {
157 return Err(Error::config(format!(
158 "Root path is not a directory: {}",
159 self.root_dir.display()
160 )));
161 }
162
163 if self.max_tokens == 0 {
165 return Err(Error::config("max_tokens must be greater than 0"));
166 }
167
168 if self.overlap_tokens >= self.max_tokens {
169 return Err(Error::config(format!(
170 "overlap_tokens ({}) must be less than max_tokens ({})",
171 self.overlap_tokens, self.max_tokens
172 )));
173 }
174
175 if self.chunk_safety_margin >= self.max_tokens {
176 return Err(Error::config(format!(
177 "chunk_safety_margin ({}) must be less than max_tokens ({})",
178 self.chunk_safety_margin, self.max_tokens
179 )));
180 }
181
182 if !self.output_pattern.contains("{index") {
184 return Err(Error::invalid_pattern(
185 &self.output_pattern,
186 "Pattern must contain {index} or {index:03} placeholder",
187 ));
188 }
189
190 if !self.output_pattern.contains("{ext}") {
191 return Err(Error::invalid_pattern(
192 &self.output_pattern,
193 "Pattern must contain {ext} placeholder",
194 ));
195 }
196
197 if let Some(ref template_path) = self.template_path {
199 if !template_path.exists() {
201 return Err(Error::config(format!(
202 "Template file does not exist: {}",
203 template_path.display()
204 )));
205 }
206
207 if !template_path.is_file() {
208 return Err(Error::config(format!(
209 "Template path is not a file: {}",
210 template_path.display()
211 )));
212 }
213
214 crate::template_validator::TemplateValidator::validate_template(template_path)?;
216 }
217
218 if matches!(self.format, OutputFormat::Custom) {
220 if self.custom_format_name.is_none() {
221 return Err(Error::config(
222 "Custom format requires custom_format_name. \
223 Use Config::builder().custom_format_name(\"my_format\")",
224 ));
225 }
226
227 if self.custom_extension.is_none() {
228 return Err(Error::config(
229 "Custom format requires custom_extension. \
230 Use Config::builder().custom_extension(\"txt\")",
231 ));
232 }
233
234 if self.template_path.is_none() {
235 return Err(Error::config(
236 "Custom format requires template_path. \
237 Use Config::builder().template_path(\"./template.tera\")",
238 ));
239 }
240 } else {
241 if self.custom_format_name.is_some() || self.custom_extension.is_some() {
243 tracing::warn!(
244 "custom_format_name and custom_extension are only used with OutputFormat::Custom. \
245 Current format: {:?}",
246 self.format
247 );
248 }
249 }
250
251 Ok(())
252 }
253
254 #[must_use]
256 pub const fn effective_chunk_size(&self) -> usize {
257 self.max_tokens.saturating_sub(self.chunk_safety_margin)
258 }
259}
260
261impl Default for Config {
262 fn default() -> Self {
263 Self {
264 root_dir: PathBuf::from("."),
265 output_dir: PathBuf::from("out"),
266 output_pattern: DEFAULT_OUTPUT_PATTERN.to_string(),
267 format: OutputFormat::Markdown,
268 max_tokens: DEFAULT_MAX_TOKENS,
269 overlap_tokens: DEFAULT_OVERLAP_TOKENS,
270 chunk_safety_margin: DEFAULT_CHUNK_SAFETY_MARGIN,
271 tokenizer: TokenizerKind::Simple,
272 prefer_line_boundaries: true,
273 filter_config: FilterConfig::default(),
274 file_filter_config: FileFilterConfig::default(),
275 preset: None,
276 dry_run: false,
277 include_binary_files: false,
278 backup_existing: true,
279 template_path: None,
280 custom_format_name: None,
281 custom_extension: None,
282 custom_data: HashMap::new(),
283 }
284 }
285}
286
287#[derive(Debug, Default)]
289pub struct ConfigBuilder {
290 root_dir: Option<PathBuf>,
291 output_dir: Option<PathBuf>,
292 output_pattern: Option<String>,
293 format: Option<OutputFormat>,
294 max_tokens: Option<usize>,
295 overlap_tokens: Option<usize>,
296 chunk_safety_margin: Option<usize>,
297 tokenizer: Option<TokenizerKind>,
298 prefer_line_boundaries: Option<bool>,
299 filter_config: Option<FilterConfig>,
300 file_filter_config: Option<FileFilterConfig>,
301 preset: Option<PresetKind>,
302 dry_run: bool,
303 include_binary_files: bool,
304 backup_existing: Option<bool>,
305 template_path: Option<PathBuf>,
306 custom_format_name: Option<String>,
307 custom_extension: Option<String>,
308 custom_data: HashMap<String, serde_json::Value>,
309}
310
311impl ConfigBuilder {
312 #[must_use]
314 pub fn root_dir(mut self, path: impl Into<PathBuf>) -> Self {
315 self.root_dir = Some(path.into());
316 self
317 }
318
319 #[must_use]
321 pub fn output_dir(mut self, path: impl Into<PathBuf>) -> Self {
322 self.output_dir = Some(path.into());
323 self
324 }
325
326 #[must_use]
330 pub fn output_pattern(mut self, pattern: impl Into<String>) -> Self {
331 self.output_pattern = Some(pattern.into());
332 self
333 }
334
335 #[must_use]
337 pub fn format(mut self, format: OutputFormat) -> Self {
338 self.format = Some(format);
339 self
340 }
341
342 #[must_use]
344 pub fn max_tokens(mut self, tokens: usize) -> Self {
345 self.max_tokens = Some(tokens);
346 self
347 }
348
349 #[must_use]
351 pub fn overlap_tokens(mut self, tokens: usize) -> Self {
352 self.overlap_tokens = Some(tokens);
353 self
354 }
355
356 #[must_use]
358 pub fn chunk_safety_margin(mut self, margin: usize) -> Self {
359 self.chunk_safety_margin = Some(margin);
360 self
361 }
362
363 #[must_use]
365 pub fn tokenizer(mut self, kind: TokenizerKind) -> Self {
366 self.tokenizer = Some(kind);
367 self
368 }
369
370 #[must_use]
372 pub fn prefer_line_boundaries(mut self, enabled: bool) -> Self {
373 self.prefer_line_boundaries = Some(enabled);
374 self
375 }
376
377 #[must_use]
379 pub fn dry_run(mut self, enabled: bool) -> Self {
380 self.dry_run = enabled;
381 self
382 }
383
384 #[must_use]
386 pub fn include_binary_files(mut self, enabled: bool) -> Self {
387 self.include_binary_files = enabled;
388 self
389 }
390
391 #[must_use]
393 pub fn backup_existing(mut self, enabled: bool) -> Self {
394 self.backup_existing = Some(enabled);
395 self
396 }
397
398 #[must_use]
400 pub fn filter_config(mut self, config: FilterConfig) -> Self {
401 self.filter_config = Some(config);
402 self
403 }
404
405 #[must_use]
407 pub fn file_filter_config(mut self, config: FileFilterConfig) -> Self {
408 self.file_filter_config = Some(config);
409 self
410 }
411
412 #[must_use]
414 pub fn preset(mut self, preset: PresetKind) -> Self {
415 self.preset = Some(preset);
416 self
417 }
418
419 #[must_use]
424 pub fn template_path(mut self, path: impl Into<PathBuf>) -> Self {
425 self.template_path = Some(path.into());
426 self
427 }
428
429 #[must_use]
434 pub fn custom_format_name(mut self, name: impl Into<String>) -> Self {
435 self.custom_format_name = Some(name.into());
436 self
437 }
438
439 #[must_use]
459 pub fn custom_extension(mut self, ext: impl Into<String>) -> Self {
460 self.custom_extension = Some(ext.into());
461 self
462 }
463
464 #[must_use]
486 pub fn custom_data(mut self, data: HashMap<String, serde_json::Value>) -> Self {
487 self.custom_data = data;
488 self
489 }
490
491 pub fn build(self) -> Result<Config> {
497 let config = Config {
498 root_dir: self.root_dir.unwrap_or_else(|| PathBuf::from(".")),
499 output_dir: self.output_dir.unwrap_or_else(|| PathBuf::from("out")),
500 output_pattern: self
501 .output_pattern
502 .unwrap_or_else(|| DEFAULT_OUTPUT_PATTERN.to_string()),
503 format: self.format.unwrap_or(OutputFormat::Markdown),
504 max_tokens: self.max_tokens.unwrap_or(DEFAULT_MAX_TOKENS),
505 overlap_tokens: self.overlap_tokens.unwrap_or(DEFAULT_OVERLAP_TOKENS),
506 chunk_safety_margin: self
507 .chunk_safety_margin
508 .unwrap_or(DEFAULT_CHUNK_SAFETY_MARGIN),
509 tokenizer: self.tokenizer.unwrap_or(TokenizerKind::Simple),
510 prefer_line_boundaries: self.prefer_line_boundaries.unwrap_or(true),
511 filter_config: self.filter_config.unwrap_or_default(),
512 file_filter_config: self.file_filter_config.unwrap_or_default(),
513 preset: self.preset,
514 dry_run: self.dry_run,
515 include_binary_files: self.include_binary_files,
516 backup_existing: self.backup_existing.unwrap_or(true),
517 template_path: self.template_path,
518 custom_format_name: self.custom_format_name,
519 custom_extension: self.custom_extension,
520 custom_data: self.custom_data,
521 };
522
523 config.validate()?;
524 Ok(config)
525 }
526}
527
528#[cfg(test)]
529mod tests {
530 use super::*;
531
532 #[test]
533 fn test_default_config() {
534 let temp = assert_fs::TempDir::new().unwrap();
535 let config = Config::builder()
536 .root_dir(temp.path())
537 .build()
538 .unwrap();
539
540 assert_eq!(config.max_tokens, DEFAULT_MAX_TOKENS);
541 assert_eq!(config.format, OutputFormat::Markdown);
542 }
543
544 #[test]
545 fn test_invalid_root_dir() {
546 let result = Config::builder()
547 .root_dir("/nonexistent/path/that/should/not/exist")
548 .build();
549
550 assert!(result.is_err());
551 }
552
553 #[test]
554 fn test_invalid_token_limits() {
555 let temp = assert_fs::TempDir::new().unwrap();
556
557 let result = Config::builder()
558 .root_dir(temp.path())
559 .max_tokens(1000)
560 .overlap_tokens(1000)
561 .build();
562
563 assert!(result.is_err());
564 }
565
566 #[test]
567 fn test_invalid_pattern() {
568 let temp = assert_fs::TempDir::new().unwrap();
569
570 let result = Config::builder()
571 .root_dir(temp.path())
572 .output_pattern("invalid_pattern")
573 .build();
574
575 assert!(result.is_err());
576 }
577}