kreuzberg-cli 4.9.8

Command-line interface for Kreuzberg document intelligence
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
//! Kreuzberg CLI - Command-line interface for document intelligence.
//!
//! This binary provides a command-line interface to the Kreuzberg document intelligence
//! library, supporting document extraction, MIME type detection, caching, and batch operations.
//!
//! # Architecture
//!
//! The CLI is built using `clap` for argument parsing and provides five main commands:
//! - `extract`: Extract text/data from a single document
//! - `batch`: Process multiple documents in parallel
//! - `detect`: Identify MIME type of a file
//! - `cache`: Manage cache (clear, stats)
//! - `serve`: Start API server (requires `api` feature)
//! - `version`: Show version information
//!
//! # Configuration
//!
//! The CLI supports configuration files in TOML, YAML, or JSON formats:
//! - Explicit: `--config path/to/config.toml`
//! - Auto-discovery: Searches for `kreuzberg.{toml,yaml,json}` in current and parent directories
//! - Inline JSON: `--config-json '{"ocr": {"backend": "tesseract"}}'`
//! - Command-line flags override config file settings
//!
//! Configuration precedence (highest to lowest):
//! 1. Individual CLI flags (--output-format, --ocr, etc.)
//! 2. Inline JSON config (--config-json or --config-json-base64)
//! 3. Config file (--config path.toml)
//! 4. Default values
//!
//! # Exit Codes
//!
//! - 0: Success
//! - Non-zero: Error (see stderr for details)
//!
//! # Examples
//!
//! ```bash
//! # Extract text from a PDF
//! kreuzberg extract document.pdf
//!
//! # Extract with OCR enabled
//! kreuzberg extract scanned.pdf --ocr true
//!
//! # Extract with inline JSON config
//! kreuzberg extract doc.pdf --config-json '{"ocr":{"backend":"tesseract"}}'
//!
//! # Batch processing
//! kreuzberg batch *.pdf --output-format json
//!
//! # Detect MIME type
//! kreuzberg detect unknown-file.bin
//! ```

#![deny(unsafe_code)]

mod commands;
mod style;

use anyhow::{Context, Result};
use base64::{Engine as _, engine::general_purpose::STANDARD};
use clap::{CommandFactory, Parser, Subcommand};
#[cfg(feature = "embeddings")]
use commands::embed_command;
#[cfg(feature = "mcp")]
use commands::mcp_command;
use commands::overrides::ExtractionOverrides;
#[cfg(feature = "api")]
use commands::serve_command;
use commands::{
    batch_command, chunk_command, clear_command, extract_command,
    extract_structured::{ExtractStructuredArgs, extract_structured_command},
    load_config, manifest_command, stats_command, warm_command,
};
use kreuzberg::{OutputFormat as ContentOutputFormat, detect_mime_type};
use serde_json::json;
use std::path::{Path, PathBuf};
use tracing_subscriber::EnvFilter;

/// Kreuzberg document intelligence CLI
#[derive(Parser)]
#[command(name = "kreuzberg")]
#[command(version, about, long_about = None)]
struct Cli {
    /// Set log level (trace, debug, info, warn, error). Overrides RUST_LOG env var.
    #[arg(long, global = true)]
    log_level: Option<String>,

    #[command(subcommand)]
    command: Commands,
}

#[derive(Subcommand)]
enum Commands {
    /// Extract text from a document
    Extract {
        /// Path to the document
        path: PathBuf,

        /// Path to config file (TOML, YAML, or JSON). If not specified, searches for kreuzberg.toml/yaml/json in current and parent directories.
        #[arg(short, long)]
        config: Option<PathBuf>,

        /// Inline JSON configuration. Applied after config file but before individual flags.
        ///
        /// Example: --config-json '{"ocr":{"backend":"tesseract"},"chunking":{"max_chars":1000}}'
        #[arg(long)]
        config_json: Option<String>,

        /// Base64-encoded JSON configuration. Useful for shell environments where quotes are problematic.
        ///
        /// Example: --config-json-base64 eyJvY3IiOnsiYmFja2VuZCI6InRlc3NlcmFjdCJ9fQ==
        #[arg(long)]
        config_json_base64: Option<String>,

        /// MIME type hint (auto-detected if not provided)
        #[arg(short, long)]
        mime_type: Option<String>,

        /// Output format for CLI results (text or json).
        ///
        /// Controls how the CLI displays results, not the extraction content format.
        #[arg(short, long, default_value = "text")]
        format: WireFormat,

        /// Extraction configuration overrides
        #[command(flatten)]
        overrides: ExtractionOverrides,
    },

    /// Extract structured data from a document using an LLM
    ExtractStructured {
        /// Path to the document file
        path: PathBuf,

        /// Path to JSON schema file defining the output structure
        #[arg(long)]
        schema: PathBuf,

        /// LLM model (e.g., "openai/gpt-4o")
        #[arg(long)]
        model: String,

        /// API key for the LLM provider
        #[arg(long)]
        api_key: Option<String>,

        /// Custom Jinja2 prompt template
        #[arg(long)]
        prompt: Option<String>,

        /// Schema name
        #[arg(long, default_value = "extraction")]
        schema_name: Option<String>,

        /// Enable strict mode
        #[arg(long)]
        strict: bool,

        /// Config file path
        #[arg(short, long)]
        config: Option<PathBuf>,

        /// Output format (text or json)
        #[arg(short, long, default_value = "json")]
        format: WireFormat,
    },

    /// Batch extract from multiple documents
    Batch {
        /// Paths to documents
        paths: Vec<PathBuf>,

        /// Path to config file (TOML, YAML, or JSON). If not specified, searches for kreuzberg.toml/yaml/json in current and parent directories.
        #[arg(short, long)]
        config: Option<PathBuf>,

        /// Inline JSON configuration. Applied after config file but before individual flags.
        ///
        /// Example: --config-json '{"ocr":{"backend":"tesseract"},"chunking":{"max_chars":1000}}'
        #[arg(long)]
        config_json: Option<String>,

        /// Base64-encoded JSON configuration. Useful for shell environments where quotes are problematic.
        ///
        /// Example: --config-json-base64 eyJvY3IiOnsiYmFja2VuZCI6InRlc3NlcmFjdCJ9fQ==
        #[arg(long)]
        config_json_base64: Option<String>,

        /// Output format for CLI results (text or json).
        ///
        /// Controls how the CLI displays results, not the extraction content format.
        #[arg(short, long, default_value = "json")]
        format: WireFormat,

        /// Extraction configuration overrides
        #[command(flatten)]
        overrides: ExtractionOverrides,

        /// Path to a JSON file mapping file paths to per-file extraction config overrides.
        /// The JSON should be an object where keys are file paths and values are FileExtractionConfig objects.
        /// Example: {"doc1.pdf": {"force_ocr": true}, "doc2.pdf": {"output_format": "markdown"}}
        #[arg(long)]
        file_configs: Option<PathBuf>,
    },

    /// Detect MIME type of a file
    Detect {
        /// Path to the file
        path: PathBuf,

        /// Output format (text or json)
        #[arg(short, long, default_value = "text")]
        format: WireFormat,
    },

    /// List all supported document formats
    Formats {
        /// Output format (text or json)
        #[arg(short, long, default_value = "text")]
        format: WireFormat,
    },

    /// Show version information
    Version {
        /// Output format (text or json)
        #[arg(short, long, default_value = "text")]
        format: WireFormat,
    },

    /// Cache management operations
    Cache {
        #[command(subcommand)]
        command: CacheCommands,
    },

    /// Start the API server
    ///
    /// Configuration is loaded with the following precedence (highest to lowest):
    /// 1. CLI arguments (--host, --port)
    /// 2. Environment variables (KREUZBERG_HOST, KREUZBERG_PORT)
    /// 3. Config file (TOML, YAML, or JSON)
    /// 4. Built-in defaults (127.0.0.1:8000)
    ///
    /// The config file can contain both extraction and server settings under [server] section.
    #[cfg(feature = "api")]
    Serve {
        /// Host to bind to (e.g., "127.0.0.1" or "0.0.0.0"). CLI arg overrides config file and env vars.
        #[arg(short = 'H', long)]
        host: Option<String>,

        /// Port to bind to. CLI arg overrides config file and env vars.
        #[arg(short, long)]
        port: Option<u16>,

        /// Path to config file (TOML, YAML, or JSON). If not specified, searches for kreuzberg.toml/yaml/json in current and parent directories.
        #[arg(short, long)]
        config: Option<PathBuf>,
    },

    /// Start the MCP (Model Context Protocol) server
    #[cfg(feature = "mcp")]
    Mcp {
        /// Path to config file (TOML, YAML, or JSON). If not specified, searches for kreuzberg.toml/yaml/json in current and parent directories.
        #[arg(short, long)]
        config: Option<PathBuf>,

        /// Transport mode: stdio (default) or http
        #[arg(long, default_value = "stdio")]
        transport: String,

        /// HTTP host (only for --transport http)
        #[arg(long, default_value = "127.0.0.1")]
        host: String,

        /// HTTP port (only for --transport http)
        #[arg(long, default_value = "8001")]
        port: u16,
    },

    /// API utilities
    #[cfg(feature = "api")]
    Api {
        #[command(subcommand)]
        command: ApiCommands,
    },

    /// Generate embeddings for text
    ///
    /// Generates vector embeddings for one or more text inputs using a specified preset model
    /// or an LLM provider. Reads from --text flag or stdin if no text is provided.
    #[cfg(feature = "embeddings")]
    Embed {
        /// Text to embed. Can be specified multiple times for batch embedding.
        #[arg(long)]
        text: Vec<String>,

        /// Embedding preset (fast, balanced, quality, multilingual). Used with --provider local.
        #[arg(long, default_value = "balanced")]
        preset: String,

        /// Embedding provider: "local" (default, ONNX) or "llm" (liter-llm)
        #[arg(long, default_value = "local")]
        provider: String,

        /// LLM model for provider-hosted embeddings (e.g., "openai/text-embedding-3-small").
        /// Required when --provider is "llm".
        #[arg(long)]
        model: Option<String>,

        /// API key for the LLM provider
        #[arg(long)]
        api_key: Option<String>,

        /// Output format (text or json)
        #[arg(short, long, default_value = "json")]
        format: WireFormat,
    },

    /// Chunk text for processing
    ///
    /// Splits text into chunks using configurable size and overlap.
    /// Reads from --text flag or stdin if no text is provided.
    Chunk {
        /// Text to chunk. If not provided, reads from stdin.
        #[arg(long)]
        text: Option<String>,

        /// Path to config file (TOML, YAML, or JSON)
        #[arg(short, long)]
        config: Option<PathBuf>,

        /// Chunk size in characters
        #[arg(long)]
        chunk_size: Option<usize>,

        /// Chunk overlap in characters
        #[arg(long)]
        chunk_overlap: Option<usize>,

        /// Chunker type: text, markdown, yaml, or semantic
        #[arg(long, default_value = "text")]
        chunker_type: String,

        /// Tokenizer model for token-based chunk sizing (e.g., "Xenova/gpt-4o").
        /// Requires the chunking-tokenizers feature.
        #[arg(long)]
        chunking_tokenizer: Option<String>,

        /// Topic threshold for semantic chunking (0.0-1.0, default: 0.75)
        #[arg(long)]
        topic_threshold: Option<f32>,

        /// Output format (text or json)
        #[arg(short, long, default_value = "json")]
        format: WireFormat,
    },

    /// Generate shell completions
    ///
    /// Outputs shell completion scripts for the specified shell.
    /// Install with: eval "$(kreuzberg completions bash)"
    Completions {
        /// Shell to generate completions for
        #[arg(value_enum)]
        shell: clap_complete::Shell,
    },
}

#[cfg(feature = "api")]
#[derive(Subcommand)]
enum ApiCommands {
    /// Output the OpenAPI schema (JSON)
    ///
    /// Prints the full OpenAPI 3.1 specification for the kreuzberg REST API.
    /// Useful for code generation, documentation, and API client tooling.
    Schema,
}

#[derive(Subcommand)]
enum CacheCommands {
    /// Show cache statistics
    Stats {
        /// Cache directory (default: .kreuzberg in current directory)
        #[arg(short, long)]
        cache_dir: Option<PathBuf>,

        /// Output format (text or json)
        #[arg(short, long, default_value = "text")]
        format: WireFormat,
    },

    /// Clear the cache
    Clear {
        /// Cache directory (default: .kreuzberg in current directory)
        #[arg(short, long)]
        cache_dir: Option<PathBuf>,

        /// Output format (text or json)
        #[arg(short, long, default_value = "text")]
        format: WireFormat,
    },

    /// Output model manifest (expected model files, checksums, sizes)
    ///
    /// Outputs a JSON manifest of all model files required by kreuzberg,
    /// including their relative paths, SHA256 checksums, and sizes.
    /// Used for pre-populating model caches in containerized deployments.
    Manifest {
        /// Output format (text or json)
        #[arg(short, long, default_value = "json")]
        format: WireFormat,
    },

    /// Download all models eagerly
    ///
    /// Downloads all PaddleOCR and layout detection models for all supported
    /// languages. Unlike normal operation which downloads lazily on first use,
    /// this ensures all models are present in the cache directory.
    ///
    /// Use --all-embeddings to also download all 4 embedding model presets,
    /// or --embedding-model <preset> to download a specific one.
    ///
    /// By default, only the core layout models (rtdetr + tatr) are downloaded.
    /// Use --all-table-models to also download SLANeXT variants (~730MB).
    Warm {
        /// Cache directory (default: .kreuzberg in current directory, or KREUZBERG_CACHE_DIR)
        #[arg(short, long)]
        cache_dir: Option<PathBuf>,

        /// Output format (text or json)
        #[arg(short, long, default_value = "text")]
        format: WireFormat,

        /// Download all embedding model presets (fast, balanced, quality, multilingual)
        #[arg(long)]
        all_embeddings: bool,

        /// Download a specific embedding model preset
        #[arg(long, value_name = "PRESET")]
        embedding_model: Option<String>,

        /// Download all table structure models including SLANeXT variants (~730MB)
        #[arg(
            long,
            help = "Download all table structure models including SLANeXT variants (~730MB)"
        )]
        all_table_models: bool,

        /// Download all tree-sitter grammar parsers
        #[arg(long)]
        all_grammars: bool,

        /// Download specific tree-sitter grammar groups (comma-separated: web,systems,scripting,data,jvm,functional)
        #[arg(long, value_name = "GROUPS", value_delimiter = ',')]
        grammar_groups: Option<Vec<String>>,

        /// Download specific tree-sitter grammars by language name (comma-separated)
        #[arg(long, value_name = "LANGUAGES", value_delimiter = ',')]
        grammars: Option<Vec<String>>,
    },
}

#[derive(Clone, Copy, Debug, PartialEq, Eq)]
enum WireFormat {
    Text,
    Json,
    Toon,
}

impl std::str::FromStr for WireFormat {
    type Err = String;

    fn from_str(s: &str) -> std::result::Result<Self, Self::Err> {
        match s.to_lowercase().as_str() {
            "text" => Ok(WireFormat::Text),
            "json" => Ok(WireFormat::Json),
            "toon" => Ok(WireFormat::Toon),
            _ => Err(format!("Invalid format: {}. Use 'text', 'json', or 'toon'", s)),
        }
    }
}

/// Content output format for extraction results.
///
/// Controls the format of the extracted content (not the CLI output format).
#[derive(Clone, Copy, Debug, PartialEq, Eq, clap::ValueEnum)]
enum ContentOutputFormatArg {
    /// Plain text (default)
    Plain,
    /// Markdown format
    Markdown,
    /// Djot markup format
    Djot,
    /// HTML format
    Html,
    /// JSON tree format with heading-driven sections
    Json,
}

impl From<ContentOutputFormatArg> for ContentOutputFormat {
    fn from(arg: ContentOutputFormatArg) -> Self {
        match arg {
            ContentOutputFormatArg::Plain => ContentOutputFormat::Plain,
            ContentOutputFormatArg::Markdown => ContentOutputFormat::Markdown,
            ContentOutputFormatArg::Djot => ContentOutputFormat::Djot,
            ContentOutputFormatArg::Html => ContentOutputFormat::Html,
            ContentOutputFormatArg::Json => ContentOutputFormat::Json,
        }
    }
}

/// Validates that a file exists and is accessible.
///
/// Checks that the path exists in the filesystem and points to a regular file
/// (not a directory or special file). Provides user-friendly error messages if validation fails.
///
/// # Errors
///
/// Returns an error if:
/// - The path does not exist in the filesystem
/// - The path exists but is not a regular file (e.g., is a directory)
fn validate_file_exists(path: &Path) -> Result<()> {
    if !path.exists() {
        anyhow::bail!(
            "File not found: '{}'. Please check that the file exists and is accessible.",
            path.display()
        );
    }
    if !path.is_file() {
        anyhow::bail!(
            "Path is not a file: '{}'. Please provide a path to a regular file.",
            path.display()
        );
    }
    Ok(())
}

/// Validates chunking parameters for correctness.
///
/// Ensures that chunking configuration makes sense: size must be positive and reasonable,
/// and overlap must be smaller than chunk size. This prevents common configuration errors
/// that would lead to cryptic failures from the underlying library.
///
/// # Errors
///
/// Returns an error if:
/// - `chunk_size` is 0 (must be at least 1 character)
/// - `chunk_size` exceeds 1,000,000 characters (to prevent excessive memory usage)
/// - `chunk_overlap` is greater than or equal to `chunk_size` (overlap must be smaller)
fn validate_chunk_params(chunk_size: Option<usize>, chunk_overlap: Option<usize>) -> Result<()> {
    if let Some(size) = chunk_size {
        if size == 0 {
            anyhow::bail!("Invalid chunk size: {}. Chunk size must be greater than 0.", size);
        }
        if size > 1_000_000 {
            anyhow::bail!(
                "Invalid chunk size: {}. Chunk size must be less than 1,000,000 characters to avoid excessive memory usage.",
                size
            );
        }
    }

    if let Some(overlap) = chunk_overlap
        && let Some(size) = chunk_size
        && overlap >= size
    {
        anyhow::bail!(
            "Invalid chunk overlap: {}. Overlap ({}) must be less than chunk size ({}).",
            overlap,
            overlap,
            size
        );
    }

    Ok(())
}

/// Validates batch extraction paths for correctness.
///
/// Ensures that at least one file path is provided and that all paths point to valid,
/// accessible files. This prevents processing empty batches or failing mid-batch due
/// to invalid paths.
///
/// # Errors
///
/// Returns an error if:
/// - The paths array is empty (at least one file is required)
/// - Any path does not exist or is not a regular file
fn validate_batch_paths(paths: &[PathBuf]) -> Result<()> {
    if paths.is_empty() {
        anyhow::bail!("No files provided for batch extraction. Please provide at least one file path.");
    }

    for (i, path) in paths.iter().enumerate() {
        validate_file_exists(path).with_context(|| format!("Invalid file at position {}", i + 1))?;
    }

    Ok(())
}

/// Apply inline JSON or base64 JSON overrides to an extraction config.
fn apply_json_overrides(
    config: &mut kreuzberg::ExtractionConfig,
    config_json: Option<String>,
    config_json_base64: Option<String>,
) -> Result<()> {
    if let Some(json_str) = config_json {
        let json_value: serde_json::Value =
            serde_json::from_str(&json_str).context("Failed to parse --config-json as JSON")?;
        *config =
            merge_json_into_config(config, json_value).context("Failed to merge --config-json with file config")?;
    } else if let Some(base64_str) = config_json_base64 {
        let json_bytes = STANDARD
            .decode(&base64_str)
            .context("Failed to decode base64 in --config-json-base64")?;
        let json_str = String::from_utf8(json_bytes).context("Base64-decoded content is not valid UTF-8")?;
        let json_value: serde_json::Value =
            serde_json::from_str(&json_str).context("Failed to parse decoded --config-json-base64 as JSON")?;
        *config = merge_json_into_config(config, json_value)
            .context("Failed to merge --config-json-base64 with file config")?;
    }
    Ok(())
}

/// Merges a JSON value into an existing extraction config via field-by-field override.
fn merge_json_into_config(
    base_config: &kreuzberg::ExtractionConfig,
    json_value: serde_json::Value,
) -> Result<kreuzberg::ExtractionConfig> {
    kreuzberg::core::config::merge::merge_config_json(base_config, json_value).map_err(|e| anyhow::anyhow!("{}", e))
}

fn main() -> Result<()> {
    let cli = Cli::parse();

    let env_filter = if let Some(ref level) = cli.log_level {
        EnvFilter::try_new(level).unwrap_or_else(|_| EnvFilter::new("info"))
    } else {
        EnvFilter::try_from_default_env().unwrap_or_else(|_| EnvFilter::new("info"))
    };

    let _ = tracing_subscriber::fmt()
        .with_env_filter(env_filter)
        .with_writer(std::io::stderr)
        .try_init();

    match cli.command {
        Commands::Extract {
            path,
            config: config_path,
            config_json,
            config_json_base64,
            mime_type,
            format,
            overrides,
        } => {
            validate_file_exists(&path)?;
            overrides.validate()?;

            let mut config = load_config(config_path)?;
            apply_json_overrides(&mut config, config_json, config_json_base64)?;
            overrides.apply(&mut config);

            extract_command(path, config, mime_type, format)?;
        }

        Commands::ExtractStructured {
            path,
            schema,
            model,
            api_key,
            prompt,
            schema_name,
            strict,
            config,
            format,
        } => {
            validate_file_exists(&path)?;
            validate_file_exists(&schema)?;
            extract_structured_command(ExtractStructuredArgs {
                path,
                schema_path: schema,
                model,
                api_key,
                prompt,
                schema_name,
                strict,
                config_path: config,
                format,
            })?;
        }

        Commands::Batch {
            paths,
            config: config_path,
            config_json,
            config_json_base64,
            format,
            overrides,
            file_configs,
        } => {
            validate_batch_paths(&paths)?;
            overrides.validate()?;

            let mut config = load_config(config_path)?;
            apply_json_overrides(&mut config, config_json, config_json_base64)?;
            overrides.apply(&mut config);

            let file_configs_map = if let Some(file_configs_path) = file_configs {
                let file_configs_json = std::fs::read_to_string(&file_configs_path)
                    .with_context(|| format!("Failed to read file configs from '{}'", file_configs_path.display()))?;
                let map: std::collections::HashMap<String, serde_json::Value> =
                    serde_json::from_str(&file_configs_json).with_context(|| {
                        format!(
                            "Failed to parse file configs JSON from '{}'",
                            file_configs_path.display()
                        )
                    })?;
                Some(map)
            } else {
                None
            };
            batch_command(paths, file_configs_map, config, format)?;
        }

        Commands::Detect { path, format } => {
            validate_file_exists(&path)?;

            let path_str = path.to_string_lossy().to_string();
            let mime_type = detect_mime_type(&path_str, true).with_context(|| {
                format!(
                    "Failed to detect MIME type for file '{}'. Ensure the file is readable.",
                    path.display()
                )
            })?;

            match format {
                WireFormat::Text => {
                    println!("{}", style::success(&mime_type));
                }
                WireFormat::Json => {
                    let output = json!({
                        "path": path_str,
                        "mime_type": mime_type,
                    });
                    println!(
                        "{}",
                        serde_json::to_string_pretty(&output)
                            .context("Failed to serialize MIME type detection result to JSON")?
                    );
                }
                WireFormat::Toon => {
                    let output = json!({
                        "path": path_str,
                        "mime_type": mime_type,
                    });
                    println!(
                        "{}",
                        serde_toon::to_string(&output)
                            .context("Failed to serialize MIME type detection result to TOON")?
                    );
                }
            }
        }

        Commands::Formats { format } => {
            let formats = kreuzberg::list_supported_formats();
            match format {
                WireFormat::Text => {
                    println!("{:<15} {}", style::label("EXTENSION"), style::label("MIME TYPE"));
                    println!("{}", style::dim(&format!("{:<15} ---------", "---------")));
                    for f in &formats {
                        println!("{:<15} {}", style::success(&format!(".{}", f.extension)), f.mime_type);
                    }
                }
                WireFormat::Json => {
                    println!(
                        "{}",
                        serde_json::to_string_pretty(&formats).context("Failed to serialize formats to JSON")?
                    );
                }
                WireFormat::Toon => {
                    println!(
                        "{}",
                        serde_toon::to_string(&formats).context("Failed to serialize formats to TOON")?
                    );
                }
            }
        }

        Commands::Version { format } => {
            let version = env!("CARGO_PKG_VERSION");
            let name = env!("CARGO_PKG_NAME");

            match format {
                WireFormat::Text => {
                    println!("{} {}", style::label(name), style::success(version));
                }
                WireFormat::Json => {
                    let output = json!({
                        "name": name,
                        "version": version,
                    });
                    println!(
                        "{}",
                        serde_json::to_string_pretty(&output)
                            .context("Failed to serialize version information to JSON")?
                    );
                }
                WireFormat::Toon => {
                    let output = json!({
                        "name": name,
                        "version": version,
                    });
                    println!(
                        "{}",
                        serde_toon::to_string(&output).context("Failed to serialize version information to TOON")?
                    );
                }
            }
        }

        #[cfg(feature = "api")]
        Commands::Serve {
            host: cli_host,
            port: cli_port,
            config: config_path,
        } => {
            let mut extraction_config = load_config(config_path.clone())?;
            extraction_config.apply_env_overrides()?;
            serve_command(cli_host, cli_port, extraction_config, config_path)?;
        }

        #[cfg(feature = "mcp")]
        Commands::Mcp {
            config: config_path,
            transport,
            #[cfg(feature = "mcp-http")]
            host,
            #[cfg(feature = "mcp-http")]
            port,
            #[cfg(not(feature = "mcp-http"))]
            host,
            #[cfg(not(feature = "mcp-http"))]
            port,
        } => {
            let mut config = load_config(config_path)?;
            config.apply_env_overrides()?;
            mcp_command(config, transport, host, port)?;
        }

        Commands::Cache { command } => match command {
            CacheCommands::Stats { cache_dir, format } => {
                stats_command(cache_dir, format)?;
            }
            CacheCommands::Clear { cache_dir, format } => {
                clear_command(cache_dir, format)?;
            }
            CacheCommands::Manifest { format } => {
                manifest_command(format)?;
            }
            CacheCommands::Warm {
                cache_dir,
                format,
                all_embeddings,
                embedding_model,
                all_table_models,
                all_grammars,
                grammar_groups,
                grammars,
            } => {
                warm_command(
                    cache_dir,
                    format,
                    all_embeddings,
                    embedding_model,
                    all_table_models,
                    all_grammars,
                    grammar_groups,
                    grammars,
                )?;
            }
        },

        #[cfg(feature = "api")]
        Commands::Api { command } => match command {
            ApiCommands::Schema => {
                println!("{}", kreuzberg::api::openapi::openapi_json());
            }
        },

        #[cfg(feature = "embeddings")]
        Commands::Embed {
            text,
            preset,
            provider,
            model,
            api_key,
            format,
        } => {
            let texts = if text.is_empty() {
                vec![commands::read_stdin()?]
            } else {
                text
            };
            embed_command(texts, &preset, &provider, model, api_key, format)?;
        }

        Commands::Chunk {
            text,
            config: config_path,
            chunk_size,
            chunk_overlap,
            chunker_type,
            chunking_tokenizer,
            topic_threshold,
            format,
        } => {
            let input = match text {
                Some(t) => t,
                None => commands::read_stdin().context("No --text provided and failed to read from stdin")?,
            };

            validate_chunk_params(chunk_size, chunk_overlap)?;

            let base_config = load_config(config_path)?;
            let mut chunking_config = base_config.chunking.unwrap_or_default();

            if let Some(size) = chunk_size {
                chunking_config.max_characters = size;
                // If user set chunk_size but not overlap, clamp overlap to fit
                if chunk_overlap.is_none() && chunking_config.overlap >= size {
                    chunking_config.overlap = size / 4;
                }
            }
            if let Some(overlap) = chunk_overlap {
                chunking_config.overlap = overlap;
            }
            match chunker_type.as_str() {
                "markdown" => chunking_config.chunker_type = kreuzberg::ChunkerType::Markdown,
                "yaml" => chunking_config.chunker_type = kreuzberg::ChunkerType::Yaml,
                "semantic" => chunking_config.chunker_type = kreuzberg::ChunkerType::Semantic,
                _ => chunking_config.chunker_type = kreuzberg::ChunkerType::Text,
            }
            if let Some(ref tokenizer) = chunking_tokenizer {
                chunking_config.sizing = kreuzberg::ChunkSizing::Tokenizer {
                    model: tokenizer.clone(),
                    cache_dir: None,
                };
            }
            if topic_threshold.is_some() {
                chunking_config.topic_threshold = topic_threshold;
            }

            chunk_command(input, chunking_config, format)?;
        }

        Commands::Completions { shell } => {
            let mut cmd = Cli::command();
            clap_complete::generate(shell, &mut cmd, "kreuzberg", &mut std::io::stdout());
        }
    }

    Ok(())
}