kreuzberg-cli 4.9.8

Command-line interface for Kreuzberg document intelligence
//! CLI contract tests - verify CLI config parsing matches Rust core
//!
//! This test suite validates that the CLI's configuration parsing produces
//! identical results to the Rust core library. It ensures that users get
//! consistent behavior whether using the CLI, SDK, or MCP interfaces.

use kreuzberg::core::config::ExtractionConfig;
use kreuzberg::core::config::OutputFormat;
use serde_json::json;

#[test]
fn test_cli_config_json_flag_basic_parsing() {
    let config_str = r#"{"use_cache": true, "output_format": "plain"}"#;

    // Parse as Rust core would
    let rust_config: ExtractionConfig = serde_json::from_str(config_str).expect("Failed to deserialize config string");

    // Simulate CLI --config-json parsing (same as Rust core)
    let cli_json: serde_json::Value = serde_json::from_str(config_str).expect("Failed to parse JSON string");
    let cli_config: ExtractionConfig = serde_json::from_value(cli_json).expect("Failed to deserialize from JSON value");

    // Verify identical behavior
    assert_eq!(
        rust_config.use_cache, cli_config.use_cache,
        "use_cache should be identical"
    );
    assert_eq!(
        rust_config.output_format, cli_config.output_format,
        "output_format should be identical"
    );
}

#[test]
fn test_cli_nested_config_deserialization() {
    let config_str = r#"{
        "chunking": {
            "max_characters": 1000,
            "overlap": 200
        },
        "ocr": {
            "backend": "tesseract"
        }
    }"#;

    let config: ExtractionConfig = serde_json::from_str(config_str).expect("Failed to deserialize nested config");

    assert!(config.chunking.is_some(), "Chunking config should be present");
    assert!(config.ocr.is_some(), "OCR config should be present");

    let chunking = config.chunking.unwrap();
    assert_eq!(chunking.max_characters, 1000, "max_chars should be 1000");
    assert_eq!(chunking.overlap, 200, "max_overlap should be 200");

    let ocr = config.ocr.unwrap();
    assert_eq!(ocr.backend, "tesseract", "backend should be tesseract");
}

#[test]
fn test_cli_force_ocr_flag_parsing() {
    let config_str = r#"{"force_ocr": true}"#;

    let config: ExtractionConfig = serde_json::from_str(config_str).expect("Failed to deserialize force_ocr config");

    assert!(config.force_ocr, "force_ocr should be true");
    // Verify other fields retain defaults
    assert!(config.use_cache, "use_cache should still be true by default");
}

#[test]
fn test_cli_max_concurrent_extractions_parsing() {
    let config_str = r#"{"max_concurrent_extractions": 8}"#;

    let config: ExtractionConfig =
        serde_json::from_str(config_str).expect("Failed to deserialize concurrent extractions");

    assert_eq!(
        config.max_concurrent_extractions,
        Some(8),
        "max_concurrent_extractions should be 8"
    );
}

#[test]
fn test_cli_complex_config_deserialization() {
    let config_str = r#"{
        "use_cache": false,
        "enable_quality_processing": true,
        "force_ocr": true,
        "output_format": "markdown",
        "result_format": "unified",
        "max_concurrent_extractions": 16,
        "ocr": {
            "backend": "tesseract",
            "language": "eng"
        },
        "chunking": {
            "max_characters": 2000,
            "overlap": 400,
            "strategy": "sliding_window"
        }
    }"#;

    let config: ExtractionConfig = serde_json::from_str(config_str).expect("Failed to deserialize complex config");

    // Verify all top-level fields
    assert!(!config.use_cache);
    assert!(config.enable_quality_processing);
    assert!(config.force_ocr);
    assert_eq!(config.max_concurrent_extractions, Some(16));

    // Verify nested configs
    assert!(config.ocr.is_some());
    assert!(config.chunking.is_some());

    let ocr = config.ocr.unwrap();
    assert_eq!(ocr.backend, "tesseract");
    assert_eq!(ocr.language, "eng");

    let chunking = config.chunking.unwrap();
    assert_eq!(chunking.max_characters, 2000);
    assert_eq!(chunking.overlap, 400);
}

#[test]
fn test_cli_empty_config_uses_defaults() {
    let config_str = r#"{}"#;

    let config: ExtractionConfig = serde_json::from_str(config_str).expect("Failed to deserialize empty config");

    // All defaults should apply
    assert!(config.use_cache, "Default use_cache should be true");
    assert!(
        config.enable_quality_processing,
        "Default enable_quality_processing should be true"
    );
    assert!(!config.force_ocr, "Default force_ocr should be false");
    assert_eq!(
        config.max_concurrent_extractions, None,
        "Default max_concurrent_extractions should be None"
    );
}

#[test]
fn test_cli_roundtrip_preserves_all_fields() {
    let original_str = r#"{
        "use_cache": false,
        "force_ocr": true,
        "max_concurrent_extractions": 12
    }"#;

    // Parse
    let config: ExtractionConfig = serde_json::from_str(original_str).expect("Failed to deserialize");

    // Serialize back
    let serialized = serde_json::to_value(&config).expect("Failed to serialize");

    // Re-parse the serialized version
    let reparsed: ExtractionConfig =
        serde_json::from_value(serialized).expect("Failed to deserialize roundtripped config");

    // Verify fields preserved
    assert!(!reparsed.use_cache);
    assert!(reparsed.force_ocr);
    assert_eq!(reparsed.max_concurrent_extractions, Some(12));
}

#[test]
fn test_cli_output_format_enum_parsing() {
    let test_cases = vec![
        (r#"{"output_format": "plain"}"#, OutputFormat::Plain),
        (r#"{"output_format": "markdown"}"#, OutputFormat::Markdown),
        (r#"{"output_format": "html"}"#, OutputFormat::Html),
    ];

    for (config_str, expected_format) in test_cases {
        let config: ExtractionConfig =
            serde_json::from_str(config_str).unwrap_or_else(|_| panic!("Failed to deserialize {}", config_str));

        assert_eq!(
            config.output_format, expected_format,
            "output_format should match expected value"
        );
    }
}

#[test]
fn test_cli_result_format_enum_parsing() {
    let test_cases = vec![
        r#"{"result_format": "unified"}"#,
        r#"{"result_format": "element_based"}"#,
    ];

    for config_str in test_cases {
        let result = serde_json::from_str::<ExtractionConfig>(config_str);
        assert!(result.is_ok(), "Should deserialize result_format from {}", config_str);
    }
}

#[test]
fn test_cli_base64_encoded_config_simulation() {
    // Simulate --config-json-base64 flag handling
    let original_json = json!({
        "force_ocr": true,
        "output_format": "markdown"
    });

    let json_string = original_json.to_string();

    // Simulate base64 encoding
    let encoded = base64::engine::general_purpose::STANDARD.encode(&json_string);

    // Simulate base64 decoding (as CLI would do)
    use base64::Engine;
    let decoded = String::from_utf8(
        base64::engine::general_purpose::STANDARD
            .decode(&encoded)
            .expect("Failed to decode base64"),
    )
    .expect("Failed to convert bytes to string");

    // Parse the decoded JSON
    let config: ExtractionConfig = serde_json::from_str(&decoded).expect("Failed to deserialize base64-decoded config");

    assert!(config.force_ocr);
    assert_eq!(config.output_format, OutputFormat::Markdown);
}

#[test]
fn test_cli_partial_override_merging() {
    // Test that partial configs can override defaults
    let base_config = ExtractionConfig::default();
    let override_json = json!({"force_ocr": true, "use_cache": false});

    // Simulate CLI merge: convert base to JSON, merge overrides, deserialize
    let mut base_json = serde_json::to_value(&base_config).expect("Failed to serialize base config");

    if let (serde_json::Value::Object(base_obj), serde_json::Value::Object(override_obj)) =
        (&mut base_json, override_json)
    {
        for (key, value) in override_obj {
            base_obj.insert(key, value);
        }
    }

    let merged: ExtractionConfig = serde_json::from_value(base_json).expect("Failed to deserialize merged config");

    assert!(merged.force_ocr, "Override should apply force_ocr");
    assert!(!merged.use_cache, "Override should apply use_cache");
    assert!(
        merged.enable_quality_processing,
        "Unoverridden field should retain default"
    );
}

#[test]
fn test_cli_invalid_json_error_handling() {
    let invalid_json_str = r#"{"force_ocr": true, "invalid_field": "value"}"#;

    // Note: serde with deny_unknown_fields would reject this
    // Without that, it should deserialize successfully and ignore unknown fields
    let result = serde_json::from_str::<ExtractionConfig>(invalid_json_str);

    // Document the current behavior - unknown fields are typically ignored
    if let Ok(config) = result {
        assert!(config.force_ocr);
    }
}

#[test]
fn test_cli_whitespace_handling_in_json() {
    let config_strs = vec![
        r#"{"force_ocr":true}"#,     // No spaces
        r#"{ "force_ocr" : true }"#, // Extra spaces
        r#"{
            "force_ocr": true
        }"#, // Newlines and indentation
    ];

    for config_str in config_strs {
        let config: ExtractionConfig =
            serde_json::from_str(config_str).unwrap_or_else(|_| panic!("Failed to parse: {}", config_str));

        assert!(config.force_ocr);
    }
}

#[test]
fn test_cli_numeric_boundary_values() {
    // Test minimum and maximum reasonable values for numeric fields
    let test_cases = vec![
        (r#"{"max_concurrent_extractions": 1}"#, Some(1)),
        (r#"{"max_concurrent_extractions": 256}"#, Some(256)),
        (r#"{"max_concurrent_extractions": 0}"#, Some(0)), // Edge case: 0 extractions
    ];

    for (config_str, expected_value) in test_cases {
        let config: ExtractionConfig =
            serde_json::from_str(config_str).unwrap_or_else(|_| panic!("Failed to parse: {}", config_str));

        assert_eq!(
            config.max_concurrent_extractions, expected_value,
            "Numeric values should be parsed correctly"
        );
    }
}

#[test]
fn test_cli_boolean_values_strict_parsing() {
    // Test that boolean values are strictly true/false, not truthy/falsy
    let test_cases = vec![(r#"{"use_cache": true}"#, true), (r#"{"use_cache": false}"#, false)];

    for (config_str, expected_value) in test_cases {
        let config: ExtractionConfig =
            serde_json::from_str(config_str).unwrap_or_else(|_| panic!("Failed to parse: {}", config_str));

        assert_eq!(config.use_cache, expected_value);
    }
}

#[test]
fn test_cli_config_consistency_across_formats() {
    // Create a config programmatically
    let programmatic_config = ExtractionConfig {
        use_cache: false,
        enable_quality_processing: true,
        force_ocr: true,
        output_format: OutputFormat::Markdown,
        max_concurrent_extractions: Some(4),
        ..Default::default()
    };

    // Serialize it
    let serialized_json = serde_json::to_value(&programmatic_config).expect("Failed to serialize");

    // Deserialize back from JSON string (simulating CLI parsing)
    let json_string = serialized_json.to_string();
    let deserialized: ExtractionConfig = serde_json::from_str(&json_string).expect("Failed to deserialize from string");

    // Verify complete roundtrip
    assert_eq!(deserialized.use_cache, programmatic_config.use_cache);
    assert_eq!(
        deserialized.enable_quality_processing,
        programmatic_config.enable_quality_processing
    );
    assert_eq!(deserialized.force_ocr, programmatic_config.force_ocr);
    assert_eq!(deserialized.output_format, programmatic_config.output_format);
    assert_eq!(
        deserialized.max_concurrent_extractions,
        programmatic_config.max_concurrent_extractions
    );
}

// Re-export needed for base64 test (moved to end of file)

// Re-export needed for base64 test (imported at top of file)