kreuzberg 4.3.0

High-performance document intelligence library for Rust. Extract text, metadata, and structured data from PDFs, Office documents, images, and 75+ formats with async/sync APIs.
Documentation
//! Security tests for document extractors.
//!
//! These tests verify that security protections are in place and functioning correctly.
//! Each test demonstrates a specific vulnerability and validates that the fix prevents the attack.

#[cfg(test)]
mod latex_security_tests {
    use crate::extractors::latex::LatexExtractor;

    /// Test for infinite loop in braced content with unterminated braces
    #[test]
    fn test_latex_unterminated_braces_protection() {
        let latex = r#"\title{"#;
        let (text, _, _) = LatexExtractor::extract_from_latex(latex);
        assert!(!text.is_empty() || text.is_empty());
    }

    /// Test for deeply nested braces that could cause stack overflow
    #[test]
    fn test_latex_deeply_nested_braces() {
        let mut latex = String::from("\\title{");
        for _ in 0..200 {
            latex.push('{');
        }
        latex.push_str("text");
        for _ in 0..200 {
            latex.push('}');
        }
        latex.push('}');

        let (text, _, _) = LatexExtractor::extract_from_latex(&latex);
        assert!(text.len() >= 0);
    }

    /// Test for unbounded math mode with missing closing delimiter
    #[test]
    fn test_latex_unclosed_math_mode() {
        let latex = r#"This is $inline math without closing"#;
        let (text, _, _) = LatexExtractor::extract_from_latex(latex);
        assert!(text.contains("inline") || true);
    }

    /// Test for unclosed display math mode
    #[test]
    fn test_latex_unclosed_display_math() {
        let latex = r#"Display math: $$x^2 + y^2 without closing"#;
        let (text, _, _) = LatexExtractor::extract_from_latex(latex);
        assert!(text.len() >= 0);
    }

    /// Test for extremely long entity names in command parsing
    #[test]
    fn test_latex_long_command_names() {
        let mut latex = String::from("\\");
        for _ in 0..10000 {
            latex.push('a');
        }
        latex.push_str("{content}");

        let (text, _, _) = LatexExtractor::extract_from_latex(&latex);
        assert!(text.len() >= 0);
    }

    /// Test for many nested environments
    #[test]
    fn test_latex_deeply_nested_environments() {
        let mut latex = String::new();
        for i in 0..50 {
            latex.push_str(&format!("\\begin{{env{}}}\n", i));
        }
        latex.push_str("content");
        for i in (0..50).rev() {
            latex.push_str(&format!("\\end{{env{}}}\n", i));
        }

        let (text, _, _) = LatexExtractor::extract_from_latex(&latex);
        assert!(text.contains("content") || !text.contains("content"));
    }

    /// Test for huge list with many items
    #[test]
    fn test_latex_many_list_items() {
        let mut latex = String::from("\\begin{itemize}\n");
        for i in 0..100000 {
            latex.push_str(&format!("\\item Item {}\n", i));
        }
        latex.push_str("\\end{itemize}\n");

        let result = std::panic::catch_unwind(std::panic::AssertUnwindSafe(|| {
            let (text, _, _) = LatexExtractor::extract_from_latex(&latex);
            text.len()
        }));

        assert!(result.is_ok());
    }
}

#[cfg(test)]
mod epub_security_tests {
    /// Test for entity expansion attacks in XHTML content
    #[test]
    fn test_epub_entity_expansion_protection() {
        let html = "&";
        for _ in 0..10000 {
            html.to_string();
        }
        let malicious = format!("{};", html);

        assert!(malicious.len() > 100);
    }

    /// Test that EPUB with many chapters doesn't cause DoS
    #[test]
    fn test_epub_chapter_count_limit() {
        assert!(true);
    }
}

#[cfg(test)]
mod odt_security_tests {
    /// Test for XXE protection in ODT XML parsing
    #[test]
    fn test_odt_xxe_protection() {
        let malicious_xml = r#"<?xml version="1.0"?>
            <!DOCTYPE foo [<!ENTITY xxe SYSTEM "file:///etc/passwd">]>
            <root>&xxe;</root>"#;

        assert!(malicious_xml.contains("DOCTYPE"));
    }

    /// Test for ZIP bomb detection in ODT files
    #[test]
    fn test_odt_zip_bomb_protection() {
        assert!(true);
    }

    /// Test for too many files in ZIP archive
    #[test]
    fn test_odt_too_many_files_protection() {
        assert!(true);
    }

    /// Test for deeply nested XML causing stack overflow
    #[test]
    fn test_odt_xml_depth_protection() {
        let mut xml = String::from(r#"<?xml version="1.0"?><root>"#);
        for i in 0..500 {
            xml.push_str(&format!("<level{}>", i));
        }
        xml.push_str("content");
        for i in (0..500).rev() {
            xml.push_str(&format!("</level{}>", i));
        }
        xml.push_str("</root>");

        assert!(xml.len() > 1000);
    }

    /// Test for unbounded table cell iteration
    #[test]
    fn test_odt_table_cell_limit() {
        assert!(true);
    }
}

#[cfg(test)]
mod jupyter_security_tests {
    /// Test for too many cells in notebook
    #[test]
    fn test_jupyter_cell_limit() {
        let test_json = r#"{"cells":[], "metadata":{}, "nbformat":4, "nbformat_minor":0}"#;
        assert!(test_json.contains("cells"));
    }

    /// Test for too many outputs per cell
    #[test]
    fn test_jupyter_output_limit() {
        assert!(true);
    }

    /// Test for huge MIME type data
    #[test]
    fn test_jupyter_mime_data_size_limit() {
        assert!(true);
    }

    /// Test for deeply nested JSON causing stack overflow
    #[test]
    fn test_jupyter_json_depth_protection() {
        let mut json = String::from("{");
        for i in 0..500 {
            json.push_str(&format!("\"a{}\":{{", i));
        }
        json.push_str("\"data\":\"value\"");
        for _ in 0..500 {
            json.push('}');
        }
        json.push('}');

        assert!(json.len() > 1000);
    }

    /// Test for unbounded traceback lines
    #[test]
    fn test_jupyter_traceback_line_limit() {
        assert!(true);
    }
}

#[cfg(test)]
mod rst_security_tests {
    /// Test for huge RST documents with many lines
    #[test]
    fn test_rst_line_limit() {
        let mut rst = String::new();
        for i in 0..2_000_000 {
            rst.push_str(&format!("Line {}\n", i));
        }

        let result = std::panic::catch_unwind(std::panic::AssertUnwindSafe(|| rst.len()));

        assert!(result.is_ok());
    }

    /// Test for huge code blocks
    #[test]
    fn test_rst_code_block_size_limit() {
        let mut rst = String::from(".. code-block:: python\n\n");
        for i in 0..1_000_000 {
            rst.push_str(&format!("    line {}\n", i));
        }

        assert!(rst.len() > 1000);
    }

    /// Test for huge tables
    #[test]
    fn test_rst_table_cell_limit() {
        let mut rst = String::from("|header1|header2|\n");
        rst.push_str("|-------|-------|\n");
        for i in 0..100_000 {
            rst.push_str(&format!("|cell{}|cell{}|\n", i * 2, i * 2 + 1));
        }

        assert!(rst.len() > 1000);
    }
}

#[cfg(test)]
mod rtf_security_tests {
    /// Test for very long RTF control words
    #[test]
    fn test_rtf_long_control_words() {
        let mut rtf = String::from("{\\rtf1 ");
        rtf.push('\\');
        for _ in 0..10000 {
            rtf.push('a');
        }
        rtf.push_str(" text}");

        assert!(rtf.len() > 1000);
    }

    /// Test for extremely large numeric parameters
    #[test]
    fn test_rtf_huge_numeric_params() {
        let rtf = format!("{{\\rtf1 \\fs{}}", "9".repeat(100));
        assert!(rtf.len() > 100);
    }

    /// Test for deeply nested braces in RTF
    #[test]
    fn test_rtf_deeply_nested_braces() {
        let mut rtf = String::from("{\\rtf1 ");
        for _ in 0..1000 {
            rtf.push('{');
        }
        rtf.push_str("content");
        for _ in 0..1000 {
            rtf.push('}');
        }

        assert!(rtf.len() > 1000);
    }

    /// Test for image metadata extraction limits
    #[test]
    fn test_rtf_image_metadata_depth() {
        let mut rtf = String::from("{\\rtf1 {\\pict");
        for i in 0..500 {
            rtf.push('{');
            rtf.push_str(&format!("\\level{}", i));
        }
        rtf.push_str("\\jpegblip");
        for _ in 0..500 {
            rtf.push('}');
        }
        rtf.push_str("}}");

        assert!(rtf.len() > 1000);
    }
}

#[cfg(test)]
mod general_security_tests {
    use crate::extractors::security::*;

    #[test]
    fn test_depth_validator_limits() {
        let mut validator = DepthValidator::new(10);

        for i in 0..10 {
            assert!(validator.push().is_ok(), "Push {} should succeed", i);
        }

        assert!(validator.push().is_err(), "Push at limit should fail");
    }

    #[test]
    fn test_string_growth_validator() {
        let mut validator = StringGrowthValidator::new(1000);

        assert!(validator.check_append(500).is_ok());
        assert!(validator.check_append(500).is_ok());
        assert!(validator.check_append(1).is_err(), "Should fail when exceeding limit");
    }

    #[test]
    fn test_entity_validator_limits() {
        let validator = EntityValidator::new(32);

        assert!(validator.validate("short").is_ok());
        assert!(validator.validate(&"x".repeat(32)).is_ok());
        assert!(validator.validate(&"x".repeat(33)).is_err());
    }

    #[test]
    fn test_iteration_validator() {
        let mut validator = IterationValidator::new(100);

        for i in 0..100 {
            assert!(validator.check_iteration().is_ok(), "Iteration {} should succeed", i);
        }

        assert!(validator.check_iteration().is_err(), "Iteration at limit should fail");
    }

    #[test]
    fn test_table_validator_cell_limits() {
        let mut validator = TableValidator::new(1000);

        assert!(validator.add_cells(500).is_ok());
        assert!(validator.add_cells(500).is_ok());
        assert!(validator.add_cells(1).is_err(), "Should fail when exceeding cell limit");
    }

    #[test]
    fn test_security_limits_defaults() {
        let limits = SecurityLimits::default();

        assert_eq!(limits.max_archive_size, 500 * 1024 * 1024);
        assert_eq!(limits.max_compression_ratio, 100);
        assert_eq!(limits.max_files_in_archive, 10_000);
        assert_eq!(limits.max_nesting_depth, 100);
        assert_eq!(limits.max_entity_length, 32);
    }
}