pmat 3.14.0 - Docs.rs

// ── Tests ──────────────────────────────────────────────────────

#[cfg(test)]
mod tests {
    use super::*;
    use std::io::Write;

    #[test]
    fn test_is_document_file() {
        assert!(is_document_file(Path::new("docs/spec.pdf")));
        assert!(is_document_file(Path::new("diagram.svg")));
        assert!(is_document_file(Path::new("screenshot.png")));
        assert!(is_document_file(Path::new("photo.jpg")));
        assert!(is_document_file(Path::new("photo.JPEG")));
        assert!(is_document_file(Path::new("README.md")));
        assert!(is_document_file(Path::new("notes.txt")));
        assert!(is_document_file(Path::new("doc.rst")));
        assert!(is_document_file(Path::new("doc.adoc")));
        assert!(!is_document_file(Path::new("main.rs")));
        assert!(!is_document_file(Path::new("lib.py")));
        assert!(!is_document_file(Path::new("Cargo.toml")));
    }

    #[test]
    fn test_extract_svg_with_text() {
        let dir = tempfile::tempdir().unwrap();
        let svg_path = dir.path().join("diagram.svg");
        std::fs::write(
            &svg_path,
            r#"<svg><text x="10" y="20">Hello World</text><tspan>Sub text</tspan></svg>"#,
        )
        .unwrap();

        let chunks = extract_svg(&svg_path, "diagram.svg", "abc123").unwrap();
        assert_eq!(chunks.len(), 1);
        assert!(chunks[0].text_content.contains("Hello World"));
        assert!(chunks[0].text_content.contains("Sub text"));
        assert_eq!(chunks[0].doc_type, DocumentType::Svg);
    }

    #[test]
    fn test_extract_svg_no_text() {
        let dir = tempfile::tempdir().unwrap();
        let svg_path = dir.path().join("empty.svg");
        std::fs::write(&svg_path, r#"<svg><rect width="100" height="100"/></svg>"#).unwrap();

        let chunks = extract_svg(&svg_path, "empty.svg", "def456").unwrap();
        assert_eq!(chunks.len(), 1);
        assert!(chunks[0].text_content.contains("no text content"));
        assert_eq!(chunks[0].extraction_quality, 0.2);
    }

    #[test]
    fn test_extract_svg_with_title() {
        let dir = tempfile::tempdir().unwrap();
        let svg_path = dir.path().join("titled.svg");
        std::fs::write(
            &svg_path,
            r#"<svg><title>Architecture Diagram</title><text>Node A</text></svg>"#,
        )
        .unwrap();

        let chunks = extract_svg(&svg_path, "titled.svg", "ghi789").unwrap();
        assert_eq!(chunks.len(), 1);
        assert!(chunks[0].text_content.contains("Architecture Diagram"));
        assert!(chunks[0].text_content.contains("Node A"));
    }

    #[test]
    fn test_extract_image_metadata() {
        let dir = tempfile::tempdir().unwrap();
        let img_path = dir.path().join("screenshot.png");
        std::fs::write(&img_path, b"fake png data").unwrap();

        let chunks =
            extract_image_metadata(&img_path, "docs/screenshots/screenshot.png", "hash1").unwrap();
        assert_eq!(chunks.len(), 1);
        assert!(chunks[0].text_content.contains("screenshot.png"));
        assert!(chunks[0].text_content.contains("docs/screenshots"));
        assert_eq!(chunks[0].extraction_quality, 0.3);
        assert_eq!(chunks[0].doc_type, DocumentType::Image);
    }

    #[test]
    fn test_extract_markdown_with_headings() {
        let dir = tempfile::tempdir().unwrap();
        let md_path = dir.path().join("doc.md");
        std::fs::write(
            &md_path,
            "# Title\n\nIntro paragraph.\n\n## Section A\n\nContent A.\n\n## Section B\n\nContent B.\n",
        )
        .unwrap();

        let chunks = extract_markdown(&md_path, "doc.md", "hash2").unwrap();
        assert!(chunks.len() >= 2);
        // First chunk should be the "Title" section with "Intro paragraph"
        assert_eq!(chunks[0].section_heading, Some("Title".to_string()));
        assert!(chunks[0].text_content.contains("Intro paragraph"));
        // Second chunk should be "Section A"
        assert_eq!(chunks[1].section_heading, Some("Section A".to_string()));
        assert!(chunks[1].text_content.contains("Content A"));
    }

    #[test]
    fn test_extract_markdown_no_headings() {
        let dir = tempfile::tempdir().unwrap();
        let md_path = dir.path().join("flat.md");
        std::fs::write(&md_path, "Just some plain text\nwith no headings.\n").unwrap();

        let chunks = extract_markdown(&md_path, "flat.md", "hash3").unwrap();
        assert_eq!(chunks.len(), 1);
        assert!(chunks[0].text_content.contains("plain text"));
        assert_eq!(chunks[0].section_heading, None);
    }

    #[test]
    fn test_extract_markdown_empty() {
        let dir = tempfile::tempdir().unwrap();
        let md_path = dir.path().join("empty.md");
        std::fs::write(&md_path, "").unwrap();

        let chunks = extract_markdown(&md_path, "empty.md", "hash4").unwrap();
        assert!(chunks.is_empty());
    }

    #[test]
    fn test_extract_plaintext() {
        let dir = tempfile::tempdir().unwrap();
        let txt_path = dir.path().join("notes.txt");
        std::fs::write(&txt_path, "Line 1\nLine 2\nLine 3\n").unwrap();

        let chunks = extract_plaintext(&txt_path, "notes.txt", "hash5").unwrap();
        assert_eq!(chunks.len(), 1);
        assert!(chunks[0].text_content.contains("Line 1"));
        assert_eq!(chunks[0].doc_type, DocumentType::PlainText);
    }

    #[test]
    fn test_extract_plaintext_empty() {
        let dir = tempfile::tempdir().unwrap();
        let txt_path = dir.path().join("empty.txt");
        std::fs::write(&txt_path, "  \n  \n").unwrap();

        let chunks = extract_plaintext(&txt_path, "empty.txt", "hash6").unwrap();
        assert!(chunks.is_empty());
    }

    #[test]
    fn test_split_into_chunks_large_content() {
        // Create content larger than MAX_CHUNK_SIZE
        let mut content = String::new();
        for i in 0..500 {
            content.push_str(&format!(
                "Line {i}: This is a test line with some content.\n"
            ));
        }
        assert!(content.len() > MAX_CHUNK_SIZE);

        let chunks = split_into_chunks(&content, "big.txt", DocumentType::PlainText, "hash7", 1.0);
        assert!(chunks.len() > 1);
        for chunk in &chunks {
            assert!(chunk.text_content.len() <= MAX_CHUNK_SIZE + 100); // allow some slack for last line
        }
    }

    #[test]
    fn test_truncate_to_max_chunk() {
        let short = "hello world";
        assert_eq!(truncate_to_max_chunk(short), "hello world");

        let long = "a ".repeat(MAX_CHUNK_SIZE);
        let truncated = truncate_to_max_chunk(&long);
        assert!(truncated.len() <= MAX_CHUNK_SIZE);
    }

    #[test]
    fn test_extract_document_dispatcher() {
        let dir = tempfile::tempdir().unwrap();

        // SVG dispatch
        let svg_path = dir.path().join("test.svg");
        std::fs::write(&svg_path, "<svg><text>Hello</text></svg>").unwrap();
        assert!(extract_document(&svg_path, "test.svg", "h1").is_ok());

        // Markdown dispatch
        let md_path = dir.path().join("test.md");
        std::fs::write(&md_path, "# Hello\nWorld").unwrap();
        assert!(extract_document(&md_path, "test.md", "h2").is_ok());

        // Plaintext dispatch
        let txt_path = dir.path().join("test.txt");
        std::fs::write(&txt_path, "Hello").unwrap();
        assert!(extract_document(&txt_path, "test.txt", "h3").is_ok());

        // Image dispatch
        let img_path = dir.path().join("test.png");
        std::fs::write(&img_path, b"PNG").unwrap();
        assert!(extract_document(&img_path, "test.png", "h4").is_ok());

        // Unsupported
        let rs_path = dir.path().join("test.rs");
        std::fs::write(&rs_path, "fn main() {}").unwrap();
        assert!(extract_document(&rs_path, "test.rs", "h5").is_err());
    }

    #[test]
    fn test_pdf_without_feature() {
        // Without doc-indexing feature, should return metadata-only chunk
        let dir = tempfile::tempdir().unwrap();
        let pdf_path = dir.path().join("test.pdf");
        let mut f = std::fs::File::create(&pdf_path).unwrap();
        f.write_all(b"%PDF-1.4 fake").unwrap();

        let result = extract_pdf(&pdf_path, "test.pdf", "hashpdf");
        // Without the doc-indexing feature, this returns a metadata-only result
        // With the feature, it would attempt real extraction (and likely fail on fake data)
        #[cfg(not(feature = "doc-indexing"))]
        {
            let chunks = result.unwrap();
            assert_eq!(chunks.len(), 1);
            assert!(chunks[0].text_content.contains("doc-indexing"));
            assert_eq!(chunks[0].extraction_quality, 0.1);
        }
    }
}