terraphim_middleware 1.16.34

Terraphim middleware for searching haystacks
Documentation
use serde_json::Value;
use terraphim_middleware::haystack::QueryRsHaystackIndexer;
use terraphim_persistence::{DeviceStorage, Persistable};
use terraphim_types::Document;

/// End-to-End test that proves the OpenDAL persistence warnings are fixed
/// This test creates actual documents with the problematic URLs and verifies
/// they can be saved and loaded without OpenDAL warnings
#[tokio::test]
async fn test_opendal_persistence_fix_end_to_end() {
    // Initialize memory-only persistence for testing
    let _storage = DeviceStorage::init_memory_only()
        .await
        .expect("Failed to initialize test storage");

    println!("๐Ÿงช End-to-End OpenDAL Persistence Fix Test");
    println!("==========================================");

    let indexer = QueryRsHaystackIndexer::default();

    // Create mock JSON data that simulates the QueryRs API responses
    // This represents the data that was causing OpenDAL warnings before the fix
    let mock_reddit_posts = create_mock_reddit_data();
    let mock_suggest_data = create_mock_suggest_data();
    let mock_crates_data = create_mock_crates_data();

    // Parse the mock data using the fixed QueryRs implementation
    let reddit_docs = indexer
        .parse_reddit_json(mock_reddit_posts)
        .expect("Should parse Reddit JSON successfully");

    let suggest_docs = indexer
        .parse_suggest_json(mock_suggest_data, "iterator")
        .expect("Should parse suggest JSON successfully");

    let crates_docs = indexer
        .parse_crates_io_json(mock_crates_data, "serde")
        .expect("Should parse crates.io JSON successfully");

    println!("๐Ÿ“Š Generated {} Reddit documents", reddit_docs.len());
    println!("๐Ÿ“Š Generated {} suggest documents", suggest_docs.len());
    println!("๐Ÿ“Š Generated {} crate documents", crates_docs.len());

    // Test saving and loading each type of document
    let mut all_successful_saves = 0;
    let mut all_successful_loads = 0;

    // Test Reddit documents
    println!("\n๐Ÿ”ฅ Testing Reddit document persistence...");
    for (i, doc) in reddit_docs.iter().enumerate() {
        println!("  Document ID: {}", doc.id);

        // Verify the ID is clean (this is what fixes the OpenDAL warnings)
        assert!(
            doc.id.len() < 50,
            "Reddit document ID should be short: {}",
            doc.id
        );
        assert!(
            doc.id.starts_with("reddit_"),
            "Reddit ID should start with reddit_: {}",
            doc.id
        );
        assert!(
            !doc.id.contains("http"),
            "Reddit ID should not contain URLs: {}",
            doc.id
        );

        // Save the document - this would have caused OpenDAL warnings before the fix
        match doc.save_to_one("memory").await {
            Ok(()) => {
                all_successful_saves += 1;
                println!("  โœ… Saved Reddit document: {}", doc.id);

                // Load the document back to verify persistence works
                let mut loaded_doc = Document::new(doc.id.clone());
                match loaded_doc.load().await {
                    Ok(loaded) => {
                        all_successful_loads += 1;
                        assert_eq!(loaded.id, doc.id, "Loaded document should match original");
                        assert_eq!(loaded.title, doc.title, "Loaded title should match");
                        println!("  โœ… Loaded Reddit document: {}", loaded.id);
                    }
                    Err(e) => {
                        println!("  โŒ Failed to load Reddit document: {}", e);
                    }
                }
            }
            Err(e) => {
                println!("  โŒ Failed to save Reddit document: {}", e);
            }
        }

        if i >= 2 {
            break;
        } // Test first few to keep test fast
    }

    // Test Suggest API documents
    println!("\n๐Ÿ”ฅ Testing Suggest API document persistence...");
    for (i, doc) in suggest_docs.iter().enumerate() {
        println!("  Document ID: {}", doc.id);

        // Verify the ID is clean
        assert!(
            doc.id.len() < 100,
            "Suggest document ID should be reasonable: {}",
            doc.id
        );
        assert!(
            !doc.id.contains("http"),
            "Suggest ID should not contain URLs: {}",
            doc.id
        );
        assert!(
            !doc.id.contains("/"),
            "Suggest ID should not contain slashes: {}",
            doc.id
        );

        // Save and load
        match doc.save_to_one("memory").await {
            Ok(()) => {
                all_successful_saves += 1;
                println!("  โœ… Saved suggest document: {}", doc.id);

                let mut loaded_doc = Document::new(doc.id.clone());
                match loaded_doc.load().await {
                    Ok(loaded) => {
                        all_successful_loads += 1;
                        assert_eq!(loaded.id, doc.id, "Loaded document should match original");
                        println!("  โœ… Loaded suggest document: {}", loaded.id);
                    }
                    Err(e) => {
                        println!("  โŒ Failed to load suggest document: {}", e);
                    }
                }
            }
            Err(e) => {
                println!("  โŒ Failed to save suggest document: {}", e);
            }
        }

        if i >= 2 {
            break;
        }
    }

    // Test Crates.io documents
    println!("\n๐Ÿ”ฅ Testing Crates.io document persistence...");
    for (i, doc) in crates_docs.iter().enumerate() {
        println!("  Document ID: {}", doc.id);

        // Verify the ID is clean
        assert!(
            doc.id.len() < 50,
            "Crate document ID should be short: {}",
            doc.id
        );
        assert!(
            doc.id.starts_with("crate_"),
            "Crate ID should start with crate_: {}",
            doc.id
        );
        assert!(
            !doc.id.contains("http"),
            "Crate ID should not contain URLs: {}",
            doc.id
        );

        // Save and load
        match doc.save_to_one("memory").await {
            Ok(()) => {
                all_successful_saves += 1;
                println!("  โœ… Saved crate document: {}", doc.id);

                let mut loaded_doc = Document::new(doc.id.clone());
                match loaded_doc.load().await {
                    Ok(loaded) => {
                        all_successful_loads += 1;
                        assert_eq!(loaded.id, doc.id, "Loaded document should match original");
                        println!("  โœ… Loaded crate document: {}", loaded.id);
                    }
                    Err(e) => {
                        println!("  โŒ Failed to load crate document: {}", e);
                    }
                }
            }
            Err(e) => {
                println!("  โŒ Failed to save crate document: {}", e);
            }
        }

        if i >= 2 {
            break;
        }
    }

    // Final validation
    println!("\n๐ŸŽฏ Final Results:");
    println!("  Total successful saves: {}", all_successful_saves);
    println!("  Total successful loads: {}", all_successful_loads);

    assert!(
        all_successful_saves > 0,
        "Should have some successful saves"
    );
    assert!(
        all_successful_loads > 0,
        "Should have some successful loads"
    );
    assert_eq!(
        all_successful_saves, all_successful_loads,
        "All saved documents should be loadable"
    );

    println!("\nโœ… End-to-End Test PASSED!");
    println!("๐ŸŽ‰ OpenDAL persistence warnings are fixed - documents save/load successfully");
    println!("๐Ÿ”ง Document IDs are now clean, short, and filesystem-safe");
}

/// Test that demonstrates the improvement: compare old vs new ID generation
#[test]
fn test_before_after_comparison() {
    let indexer = QueryRsHaystackIndexer::default();

    println!("๐Ÿงช Before/After Document ID Comparison");
    println!("======================================");

    let test_cases = vec![
        (
            "Reddit Long URL",
            "https://www.reddit.com/r/rust/comments/1a2b3c4d/incredible_rust_performance_improvements_with_zero_cost_abstractions_detailed_analysis/",
            "Old: reddit_https_www_reddit_com_r_rust_comments_1a2b3c4d_incredible_rust_performance_improvements_with_zero_cost_abstractions_detailed_analysis",
            "reddit_1a2b3c4d",
        ),
        (
            "Documentation URL",
            "https://doc.rust-lang.org/std/collections/hash_map/struct.HashMap.html",
            "Old: std_https_doc_rust_lang_org_std_collections_hash_map_struct_hashmap_html",
            "doc_std_collections_hash_map_struct_hashmap",
        ),
        (
            "Docs.rs URL",
            "https://docs.rs/serde_json/1.0.96/serde_json/value/enum.Value.html",
            "Old: docs_rs_https_docs_rs_serde_json_1_0_96_serde_json_value_enum_value_html",
            "doc_docs_rs_serde_json_1_0_96_serde_json_value_enu",  // Truncated to 50 chars
        ),
    ];

    for (case_name, url, old_pattern, expected_new) in test_cases {
        println!("\n๐Ÿ“‹ Case: {}", case_name);
        println!("  URL: {}", url);

        // Generate new clean ID
        let new_id = if url.contains("reddit.com") {
            if let Some(post_id) = indexer.extract_reddit_post_id(url) {
                indexer.normalize_document_id(&format!("reddit-{}", post_id))
            } else {
                "fallback_id".to_string()
            }
        } else {
            let doc_id = indexer.extract_doc_identifier(url);
            indexer.normalize_document_id(&format!("doc-{}", doc_id))
        };

        println!(
            "  ๐Ÿ“‰ Old (problematic): {} ({} chars)",
            old_pattern,
            old_pattern.len()
        );
        println!(
            "  ๐Ÿ“ˆ New (fixed):       {} ({} chars)",
            new_id,
            new_id.len()
        );

        // Verify improvements
        assert!(new_id.len() < old_pattern.len(), "New ID should be shorter");
        assert!(new_id.len() < 100, "New ID should be reasonable length");
        assert_eq!(
            new_id, expected_new,
            "New ID should match expected clean format"
        );

        println!(
            "  โœ… Improvement: {}% size reduction",
            100 - (new_id.len() * 100 / old_pattern.len())
        );
    }

    println!("\nโœ… All document IDs significantly improved!");
    println!("๐Ÿšซ No more OpenDAL 'path not found' warnings");
    println!("๐Ÿ’พ All IDs are filesystem-safe and readable");
}

// Helper functions to create mock data for testing

fn create_mock_reddit_data() -> Vec<Value> {
    serde_json::from_str(r#"[
        {
            "title": "Amazing Rust async programming guide",
            "url": "https://www.reddit.com/r/rust/comments/abc123/amazing_rust_async_programming_guide/",
            "author": "rustacean42",
            "score": 156,
            "selftext": "This is a comprehensive guide to async programming in Rust..."
        },
        {
            "title": "Zero-cost abstractions in practice",
            "url": "https://www.reddit.com/r/rust/comments/def456/zerocost_abstractions_in_practice/",
            "author": "performance_guru",
            "score": 89,
            "selftext": "Let me show you how zero-cost abstractions work in real scenarios..."
        }
    ]"#).unwrap()
}

fn create_mock_suggest_data() -> Value {
    serde_json::from_str(r#"[
        "iterator",
        [
            "std::iter::Iterator - https://doc.rust-lang.org/std/iter/trait.Iterator.html",
            "std::iter::IntoIterator - https://doc.rust-lang.org/std/iter/trait.IntoIterator.html"
        ],
        ["Iterator trait", "IntoIterator trait"],
        ["https://doc.rust-lang.org/std/iter/trait.Iterator.html", "https://doc.rust-lang.org/std/iter/trait.IntoIterator.html"]
    ]"#).unwrap()
}

fn create_mock_crates_data() -> Value {
    serde_json::from_str(
        r#"{
        "crates": [
            {
                "name": "serde",
                "description": "A generic serialization/deserialization framework",
                "max_version": "1.0.196",
                "downloads": 500000000,
                "homepage": "https://serde.rs/",
                "documentation": "https://docs.rs/serde/",
                "repository": "https://github.com/serde-rs/serde",
                "keywords": ["serialization", "json"]
            },
            {
                "name": "caffe2-nomnigraph",
                "description": "Rust bindings for Caffe2 NomniGraph",
                "max_version": "0.1.2",
                "downloads": 1234,
                "homepage": "",
                "documentation": "",
                "repository": "https://github.com/example/caffe2-nomnigraph",
                "keywords": ["ml", "caffe2"]
            }
        ]
    }"#,
    )
    .unwrap()
}