use terraphim_middleware::haystack::QueryRsHaystackIndexer;
#[tokio::test]
async fn test_document_id_fix_comprehensive() {
let indexer = QueryRsHaystackIndexer::default();
println!("๐งช Testing Document ID Generation Fix");
println!("=====================================");
let problematic_urls = vec![
(
"https://www.reddit.com/r/rust/comments/abc123/some_title/",
"reddit_abc123",
),
(
"https://www.reddit.com/r/rust/comments/xyz789/rust_async_programming_guide/?utm_source=share",
"reddit_xyz789",
),
(
"https://doc.rust-lang.org/std/iter/trait.Iterator.html",
"std_std_iter_trait_iterator",
),
(
"https://doc.rust-lang.org/std/collections/struct.HashMap.html",
"std_std_collections_struct_hashmap",
),
(
"https://docs.rs/serde/latest/serde/",
"std_docs_rs_serde_latest_serde",
),
(
"https://docs.rs/tokio/1.0.0/tokio/runtime/",
"std_docs_rs_tokio_1_0_0_tokio_runtime",
),
];
for (url, expected_clean_id) in problematic_urls {
println!("\n๐ Testing URL: {}", url);
if url.contains("reddit.com") {
let post_id = indexer.extract_reddit_post_id(url);
assert!(
post_id.is_some(),
"Should extract Reddit post ID from: {}",
url
);
let original_id = format!("reddit-{}", post_id.unwrap());
let normalized_id = indexer.normalize_document_id(&original_id);
assert_eq!(
normalized_id, expected_clean_id,
"Reddit ID should be clean for: {}",
url
);
println!(" โ
Reddit: {} -> {}", url, normalized_id);
assert!(
normalized_id.len() < 50,
"ID should be reasonably short: {}",
normalized_id
);
assert!(
!normalized_id.contains('/'),
"ID should not contain slashes: {}",
normalized_id
);
assert!(
!normalized_id.contains('\\'),
"ID should not contain backslashes: {}",
normalized_id
);
assert!(
!normalized_id.contains(':'),
"ID should not contain colons: {}",
normalized_id
);
}
if url.contains("doc.rust-lang.org") || url.contains("docs.rs") {
let doc_identifier = indexer.extract_doc_identifier(url);
let original_id = format!("std-{}", doc_identifier);
let normalized_id = indexer.normalize_document_id(&original_id);
assert_eq!(
normalized_id, expected_clean_id,
"Doc ID should be clean for: {}",
url
);
println!(" โ
Docs: {} -> {}", url, normalized_id);
assert!(
normalized_id.len() < 100,
"Doc ID should be reasonably short: {}",
normalized_id
);
assert!(
!normalized_id.contains('/'),
"Doc ID should not contain slashes: {}",
normalized_id
);
assert!(
!normalized_id.contains('.'),
"Doc ID should not contain dots: {}",
normalized_id
);
}
}
println!("\nโ
All document IDs are now clean and filesystem-safe!");
println!("๐ฏ OpenDAL persistence warnings should be eliminated");
}
#[test]
fn test_crate_name_normalization() {
let indexer = QueryRsHaystackIndexer::default();
let problematic_crate_names = vec![
"caffe2-nomnigraph", "proc-macro2",
"serde_json",
"async-trait",
"regex-lite",
];
println!("๐งช Testing Crate Name Normalization");
println!("==================================");
for crate_name in problematic_crate_names {
let original_id = format!("crate-{}", crate_name);
let normalized_id = indexer.normalize_document_id(&original_id);
println!(" Crate: {} -> {}", crate_name, normalized_id);
assert!(
!normalized_id.contains('-'),
"Should not contain hyphens: {}",
normalized_id
);
assert!(
normalized_id.starts_with("crate_"),
"Should start with crate_: {}",
normalized_id
);
assert!(
normalized_id.len() < 50,
"Should be reasonably short: {}",
normalized_id
);
let expected_safe_chars = normalized_id
.chars()
.all(|c| c.is_alphanumeric() || c == '_');
assert!(
expected_safe_chars,
"Should only contain safe characters: {}",
normalized_id
);
}
println!("โ
All crate names properly normalized");
}
#[test]
fn test_edge_cases_and_fallbacks() {
let indexer = QueryRsHaystackIndexer::default();
println!("๐งช Testing Edge Cases and Fallbacks");
println!("==================================");
let malformed_reddit_url = "https://www.reddit.com/r/rust/malformed/url/structure/";
let post_id = indexer.extract_reddit_post_id(malformed_reddit_url);
if post_id.is_none() {
println!(" โ
Malformed Reddit URL correctly returns None, will use hash fallback");
}
let very_long_url = format!("https://example.com/{}", "very_long_path".repeat(20));
let doc_identifier = indexer.extract_doc_identifier(&very_long_url);
println!(
" Long URL: {} chars -> {} chars",
very_long_url.len(),
doc_identifier.len()
);
assert!(
doc_identifier.len() < very_long_url.len(),
"Should be shorter than original"
);
let special_url = "https://example.com/path/with-special@chars#section?param=value";
let special_identifier = indexer.extract_doc_identifier(special_url);
println!(" Special chars: {} -> {}", special_url, special_identifier);
assert!(
!special_identifier.contains('@'),
"Should not contain @ symbols"
);
assert!(
!special_identifier.contains('#'),
"Should not contain # symbols"
);
assert!(
!special_identifier.contains('?'),
"Should not contain ? symbols"
);
println!("โ
Edge cases handled properly with fallbacks");
}
#[test]
fn test_performance_of_id_generation() {
let indexer = QueryRsHaystackIndexer::default();
let test_urls = vec![
"https://www.reddit.com/r/rust/comments/test123/benchmark_post/",
"https://doc.rust-lang.org/std/collections/HashMap.html",
"https://docs.rs/serde/latest/serde/",
];
let start_time = std::time::Instant::now();
for _ in 0..100 {
for url in &test_urls {
if url.contains("reddit.com") {
if let Some(post_id) = indexer.extract_reddit_post_id(url) {
let original_id = format!("reddit-{}", post_id);
let _normalized = indexer.normalize_document_id(&original_id);
}
} else {
let doc_id = indexer.extract_doc_identifier(url);
let original_id = format!("doc-{}", doc_id);
let _normalized = indexer.normalize_document_id(&original_id);
}
}
}
let duration = start_time.elapsed();
println!("๐ Generated 300 document IDs in {:?}", duration);
assert!(
duration.as_millis() < 1000,
"ID generation should be fast: {:?}",
duration
);
println!("โ
Document ID generation is performant");
}