lucisearch 0.8.1

Embeddable, in-process search engine — the SQLite/DuckDB of search
Documentation
//! Storage layer profiling: single-file I/O performance.
//!
//! Measures the SingleFileDirectory operations that underpin all index
//! operations: segment write throughput, commit latency (atomic two-root-pointer
//! protocol + fsync), cold open latency, and segment read throughput.

use super::harness::*;
use luci::index::Index;
use luci::mapping::{FieldType, Mapping};
use luci::storage::SingleFileDirectory;
use serde_json::json;

use std::time::Instant;

/// Measure commit latency in isolation (fsync + metadata write).
///
/// Commits with varying numbers of buffered documents to separate the
/// per-commit overhead from the per-document indexing cost.
#[test]
fn commit_latency() {
    println!("\n=== Storage: Commit Latency ===\n");

    for &doc_count in &[1, 100, 1_000, 5_000] {
        let path = profile_dir(&format!("commit_lat_{doc_count}"));
        let index = Index::create_with_mapping(&path, text_schema()).unwrap();

        let docs: Vec<serde_json::Value> = (0..doc_count)
            .map(|i| {
                json!({
                    "title": format!("doc {i}"),
                    "body": format!("body text for document {i}"),
                    "tag": "tech",
                    "price": (i as f64) + 0.99,
                })
            })
            .collect();
        index.bulk(docs).unwrap();

        // Measure commit (flush + segment write + metadata write + fsync)
        let mut times = Vec::new();
        // First commit: flush + write
        let start = Instant::now();
        times.push(start.elapsed());

        // Subsequent empty commits (metadata-only)
        for _ in 0..20 {
            let start = Instant::now();
            times.push(start.elapsed());
        }

        let first = times[0];
        times[1..].sort();
        let empty_p50 = times[1 + (times.len() - 1) / 2];

        println!(
            "{doc_count:>5} docs: first_commit={:.1}ms  empty_commit_p50={:.1}us",
            first.as_secs_f64() * 1000.0,
            empty_p50.as_micros() as f64
        );

        cleanup(&path);
    }
}

/// Measure cold open latency — time to open a .luci file, read metadata,
/// and reconstruct the allocator state.
#[test]
fn cold_open_latency() {
    println!("\n=== Storage: Cold Open Latency ===\n");

    for &(doc_count, num_commits) in &[(1_000, 1), (10_000, 1), (10_000, 10), (50_000, 1)] {
        let path = profile_dir(&format!("cold_open_{doc_count}_{num_commits}"));
        let index = Index::create_with_mapping(&path, text_schema()).unwrap();

        let docs_per_commit = doc_count / num_commits;
        for c in 0..num_commits {
            let docs: Vec<serde_json::Value> = (0..docs_per_commit)
                .map(|i| {
                    let global = c * docs_per_commit + i;
                    json!({
                        "title": format!("doc {global}"),
                        "body": format!("body {global}"),
                        "tag": "tech",
                        "price": 1.0,
                    })
                })
                .collect();
            index.bulk(docs).unwrap();
        }
        drop(index);

        // Measure cold open
        let mut times = Vec::new();
        for _ in 0..50 {
            let start = Instant::now();
            let _storage = SingleFileDirectory::open(&path).unwrap();
            times.push(start.elapsed());
        }
        times.sort();
        let p50 = times[times.len() / 2];
        let p99 = times[(times.len() as f64 * 0.99) as usize];

        println!(
            "{doc_count:>5} docs, {num_commits} segments: open_p50={:.1}us  open_p99={:.1}us",
            p50.as_micros() as f64,
            p99.as_micros() as f64
        );

        cleanup(&path);
    }
}

/// Measure segment read throughput — pread latency for varying segment sizes.
#[test]
fn segment_read_throughput() {
    println!("\n=== Storage: Segment Read Latency ===\n");

    for &doc_count in &[100, 1_000, 10_000, 50_000] {
        let path = profile_dir(&format!("seg_read_{doc_count}"));
        let index = Index::create_with_mapping(&path, text_schema()).unwrap();

        let docs: Vec<serde_json::Value> = (0..doc_count)
            .map(|i| json!({
                "title": format!("document number {i} about technology topics"),
                "body": format!("body text for document {i} covering tech related content and discussions"),
                "tag": "tech",
                "price": (i as f64 % 100.0) + 0.99,
            }))
            .collect();
        index.bulk(docs).unwrap();
        drop(index);

        let storage = SingleFileDirectory::open(&path).unwrap();
        let seg_entry = &storage.segments()[0];
        let seg_id = seg_entry.segment_id;
        let data_len = seg_entry.data_len;

        // Measure read latency
        let mut times = Vec::new();
        for _ in 0..100 {
            let start = Instant::now();
            let data = storage.read_segment(seg_id).unwrap();
            times.push((start.elapsed(), data.len()));
        }
        times.sort_by_key(|t| t.0);
        let p50 = times[times.len() / 2].0;
        let p99 = times[(times.len() as f64 * 0.99) as usize].0;
        let size_kb = data_len as f64 / 1024.0;
        let throughput_mb = (data_len as f64 / 1_048_576.0) / p50.as_secs_f64();

        println!(
            "{doc_count:>5} docs ({size_kb:.0}KB): read_p50={:.1}us  read_p99={:.1}us  throughput={throughput_mb:.0}MB/s",
            p50.as_micros() as f64,
            p99.as_micros() as f64
        );

        cleanup(&path);
    }
}

/// Measure disk space efficiency — bytes per document for the single-file format.
#[test]
fn disk_space_efficiency() {
    println!("\n=== Storage: Disk Space Efficiency ===\n");

    let path = profile_dir("disk_efficiency");
    let schema = Mapping::builder()
        .field("title", FieldType::Text)
        .field("body", FieldType::Text)
        .field("tag", FieldType::Keyword)
        .field("price", FieldType::Float)
        .build();
    let index = Index::create_with_mapping(&path, schema).unwrap();

    let tags = ["tech", "science", "sports", "politics", "entertainment"];
    let docs: Vec<serde_json::Value> = (0..50_000usize)
        .map(|i| json!({
            "title": format!("document number {i} about {}", tags[i % tags.len()]),
            "body": format!("this is the body text for document {i} covering {} related content and discussions", tags[i % tags.len()]),
            "tag": tags[i % tags.len()],
            "price": (i as f64 % 100.0) + 0.99,
        }))
        .collect();
    index.bulk(docs).unwrap();

    let file_size = std::fs::metadata(&path).map(|m| m.len()).unwrap_or(0);
    let bytes_per_doc = file_size as f64 / 50_000.0;

    println!("50,000 docs (text + keyword + float):");
    println!("  file size: {:.1}MB", file_size as f64 / 1_048_576.0);
    println!("  bytes/doc: {bytes_per_doc:.0}");

    cleanup(&path);
}