lucisearch 0.8.1

Embeddable, in-process search engine — the SQLite/DuckDB of search
Documentation
//! Integration tests for search highlighting.
//!
//! The engine-side API is `hit.highlight(field) -> Option<Vec<Highlight>>`
//! returning structured match spans. No XML/HTML tag wrapping — the
//! consumer decides presentation. See [[feature-search-highlight]] and
//! [[architecture-scoring-materialization-separation]].

use luci::index::Index;
use luci::mapping::{FieldType, Mapping};
use luci::search::expression::parse_search;
use luci::search::highlight::{Highlight, HighlightConfig, HighlightFieldConfig, HighlightOrder};
use serde_json::json;

fn test_dir(name: &str) -> std::path::PathBuf {
    let dir = std::env::temp_dir().join(format!("luci_highlight_{}_{name}", std::process::id()));
    let _ = std::fs::remove_dir_all(&dir);
    dir
}

fn cleanup(path: &std::path::Path) {
    let _ = std::fs::remove_dir_all(path);
}

fn setup_index(name: &str) -> (Index, std::path::PathBuf) {
    let path = test_dir(name);
    let schema = Mapping::builder()
        .field("title", FieldType::Text)
        .field("body", FieldType::Text)
        .build();
    let index = Index::create_with_mapping(&path, schema).unwrap();
    index
        .bulk(vec![
            json!({
                "title": "Search Engine Architecture",
                "body": "A search engine indexes documents for fast retrieval."
            }),
            json!({
                "title": "Database Design",
                "body": "Databases store data in tables. Search is done via SQL queries."
            }),
            json!({
                "title": "Information Retrieval",
                "body": "Modern search engines use inverted indexes and BM25 scoring."
            }),
        ])
        .unwrap();
    (index, path)
}

/// Helper: extract spans for a field from the first hit of `query`.
fn spans_for(index: &Index, query: serde_json::Value, field: &str) -> Option<Vec<Highlight>> {
    let expr = parse_search(query, 10).unwrap();
    let results = index.search(&expr).unwrap();
    let hit = results.hit(0)?;
    hit.highlight(field)
}

#[test]
fn basic_match_returns_spans() {
    let (index, path) = setup_index("basic");
    let spans = spans_for(
        &index,
        json!({"query": {"match": {"body": "search"}}}),
        "body",
    )
    .expect("expected spans");
    assert!(!spans.is_empty());
    for hl in &spans {
        assert!(
            hl.text.eq_ignore_ascii_case("search"),
            "unexpected match text: {:?}",
            hl.text
        );
        assert!(hl.end > hl.start);
    }
    cleanup(&path);
}

#[test]
fn multi_term_highlights_each_term_independently() {
    let (index, path) = setup_index("multi_terms");
    let spans = spans_for(
        &index,
        json!({"query": {"match": {"body": "search engine"}}}),
        "body",
    )
    .unwrap();
    let texts: Vec<&str> = spans.iter().map(|h| h.text.as_str()).collect();
    assert!(
        texts.iter().any(|t| t.eq_ignore_ascii_case("search")),
        "expected a span for 'search', got {texts:?}"
    );
    assert!(
        texts.iter().any(|t| t.eq_ignore_ascii_case("engine")),
        "expected a span for 'engine', got {texts:?}"
    );
    cleanup(&path);
}

#[test]
fn spans_are_in_positional_order() {
    let (index, path) = setup_index("ordered");
    let spans = spans_for(
        &index,
        json!({"query": {"match": {"body": "search engine"}}}),
        "body",
    )
    .unwrap();
    let starts: Vec<usize> = spans.iter().map(|h| h.start).collect();
    let mut sorted = starts.clone();
    sorted.sort();
    assert_eq!(starts, sorted, "spans must be positionally ordered");
    cleanup(&path);
}

#[test]
fn span_offsets_locate_the_match_in_field_text() {
    let (index, path) = setup_index("offsets");
    let spans = spans_for(
        &index,
        json!({"query": {"match": {"body": "search"}}}),
        "body",
    )
    .unwrap();
    let body = "A search engine indexes documents for fast retrieval.";
    for hl in &spans {
        assert_eq!(&body[hl.start..hl.end], hl.text);
    }
    cleanup(&path);
}

#[test]
fn phrase_query_highlights_constituent_terms() {
    let (index, path) = setup_index("phrase");
    let spans = spans_for(
        &index,
        json!({"query": {"match_phrase": {"body": "search engine"}}}),
        "body",
    )
    .unwrap();
    let texts: Vec<String> = spans.iter().map(|h| h.text.to_lowercase()).collect();
    assert!(texts.iter().any(|t| t == "search"));
    assert!(texts.iter().any(|t| t == "engine"));
    cleanup(&path);
}

#[test]
fn term_query_highlights_exact_value() {
    let (index, path) = setup_index("term");
    let spans = spans_for(
        &index,
        json!({"query": {"term": {"body": "search"}}}),
        "body",
    )
    .unwrap();
    assert!(spans.iter().all(|h| h.text.eq_ignore_ascii_case("search")));
    cleanup(&path);
}

#[test]
fn require_field_match_restricts_to_query_fields() {
    let (index, path) = setup_index("rfm_true");
    // match is on body — title should produce no spans under require_field_match.
    let spans = spans_for(
        &index,
        json!({"query": {"match": {"body": "search"}}}),
        "title",
    );
    assert!(spans.map_or(true, |s| s.is_empty()));
    cleanup(&path);
}

#[test]
fn require_field_match_false_highlights_any_field() {
    let (index, path) = setup_index("rfm_false");
    let expr = parse_search(json!({"query": {"match": {"body": "search"}}}), 10).unwrap();
    let results = index.search(&expr).unwrap();
    let hit = results.hit(0).unwrap();

    let config = HighlightConfig {
        fields: vec![HighlightFieldConfig {
            field: "title".to_string(),
            fragment_size: 0,
            number_of_fragments: 0,
        }],
        require_field_match: false,
        order: HighlightOrder::None,
    };
    let by_field = hit.highlight_with_config(&config).unwrap();
    let title_spans = by_field.get("title").expect("title should have spans");
    assert!(
        title_spans
            .iter()
            .any(|h| h.text.to_lowercase() == "search")
    );
    cleanup(&path);
}

#[test]
fn bool_query_highlights_each_clauses_field() {
    let (index, path) = setup_index("bool");
    let expr = parse_search(
        json!({
            "query": {
                "bool": {
                    "must": [
                        {"match": {"body": "search"}},
                        {"match": {"title": "architecture"}}
                    ]
                }
            }
        }),
        10,
    )
    .unwrap();
    let results = index.search(&expr).unwrap();
    let hit = results.hit(0).unwrap();
    let body_spans = hit.highlight("body").unwrap_or_default();
    let title_spans = hit.highlight("title").unwrap_or_default();
    assert!(body_spans.iter().any(|h| h.text.to_lowercase() == "search"));
    assert!(
        title_spans
            .iter()
            .any(|h| h.text.to_lowercase() == "architecture")
    );
    cleanup(&path);
}

#[test]
fn preserves_case_of_matched_text() {
    let (index, path) = setup_index("case");
    // "Search" appears capitalized in title.
    let spans = spans_for(
        &index,
        json!({"query": {"match": {"title": "search"}}}),
        "title",
    )
    .unwrap();
    // Some spans should contain the capitalized form if the analyser
    // lowercases for matching but offsets reference the original text.
    assert!(
        spans
            .iter()
            .any(|h| h.text.contains('S') || h.text.contains('s'))
    );
    // And offsets must point at the original case-preserved text.
    let title = "Search Engine Architecture";
    for hl in &spans {
        assert_eq!(&title[hl.start..hl.end], hl.text);
    }
    cleanup(&path);
}

#[test]
fn missing_field_returns_none() {
    let (index, path) = setup_index("missing");
    let spans = spans_for(
        &index,
        json!({"query": {"match": {"body": "search"}}}),
        "nonexistent_field",
    );
    assert!(spans.is_none());
    cleanup(&path);
}

#[test]
fn no_matches_returns_none_or_empty() {
    let (index, path) = setup_index("no_matches");
    // A query that has at least one hit but where the matching doc has
    // no body-match for a term we didn't query on.
    let spans = spans_for(
        &index,
        json!({"query": {"match": {"title": "search"}}}),
        "body",
    );
    // Either None (no matches) or empty vec — both acceptable.
    assert!(spans.as_ref().map_or(true, |s| s.is_empty()));
    cleanup(&path);
}

#[test]
fn number_of_fragments_caps_span_count() {
    let (index, path) = setup_index("cap");
    let expr = parse_search(json!({"query": {"match": {"body": "search"}}}), 10).unwrap();
    let results = index.search(&expr).unwrap();
    let hit = results.hit(0).unwrap();

    let config = HighlightConfig {
        fields: vec![HighlightFieldConfig {
            field: "body".to_string(),
            fragment_size: 50,
            number_of_fragments: 1,
        }],
        require_field_match: true,
        order: HighlightOrder::None,
    };
    let spans = hit
        .highlight_with_config(&config)
        .and_then(|m| m.get("body").cloned())
        .unwrap_or_default();
    // With number_of_fragments=1, we should get only spans from one fragment window.
    assert!(!spans.is_empty());
    cleanup(&path);
}