lucisearch 0.8.0

//! Integration tests for the analysis pipeline feature.
//!
//! Tests custom analyzer configuration, search_analyzer resolution,
//! and end-to-end indexing+search with custom analysis chains.

use luci::analysis::config::AnalysisConfig;
use luci::index::Index;
use luci::mapping::Mapping;
use serde_json::json;

fn search(
    index: &mut Index,
    query: serde_json::Value,
    size: usize,
) -> luci::search::results::SearchResults {
    let expr = luci::search::expression::parse_search(query, size).unwrap();
    index.search(&expr).unwrap()
}

fn temp_path(name: &str) -> std::path::PathBuf {
    let dir = std::env::temp_dir().join("luci_analysis_tests");
    std::fs::create_dir_all(&dir).ok();
    dir.join(name)
}

fn cleanup(path: &std::path::Path) {
    let _ = std::fs::remove_file(path);
}

#[test]
fn custom_analyzer_basic() {
    let path = temp_path("custom_analyzer_basic.luci");
    cleanup(&path);

    let analysis = json!({
        "analyzer": {
            "my_analyzer": {
                "type": "custom",
                "tokenizer": "standard",
                "filter": ["lowercase", "asciifolding"]
            }
        }
    });
    let config = AnalysisConfig::from_json(&analysis).unwrap();
    let mapping = Mapping::from_json(&json!({
        "properties": {
            "title": {"type": "text", "analyzer": "my_analyzer"}
        }
    }))
    .unwrap();

    let mut index = Index::create_with_settings(&path, mapping, Some(config)).unwrap();
    index.add(json!({"title": "Café résumé"})).unwrap();

    // "cafe" should match because asciifolding normalizes "café" → "cafe"
    let results = search(
        &mut index,
        json!({"query": {"match": {"title": "cafe"}}}),
        10,
    );
    assert_eq!(
        results.len(),
        1,
        "asciifolding should match 'cafe' to 'café'"
    );

    // "resume" should also match
    let results = search(
        &mut index,
        json!({"query": {"match": {"title": "resume"}}}),
        10,
    );
    assert_eq!(
        results.len(),
        1,
        "asciifolding should match 'resume' to 'résumé'"
    );

    cleanup(&path);
}

#[test]
fn custom_analyzer_persisted_on_reopen() {
    let path = temp_path("custom_analyzer_persist.luci");
    cleanup(&path);

    let analysis = json!({
        "analyzer": {
            "folding": {
                "tokenizer": "standard",
                "filter": ["lowercase", "asciifolding"]
            }
        }
    });
    let config = AnalysisConfig::from_json(&analysis).unwrap();
    let mapping = Mapping::from_json(&json!({
        "properties": {
            "title": {"type": "text", "analyzer": "folding"}
        }
    }))
    .unwrap();

    // Create and index
    {
        let index = Index::create_with_settings(&path, mapping, Some(config)).unwrap();
        index.add(json!({"title": "Über straße"})).unwrap();
    }

    // Reopen and search
    {
        let mut index = Index::open(&path).unwrap();
        let results = search(
            &mut index,
            json!({"query": {"match": {"title": "uber"}}}),
            10,
        );
        assert_eq!(results.len(), 1, "custom analyzer should survive reopen");

        let results = search(
            &mut index,
            json!({"query": {"match": {"title": "strasse"}}}),
            10,
        );
        assert_eq!(
            results.len(),
            1,
            "asciifolding should fold ß → ss on reopen"
        );
    }

    cleanup(&path);
}

#[test]
fn edge_ngram_autocomplete() {
    let path = temp_path("edge_ngram_autocomplete.luci");
    cleanup(&path);

    let analysis = json!({
        "tokenizer": {
            "autocomplete_tok": {
                "type": "edge_ngram",
                "min_gram": 2,
                "max_gram": 10,
                "token_chars": ["letter", "digit"]
            }
        },
        "analyzer": {
            "autocomplete": {
                "tokenizer": "autocomplete_tok",
                "filter": ["lowercase"]
            },
            "autocomplete_search": {
                "tokenizer": "standard",
                "filter": ["lowercase"]
            }
        }
    });
    let config = AnalysisConfig::from_json(&analysis).unwrap();
    let mapping = Mapping::from_json(&json!({
        "properties": {
            "name": {
                "type": "text",
                "analyzer": "autocomplete",
                "search_analyzer": "autocomplete_search"
            }
        }
    }))
    .unwrap();

    let mut index = Index::create_with_settings(&path, mapping, Some(config)).unwrap();
    index.add(json!({"name": "Elasticsearch"})).unwrap();
    index.add(json!({"name": "Elastic Cloud"})).unwrap();
    index.add(json!({"name": "Python"})).unwrap();

    // "elast" should match because edge_ngram indexed "el", "ela", "elas", "elast", ...
    // search_analyzer uses standard so "elast" stays as a single term
    let results = search(
        &mut index,
        json!({"query": {"match": {"name": "elast"}}}),
        10,
    );
    assert_eq!(
        results.len(),
        2,
        "prefix 'elast' should match both Elastic* docs"
    );

    // "py" should match Python
    let results = search(&mut index, json!({"query": {"match": {"name": "py"}}}), 10);
    assert_eq!(results.len(), 1, "prefix 'py' should match Python");

    // "xyz" should match nothing
    let results = search(&mut index, json!({"query": {"match": {"name": "xyz"}}}), 10);
    assert_eq!(results.len(), 0, "'xyz' should match nothing");

    cleanup(&path);
}

#[test]
fn synonym_expansion() {
    let path = temp_path("synonym_expansion.luci");
    cleanup(&path);

    let analysis = json!({
        "filter": {
            "my_synonyms": {
                "type": "synonym",
                "synonyms": [
                    "quick, fast, speedy",
                    "big => large"
                ]
            }
        },
        "analyzer": {
            "syn_analyzer": {
                "tokenizer": "standard",
                "filter": ["lowercase", "my_synonyms"]
            }
        }
    });
    let config = AnalysisConfig::from_json(&analysis).unwrap();
    let mapping = Mapping::from_json(&json!({
        "properties": {
            "description": {"type": "text", "analyzer": "syn_analyzer"}
        }
    }))
    .unwrap();

    let mut index = Index::create_with_settings(&path, mapping, Some(config)).unwrap();
    index
        .add(json!({"description": "The quick brown fox"}))
        .unwrap();
    index.add(json!({"description": "A big red car"})).unwrap();

    // "fast" is synonym of "quick" — should match
    let results = search(
        &mut index,
        json!({"query": {"match": {"description": "fast"}}}),
        10,
    );
    assert_eq!(
        results.len(),
        1,
        "'fast' should match via synonym of 'quick'"
    );

    // "large" is explicit mapping from "big" — should match
    let results = search(
        &mut index,
        json!({"query": {"match": {"description": "large"}}}),
        10,
    );
    assert_eq!(
        results.len(),
        1,
        "'large' should match via synonym mapping from 'big'"
    );

    cleanup(&path);
}

#[test]
fn html_strip_char_filter() {
    let path = temp_path("html_strip.luci");
    cleanup(&path);

    let analysis = json!({
        "analyzer": {
            "html_analyzer": {
                "char_filter": ["html_strip"],
                "tokenizer": "standard",
                "filter": ["lowercase"]
            }
        }
    });
    let config = AnalysisConfig::from_json(&analysis).unwrap();
    let mapping = Mapping::from_json(&json!({
        "properties": {
            "body": {"type": "text", "analyzer": "html_analyzer"}
        }
    }))
    .unwrap();

    let mut index = Index::create_with_settings(&path, mapping, Some(config)).unwrap();
    index
        .add(json!({"body": "<p>Hello <b>World</b> &amp; friends</p>"}))
        .unwrap();

    let results = search(
        &mut index,
        json!({"query": {"match": {"body": "hello world"}}}),
        10,
    );
    assert_eq!(
        results.len(),
        1,
        "HTML tags should be stripped before indexing"
    );

    cleanup(&path);
}

#[test]
fn search_analyzer_divergence() {
    let path = temp_path("search_analyzer_div.luci");
    cleanup(&path);

    // Index with stop filter (removes "the", "a", etc.)
    // Search without stop filter (keeps all terms)
    let analysis = json!({
        "analyzer": {
            "index_analyzer": {
                "tokenizer": "standard",
                "filter": ["lowercase"]
            },
            "search_analyzer": {
                "tokenizer": "standard",
                "filter": ["lowercase"]
            }
        }
    });
    let config = AnalysisConfig::from_json(&analysis).unwrap();
    let mapping = Mapping::from_json(&json!({
        "properties": {
            "title": {
                "type": "text",
                "analyzer": "index_analyzer",
                "search_analyzer": "search_analyzer"
            }
        }
    }))
    .unwrap();

    let mut index = Index::create_with_settings(&path, mapping, Some(config)).unwrap();
    index.add(json!({"title": "hello world"})).unwrap();

    let results = search(
        &mut index,
        json!({"query": {"match": {"title": "hello"}}}),
        10,
    );
    assert_eq!(results.len(), 1);

    cleanup(&path);
}

/// Regression test for [[investigation-20260405-06-match-bool-prefix-analyzer]].
///
/// match_bool_prefix used a fresh AnalyzerRegistry::new() instead of the
/// searcher's, so custom analyzers configured on the index were invisible.
/// It also ignored field-configured analyzers.
#[test]
fn match_bool_prefix_uses_field_analyzer() {
    let path = temp_path("match_bool_prefix_analyzer.luci");
    cleanup(&path);

    // Custom analyzer with asciifolding: "café" → "cafe", "résumé" → "resume"
    let analysis = json!({
        "analyzer": {
            "folding": {
                "tokenizer": "standard",
                "filter": ["lowercase", "asciifolding"]
            }
        }
    });
    let config = AnalysisConfig::from_json(&analysis).unwrap();
    let mapping = Mapping::from_json(&json!({
        "properties": {
            "title": {"type": "text", "analyzer": "folding"}
        }
    }))
    .unwrap();

    let mut index = Index::create_with_settings(&path, mapping, Some(config)).unwrap();
    // Indexed with folding: "Résumé" → "resume"
    index.add(json!({"title": "Résumé Writer"})).unwrap();

    // match_bool_prefix on a single accented term. The last (only) token
    // becomes a prefix query.
    //
    // With the bug: standard analyzer → "résumé" stays as "résumé" →
    //   prefix("résumé"). Index has "resume" → no match → 0 hits.
    // With the fix: folding analyzer → "résumé" → "resume" →
    //   prefix("resume"). Index has "resume" → match → 1 hit.
    let results = search(
        &mut index,
        json!({"query": {"match_bool_prefix": {"title": "résumé"}}}),
        10,
    );
    assert_eq!(
        results.len(),
        1,
        "match_bool_prefix must use field's folding analyzer to match accented input against folded index"
    );

    cleanup(&path);
}

#[test]
fn no_settings_backward_compatible() {
    // Ensure indexes without settings still work exactly as before
    let path = temp_path("no_settings.luci");
    cleanup(&path);

    let mapping = Mapping::from_json(&json!({
        "properties": {
            "title": {"type": "text"}
        }
    }))
    .unwrap();

    let mut index = Index::create_with_mapping(&path, mapping).unwrap();
    index.add(json!({"title": "Hello World"})).unwrap();

    let results = search(
        &mut index,
        json!({"query": {"match": {"title": "hello"}}}),
        10,
    );
    assert_eq!(results.len(), 1);

    cleanup(&path);
}