dakera-inference 0.11.76

Embedded inference engine for Dakera - generates embeddings locally via ONNX Runtime
Documentation
//! Integration tests for `crates/inference` extraction and NER pipelines.
//!
//! Covers three categories of regression:
//!
//! **Tier-1a — ISO date extraction** (`rule_based_extract`, iso_date regex)
//! Tests that explicit YYYY-MM-DD patterns are reliably extracted as "date" entities.
//!
//! **Tier-1b — Natural language date extraction** (natural_date regex)
//! Tests that "Month Day, Year" and "Month Day" patterns are extracted.
//!
//! **Relative-time exclusion (Tier-1c/2)**
//! Verifies that relative time expressions ("last month", "2 years ago", etc.) are
//! NOT extracted by the rule-based pass. CE-93 context: bare years like "in 2023"
//! without a month are also excluded — the iso_date pattern requires YYYY-MM-DD.
//!
//! **Extractor config & provider factory**
//! Tests that API keys are redacted in Debug output and that `build_provider` routes
//! to the correct implementation without network calls.

use inference::extraction::{build_provider, ExtractionOpts, ExtractorConfig, NoneExtractor};
use inference::{rule_based_extract, ExtractedEntity, ExtractionProvider};

// ─────────────────────────────────────────────────────────────
// Tier-1a: ISO date extraction
// ─────────────────────────────────────────────────────────────

#[test]
fn rule_extract_iso_date_tier1a_standard() {
    let entities = rule_based_extract("Meeting scheduled on 2023-06-15 at the office.");
    let dates: Vec<&ExtractedEntity> = entities
        .iter()
        .filter(|e| e.entity_type == "date")
        .collect();
    assert_eq!(dates.len(), 1, "should extract exactly one ISO date");
    assert_eq!(dates[0].value, "2023-06-15");
    assert_eq!(dates[0].score, 1.0, "rule-based entities always score 1.0");
}

#[test]
fn rule_extract_iso_date_tier1a_start_of_year() {
    let entities = rule_based_extract("Event on 2024-01-01.");
    let dates: Vec<&ExtractedEntity> = entities
        .iter()
        .filter(|e| e.entity_type == "date")
        .collect();
    assert_eq!(dates.len(), 1);
    assert_eq!(dates[0].value, "2024-01-01");
}

#[test]
fn rule_extract_iso_date_tier1a_end_of_year() {
    let entities = rule_based_extract("Archive date: 2022-12-31");
    let dates: Vec<&ExtractedEntity> = entities
        .iter()
        .filter(|e| e.entity_type == "date")
        .collect();
    assert_eq!(dates.len(), 1);
    assert_eq!(dates[0].value, "2022-12-31");
}

#[test]
fn rule_extract_multiple_iso_dates() {
    let entities = rule_based_extract("Contract starts 2024-03-01 and ends 2025-02-28.");
    let dates: Vec<&ExtractedEntity> = entities
        .iter()
        .filter(|e| e.entity_type == "date")
        .collect();
    assert_eq!(dates.len(), 2, "should extract both ISO dates");
}

// ─────────────────────────────────────────────────────────────
// Tier-1b: Natural language date extraction
// ─────────────────────────────────────────────────────────────

#[test]
fn rule_extract_natural_date_tier1b_full() {
    let entities = rule_based_extract("The event was held on January 15, 2023.");
    let dates: Vec<&ExtractedEntity> = entities
        .iter()
        .filter(|e| e.entity_type == "date")
        .collect();
    assert!(!dates.is_empty(), "should extract natural language date");
    assert!(
        dates[0].value.contains("January"),
        "date value should contain month name, got: {}",
        dates[0].value
    );
}

#[test]
fn rule_extract_natural_date_tier1b_abbreviated_month() {
    let entities = rule_based_extract("Published Dec 25, 2020.");
    let dates: Vec<&ExtractedEntity> = entities
        .iter()
        .filter(|e| e.entity_type == "date")
        .collect();
    assert!(!dates.is_empty(), "abbreviated month should be extracted");
    assert!(dates[0].value.contains("Dec"));
}

#[test]
fn rule_extract_natural_date_tier1b_no_year() {
    // "March 5" without year should still be extracted by the natural_date pattern.
    let entities = rule_based_extract("See you March 5 at the conference.");
    let dates: Vec<&ExtractedEntity> = entities
        .iter()
        .filter(|e| e.entity_type == "date")
        .collect();
    assert!(
        !dates.is_empty(),
        "month+day without year should be extracted"
    );
}

// ─────────────────────────────────────────────────────────────
// Relative-time exclusion (Tier-1c / Tier-2)
// The rule-based pass intentionally does NOT extract relative time expressions.
// This prevents false positives in temporal scoring (CE-93 context).
// ─────────────────────────────────────────────────────────────

#[test]
fn rule_extract_relative_time_last_month_excluded() {
    let entities = rule_based_extract("I visited last month and it was great.");
    let dates: Vec<&ExtractedEntity> = entities
        .iter()
        .filter(|e| e.entity_type == "date")
        .collect();
    assert!(
        dates.is_empty(),
        "relative time 'last month' must not be extracted, got: {:?}",
        dates.iter().map(|e| &e.value).collect::<Vec<_>>()
    );
}

#[test]
fn rule_extract_relative_time_years_ago_excluded() {
    let entities = rule_based_extract("That happened 3 years ago when we started.");
    let dates: Vec<&ExtractedEntity> = entities
        .iter()
        .filter(|e| e.entity_type == "date")
        .collect();
    assert!(
        dates.is_empty(),
        "'3 years ago' must not be extracted as a date, got: {:?}",
        dates.iter().map(|e| &e.value).collect::<Vec<_>>()
    );
}

#[test]
fn rule_extract_bare_year_not_extracted() {
    // A bare year like "in 2023" without month/day is NOT extracted.
    // The iso_date regex requires full YYYY-MM-DD. This is the CE-93 invariant:
    // bare-year queries have different temporal proximity behavior that BARE_YEAR_PROXIMITY_FLOOR
    // (CE-93, unmerged) addresses in the scoring layer, not the extraction layer.
    let entities = rule_based_extract("John moved to Seattle in 2023 and started a new job.");
    let dates: Vec<&ExtractedEntity> = entities
        .iter()
        .filter(|e| e.entity_type == "date")
        .collect();
    assert!(
        dates.is_empty(),
        "bare year '2023' must not be extracted — CE-93: temporal proximity applies in scoring, \
         not extraction; got: {:?}",
        dates.iter().map(|e| &e.value).collect::<Vec<_>>()
    );
}

#[test]
fn rule_extract_next_week_excluded() {
    let entities = rule_based_extract("The report is due next week.");
    let dates: Vec<&ExtractedEntity> = entities
        .iter()
        .filter(|e| e.entity_type == "date")
        .collect();
    assert!(
        dates.is_empty(),
        "'next week' must not be extracted as a date"
    );
}

// ─────────────────────────────────────────────────────────────
// Other entity types extracted by rule_based_extract
// ─────────────────────────────────────────────────────────────

#[test]
fn rule_extract_email_found() {
    let entities = rule_based_extract("Contact us at support@dakera.ai for help.");
    let emails: Vec<&ExtractedEntity> = entities
        .iter()
        .filter(|e| e.entity_type == "email")
        .collect();
    assert_eq!(emails.len(), 1, "should find one email");
    assert_eq!(emails[0].value, "support@dakera.ai");
}

#[test]
fn rule_extract_email_not_also_extracted_as_url() {
    // Email addresses contain '@' and could match a loose URL pattern.
    // The rule-based pass must not double-count an email as a URL.
    let entities = rule_based_extract("Email: info@example.com");
    let urls: Vec<&ExtractedEntity> = entities.iter().filter(|e| e.entity_type == "url").collect();
    let emails: Vec<&ExtractedEntity> = entities
        .iter()
        .filter(|e| e.entity_type == "email")
        .collect();
    assert_eq!(emails.len(), 1, "email should appear once");
    assert!(
        !urls.iter().any(|u| u.value.contains("info@example.com")),
        "email must not also be extracted as URL"
    );
}

#[test]
fn rule_extract_url_found() {
    let entities = rule_based_extract("Visit https://dakera.ai/docs for documentation.");
    let urls: Vec<&ExtractedEntity> = entities.iter().filter(|e| e.entity_type == "url").collect();
    assert_eq!(urls.len(), 1);
    assert!(urls[0].value.starts_with("https://"));
}

#[test]
fn rule_extract_uuid_found() {
    let entities = rule_based_extract("Session ID: 550e8400-e29b-41d4-a716-446655440000 logged.");
    let uuids: Vec<&ExtractedEntity> = entities
        .iter()
        .filter(|e| e.entity_type == "uuid")
        .collect();
    assert_eq!(uuids.len(), 1);
    assert_eq!(uuids[0].value, "550e8400-e29b-41d4-a716-446655440000");
}

#[test]
fn rule_extract_ipv4_found() {
    let entities = rule_based_extract("Server at 192.168.1.100 is unreachable.");
    let ips: Vec<&ExtractedEntity> = entities.iter().filter(|e| e.entity_type == "ip").collect();
    assert_eq!(ips.len(), 1);
    assert_eq!(ips[0].value, "192.168.1.100");
}

#[test]
fn rule_extract_empty_text_returns_empty() {
    let entities = rule_based_extract("");
    assert!(entities.is_empty());
}

// ─────────────────────────────────────────────────────────────
// ExtractedEntity::to_tag
// ─────────────────────────────────────────────────────────────

#[test]
fn extracted_entity_to_tag_format() {
    let entity = ExtractedEntity {
        entity_type: "person".to_string(),
        value: "Alice".to_string(),
        score: 0.95,
        start: 0,
        end: 5,
    };
    // Tag values are lowercased for consistent dedup: "Alice" and "alice" produce the same tag.
    assert_eq!(entity.to_tag(), "entity:person:alice");
}

#[test]
fn extracted_entity_to_tag_lowercases_value() {
    let entity = ExtractedEntity {
        entity_type: "organization".to_string(),
        value: "Anthropic".to_string(),
        score: 0.9,
        start: 0,
        end: 9,
    };
    let tag = entity.to_tag();
    assert_eq!(tag, "entity:organization:anthropic");
}

#[test]
fn extracted_entity_to_tag_sanitizes_colons_in_value() {
    // Colons in the value would break the tag format — they must be replaced with '_'.
    let entity = ExtractedEntity {
        entity_type: "url".to_string(),
        value: "https://example.com".to_string(),
        score: 1.0,
        start: 0,
        end: 19,
    };
    let tag = entity.to_tag();
    // After sanitization, only the two structural colons (entity: and type:) remain.
    let parts: Vec<&str> = tag.splitn(3, ':').collect();
    assert_eq!(parts[0], "entity");
    assert_eq!(parts[1], "url");
    assert!(
        !parts[2].contains(':'),
        "value part of tag must not contain colons, got: {tag}"
    );
}

// ─────────────────────────────────────────────────────────────
// ExtractorConfig
// ─────────────────────────────────────────────────────────────

#[test]
fn extractor_config_none_has_correct_provider() {
    let cfg = ExtractorConfig::none();
    assert_eq!(cfg.provider, "none");
    assert!(cfg.model.is_none());
    assert!(cfg.base_url.is_none());
    assert!(cfg.api_key.is_none());
}

#[test]
fn extractor_config_gliner_has_correct_provider() {
    let cfg = ExtractorConfig::gliner();
    assert_eq!(cfg.provider, "gliner");
}

#[test]
fn extractor_config_debug_redacts_api_key() {
    // API keys must NEVER appear in log output. The Debug impl must show [REDACTED].
    let cfg = ExtractorConfig {
        provider: "openai".to_string(),
        model: None,
        base_url: None,
        api_key: Some("sk-supersecret-key-1234".to_string()),
    };
    let debug_output = format!("{:?}", cfg);
    assert!(
        !debug_output.contains("sk-supersecret-key-1234"),
        "API key must not appear in Debug output, got: {debug_output}"
    );
    assert!(
        debug_output.contains("[REDACTED]") || debug_output.contains("REDACTED"),
        "Debug output should show [REDACTED] for api_key, got: {debug_output}"
    );
}

// ─────────────────────────────────────────────────────────────
// build_provider factory
// ─────────────────────────────────────────────────────────────

#[test]
fn build_provider_none_returns_none_provider() {
    let cfg = ExtractorConfig::none();
    let provider = build_provider(&cfg, None);
    assert_eq!(provider.provider_name(), "none");
}

#[test]
fn build_provider_unknown_falls_back_to_none() {
    let cfg = ExtractorConfig {
        provider: "nonexistent_provider".to_string(),
        model: None,
        base_url: None,
        api_key: None,
    };
    let provider = build_provider(&cfg, None);
    assert_eq!(
        provider.provider_name(),
        "none",
        "unknown provider must fall back to NoneExtractor"
    );
}

#[test]
fn build_provider_gliner_without_ner_engine_returns_gliner() {
    // Without a NER engine, gliner falls back to rule-based but still identifies as gliner.
    let cfg = ExtractorConfig::gliner();
    let provider = build_provider(&cfg, None);
    assert_eq!(provider.provider_name(), "gliner");
}

// ─────────────────────────────────────────────────────────────
// NoneExtractor async behaviour
// ─────────────────────────────────────────────────────────────

#[tokio::test]
async fn none_extractor_returns_empty_extraction_result() {
    use inference::ExtractionResult;
    let extractor = NoneExtractor;
    let opts = ExtractionOpts::default();
    let result = extractor.extract("Some text with entities.", &opts).await;
    assert!(result.is_ok(), "NoneExtractor should never error");
    let r: ExtractionResult = result.unwrap();
    assert_eq!(r.provider, "none");
    assert!(
        r.entities.is_empty(),
        "NoneExtractor should return zero entities"
    );
    assert!(r.topics.is_empty());
    assert!(r.key_phrases.is_empty());
    assert!(r.summary.is_none());
}