lantern 0.2.4 - Docs.rs

//! Provenance-preserving entity layer.
//!
//! Extracts deterministic signals from chunk text — URLs, email addresses,
//! backtick-wrapped file paths, and @-mentions — and stores them in a shape
//! that does not require another migration for each new kind. The schema
//! lives in `store::SCHEMA_V10`: `entities` is keyed by `UNIQUE(kind, value)`
//! and `chunk_entities` joins each entity back to the chunks it appeared in.
//!
//! Extraction stays regex-free and dependency-free. URLs scan for the
//! `http(s)://` prefix, take everything up to the next whitespace character,
//! and trim a small set of trailing punctuation. Emails scan around `@`, use
//! simple ASCII boundary rules, and validate a sane domain/TLD shape. File
//! paths are intentionally narrow: only backtick-wrapped tokens that contain
//! a `/` and no whitespace or `://` scheme are accepted, which keeps prose
//! mentions like `` `src/main.rs` `` while rejecting the much larger pool of
//! backtick-wrapped identifiers (`Vec`, `String`, …). @-mentions look for `@`
//! that is NOT preceded by an email-local character (so `me@example.com` stays
//! exclusively an email), require at least two body characters with at least
//! one ASCII letter (so dates and pure digit runs do not slip through), and
//! store the captured handle with the leading `@` retained for readability.
//! Wrong calls here are recoverable — entities can be re-extracted on reindex
//! without touching the authoritative chunk text.

use anyhow::{Context, Result};
use rusqlite::{Connection, params};
use serde::Serialize;
use sha2::{Digest, Sha256};
use std::collections::HashSet;

/// Discriminator for an extracted entity. Stored as the `kind` column in the
/// `entities` table; the string form is the on-disk representation.
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize)]
#[serde(rename_all = "lowercase")]
pub enum EntityKind {
    Url,
    Email,
    FilePath,
    Mention,
}

impl EntityKind {
    pub fn as_str(self) -> &'static str {
        match self {
            EntityKind::Url => "url",
            EntityKind::Email => "email",
            EntityKind::FilePath => "filepath",
            EntityKind::Mention => "mention",
        }
    }
}

/// A single (kind, value) extracted from chunk text.
#[derive(Debug, Clone, PartialEq, Eq, Serialize)]
pub struct Entity {
    pub kind: EntityKind,
    pub value: String,
}

/// Deterministic ID for an entity. Same (kind, value) on any machine yields the
/// same id, which lines up with the `UNIQUE(kind, value)` constraint and lets
/// `INSERT OR IGNORE` dedup without a follow-up SELECT.
pub fn entity_id(kind: EntityKind, value: &str) -> String {
    let mut h = Sha256::new();
    h.update(kind.as_str().as_bytes());
    h.update(b":");
    h.update(value.as_bytes());
    hex::encode(&h.finalize()[..16])
}

/// Trailing characters trimmed from a URL after greedy whitespace-bounded
/// capture. These are sentence punctuation that is almost always part of the
/// surrounding prose, not the URL itself. We do NOT try to balance parens
/// inside the URL — Wikipedia-style links lose their trailing `)` as a
/// trade-off for keeping extraction simple and predictable.
const URL_TRAILING_TRIM: &[char] = &[
    ',', '.', ';', ':', '!', '?', '\'', '"', ')', ']', '}', '>', '`',
];

const EMAIL_TRAILING_TRIM: &[char] = &[
    ',', '.', ';', ':', '!', '?', '\'', '"', ')', ']', '}', '>', '`',
];

fn push_entity(out: &mut Vec<Entity>, seen: &mut HashSet<String>, kind: EntityKind, value: String) {
    let key = format!("{}:{}", kind.as_str(), value);
    if seen.insert(key) {
        out.push(Entity { kind, value });
    }
}

fn looks_like_file_path_literal(candidate: &str) -> bool {
    if candidate.is_empty() || candidate.chars().any(|c| c.is_whitespace()) {
        return false;
    }
    if candidate.contains("://") {
        return false;
    }
    if candidate.contains('/') {
        return true;
    }
    match candidate.rsplit_once('.') {
        Some((stem, ext)) => {
            !stem.is_empty()
                && !ext.is_empty()
                && stem.chars().any(|c| c.is_ascii_alphanumeric())
                && ext.chars().any(|c| c.is_ascii_alphabetic())
                && stem
                    .chars()
                    .all(|c| c.is_ascii_alphanumeric() || matches!(c, '.' | '_' | '-'))
                && ext.chars().all(|c| c.is_ascii_alphanumeric())
        }
        None => false,
    }
}

fn push_file_path_entities(text: &str, out: &mut Vec<Entity>, seen: &mut HashSet<String>) {
    let mut cursor = 0usize;
    while let Some(rel_start) = text[cursor..].find('`') {
        let start = cursor + rel_start + 1;
        let rest = &text[start..];
        let Some(rel_end) = rest.find('`') else {
            break;
        };
        let candidate = rest[..rel_end].trim();
        if looks_like_file_path_literal(candidate) {
            push_entity(out, seen, EntityKind::FilePath, candidate.to_string());
        }
        cursor = start + rel_end + 1;
    }
}

fn is_email_local_char(byte: u8) -> bool {
    byte.is_ascii_alphanumeric() || matches!(byte, b'.' | b'_' | b'%' | b'+' | b'-')
}

fn is_email_domain_char(byte: u8) -> bool {
    byte.is_ascii_alphanumeric() || matches!(byte, b'.' | b'-')
}

fn is_valid_email(candidate: &str) -> bool {
    let (local, domain) = match candidate.split_once('@') {
        Some(parts) => parts,
        None => return false,
    };
    if local.is_empty() || domain.is_empty() {
        return false;
    }
    if local.starts_with('.') || local.ends_with('.') {
        return false;
    }
    if domain.starts_with('.') || domain.ends_with('.') {
        return false;
    }
    if !local.as_bytes().iter().copied().all(is_email_local_char) {
        return false;
    }
    if !domain.as_bytes().iter().copied().all(is_email_domain_char) {
        return false;
    }
    let labels: Vec<&str> = domain.split('.').collect();
    if labels.len() < 2 || labels.iter().any(|label| label.is_empty()) {
        return false;
    }
    if labels
        .iter()
        .take(labels.len() - 1)
        .any(|label| label.starts_with('-') || label.ends_with('-'))
    {
        return false;
    }
    let tld = labels.last().unwrap();
    tld.len() >= 2 && tld.chars().all(|c| c.is_ascii_alphabetic())
}

fn push_url_entities(text: &str, out: &mut Vec<Entity>, seen: &mut HashSet<String>) {
    for prefix in ["https://", "http://"] {
        let mut cursor = 0usize;
        while let Some(rel) = text[cursor..].find(prefix) {
            let start = cursor + rel;
            let rest = &text[start..];
            let end = rest.find(|c: char| c.is_whitespace()).unwrap_or(rest.len());
            let raw = &rest[..end];
            let trimmed = raw.trim_end_matches(URL_TRAILING_TRIM);
            // Require at least one character past the scheme; "http://" alone
            // is not a URL worth indexing.
            if trimmed.len() > prefix.len() {
                push_entity(out, seen, EntityKind::Url, trimmed.to_string());
            }
            cursor = start + prefix.len();
        }
    }
}

// Body characters allowed inside an @-mention handle. Conservative on purpose:
// alphanumeric plus a small set of internal separators that show up in real
// handles (`first.last`, `first-last`, `first_last`). Anything else ends the
// scan, so trailing punctuation like `?`, `,`, `)` does not get folded in.
fn is_mention_body_char(byte: u8) -> bool {
    byte.is_ascii_alphanumeric() || matches!(byte, b'_' | b'-' | b'.')
}

// Trailing characters trimmed off a mention body once the greedy scan stops.
// Includes the internal separators because handles do not end on `.`/`-`/`_`
// in any of the conventions we want to support, but a stray `bob.` is common
// at sentence ends.
const MENTION_TRAILING_TRIM: &[char] = &['.', '-', '_'];

fn push_mention_entities(text: &str, out: &mut Vec<Entity>, seen: &mut HashSet<String>) {
    let bytes = text.as_bytes();
    for (idx, byte) in bytes.iter().enumerate() {
        if *byte != b'@' {
            continue;
        }
        // If the `@` is preceded by an email-local character, the email
        // extractor already owns this match. Skipping here prevents a single
        // `me@example.com` from being recorded as both an email and a mention.
        if idx > 0 && is_email_local_char(bytes[idx - 1]) {
            continue;
        }
        let body_start = idx + 1;
        let mut end = body_start;
        while end < bytes.len() && is_mention_body_char(bytes[end]) {
            end += 1;
        }
        let raw = &text[body_start..end];
        let trimmed = raw.trim_end_matches(MENTION_TRAILING_TRIM);
        if trimmed.len() < 2 {
            continue;
        }
        let first = trimmed.as_bytes()[0];
        if !first.is_ascii_alphanumeric() && first != b'_' {
            continue;
        }
        // Conservative: require at least one ASCII letter so we do not pull in
        // pure digit/date-shaped runs like `@2024-01-15`.
        if !trimmed.bytes().any(|b| b.is_ascii_alphabetic()) {
            continue;
        }
        push_entity(out, seen, EntityKind::Mention, format!("@{trimmed}"));
    }
}

fn push_email_entities(text: &str, out: &mut Vec<Entity>, seen: &mut HashSet<String>) {
    let bytes = text.as_bytes();
    for (idx, byte) in bytes.iter().enumerate() {
        if *byte != b'@' {
            continue;
        }
        let mut start = idx;
        while start > 0 && is_email_local_char(bytes[start - 1]) {
            start -= 1;
        }
        let mut end = idx + 1;
        while end < bytes.len() && is_email_domain_char(bytes[end]) {
            end += 1;
        }
        if start < idx && end > idx + 1 {
            let candidate = text[start..end].trim_end_matches(EMAIL_TRAILING_TRIM);
            if is_valid_email(candidate) {
                push_entity(out, seen, EntityKind::Email, candidate.to_string());
            }
        }
    }
}

/// Extract distinct URL, email, mention, and path-literal entities from
/// `text`. Order is the order of first appearance within each kind;
/// duplicates within the same chunk collapse to one entry so the caller does
/// not need to dedup before inserting. Email extraction runs before mentions
/// so the email-local-char guard inside `push_mention_entities` only needs to
/// inspect the byte directly preceding `@` to know an email already claims
/// that match.
pub fn extract_entities(text: &str) -> Vec<Entity> {
    let mut out: Vec<Entity> = Vec::new();
    let mut seen: HashSet<String> = HashSet::new();
    push_url_entities(text, &mut out, &mut seen);
    push_email_entities(text, &mut out, &mut seen);
    push_mention_entities(text, &mut out, &mut seen);
    push_file_path_entities(text, &mut out, &mut seen);
    out
}

/// Parse a CLI/MCP-supplied kind string back into [`EntityKind`].
///
/// Mirrors the `as_str` mapping above; case-insensitive so `URL` and `url` both
/// parse. Returns `Err` instead of silently coercing so the caller can surface
/// the bad input verbatim.
pub fn entity_kind_from_str(raw: &str) -> Result<EntityKind> {
    match raw.to_ascii_lowercase().as_str() {
        "url" => Ok(EntityKind::Url),
        "email" => Ok(EntityKind::Email),
        "filepath" => Ok(EntityKind::FilePath),
        "mention" => Ok(EntityKind::Mention),
        other => {
            anyhow::bail!("unknown entity kind '{other}' (expected: url, email, filepath, mention)")
        }
    }
}

/// Filter / paging options for [`list_entities`].
///
/// `value_contains` is a plain substring match (case-sensitive) so URLs and
/// file paths stay grep-able without escaping. `limit` caps how many rows are
/// returned, but `EntityListReport::total_matched` always reflects the unbounded
/// match count so callers can tell when results were truncated.
#[derive(Debug, Clone, Default)]
pub struct EntityListOptions {
    pub kind: Option<EntityKind>,
    pub value_contains: Option<String>,
    pub limit: Option<usize>,
}

/// One row in the entity listing — the entity plus its chunk-reference count.
#[derive(Debug, Clone, Serialize, PartialEq, Eq)]
pub struct EntityListEntry {
    pub id: String,
    pub kind: EntityKind,
    pub value: String,
    pub chunk_count: i64,
    pub created_at: i64,
}

/// Listing result: the (possibly truncated) entries and the total matched.
#[derive(Debug, Clone, Serialize)]
pub struct EntityListReport {
    pub entries: Vec<EntityListEntry>,
    pub total_matched: i64,
    pub kind_filter: Option<EntityKind>,
    pub value_contains: Option<String>,
}

/// List entities, optionally filtered by kind and value substring.
///
/// Ordered by chunk count (descending) then value (ascending) so the most-
/// referenced entries surface first while still being deterministic for tests.
pub fn list_entities(conn: &Connection, opts: &EntityListOptions) -> Result<EntityListReport> {
    let mut where_clauses: Vec<&str> = Vec::new();
    let kind_str = opts.kind.map(|k| k.as_str().to_string());
    let like_pattern = opts.value_contains.as_deref().map(|s| {
        format!(
            "%{}%",
            s.replace('\\', "\\\\")
                .replace('%', "\\%")
                .replace('_', "\\_")
        )
    });

    if kind_str.is_some() {
        where_clauses.push("e.kind = ?");
    }
    if like_pattern.is_some() {
        where_clauses.push("e.value LIKE ? ESCAPE '\\'");
    }
    let where_sql = if where_clauses.is_empty() {
        String::new()
    } else {
        format!("WHERE {}", where_clauses.join(" AND "))
    };

    let total_sql = format!("SELECT COUNT(*) FROM entities e {where_sql}");
    let list_sql = format!(
        "SELECT e.id, e.kind, e.value, e.created_at,
                (SELECT COUNT(*) FROM chunk_entities ce WHERE ce.entity_id = e.id) AS chunk_count
         FROM entities e
         {where_sql}
         ORDER BY chunk_count DESC, e.value ASC, e.id ASC
         LIMIT ?"
    );

    let mut total_params: Vec<rusqlite::types::Value> = Vec::new();
    if let Some(k) = &kind_str {
        total_params.push(k.clone().into());
    }
    if let Some(p) = &like_pattern {
        total_params.push(p.clone().into());
    }
    let total_param_refs: Vec<&dyn rusqlite::ToSql> = total_params
        .iter()
        .map(|v| v as &dyn rusqlite::ToSql)
        .collect();
    let total_matched: i64 = conn
        .query_row(&total_sql, total_param_refs.as_slice(), |row| row.get(0))
        .with_context(|| format!("counting entities matching {opts:?}"))?;

    let limit = opts.limit.unwrap_or(usize::MAX).min(i64::MAX as usize) as i64;
    let mut list_params = total_params.clone();
    list_params.push(limit.into());
    let list_param_refs: Vec<&dyn rusqlite::ToSql> = list_params
        .iter()
        .map(|v| v as &dyn rusqlite::ToSql)
        .collect();

    let mut stmt = conn
        .prepare(&list_sql)
        .context("preparing list_entities query")?;
    let rows = stmt
        .query_map(list_param_refs.as_slice(), |row| {
            let kind_raw: String = row.get(1)?;
            let kind = entity_kind_from_str(&kind_raw).map_err(|e| {
                rusqlite::Error::FromSqlConversionFailure(
                    1,
                    rusqlite::types::Type::Text,
                    Box::new(std::io::Error::other(e.to_string())),
                )
            })?;
            Ok(EntityListEntry {
                id: row.get(0)?,
                kind,
                value: row.get(2)?,
                created_at: row.get(3)?,
                chunk_count: row.get(4)?,
            })
        })
        .context("running list_entities query")?;
    let entries = rows.collect::<Result<Vec<_>, _>>()?;

    Ok(EntityListReport {
        entries,
        total_matched,
        kind_filter: opts.kind,
        value_contains: opts.value_contains.clone(),
    })
}

pub fn print_text(report: &EntityListReport) {
    if report.entries.is_empty() {
        println!("no entities matched");
        if let Some(kind) = report.kind_filter {
            println!("  kind:  {}", kind.as_str());
        }
        if let Some(v) = &report.value_contains {
            println!("  value contains: {v}");
        }
        return;
    }
    println!(
        "{} entit{} ({} total matched)",
        report.entries.len(),
        if report.entries.len() == 1 {
            "y"
        } else {
            "ies"
        },
        report.total_matched
    );
    for entry in &report.entries {
        println!(
            "  {kind:<8} chunks={chunks:<4} {value}",
            kind = entry.kind.as_str(),
            chunks = entry.chunk_count,
            value = entry.value,
        );
    }
}

pub fn print_json(report: &EntityListReport) -> Result<()> {
    println!("{}", serde_json::to_string_pretty(report)?);
    Ok(())
}

/// Upsert each entity and link it to `chunk_id`. Designed to be called inside
/// the same transaction that inserted the chunk, so a failed extraction does
/// not leave the entity tables out of sync with `chunks`.
///
/// Returns the number of new chunk -> entity edges actually created (existing
/// links are silently ignored, so re-running on the same chunk is a no-op).
pub fn record_chunk_entities(
    conn: &Connection,
    chunk_id: &str,
    entities: &[Entity],
    now: i64,
) -> Result<usize> {
    if entities.is_empty() {
        return Ok(0);
    }
    let mut up_stmt = conn
        .prepare_cached(
            "INSERT OR IGNORE INTO entities (id, kind, value, created_at) VALUES (?1, ?2, ?3, ?4)",
        )
        .context("preparing entity upsert")?;
    let mut link_stmt = conn
        .prepare_cached(
            "INSERT OR IGNORE INTO chunk_entities (chunk_id, entity_id) VALUES (?1, ?2)",
        )
        .context("preparing chunk_entities link")?;
    let mut linked = 0usize;
    for entity in entities {
        let id = entity_id(entity.kind, &entity.value);
        up_stmt
            .execute(params![id, entity.kind.as_str(), entity.value, now])
            .with_context(|| {
                format!("upserting entity {}={}", entity.kind.as_str(), entity.value)
            })?;
        let inserted = link_stmt
            .execute(params![chunk_id, id])
            .with_context(|| format!("linking chunk {chunk_id} to entity {id}"))?;
        linked += inserted;
    }
    Ok(linked)
}

#[cfg(test)]
mod tests {
    use super::*;
    use crate::store::Store;
    use tempfile::tempdir;

    #[test]
    fn extracts_basic_https_url() {
        let entities = extract_entities("see https://example.com for details");
        assert_eq!(entities.len(), 1);
        assert_eq!(entities[0].kind, EntityKind::Url);
        assert_eq!(entities[0].value, "https://example.com");
    }

    #[test]
    fn extracts_both_schemes() {
        let entities = extract_entities("http://a.test and https://b.test/path");
        let values: Vec<_> = entities.iter().map(|e| e.value.as_str()).collect();
        assert!(values.contains(&"http://a.test"));
        assert!(values.contains(&"https://b.test/path"));
    }

    #[test]
    fn trims_trailing_sentence_punctuation() {
        let entities = extract_entities("visit https://example.com/page, or https://other.test.");
        let values: Vec<_> = entities.iter().map(|e| e.value.as_str()).collect();
        assert_eq!(
            values,
            vec!["https://example.com/page", "https://other.test"]
        );
    }

    #[test]
    fn dedups_within_same_text() {
        let entities =
            extract_entities("https://x.test and again https://x.test plus https://y.test");
        assert_eq!(entities.len(), 2);
        assert_eq!(entities[0].value, "https://x.test");
        assert_eq!(entities[1].value, "https://y.test");
    }

    #[test]
    fn ignores_bare_scheme_without_host() {
        let entities = extract_entities("not a url: http:// or https:// alone");
        assert!(entities.is_empty());
    }

    #[test]
    fn extracts_emails_and_trims_punctuation() {
        let entities =
            extract_entities("reach me at alice@example.com, or bob.smith+test@sub.example.co.uk.");
        let values: Vec<_> = entities
            .iter()
            .map(|e| (e.kind, e.value.as_str()))
            .collect();
        assert_eq!(
            values,
            vec![
                (EntityKind::Email, "alice@example.com"),
                (EntityKind::Email, "bob.smith+test@sub.example.co.uk"),
            ]
        );
    }

    #[test]
    fn extracts_backtick_wrapped_file_paths() {
        let entities = extract_entities(
            "look at `src/main.rs`, `Cargo.toml`, and `Vec` while ignoring `https://example.com`",
        );
        let values: Vec<_> = entities
            .iter()
            .filter(|e| e.kind == EntityKind::FilePath)
            .map(|e| e.value.as_str())
            .collect();
        assert_eq!(values, vec!["src/main.rs", "Cargo.toml"]);
    }

    #[test]
    fn entity_id_is_deterministic_and_kind_scoped() {
        let a = entity_id(EntityKind::Url, "https://example.com");
        let b = entity_id(EntityKind::Url, "https://example.com");
        let email = entity_id(EntityKind::Email, "https://example.com");
        let path = entity_id(EntityKind::FilePath, "https://example.com");
        let mention = entity_id(EntityKind::Mention, "https://example.com");
        assert_eq!(a, b);
        assert_ne!(a, email);
        assert_ne!(a, path);
        assert_ne!(a, mention);
        assert_ne!(email, mention);
        assert_eq!(a.len(), 32);
    }

    #[test]
    fn extracts_basic_mentions() {
        let entities = extract_entities("Hi @alice and @bob_smith — see also @charlie.");
        let values: Vec<_> = entities
            .iter()
            .filter(|e| e.kind == EntityKind::Mention)
            .map(|e| e.value.as_str())
            .collect();
        assert_eq!(values, vec!["@alice", "@bob_smith", "@charlie"]);
    }

    #[test]
    fn mention_at_start_of_text() {
        let entities = extract_entities("@alice says hi");
        let mentions: Vec<_> = entities
            .iter()
            .filter(|e| e.kind == EntityKind::Mention)
            .map(|e| e.value.as_str())
            .collect();
        assert_eq!(mentions, vec!["@alice"]);
    }

    #[test]
    fn email_does_not_double_extract_as_mention() {
        let entities = extract_entities("ping me@example.com");
        let kinds: Vec<_> = entities
            .iter()
            .map(|e| (e.kind, e.value.as_str()))
            .collect();
        assert_eq!(kinds, vec![(EntityKind::Email, "me@example.com")]);
    }

    #[test]
    fn mention_and_email_can_coexist() {
        let entities = extract_entities("@alice emailed bob@example.com today");
        let mentions: Vec<_> = entities
            .iter()
            .filter(|e| e.kind == EntityKind::Mention)
            .map(|e| e.value.as_str())
            .collect();
        let emails: Vec<_> = entities
            .iter()
            .filter(|e| e.kind == EntityKind::Email)
            .map(|e| e.value.as_str())
            .collect();
        assert_eq!(mentions, vec!["@alice"]);
        assert_eq!(emails, vec!["bob@example.com"]);
    }

    #[test]
    fn mention_dedupes_within_text() {
        let entities = extract_entities("@bob said hi, then @bob left, finally @carol");
        let mentions: Vec<_> = entities
            .iter()
            .filter(|e| e.kind == EntityKind::Mention)
            .map(|e| e.value.as_str())
            .collect();
        assert_eq!(mentions, vec!["@bob", "@carol"]);
    }

    #[test]
    fn mention_trims_trailing_separators() {
        // Trailing punctuation outside the body-char set ends the scan; trailing
        // `.`/`-`/`_` inside the scan get trimmed off by `MENTION_TRAILING_TRIM`.
        let entities = extract_entities("Ping @charlie! Then @dana. And @eve-");
        let mentions: Vec<_> = entities
            .iter()
            .filter(|e| e.kind == EntityKind::Mention)
            .map(|e| e.value.as_str())
            .collect();
        assert_eq!(mentions, vec!["@charlie", "@dana", "@eve"]);
    }

    #[test]
    fn mention_skips_short_or_empty_handles() {
        let entities = extract_entities("@ alone, @x too short, but @ab is fine");
        let mentions: Vec<_> = entities
            .iter()
            .filter(|e| e.kind == EntityKind::Mention)
            .map(|e| e.value.as_str())
            .collect();
        assert_eq!(mentions, vec!["@ab"]);
    }

    #[test]
    fn mention_skips_pure_digit_or_date_handles() {
        // Conservative: no ASCII letter in the body means we leave it alone,
        // so dates and version-shaped tokens do not become mentions.
        let entities = extract_entities("see @2024-01-15 or @1.2.3 vs @v1.2.3");
        let mentions: Vec<_> = entities
            .iter()
            .filter(|e| e.kind == EntityKind::Mention)
            .map(|e| e.value.as_str())
            .collect();
        assert_eq!(mentions, vec!["@v1.2.3"]);
    }

    #[test]
    fn record_chunk_entities_dedups_across_chunks() {
        let dir = tempdir().unwrap();
        let mut store = Store::initialize(&dir.path().join("store")).unwrap();
        // Seed a fake source + two chunks so the FK targets exist.
        let conn = store.conn_mut();
        let tx = conn.transaction().unwrap();
        tx.execute(
            "INSERT INTO sources (id, uri, path, kind, bytes, content_sha256, mtime_unix, ingested_at)
             VALUES ('src1', 'mem://t', NULL, 'text/plain', 0, 'deadbeef', NULL, 0)",
            [],
        )
        .unwrap();
        for (id, ord) in [("c1", 0), ("c2", 1)] {
            tx.execute(
                "INSERT INTO chunks (id, source_id, ordinal, byte_start, byte_end, char_count, text, sha256, created_at)
                 VALUES (?1, 'src1', ?2, 0, 0, 0, '', '', 0)",
                params![id, ord],
            )
            .unwrap();
        }
        let shared = vec![Entity {
            kind: EntityKind::Url,
            value: "https://shared.test".into(),
        }];
        record_chunk_entities(&tx, "c1", &shared, 100).unwrap();
        record_chunk_entities(&tx, "c2", &shared, 200).unwrap();
        tx.commit().unwrap();

        let entity_count: i64 = store
            .conn()
            .query_row("SELECT COUNT(*) FROM entities", [], |row| row.get(0))
            .unwrap();
        assert_eq!(
            entity_count, 1,
            "shared URL must collapse to a single entity"
        );

        let edges: i64 = store
            .conn()
            .query_row("SELECT COUNT(*) FROM chunk_entities", [], |row| row.get(0))
            .unwrap();
        assert_eq!(edges, 2, "both chunks should link to the shared entity");
    }

    fn seed_chunks(store: &mut Store, ids: &[&str]) {
        let conn = store.conn_mut();
        let tx = conn.transaction().unwrap();
        tx.execute(
            "INSERT INTO sources (id, uri, path, kind, bytes, content_sha256, mtime_unix, ingested_at)
             VALUES ('list_src', 'mem://list', NULL, 'text/plain', 0, 'deadbeef', NULL, 0)",
            [],
        )
        .unwrap();
        for (ord, id) in ids.iter().enumerate() {
            tx.execute(
                "INSERT INTO chunks (id, source_id, ordinal, byte_start, byte_end, char_count, text, sha256, created_at)
                 VALUES (?1, 'list_src', ?2, 0, 0, 0, '', '', 0)",
                params![id, ord as i64],
            )
            .unwrap();
        }
        tx.commit().unwrap();
    }

    fn link(store: &mut Store, chunk_id: &str, entities: &[Entity], now: i64) {
        let conn = store.conn_mut();
        let tx = conn.transaction().unwrap();
        record_chunk_entities(&tx, chunk_id, entities, now).unwrap();
        tx.commit().unwrap();
    }

    #[test]
    fn list_entities_orders_by_chunk_count_desc() {
        let dir = tempdir().unwrap();
        let mut store = Store::initialize(&dir.path().join("store")).unwrap();
        seed_chunks(&mut store, &["c1", "c2", "c3"]);
        let popular = Entity {
            kind: EntityKind::Url,
            value: "https://popular.test".into(),
        };
        let lonely = Entity {
            kind: EntityKind::Url,
            value: "https://lonely.test".into(),
        };
        link(&mut store, "c1", &[popular.clone(), lonely.clone()], 100);
        link(&mut store, "c2", &[popular.clone()], 200);
        link(&mut store, "c3", &[popular.clone()], 300);

        let report = list_entities(store.conn(), &EntityListOptions::default()).unwrap();
        assert_eq!(report.total_matched, 2);
        assert_eq!(report.entries.len(), 2);
        assert_eq!(report.entries[0].value, "https://popular.test");
        assert_eq!(report.entries[0].chunk_count, 3);
        assert_eq!(report.entries[1].value, "https://lonely.test");
        assert_eq!(report.entries[1].chunk_count, 1);
    }

    #[test]
    fn list_entities_filters_by_kind() {
        let dir = tempdir().unwrap();
        let mut store = Store::initialize(&dir.path().join("store")).unwrap();
        seed_chunks(&mut store, &["c1"]);
        link(
            &mut store,
            "c1",
            &[
                Entity {
                    kind: EntityKind::Url,
                    value: "https://x.test".into(),
                },
                Entity {
                    kind: EntityKind::Email,
                    value: "alice@x.test".into(),
                },
                Entity {
                    kind: EntityKind::Mention,
                    value: "@bob".into(),
                },
            ],
            100,
        );

        let opts = EntityListOptions {
            kind: Some(EntityKind::Email),
            ..Default::default()
        };
        let report = list_entities(store.conn(), &opts).unwrap();
        assert_eq!(report.total_matched, 1);
        assert_eq!(report.entries.len(), 1);
        assert_eq!(report.entries[0].kind, EntityKind::Email);
        assert_eq!(report.entries[0].value, "alice@x.test");
    }

    #[test]
    fn list_entities_value_contains_treats_percent_as_literal() {
        // The SQL builder must escape `%` so users searching for a literal
        // percent sign do not accidentally turn it into a wildcard.
        let dir = tempdir().unwrap();
        let mut store = Store::initialize(&dir.path().join("store")).unwrap();
        seed_chunks(&mut store, &["c1", "c2"]);
        link(
            &mut store,
            "c1",
            &[Entity {
                kind: EntityKind::Url,
                value: "https://example.test/a%20b".into(),
            }],
            100,
        );
        link(
            &mut store,
            "c2",
            &[Entity {
                kind: EntityKind::Url,
                value: "https://other.test/plain".into(),
            }],
            200,
        );

        // A literal `%` substring should match only the first URL — not act
        // as the LIKE wildcard.
        let opts = EntityListOptions {
            value_contains: Some("%20".into()),
            ..Default::default()
        };
        let report = list_entities(store.conn(), &opts).unwrap();
        assert_eq!(report.total_matched, 1);
        assert_eq!(report.entries[0].value, "https://example.test/a%20b");
    }

    #[test]
    fn list_entities_limit_truncates_but_total_matched_is_full() {
        let dir = tempdir().unwrap();
        let mut store = Store::initialize(&dir.path().join("store")).unwrap();
        seed_chunks(&mut store, &["c1"]);
        let many = (0..5)
            .map(|i| Entity {
                kind: EntityKind::Url,
                value: format!("https://e{i}.test"),
            })
            .collect::<Vec<_>>();
        link(&mut store, "c1", &many, 100);

        let opts = EntityListOptions {
            limit: Some(2),
            ..Default::default()
        };
        let report = list_entities(store.conn(), &opts).unwrap();
        assert_eq!(report.entries.len(), 2);
        assert_eq!(report.total_matched, 5);
    }

    #[test]
    fn list_entities_empty_store_returns_empty_report() {
        let dir = tempdir().unwrap();
        let store = Store::initialize(&dir.path().join("store")).unwrap();
        let report = list_entities(store.conn(), &EntityListOptions::default()).unwrap();
        assert!(report.entries.is_empty());
        assert_eq!(report.total_matched, 0);
    }

    #[test]
    fn entity_kind_from_str_round_trips_known_kinds() {
        for k in [
            EntityKind::Url,
            EntityKind::Email,
            EntityKind::FilePath,
            EntityKind::Mention,
        ] {
            assert_eq!(entity_kind_from_str(k.as_str()).unwrap(), k);
        }
        // Case insensitive.
        assert_eq!(entity_kind_from_str("URL").unwrap(), EntityKind::Url);
        assert!(entity_kind_from_str("nope").is_err());
    }
}