veles-core 0.1.0

//! Query-type boosting — definition boosts, multi-chunk file boosts, stem matches.
//!
//! All scoring is index-keyed: a slice `scores[idx]` holds the running score
//! for `chunks[idx]`. A score of `0.0` means "not in the candidate pool" — the
//! invariant is preserved because RRF scores are strictly positive for ranked
//! entries.

use ahash::AHashMap;
use ahash::AHashSet;
use regex::Regex;
use std::path::Path;
use std::sync::LazyLock;

use crate::tokenizer::split_identifier;
use crate::types::Chunk;

// ── Symbol query detection ────────────────────────────────────────────────

static SYMBOL_QUERY_RE: LazyLock<Regex> = LazyLock::new(|| {
    Regex::new(
        r"^(?:\
        [A-Za-z_][A-Za-z0-9_]*(?:(?:::|\\|->|\.)[A-Za-z_][A-Za-z0-9_]*)+\
        |_[A-Za-z0-9_]*\
        |[A-Za-z][A-Za-z0-9]*[A-Z_][A-Za-z0-9_]*\
        |[A-Z][A-Za-z0-9]*\
        )$",
    )
    .unwrap()
});

/// embedded CamelCase identifiers in NL queries.
static EMBEDDED_SYMBOL_RE: LazyLock<Regex> = LazyLock::new(|| {
    Regex::new(
        r"\b(?:\
        [A-Z][a-z][a-zA-Z0-9]*[A-Z][a-zA-Z0-9]*\
        |[a-z][a-zA-Z0-9]*[A-Z][a-zA-Z0-9]+\
        )\b",
    )
    .unwrap()
});

/// Token regex for natural-language stem boosts. Unicode-aware (matches
/// Cyrillic / CJK / Greek / Arabic etc. in addition to ASCII).
static TOKEN_RE: LazyLock<Regex> =
    LazyLock::new(|| Regex::new(r"[\p{L}_][\p{L}\p{N}_]*").unwrap());

/// Minimum stem length for prefix-based non-candidate scan.
const EMBEDDED_STEM_MIN_LEN: usize = 4;
/// Half-strength for embedded symbols.
const EMBEDDED_SYMBOL_BOOST_SCALE: f64 = 0.5;

/// Definition keywords across common languages.
const DEFINITION_KEYWORDS: &[&str] = &[
    "class", "module", "defmodule", "def", "interface", "struct", "enum",
    "trait", "type", "func", "function", "object", "abstract class",
    "data class", "fn", "fun", "package", "namespace", "protocol",
    "record", "typedef",
];

/// SQL DDL keywords.
const SQL_DEFINITION_KEYWORDS: &[&str] = &[
    "CREATE TABLE", "CREATE VIEW", "CREATE PROCEDURE", "CREATE FUNCTION",
];

/// Additive boost multiplier for chunks that define a queried symbol.
const DEFINITION_BOOST_MULTIPLIER: f64 = 3.0;
/// Additive boost multiplier for NL queries when file stems match.
const STEM_BOOST_MULTIPLIER: f64 = 1.0;
/// Fraction of max_score added for file coherence.
const FILE_COHERENCE_BOOST_FRAC: f64 = 0.2;

/// Common English stopwords.
static STOPWORDS: LazyLock<AHashSet<&'static str>> = LazyLock::new(|| {
    "a an and are as at be by do does for from has have how if in is it not of on or the to was \
     what when where which who why with"
        .split_whitespace()
        .collect()
});

// ── Public API ────────────────────────────────────────────────────────────

/// Return true if the query looks like a bare symbol or namespace-qualified identifier.
pub fn is_symbol_query(query: &str) -> bool {
    SYMBOL_QUERY_RE.is_match(query.trim())
}

/// Resolve the blending weight for semantic scores, auto-detecting from query type.
pub fn resolve_alpha(query: &str, alpha: Option<f64>) -> f64 {
    match alpha {
        Some(a) => a,
        None => {
            if is_symbol_query(query) {
                0.3 // lean BM25 for exact keyword matching
            } else {
                0.5 // balanced semantic + BM25
            }
        }
    }
}

/// Apply query-type boosts to candidate scores in place.
///
/// `scores[i]` is the running score for `chunks[i]`; entries with score `0.0`
/// are treated as non-candidates (boosts may still add them).
pub fn apply_query_boost(scores: &mut [f64], query: &str, chunks: &[Chunk]) {
    if scores.is_empty() {
        return;
    }
    let max_score = current_max(scores);
    if !(max_score > 0.0) {
        return;
    }

    if is_symbol_query(query) {
        boost_symbol_definitions(scores, query, max_score, chunks);
    } else {
        boost_stem_matches(scores, query, max_score, chunks);
        boost_embedded_symbols(scores, query, max_score, chunks);
    }
}

/// Promote files with multiple high-scoring chunks by boosting their top chunk in place.
pub fn boost_multi_chunk_files(scores: &mut [f64], chunks: &[Chunk]) {
    let max_score = current_max(scores);
    if !(max_score > 0.0) {
        return;
    }

    // Aggregate per-file totals and best chunk index in a single pass over candidates.
    let mut file_sum: AHashMap<&str, f64> = AHashMap::new();
    let mut best_idx: AHashMap<&str, (usize, f64)> = AHashMap::new();

    for (i, &score) in scores.iter().enumerate() {
        if !(score > 0.0) {
            continue;
        }
        let fp = chunks[i].file_path.as_str();
        *file_sum.entry(fp).or_insert(0.0) += score;
        let entry = best_idx.entry(fp).or_insert((i, f64::NEG_INFINITY));
        if score > entry.1 {
            *entry = (i, score);
        }
    }

    let max_file_sum = file_sum.values().copied().fold(f64::NEG_INFINITY, f64::max);
    if !(max_file_sum > 0.0) {
        return;
    }
    let boost_unit = max_score * FILE_COHERENCE_BOOST_FRAC;

    for (fp, (idx, _)) in &best_idx {
        let sum = *file_sum.get(*fp).unwrap_or(&0.0);
        scores[*idx] += boost_unit * sum / max_file_sum;
    }
}

// ── Internal helpers ──────────────────────────────────────────────────────

fn current_max(scores: &[f64]) -> f64 {
    scores.iter().copied().fold(f64::NEG_INFINITY, f64::max)
}

fn extract_symbol_name(query: &str) -> String {
    let query = query.trim();
    for separator in ["::", "\\", "->", "."] {
        if query.contains(separator) {
            return query.rsplit(separator).next().unwrap_or(query).to_string();
        }
    }
    query.to_string()
}

fn definition_pattern(symbol_name: &str) -> (Regex, Regex) {
    let escaped = regex::escape(symbol_name);
    let ns_prefix = r"(?:[A-Za-z_][A-Za-z0-9_]*(?:\.|::))*";
    let suffix = format!(r")\s+{ns_prefix}{escaped}(?:\s|[<({{\[:;]|$)");

    let kw_body: String = DEFINITION_KEYWORDS
        .iter()
        .map(|k| regex::escape(k))
        .collect::<Vec<_>>()
        .join("|");
    let sql_body: String = SQL_DEFINITION_KEYWORDS
        .iter()
        .map(|k| regex::escape(k))
        .collect::<Vec<_>>()
        .join("|");

    let general = Regex::new(&format!(r"(?:^|(?<=\s))(?:{kw_body}{suffix}")).unwrap();
    let sql = Regex::new(&format!(r"(?i)(?:^|(?<=\s))(?:{sql_body}{suffix}")).unwrap();
    (general, sql)
}

/// A bundle of definition patterns for a set of symbol names.
struct DefinitionMatchers {
    patterns: Vec<(Regex, Regex)>,
    names: Vec<String>,
}

impl DefinitionMatchers {
    fn for_names<I: IntoIterator<Item = String>>(names: I) -> Self {
        let names: Vec<String> = names.into_iter().collect();
        let patterns = names.iter().map(|n| definition_pattern(n)).collect();
        Self { patterns, names }
    }

    fn defines_any(&self, content: &str) -> bool {
        self.patterns
            .iter()
            .any(|(g, s)| g.is_match(content) || s.is_match(content))
    }
}

fn stem_matches(stem: &str, name: &str) -> bool {
    let stem_norm = stem.replace('_', "");
    stem == name
        || stem_norm == name
        || stem.trim_end_matches('s') == name
        || stem_norm.trim_end_matches('s') == name
}

fn definition_tier(chunk: &Chunk, matchers: &DefinitionMatchers, boost_unit: f64) -> f64 {
    if !matchers.defines_any(&chunk.content) {
        return 0.0;
    }
    let stem = file_stem_lower(&chunk.file_path);
    if matchers
        .names
        .iter()
        .any(|n| stem_matches(&stem, &n.to_lowercase()))
    {
        boost_unit * 1.5
    } else {
        boost_unit
    }
}

fn file_stem_lower(file_path: &str) -> String {
    Path::new(file_path)
        .file_stem()
        .and_then(|s| s.to_str())
        .unwrap_or("")
        .to_lowercase()
}

fn boost_symbol_definitions(
    scores: &mut [f64],
    query: &str,
    max_score: f64,
    chunks: &[Chunk],
) {
    let symbol_name = extract_symbol_name(query);
    let trimmed = query.trim().to_string();

    let mut names = vec![symbol_name.clone()];
    if symbol_name != trimmed {
        names.push(trimmed);
    }
    let matchers = DefinitionMatchers::for_names(names);

    let boost_unit = max_score * DEFINITION_BOOST_MULTIPLIER;
    let symbol_lower = symbol_name.to_lowercase();

    for (i, chunk) in chunks.iter().enumerate() {
        let in_pool = scores[i] > 0.0;
        if in_pool {
            let tier = definition_tier(chunk, &matchers, boost_unit);
            if tier > 0.0 {
                scores[i] += tier;
            }
        } else {
            // Non-candidate scan: only consider chunks whose file stem matches the symbol.
            let stem = file_stem_lower(&chunk.file_path);
            if !stem_matches(&stem, &symbol_lower) {
                continue;
            }
            let tier = definition_tier(chunk, &matchers, boost_unit);
            if tier > 0.0 {
                scores[i] += tier;
            }
        }
    }
}

fn boost_embedded_symbols(
    scores: &mut [f64],
    query: &str,
    max_score: f64,
    chunks: &[Chunk],
) {
    let names: Vec<String> = EMBEDDED_SYMBOL_RE
        .find_iter(query)
        .map(|m| m.as_str().to_string())
        .collect();
    if names.is_empty() {
        return;
    }

    let matchers = DefinitionMatchers::for_names(names.clone());
    let boost_unit = max_score * DEFINITION_BOOST_MULTIPLIER * EMBEDDED_SYMBOL_BOOST_SCALE;
    let symbols_lower: Vec<String> = names.iter().map(|s| s.to_lowercase()).collect();

    for (i, chunk) in chunks.iter().enumerate() {
        let in_pool = scores[i] > 0.0;
        if in_pool {
            let tier = definition_tier(chunk, &matchers, boost_unit);
            if tier > 0.0 {
                scores[i] += tier;
            }
        } else {
            // Non-candidate scan.
            let stem = file_stem_lower(&chunk.file_path);
            let stem_norm = stem.replace('_', "");
            let matches = symbols_lower.iter().any(|sym_lower| {
                stem == *sym_lower
                    || stem_norm == *sym_lower
                    || (stem.len() >= EMBEDDED_STEM_MIN_LEN && sym_lower.starts_with(&stem))
                    || (stem_norm.len() >= EMBEDDED_STEM_MIN_LEN
                        && sym_lower.starts_with(&stem_norm))
            });
            if !matches {
                continue;
            }
            let tier = definition_tier(chunk, &matchers, boost_unit);
            if tier > 0.0 {
                scores[i] += tier;
            }
        }
    }
}

fn boost_stem_matches(scores: &mut [f64], query: &str, max_score: f64, chunks: &[Chunk]) {
    let keywords: AHashSet<String> = TOKEN_RE
        .find_iter(query)
        .map(|m| m.as_str().to_lowercase())
        // Use char count (not byte length) so a 2-letter Cyrillic word like
        // "ии" is correctly treated as too short, while a 3-letter Cyrillic
        // word like "код" passes. STOPWORDS only catches English fillers; for
        // non-English words it's a no-op (none of them match).
        .filter(|w| w.chars().count() > 2 && !STOPWORDS.contains(w.as_str()))
        .collect();

    if keywords.is_empty() {
        return;
    }

    let boost = max_score * STEM_BOOST_MULTIPLIER;
    // Cache path-parts per file_path so we don't re-split for every chunk in the same file.
    let mut path_cache: AHashMap<&str, AHashSet<String>> = AHashMap::new();

    for (i, chunk) in chunks.iter().enumerate() {
        if !(scores[i] > 0.0) {
            continue;
        }
        let parts = path_cache
            .entry(chunk.file_path.as_str())
            .or_insert_with(|| build_path_parts(&chunk.file_path));

        let n_matches = count_keyword_matches(&keywords, parts);
        if n_matches > 0 {
            let match_ratio = n_matches as f64 / keywords.len() as f64;
            if match_ratio >= 0.10 {
                scores[i] += boost * match_ratio;
            }
        }
    }
}

fn build_path_parts(file_path: &str) -> AHashSet<String> {
    let path = Path::new(file_path);
    let mut parts: AHashSet<String> = AHashSet::new();
    if let Some(stem) = path.file_stem().and_then(|s| s.to_str()) {
        parts.extend(split_identifier(stem));
    }
    if let Some(parent_name) = path
        .parent()
        .and_then(|p| p.file_name())
        .and_then(|n| n.to_str())
    {
        if ![".", "/", ".."].contains(&parent_name) {
            parts.extend(split_identifier(parent_name));
        }
    }
    parts
}

fn count_keyword_matches(keywords: &AHashSet<String>, parts: &AHashSet<String>) -> usize {
    let mut n_matches = 0usize;
    let mut residual: Vec<&String> = Vec::with_capacity(keywords.len());
    for kw in keywords {
        if parts.contains(kw) {
            n_matches += 1;
        } else {
            residual.push(kw);
        }
    }
    if residual.is_empty() {
        return n_matches;
    }
    for keyword in residual {
        for part in parts {
            let (shorter, longer) = if keyword.len() <= part.len() {
                (keyword.as_str(), part.as_str())
            } else {
                (part.as_str(), keyword.as_str())
            };
            if shorter.len() >= 3 && longer.starts_with(shorter) {
                n_matches += 1;
                break;
            }
        }
    }
    n_matches
}