sdivi-patterns 0.2.23

Pattern fingerprinting and catalog for sdivi-rust
Documentation
//! [`PatternCatalog`] — per-category pattern fingerprint aggregation.

use std::collections::BTreeMap;
use std::path::PathBuf;

#[cfg(feature = "pipeline-records")]
use globset::{Glob, GlobSet, GlobSetBuilder};
#[cfg(feature = "pipeline-records")]
use sdivi_config::PatternsConfig;
use serde::{Deserialize, Serialize};

#[cfg(feature = "pipeline-records")]
use crate::fingerprint::fingerprint_node_kind;
use crate::fingerprint::PatternFingerprint;
#[cfg(feature = "pipeline-records")]
use crate::queries;

/// The file path and source position of a single pattern instance.
///
/// # Examples
///
/// ```rust
/// use std::path::PathBuf;
/// use sdivi_patterns::catalog::PatternLocation;
///
/// let loc = PatternLocation { file: PathBuf::from("src/lib.rs"), start_row: 10, start_col: 4 };
/// assert_eq!(loc.start_row, 10);
/// ```
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
pub struct PatternLocation {
    /// Source file path relative to the repository root.
    pub file: PathBuf,
    /// Zero-indexed source row (line) of the pattern instance.
    pub start_row: usize,
    /// Zero-indexed source column of the pattern instance.
    pub start_col: usize,
}

/// Aggregated statistics for a single pattern fingerprint within one category.
///
/// # Examples
///
/// ```rust
/// use sdivi_patterns::catalog::PatternStats;
///
/// let stats = PatternStats { count: 3, locations: vec![] };
/// assert_eq!(stats.count, 3);
/// ```
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
pub struct PatternStats {
    /// Total number of instances across all non-excluded files.
    pub count: u32,
    /// All source locations where this fingerprint was observed.
    pub locations: Vec<PatternLocation>,
}

/// Per-category pattern catalog keyed by [`PatternFingerprint`].
///
/// `BTreeMap` ordering guarantees deterministic serialization.
/// Empty categories are omitted from the map.
///
/// # Examples
///
/// ```rust
/// use sdivi_patterns::PatternCatalog;
///
/// let catalog = PatternCatalog::default();
/// assert!(catalog.entries.is_empty());
/// ```
#[derive(Debug, Clone, Serialize, Deserialize, Default, PartialEq)]
pub struct PatternCatalog {
    /// Outer key: category name; inner key: pattern fingerprint.
    pub entries: BTreeMap<String, BTreeMap<PatternFingerprint, PatternStats>>,
}

/// Builds a [`PatternCatalog`] from parsed source records and the patterns config.
///
/// Files whose path matches any glob in `config.scope_exclude` are skipped for
/// pattern collection but remain in the graph and partition stages. Fingerprints
/// with a total instance count below `config.min_pattern_nodes` are removed.
///
/// Only available with the `pipeline-records` feature (default ON).
///
/// # Examples
///
/// ```rust
/// use sdivi_config::Config;
/// use sdivi_patterns::catalog::build_catalog;
///
/// let catalog = build_catalog(&[], &Config::default().patterns);
/// assert!(catalog.entries.is_empty());
/// ```
#[cfg(feature = "pipeline-records")]
pub fn build_catalog(
    records: &[sdivi_parsing::feature_record::FeatureRecord],
    config: &PatternsConfig,
) -> PatternCatalog {
    let exclude_set = build_globset(&config.scope_exclude);

    let mut entries: BTreeMap<String, BTreeMap<PatternFingerprint, PatternStats>> = BTreeMap::new();

    for record in records {
        if is_excluded(&record.path, &exclude_set) {
            continue;
        }
        for hint in &record.pattern_hints {
            // M33: use classify_hint (node-kind + callee-text) instead of
            // category_for_node_kind (node-kind only). Returns [] for unrecognised
            // callees; those hints are silently dropped, matching the prior None path.
            let hint_input = crate::hint_input::PatternHintInput {
                node_kind: hint.node_kind.clone(),
                text: hint.text.clone(),
            };
            let categories = queries::classify_hint(&hint_input, &record.language);
            if categories.is_empty() {
                continue;
            }
            let fp = fingerprint_node_kind(&hint.node_kind);
            let location = PatternLocation {
                file: record.path.clone(),
                start_row: hint.start_row,
                start_col: hint.start_col,
            };
            for category in categories {
                let cat_map = entries.entry(category.to_string()).or_default();
                let stats = cat_map.entry(fp).or_insert(PatternStats {
                    count: 0,
                    locations: vec![],
                });
                stats.count += 1;
                // clone() per category per hint: v0 regex tables are disjoint so
                // the inner loop runs at most once. clone() is the cold path today
                // and avoids an early optimization that would need undoing the
                // moment a co-occurring category is added.
                stats.locations.push(location.clone());
            }
        }
    }

    let min = config.min_pattern_nodes;
    for cat_map in entries.values_mut() {
        cat_map.retain(|_, stats| stats.count >= min);
    }
    entries.retain(|_, cat_map| !cat_map.is_empty());

    PatternCatalog { entries }
}

#[cfg(feature = "pipeline-records")]
fn build_globset(patterns: &[String]) -> Option<GlobSet> {
    if patterns.is_empty() {
        return None;
    }
    let mut builder = GlobSetBuilder::new();
    for pat in patterns {
        if let Ok(glob) = Glob::new(pat) {
            builder.add(glob);
        }
    }
    builder.build().ok()
}

#[cfg(feature = "pipeline-records")]
fn is_excluded(path: &std::path::Path, exclude_set: &Option<GlobSet>) -> bool {
    match exclude_set {
        None => false,
        Some(gs) => gs.is_match(path),
    }
}