Skip to main content

sdivi_patterns/
catalog.rs

1//! [`PatternCatalog`] — per-category pattern fingerprint aggregation.
2
3use std::collections::BTreeMap;
4use std::path::PathBuf;
5
6#[cfg(feature = "pipeline-records")]
7use globset::{Glob, GlobSet, GlobSetBuilder};
8#[cfg(feature = "pipeline-records")]
9use sdivi_config::PatternsConfig;
10use serde::{Deserialize, Serialize};
11
12#[cfg(feature = "pipeline-records")]
13use crate::fingerprint::fingerprint_node_kind;
14use crate::fingerprint::PatternFingerprint;
15#[cfg(feature = "pipeline-records")]
16use crate::queries;
17
18/// The file path and source position of a single pattern instance.
19///
20/// # Examples
21///
22/// ```rust
23/// use std::path::PathBuf;
24/// use sdivi_patterns::catalog::PatternLocation;
25///
26/// let loc = PatternLocation { file: PathBuf::from("src/lib.rs"), start_row: 10, start_col: 4 };
27/// assert_eq!(loc.start_row, 10);
28/// ```
29#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
30pub struct PatternLocation {
31    /// Source file path relative to the repository root.
32    pub file: PathBuf,
33    /// Zero-indexed source row (line) of the pattern instance.
34    pub start_row: usize,
35    /// Zero-indexed source column of the pattern instance.
36    pub start_col: usize,
37}
38
39/// Aggregated statistics for a single pattern fingerprint within one category.
40///
41/// # Examples
42///
43/// ```rust
44/// use sdivi_patterns::catalog::PatternStats;
45///
46/// let stats = PatternStats { count: 3, locations: vec![] };
47/// assert_eq!(stats.count, 3);
48/// ```
49#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
50pub struct PatternStats {
51    /// Total number of instances across all non-excluded files.
52    pub count: u32,
53    /// All source locations where this fingerprint was observed.
54    pub locations: Vec<PatternLocation>,
55}
56
57/// Per-category pattern catalog keyed by [`PatternFingerprint`].
58///
59/// `BTreeMap` ordering guarantees deterministic serialization.
60/// Empty categories are omitted from the map.
61///
62/// # Examples
63///
64/// ```rust
65/// use sdivi_patterns::PatternCatalog;
66///
67/// let catalog = PatternCatalog::default();
68/// assert!(catalog.entries.is_empty());
69/// ```
70#[derive(Debug, Clone, Serialize, Deserialize, Default, PartialEq)]
71pub struct PatternCatalog {
72    /// Outer key: category name; inner key: pattern fingerprint.
73    pub entries: BTreeMap<String, BTreeMap<PatternFingerprint, PatternStats>>,
74}
75
76/// Builds a [`PatternCatalog`] from parsed source records and the patterns config.
77///
78/// Files whose path matches any glob in `config.scope_exclude` are skipped for
79/// pattern collection but remain in the graph and partition stages. Fingerprints
80/// with a total instance count below `config.min_pattern_nodes` are removed.
81///
82/// Only available with the `pipeline-records` feature (default ON).
83///
84/// # Examples
85///
86/// ```rust
87/// use sdivi_config::Config;
88/// use sdivi_patterns::catalog::build_catalog;
89///
90/// let catalog = build_catalog(&[], &Config::default().patterns);
91/// assert!(catalog.entries.is_empty());
92/// ```
93#[cfg(feature = "pipeline-records")]
94pub fn build_catalog(
95    records: &[sdivi_parsing::feature_record::FeatureRecord],
96    config: &PatternsConfig,
97) -> PatternCatalog {
98    let exclude_set = build_globset(&config.scope_exclude);
99
100    let mut entries: BTreeMap<String, BTreeMap<PatternFingerprint, PatternStats>> = BTreeMap::new();
101
102    for record in records {
103        if is_excluded(&record.path, &exclude_set) {
104            continue;
105        }
106        for hint in &record.pattern_hints {
107            // M33: use classify_hint (node-kind + callee-text) instead of
108            // category_for_node_kind (node-kind only). Returns [] for unrecognised
109            // callees; those hints are silently dropped, matching the prior None path.
110            let hint_input = crate::hint_input::PatternHintInput {
111                node_kind: hint.node_kind.clone(),
112                text: hint.text.clone(),
113            };
114            let categories = queries::classify_hint(&hint_input, &record.language);
115            if categories.is_empty() {
116                continue;
117            }
118            let fp = fingerprint_node_kind(&hint.node_kind);
119            let location = PatternLocation {
120                file: record.path.clone(),
121                start_row: hint.start_row,
122                start_col: hint.start_col,
123            };
124            for category in categories {
125                let cat_map = entries.entry(category.to_string()).or_default();
126                let stats = cat_map.entry(fp).or_insert(PatternStats {
127                    count: 0,
128                    locations: vec![],
129                });
130                stats.count += 1;
131                // clone() per category per hint: v0 regex tables are disjoint so
132                // the inner loop runs at most once. clone() is the cold path today
133                // and avoids an early optimization that would need undoing the
134                // moment a co-occurring category is added.
135                stats.locations.push(location.clone());
136            }
137        }
138    }
139
140    let min = config.min_pattern_nodes;
141    for cat_map in entries.values_mut() {
142        cat_map.retain(|_, stats| stats.count >= min);
143    }
144    entries.retain(|_, cat_map| !cat_map.is_empty());
145
146    PatternCatalog { entries }
147}
148
149#[cfg(feature = "pipeline-records")]
150fn build_globset(patterns: &[String]) -> Option<GlobSet> {
151    if patterns.is_empty() {
152        return None;
153    }
154    let mut builder = GlobSetBuilder::new();
155    for pat in patterns {
156        if let Ok(glob) = Glob::new(pat) {
157            builder.add(glob);
158        }
159    }
160    builder.build().ok()
161}
162
163#[cfg(feature = "pipeline-records")]
164fn is_excluded(path: &std::path::Path, exclude_set: &Option<GlobSet>) -> bool {
165    match exclude_set {
166        None => false,
167        Some(gs) => gs.is_match(path),
168    }
169}