sdivi_patterns/catalog.rs
1//! [`PatternCatalog`] — per-category pattern fingerprint aggregation.
2
3use std::collections::BTreeMap;
4use std::path::PathBuf;
5
6#[cfg(feature = "pipeline-records")]
7use globset::{Glob, GlobSet, GlobSetBuilder};
8#[cfg(feature = "pipeline-records")]
9use sdivi_config::PatternsConfig;
10use serde::{Deserialize, Serialize};
11
12#[cfg(feature = "pipeline-records")]
13use crate::fingerprint::fingerprint_node_kind;
14use crate::fingerprint::PatternFingerprint;
15#[cfg(feature = "pipeline-records")]
16use crate::queries;
17
18/// The file path and source position of a single pattern instance.
19///
20/// # Examples
21///
22/// ```rust
23/// use std::path::PathBuf;
24/// use sdivi_patterns::catalog::PatternLocation;
25///
26/// let loc = PatternLocation { file: PathBuf::from("src/lib.rs"), start_row: 10, start_col: 4 };
27/// assert_eq!(loc.start_row, 10);
28/// ```
29#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
30pub struct PatternLocation {
31 /// Source file path relative to the repository root.
32 pub file: PathBuf,
33 /// Zero-indexed source row (line) of the pattern instance.
34 pub start_row: usize,
35 /// Zero-indexed source column of the pattern instance.
36 pub start_col: usize,
37}
38
39/// Aggregated statistics for a single pattern fingerprint within one category.
40///
41/// # Examples
42///
43/// ```rust
44/// use sdivi_patterns::catalog::PatternStats;
45///
46/// let stats = PatternStats { count: 3, locations: vec![] };
47/// assert_eq!(stats.count, 3);
48/// ```
49#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
50pub struct PatternStats {
51 /// Total number of instances across all non-excluded files.
52 pub count: u32,
53 /// All source locations where this fingerprint was observed.
54 pub locations: Vec<PatternLocation>,
55}
56
57/// Per-category pattern catalog keyed by [`PatternFingerprint`].
58///
59/// `BTreeMap` ordering guarantees deterministic serialization.
60/// Empty categories are omitted from the map.
61///
62/// # Examples
63///
64/// ```rust
65/// use sdivi_patterns::PatternCatalog;
66///
67/// let catalog = PatternCatalog::default();
68/// assert!(catalog.entries.is_empty());
69/// ```
70#[derive(Debug, Clone, Serialize, Deserialize, Default, PartialEq)]
71pub struct PatternCatalog {
72 /// Outer key: category name; inner key: pattern fingerprint.
73 pub entries: BTreeMap<String, BTreeMap<PatternFingerprint, PatternStats>>,
74}
75
76/// Builds a [`PatternCatalog`] from parsed source records and the patterns config.
77///
78/// Files whose path matches any glob in `config.scope_exclude` are skipped for
79/// pattern collection but remain in the graph and partition stages. Fingerprints
80/// with a total instance count below `config.min_pattern_nodes` are removed.
81///
82/// Only available with the `pipeline-records` feature (default ON).
83///
84/// # Examples
85///
86/// ```rust
87/// use sdivi_config::Config;
88/// use sdivi_patterns::catalog::build_catalog;
89///
90/// let catalog = build_catalog(&[], &Config::default().patterns);
91/// assert!(catalog.entries.is_empty());
92/// ```
93#[cfg(feature = "pipeline-records")]
94pub fn build_catalog(
95 records: &[sdivi_parsing::feature_record::FeatureRecord],
96 config: &PatternsConfig,
97) -> PatternCatalog {
98 let exclude_set = build_globset(&config.scope_exclude);
99
100 let mut entries: BTreeMap<String, BTreeMap<PatternFingerprint, PatternStats>> = BTreeMap::new();
101
102 for record in records {
103 if is_excluded(&record.path, &exclude_set) {
104 continue;
105 }
106 for hint in &record.pattern_hints {
107 // M33: use classify_hint (node-kind + callee-text) instead of
108 // category_for_node_kind (node-kind only). Returns [] for unrecognised
109 // callees; those hints are silently dropped, matching the prior None path.
110 let hint_input = crate::hint_input::PatternHintInput {
111 node_kind: hint.node_kind.clone(),
112 text: hint.text.clone(),
113 };
114 let categories = queries::classify_hint(&hint_input, &record.language);
115 if categories.is_empty() {
116 continue;
117 }
118 let fp = fingerprint_node_kind(&hint.node_kind);
119 let location = PatternLocation {
120 file: record.path.clone(),
121 start_row: hint.start_row,
122 start_col: hint.start_col,
123 };
124 for category in categories {
125 let cat_map = entries.entry(category.to_string()).or_default();
126 let stats = cat_map.entry(fp).or_insert(PatternStats {
127 count: 0,
128 locations: vec![],
129 });
130 stats.count += 1;
131 // clone() per category per hint: v0 regex tables are disjoint so
132 // the inner loop runs at most once. clone() is the cold path today
133 // and avoids an early optimization that would need undoing the
134 // moment a co-occurring category is added.
135 stats.locations.push(location.clone());
136 }
137 }
138 }
139
140 let min = config.min_pattern_nodes;
141 for cat_map in entries.values_mut() {
142 cat_map.retain(|_, stats| stats.count >= min);
143 }
144 entries.retain(|_, cat_map| !cat_map.is_empty());
145
146 PatternCatalog { entries }
147}
148
149#[cfg(feature = "pipeline-records")]
150fn build_globset(patterns: &[String]) -> Option<GlobSet> {
151 if patterns.is_empty() {
152 return None;
153 }
154 let mut builder = GlobSetBuilder::new();
155 for pat in patterns {
156 if let Ok(glob) = Glob::new(pat) {
157 builder.add(glob);
158 }
159 }
160 builder.build().ok()
161}
162
163#[cfg(feature = "pipeline-records")]
164fn is_excluded(path: &std::path::Path, exclude_set: &Option<GlobSet>) -> bool {
165 match exclude_set {
166 None => false,
167 Some(gs) => gs.is_match(path),
168 }
169}