Skip to main content

sdivi_patterns/
catalog.rs

1//! [`PatternCatalog`] — per-category pattern fingerprint aggregation.
2
3use std::collections::BTreeMap;
4use std::path::PathBuf;
5
6use globset::{Glob, GlobSet, GlobSetBuilder};
7use sdivi_config::PatternsConfig;
8use serde::{Deserialize, Serialize};
9
10use crate::fingerprint::{fingerprint_node_kind, PatternFingerprint};
11use crate::queries;
12
13/// The file path and source position of a single pattern instance.
14///
15/// # Examples
16///
17/// ```rust
18/// use std::path::PathBuf;
19/// use sdivi_patterns::catalog::PatternLocation;
20///
21/// let loc = PatternLocation { file: PathBuf::from("src/lib.rs"), start_row: 10, start_col: 4 };
22/// assert_eq!(loc.start_row, 10);
23/// ```
24#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
25pub struct PatternLocation {
26    /// Source file path relative to the repository root.
27    pub file: PathBuf,
28    /// Zero-indexed source row (line) of the pattern instance.
29    pub start_row: usize,
30    /// Zero-indexed source column of the pattern instance.
31    pub start_col: usize,
32}
33
34/// Aggregated statistics for a single pattern fingerprint within one category.
35///
36/// # Examples
37///
38/// ```rust
39/// use sdivi_patterns::catalog::PatternStats;
40///
41/// let stats = PatternStats { count: 3, locations: vec![] };
42/// assert_eq!(stats.count, 3);
43/// ```
44#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
45pub struct PatternStats {
46    /// Total number of instances across all non-excluded files.
47    pub count: u32,
48    /// All source locations where this fingerprint was observed.
49    pub locations: Vec<PatternLocation>,
50}
51
52/// Per-category pattern catalog keyed by [`PatternFingerprint`].
53///
54/// `BTreeMap` ordering guarantees deterministic serialization.
55/// Empty categories are omitted from the map.
56///
57/// # Examples
58///
59/// ```rust
60/// use sdivi_patterns::PatternCatalog;
61///
62/// let catalog = PatternCatalog::default();
63/// assert!(catalog.entries.is_empty());
64/// ```
65#[derive(Debug, Clone, Serialize, Deserialize, Default, PartialEq)]
66pub struct PatternCatalog {
67    /// Outer key: category name; inner key: pattern fingerprint.
68    pub entries: BTreeMap<String, BTreeMap<PatternFingerprint, PatternStats>>,
69}
70
71/// Builds a [`PatternCatalog`] from parsed source records and the patterns config.
72///
73/// Files whose path matches any glob in `config.scope_exclude` are skipped for
74/// pattern collection but remain in the graph and partition stages. Fingerprints
75/// with a total instance count below `config.min_pattern_nodes` are removed.
76///
77/// Only available with the `pipeline-records` feature (default ON).
78///
79/// # Examples
80///
81/// ```rust
82/// use sdivi_config::Config;
83/// use sdivi_patterns::catalog::build_catalog;
84///
85/// let catalog = build_catalog(&[], &Config::default().patterns);
86/// assert!(catalog.entries.is_empty());
87/// ```
88#[cfg(feature = "pipeline-records")]
89pub fn build_catalog(
90    records: &[sdivi_parsing::feature_record::FeatureRecord],
91    config: &PatternsConfig,
92) -> PatternCatalog {
93    let exclude_set = build_globset(&config.scope_exclude);
94
95    let mut entries: BTreeMap<String, BTreeMap<PatternFingerprint, PatternStats>> = BTreeMap::new();
96
97    for record in records {
98        if is_excluded(&record.path, &exclude_set) {
99            continue;
100        }
101        for hint in &record.pattern_hints {
102            let Some(category) = queries::category_for_node_kind(&hint.node_kind, &record.language)
103            else {
104                continue;
105            };
106            let fp = fingerprint_node_kind(&hint.node_kind);
107            let location = PatternLocation {
108                file: record.path.clone(),
109                start_row: hint.start_row,
110                start_col: hint.start_col,
111            };
112            let cat_map = entries.entry(category.to_string()).or_default();
113            let stats = cat_map.entry(fp).or_insert(PatternStats {
114                count: 0,
115                locations: vec![],
116            });
117            stats.count += 1;
118            stats.locations.push(location);
119        }
120    }
121
122    let min = config.min_pattern_nodes;
123    for cat_map in entries.values_mut() {
124        cat_map.retain(|_, stats| stats.count >= min);
125    }
126    entries.retain(|_, cat_map| !cat_map.is_empty());
127
128    PatternCatalog { entries }
129}
130
131#[cfg(feature = "pipeline-records")]
132fn build_globset(patterns: &[String]) -> Option<GlobSet> {
133    if patterns.is_empty() {
134        return None;
135    }
136    let mut builder = GlobSetBuilder::new();
137    for pat in patterns {
138        if let Ok(glob) = Glob::new(pat) {
139            builder.add(glob);
140        }
141    }
142    builder.build().ok()
143}
144
145#[cfg(feature = "pipeline-records")]
146fn is_excluded(path: &std::path::Path, exclude_set: &Option<GlobSet>) -> bool {
147    match exclude_set {
148        None => false,
149        Some(gs) => gs.is_match(path),
150    }
151}