Skip to main content

bv_builder/
popularity.rs

1use std::collections::HashMap;
2use std::path::Path;
3
4use anyhow::Context;
5use serde::{Deserialize, Serialize};
6use walkdir::WalkDir;
7
8/// Co-occurrence scores for conda packages across the full tool registry.
9///
10/// Key: package name. Value: how many tools in the registry include it.
11/// Higher score = more popular = deserves its own OCI layer.
12///
13/// Scores are computed by `bv-builder pack` from the registry's `specs/` tree
14/// and committed as `popularity.json`. Each per-tool build reads this file and
15/// uses it to decide which packages get solo layers vs. the long-tail layer.
16///
17/// Stability guarantee: scores are keyed by package NAME only, not version.
18/// A new version of an already-popular package (e.g. Python 3.11.6 replacing
19/// 3.11.5) inherits the same popularity score and therefore the same layer
20/// priority, which means it still gets a solo layer, just with a different
21/// digest. This bounds layer-order churn when popular packages are upgraded.
22#[derive(Debug, Clone, Serialize, Deserialize, Default)]
23pub struct PopularityMap {
24    pub version: u32,
25    /// Package name → co-occurrence count (number of tools that list it).
26    pub packages: HashMap<String, u64>,
27}
28
29impl PopularityMap {
30    pub fn new() -> Self {
31        Self {
32            version: 1,
33            packages: HashMap::new(),
34        }
35    }
36
37    pub fn load(path: &Path) -> anyhow::Result<Self> {
38        let s = std::fs::read_to_string(path)
39            .with_context(|| format!("read popularity map '{}'", path.display()))?;
40        serde_json::from_str(&s)
41            .with_context(|| format!("parse popularity map '{}'", path.display()))
42    }
43
44    pub fn save(&self, path: &Path) -> anyhow::Result<()> {
45        let json = serde_json::to_string_pretty(self)?;
46        std::fs::write(path, &json)?;
47        Ok(())
48    }
49
50    /// Popularity score for a package, defaulting to 0 for unknowns.
51    pub fn score(&self, package_name: &str) -> u64 {
52        self.packages.get(package_name).copied().unwrap_or(0)
53    }
54
55    /// Record that all `package_names` appear together in one tool.
56    pub fn record_tool(&mut self, package_names: &[String]) {
57        for name in package_names {
58            *self.packages.entry(name.clone()).or_insert(0) += 1;
59        }
60    }
61}
62
63/// Compute a popularity map from all tool spec directories under `specs_root`.
64///
65/// Walks `specs_root/**/*.toml`, parses each as a `BuildSpec`, and counts
66/// how many specs declare each package name. The resulting map is sorted
67/// deterministically (by name) inside `save()` via serde_json's map ordering.
68pub fn compute_from_spec_dir(specs_root: &Path) -> anyhow::Result<PopularityMap> {
69    let mut map = PopularityMap::new();
70
71    for entry in WalkDir::new(specs_root).into_iter().filter_map(Result::ok) {
72        let path = entry.path();
73        if !entry.file_type().is_file() || path.extension().and_then(|e| e.to_str()) != Some("toml")
74        {
75            continue;
76        }
77
78        let s = std::fs::read_to_string(path)
79            .with_context(|| format!("read spec '{}'", path.display()))?;
80
81        let raw: toml::Value =
82            toml::from_str(&s).with_context(|| format!("parse spec '{}'", path.display()))?;
83
84        let names = extract_package_names(&raw);
85        map.record_tool(&names);
86    }
87
88    Ok(map)
89}
90
91/// Extract the bare package names from a parsed spec TOML value.
92fn extract_package_names(spec: &toml::Value) -> Vec<String> {
93    let Some(pkgs) = spec.get("packages").and_then(|v| v.as_array()) else {
94        return vec![];
95    };
96
97    pkgs.iter()
98        .filter_map(|v| v.as_str())
99        .map(|s| {
100            // Strip version constraint: "samtools ==1.19.2" -> "samtools"
101            s.split_whitespace().next().unwrap_or(s).to_string()
102        })
103        .collect()
104}
105
106#[cfg(test)]
107mod tests {
108    use super::*;
109
110    #[test]
111    fn record_tool_increments_counts() {
112        let mut map = PopularityMap::new();
113        map.record_tool(&["openssl".into(), "zlib".into()]);
114        map.record_tool(&["openssl".into(), "samtools".into()]);
115        assert_eq!(map.score("openssl"), 2);
116        assert_eq!(map.score("zlib"), 1);
117        assert_eq!(map.score("samtools"), 1);
118        assert_eq!(map.score("unknown"), 0);
119    }
120
121    #[test]
122    fn extract_names_strips_version_constraints() {
123        let val: toml::Value =
124            toml::from_str("packages = [\"samtools ==1.19.2\", \"openssl\", \"bwa >=0.7\"]")
125                .unwrap();
126        let names = extract_package_names(&val);
127        assert_eq!(names, vec!["samtools", "openssl", "bwa"]);
128    }
129
130    #[test]
131    fn save_and_load_round_trips() {
132        let mut map = PopularityMap::new();
133        map.record_tool(&["openssl".into(), "zlib".into()]);
134
135        let dir = tempfile::tempdir().unwrap();
136        let path = dir.path().join("popularity.json");
137        map.save(&path).unwrap();
138
139        let loaded = PopularityMap::load(&path).unwrap();
140        assert_eq!(loaded.score("openssl"), 1);
141        assert_eq!(loaded.score("zlib"), 1);
142    }
143}