Skip to main content

bv_builder/
popularity.rs

1use std::collections::HashMap;
2use std::path::Path;
3
4use anyhow::Context;
5use serde::{Deserialize, Serialize};
6
7/// Co-occurrence scores for conda packages across the full tool registry.
8///
9/// Key: package name. Value: how many tools in the registry include it.
10/// Higher score = more popular = deserves its own OCI layer.
11///
12/// Scores are computed by `bv-builder pack` from the registry's `specs/` tree
13/// and committed as `popularity.json`. Each per-tool build reads this file and
14/// uses it to decide which packages get solo layers vs. the long-tail layer.
15///
16/// Stability guarantee: scores are keyed by package NAME only, not version.
17/// A new version of an already-popular package (e.g. Python 3.11.6 replacing
18/// 3.11.5) inherits the same popularity score and therefore the same layer
19/// priority — which means it still gets a solo layer, just with a different
20/// digest. This bounds layer-order churn when popular packages are upgraded.
21#[derive(Debug, Clone, Serialize, Deserialize, Default)]
22pub struct PopularityMap {
23    pub version: u32,
24    /// Package name → co-occurrence count (number of tools that list it).
25    pub packages: HashMap<String, u64>,
26}
27
28impl PopularityMap {
29    pub fn new() -> Self {
30        Self {
31            version: 1,
32            packages: HashMap::new(),
33        }
34    }
35
36    pub fn load(path: &Path) -> anyhow::Result<Self> {
37        let s = std::fs::read_to_string(path)
38            .with_context(|| format!("read popularity map '{}'", path.display()))?;
39        serde_json::from_str(&s)
40            .with_context(|| format!("parse popularity map '{}'", path.display()))
41    }
42
43    pub fn save(&self, path: &Path) -> anyhow::Result<()> {
44        let json = serde_json::to_string_pretty(self)?;
45        std::fs::write(path, &json)?;
46        Ok(())
47    }
48
49    /// Popularity score for a package, defaulting to 0 for unknowns.
50    pub fn score(&self, package_name: &str) -> u64 {
51        self.packages.get(package_name).copied().unwrap_or(0)
52    }
53
54    /// Record that all `package_names` appear together in one tool.
55    pub fn record_tool(&mut self, package_names: &[String]) {
56        for name in package_names {
57            *self.packages.entry(name.clone()).or_insert(0) += 1;
58        }
59    }
60}
61
62/// Compute a popularity map from all tool spec directories under `specs_root`.
63///
64/// Walks `specs_root/**/*.yaml`, parses each as a `BuildSpec`, and counts
65/// how many specs declare each package name. The resulting map is sorted
66/// deterministically (by name) inside `save()` via serde_json's map ordering.
67pub fn compute_from_spec_dir(specs_root: &Path) -> anyhow::Result<PopularityMap> {
68    let mut map = PopularityMap::new();
69
70    for entry in walkdir(specs_root)? {
71        let path = entry?;
72        if path.extension().and_then(|e| e.to_str()) != Some("yaml") {
73            continue;
74        }
75
76        let s = std::fs::read_to_string(&path)
77            .with_context(|| format!("read spec '{}'", path.display()))?;
78
79        // Parse only the `packages` field; skip unrelated YAML.
80        let raw: serde_yaml::Value = serde_yaml::from_str(&s)
81            .with_context(|| format!("parse spec '{}'", path.display()))?;
82
83        let names = extract_package_names(&raw);
84        map.record_tool(&names);
85    }
86
87    Ok(map)
88}
89
90/// Extract the bare package names from a parsed spec YAML value.
91fn extract_package_names(yaml: &serde_yaml::Value) -> Vec<String> {
92    let Some(pkgs) = yaml.get("packages").and_then(|v| v.as_sequence()) else {
93        return vec![];
94    };
95
96    pkgs.iter()
97        .filter_map(|v| v.as_str())
98        .map(|s| {
99            // Strip version constraint: "samtools ==1.19.2" -> "samtools"
100            s.split_whitespace().next().unwrap_or(s).to_string()
101        })
102        .collect()
103}
104
105fn walkdir(root: &Path) -> anyhow::Result<impl Iterator<Item = anyhow::Result<std::path::PathBuf>>> {
106    let entries = walkdir_inner(root);
107    Ok(entries.into_iter())
108}
109
110fn walkdir_inner(root: &Path) -> Vec<anyhow::Result<std::path::PathBuf>> {
111    let Ok(read) = std::fs::read_dir(root) else {
112        return vec![];
113    };
114    let mut results = vec![];
115    for entry in read {
116        let Ok(e) = entry else { continue };
117        let path = e.path();
118        if path.is_dir() {
119            results.extend(walkdir_inner(&path));
120        } else {
121            results.push(Ok(path));
122        }
123    }
124    results
125}
126
127#[cfg(test)]
128mod tests {
129    use super::*;
130
131    #[test]
132    fn record_tool_increments_counts() {
133        let mut map = PopularityMap::new();
134        map.record_tool(&["openssl".into(), "zlib".into()]);
135        map.record_tool(&["openssl".into(), "samtools".into()]);
136        assert_eq!(map.score("openssl"), 2);
137        assert_eq!(map.score("zlib"), 1);
138        assert_eq!(map.score("samtools"), 1);
139        assert_eq!(map.score("unknown"), 0);
140    }
141
142    #[test]
143    fn extract_names_strips_version_constraints() {
144        let yaml: serde_yaml::Value = serde_yaml::from_str(
145            "packages:\n  - samtools ==1.19.2\n  - openssl\n  - bwa >=0.7",
146        )
147        .unwrap();
148        let names = extract_package_names(&yaml);
149        assert_eq!(names, vec!["samtools", "openssl", "bwa"]);
150    }
151
152    #[test]
153    fn save_and_load_round_trips() {
154        let mut map = PopularityMap::new();
155        map.record_tool(&["openssl".into(), "zlib".into()]);
156
157        let dir = tempfile::tempdir().unwrap();
158        let path = dir.path().join("popularity.json");
159        map.save(&path).unwrap();
160
161        let loaded = PopularityMap::load(&path).unwrap();
162        assert_eq!(loaded.score("openssl"), 1);
163        assert_eq!(loaded.score("zlib"), 1);
164    }
165}