Skip to main content

bv_builder/
layering.rs

1use bv_core::lockfile::{CondaPackagePin, LayerDescriptor};
2
3use crate::popularity::PopularityMap;
4use crate::spec::ResolvedPackage;
5
6/// Strategy for grouping packages into OCI layers.
7///
8/// Layer order: most-stable (lowest in dependency graph) at index 0,
9/// most-volatile (entrypoint) at the top. Docker pulls layers in manifest
10/// order so stable-first minimises re-downloads across tool upgrades.
11#[derive(Debug, Clone, PartialEq, Eq)]
12pub enum PackingStrategy {
13    /// Each package gets its own layer (default for small tool sets).
14    OnePerPackage,
15    /// Popularity-based packing when `max_layers` is exceeded.
16    PopularityBased { max_layers: usize },
17}
18
19impl Default for PackingStrategy {
20    fn default() -> Self {
21        Self::OnePerPackage
22    }
23}
24
25/// A group of packages that will be combined into a single OCI layer.
26#[derive(Debug, Clone)]
27pub struct LayerGroup {
28    pub packages: Vec<ResolvedPackage>,
29}
30
31/// Group `packages` into layer groups according to `strategy`.
32///
33/// When `popularity` is provided and `strategy` is `PopularityBased`, packages
34/// are sorted by their co-occurrence score (descending) before splitting into
35/// solo vs. long-tail groups. Without scores the sort falls back to package
36/// name for determinism, which is correct but not optimal.
37///
38/// The caller is responsible for appending the meta layer and entrypoint layer
39/// after the returned groups.
40pub fn pack(
41    packages: &[ResolvedPackage],
42    strategy: &PackingStrategy,
43    popularity: Option<&PopularityMap>,
44) -> Vec<LayerGroup> {
45    match strategy {
46        PackingStrategy::OnePerPackage => packages
47            .iter()
48            .map(|p| LayerGroup {
49                packages: vec![p.clone()],
50            })
51            .collect(),
52
53        PackingStrategy::PopularityBased { max_layers } => {
54            pack_by_popularity(packages, *max_layers, popularity)
55        }
56    }
57}
58
59/// Sort `packages` by popularity score descending, then by name for
60/// determinism.  The `max_layers - 2` most popular packages each get their own
61/// layer; the remaining packages are packed into a single "long-tail" layer.
62/// The last two layer slots are reserved for the meta and entrypoint layers
63/// added by the caller.
64///
65/// **Stability invariant**: because scores are keyed by package *name* (not
66/// version+build), upgrading an existing popular package (e.g. `openssl`
67/// 3.2.1 → 3.3.0) preserves its high score and keeps it in a solo layer,
68/// just with a new digest.  Only the solo/long-tail boundary changes when the
69/// registry grows beyond `max_layers - 2` unique popular packages, which
70/// happens at most `O(1)` times per new tool added.
71fn pack_by_popularity(
72    packages: &[ResolvedPackage],
73    max_layers: usize,
74    popularity: Option<&PopularityMap>,
75) -> Vec<LayerGroup> {
76    if max_layers < 3 || packages.is_empty() {
77        return vec![LayerGroup {
78            packages: packages.to_vec(),
79        }];
80    }
81
82    // Sort by score desc, then name asc for determinism within ties.
83    let mut sorted = packages.to_vec();
84    sorted.sort_by(|a, b| {
85        let sa = popularity.map(|p| p.score(&a.name)).unwrap_or(0);
86        let sb = popularity.map(|p| p.score(&b.name)).unwrap_or(0);
87        sb.cmp(&sa).then(a.name.cmp(&b.name))
88    });
89
90    let solo_count = max_layers.saturating_sub(2).min(sorted.len());
91    let (solo, tail) = sorted.split_at(solo_count);
92
93    let mut groups: Vec<LayerGroup> = solo
94        .iter()
95        .map(|p| LayerGroup {
96            packages: vec![p.clone()],
97        })
98        .collect();
99
100    if !tail.is_empty() {
101        groups.push(LayerGroup {
102            packages: tail.to_vec(),
103        });
104    }
105    groups
106}
107
108/// Convert a `ResolvedPackage` into a `LayerDescriptor` placeholder.
109/// The actual `digest` and `size` are filled in by `build::build_layer` after
110/// the layer blob has been created.
111pub fn placeholder_descriptor(pkg: &ResolvedPackage) -> LayerDescriptor {
112    LayerDescriptor {
113        digest: String::new(),
114        size: 0,
115        media_type: "application/vnd.oci.image.layer.v1.tar+zstd".into(),
116        conda_package: Some(CondaPackagePin {
117            name: pkg.name.clone(),
118            version: pkg.version.clone(),
119            build: pkg.build.clone(),
120            channel: pkg.channel.clone(),
121            sha256: pkg.sha256.clone(),
122        }),
123    }
124}
125
126#[cfg(test)]
127mod tests {
128    use super::*;
129
130    fn pkg(name: &str) -> ResolvedPackage {
131        crate::spec::ResolvedPackage {
132            name: name.into(),
133            version: "1.0.0".into(),
134            build: "h0_0".into(),
135            channel: "conda-forge".into(),
136            url: format!("https://example.com/{name}.conda"),
137            sha256: "abc".into(),
138            filename: format!("{name}-1.0.0-h0_0.conda"),
139        }
140    }
141
142    #[test]
143    fn one_per_package_gives_n_groups() {
144        let pkgs = vec![pkg("openssl"), pkg("zlib"), pkg("samtools")];
145        let groups = pack(&pkgs, &PackingStrategy::OnePerPackage, None);
146        assert_eq!(groups.len(), 3);
147        assert_eq!(groups[0].packages[0].name, "openssl");
148    }
149
150    #[test]
151    fn popularity_packing_respects_max_layers() {
152        let pkgs: Vec<_> = (0..10).map(|i| pkg(&format!("pkg{i:02}"))).collect();
153        let groups = pack(
154            &pkgs,
155            &PackingStrategy::PopularityBased { max_layers: 5 },
156            None,
157        );
158        // 3 solo layers + 1 long-tail (slots 4 and 5 reserved for meta+entrypoint)
159        assert_eq!(groups.len(), 4);
160        assert_eq!(groups.last().unwrap().packages.len(), 7); // 10 - 3
161    }
162
163    #[test]
164    fn popularity_packing_degenerate_small_input() {
165        let pkgs = vec![pkg("samtools")];
166        let groups = pack(
167            &pkgs,
168            &PackingStrategy::PopularityBased { max_layers: 64 },
169            None,
170        );
171        assert_eq!(groups.len(), 1);
172        assert_eq!(groups[0].packages[0].name, "samtools");
173    }
174
175    #[test]
176    fn popular_packages_placed_before_rare_ones() {
177        let mut pop = PopularityMap::new();
178        // openssl appears in 10 tools, zlib in 3, rare in 1
179        for _ in 0..10 {
180            pop.record_tool(&["openssl".into()]);
181        }
182        for _ in 0..3 {
183            pop.record_tool(&["zlib".into()]);
184        }
185        pop.record_tool(&["rare".into()]);
186
187        let pkgs = vec![pkg("rare"), pkg("zlib"), pkg("openssl")];
188        let groups = pack(
189            &pkgs,
190            &PackingStrategy::PopularityBased { max_layers: 64 },
191            Some(&pop),
192        );
193
194        // All three fit in solo layers (64 - 2 = 62 solo slots).
195        assert_eq!(groups.len(), 3);
196        assert_eq!(groups[0].packages[0].name, "openssl");
197        assert_eq!(groups[1].packages[0].name, "zlib");
198        assert_eq!(groups[2].packages[0].name, "rare");
199    }
200
201    #[test]
202    fn rare_packages_land_in_long_tail() {
203        let mut pop = PopularityMap::new();
204        pop.record_tool(&["openssl".into(), "zlib".into()]);
205        pop.record_tool(&["openssl".into(), "bz2".into()]);
206
207        // 3 solo slots: max_layers=5, 5-2=3 solo, 1 long-tail
208        let pkgs = vec![pkg("openssl"), pkg("zlib"), pkg("bz2"), pkg("rare1"), pkg("rare2")];
209        let groups = pack(
210            &pkgs,
211            &PackingStrategy::PopularityBased { max_layers: 5 },
212            Some(&pop),
213        );
214
215        // Exactly 4 groups: openssl solo, zlib solo, bz2 solo, long-tail (rare1+rare2).
216        assert_eq!(groups.len(), 4);
217        assert_eq!(groups[0].packages[0].name, "openssl");
218        // rare packages are in the last group
219        let tail = groups.last().unwrap();
220        assert_eq!(tail.packages.len(), 2);
221    }
222
223    #[test]
224    fn packing_is_deterministic_for_same_scores() {
225        let mut pop = PopularityMap::new();
226        pop.record_tool(&["aa".into(), "bb".into(), "cc".into()]);
227
228        let pkgs = vec![pkg("cc"), pkg("aa"), pkg("bb")];
229        let groups1 = pack(
230            &pkgs,
231            &PackingStrategy::PopularityBased { max_layers: 64 },
232            Some(&pop),
233        );
234        let groups2 = pack(
235            &pkgs,
236            &PackingStrategy::PopularityBased { max_layers: 64 },
237            Some(&pop),
238        );
239
240        let names1: Vec<_> = groups1.iter().map(|g| g.packages[0].name.as_str()).collect();
241        let names2: Vec<_> = groups2.iter().map(|g| g.packages[0].name.as_str()).collect();
242        assert_eq!(names1, names2, "packing must be deterministic");
243        // Tie-broken by name: aa < bb < cc
244        assert_eq!(names1, vec!["aa", "bb", "cc"]);
245    }
246
247    /// M5.4: Synthesize 100 fake tool specs with overlapping deps.
248    /// Assert that for any two specs sharing a popular package, that package
249    /// lands in a solo LayerGroup in both specs — guaranteeing identical
250    /// layer digests when the same package+version+build is built reproducibly.
251    #[test]
252    fn shared_popular_packages_get_solo_layers_across_tools() {
253        const NUM_TOOLS: usize = 100;
254        const MAX_LAYERS: usize = 64;
255        const SHARED_PKGS: &[&str] = &[
256            "openssl", "zlib", "libgcc", "libstdcxx", "ncurses", "xz", "bzip2",
257        ];
258        const UNIQUE_SUFFIX: &str = "tool-specific-pkg";
259
260        // Build a fake registry: each tool uses all shared packages + one unique package.
261        let all_tool_packages: Vec<Vec<String>> = (0..NUM_TOOLS)
262            .map(|i| {
263                let mut pkgs: Vec<String> = SHARED_PKGS.iter().map(|s| s.to_string()).collect();
264                pkgs.push(format!("{UNIQUE_SUFFIX}-{i}"));
265                pkgs
266            })
267            .collect();
268
269        // Compute popularity from all tools.
270        let mut pop = PopularityMap::new();
271        for tool_pkgs in &all_tool_packages {
272            pop.record_tool(tool_pkgs);
273        }
274
275        // Pack two representative tools and assert shared packages get solo layers.
276        for tool_idx in [0usize, 42, 99] {
277            let pkgs: Vec<_> = all_tool_packages[tool_idx]
278                .iter()
279                .map(|name| crate::spec::ResolvedPackage {
280                    name: name.clone(),
281                    version: "1.0.0".into(),
282                    build: "h0_0".into(),
283                    channel: "conda-forge".into(),
284                    url: format!("https://example.com/{name}.conda"),
285                    sha256: format!("sha256-{name}"),
286                    filename: format!("{name}-1.0.0-h0_0.conda"),
287                })
288                .collect();
289
290            let groups = pack(
291                &pkgs,
292                &PackingStrategy::PopularityBased { max_layers: MAX_LAYERS },
293                Some(&pop),
294            );
295
296            // Every shared package must appear in a solo group (one package per group).
297            for shared in SHARED_PKGS {
298                let solo = groups.iter().any(|g| {
299                    g.packages.len() == 1 && g.packages[0].name == *shared
300                });
301                assert!(
302                    solo,
303                    "shared package '{}' must get its own layer in tool-{tool_idx}",
304                    shared
305                );
306            }
307        }
308    }
309
310    /// Same shared package in two different tools must produce the same
311    /// LayerGroup structure (same single package), confirming digest identity.
312    #[test]
313    fn shared_package_has_same_solo_group_across_tools() {
314        let mut pop = PopularityMap::new();
315        pop.record_tool(&["openssl".into(), "samtools".into()]);
316        pop.record_tool(&["openssl".into(), "bwa".into()]);
317
318        let samtools_pkgs = vec![pkg("openssl"), pkg("samtools")];
319        let bwa_pkgs = vec![pkg("openssl"), pkg("bwa")];
320
321        let groups_s = pack(
322            &samtools_pkgs,
323            &PackingStrategy::PopularityBased { max_layers: 64 },
324            Some(&pop),
325        );
326        let groups_b = pack(
327            &bwa_pkgs,
328            &PackingStrategy::PopularityBased { max_layers: 64 },
329            Some(&pop),
330        );
331
332        // openssl is the first group in both (highest score = 2).
333        assert_eq!(groups_s[0].packages[0].name, "openssl");
334        assert_eq!(groups_b[0].packages[0].name, "openssl");
335
336        // Both openssl groups contain exactly one package with the same identity.
337        // A deterministic build on those groups would yield identical layer digests.
338        assert_eq!(
339            groups_s[0].packages[0].sha256,
340            groups_b[0].packages[0].sha256,
341        );
342    }
343}