Skip to main content

bv_builder/
layering.rs

1use bv_core::lockfile::{CondaPackagePin, LayerDescriptor};
2
3use crate::catalog::LayerCatalog;
4use crate::popularity::PopularityMap;
5use crate::spec::ResolvedPackage;
6
7/// Strategy for grouping packages into OCI layers.
8///
9/// Layer order: most-stable (lowest in dependency graph) at index 0,
10/// most-volatile (entrypoint) at the top. Docker pulls layers in manifest
11/// order so stable-first minimises re-downloads across tool upgrades.
12#[derive(Debug, Clone, PartialEq, Eq, Default)]
13pub enum PackingStrategy {
14    /// Each package gets its own layer (default for small tool sets).
15    #[default]
16    OnePerPackage,
17    /// Popularity-based packing when `max_layers` is exceeded.
18    ///
19    /// Requires a pre-computed `PopularityMap` from `bv-builder pack`. The
20    /// most popular packages (by co-occurrence across all registry specs) get
21    /// solo layers; the long tail is bundled into one layer.
22    PopularityBased { max_layers: usize },
23    /// Catalog-aware greedy packing.
24    ///
25    /// Packages that already have an entry in the `LayerCatalog` (meaning a
26    /// matching blob is already on the registry from a previous build) are
27    /// sorted by their catalog count descending and given priority for solo
28    /// layers. Cache-miss packages fill the remaining solo slots; any overflow
29    /// goes into a single long-tail layer.
30    ///
31    /// This is the preferred strategy for `bv publish --spec` because it
32    /// requires no pre-computation: the catalog grows incrementally and each
33    /// new publish greedily populates it.
34    CatalogAware { max_layers: usize },
35}
36
37/// A group of packages that will be combined into a single OCI layer.
38#[derive(Debug, Clone)]
39pub struct LayerGroup {
40    pub packages: Vec<ResolvedPackage>,
41}
42
43/// Group `packages` into layer groups according to `strategy`.
44///
45/// The caller is responsible for appending the meta layer and entrypoint layer
46/// after the returned groups.
47pub fn pack(
48    packages: &[ResolvedPackage],
49    strategy: &PackingStrategy,
50    popularity: Option<&PopularityMap>,
51    catalog: Option<&LayerCatalog>,
52) -> Vec<LayerGroup> {
53    match strategy {
54        PackingStrategy::OnePerPackage => packages
55            .iter()
56            .map(|p| LayerGroup {
57                packages: vec![p.clone()],
58            })
59            .collect(),
60
61        PackingStrategy::PopularityBased { max_layers } => {
62            pack_by_popularity(packages, *max_layers, popularity)
63        }
64
65        PackingStrategy::CatalogAware { max_layers } => {
66            pack_by_catalog(packages, *max_layers, catalog)
67        }
68    }
69}
70
71/// Sort `packages` by popularity score descending, then by name for
72/// determinism.  The `max_layers - 2` most popular packages each get their own
73/// layer; the remaining packages are packed into a single "long-tail" layer.
74/// The last two layer slots are reserved for the meta and entrypoint layers
75/// added by the caller.
76///
77/// **Stability invariant**: because scores are keyed by package *name* (not
78/// version+build), upgrading an existing popular package (e.g. `openssl`
79/// 3.2.1 → 3.3.0) preserves its high score and keeps it in a solo layer,
80/// just with a new digest.  Only the solo/long-tail boundary changes when the
81/// registry grows beyond `max_layers - 2` unique popular packages, which
82/// happens at most `O(1)` times per new tool added.
83fn pack_by_popularity(
84    packages: &[ResolvedPackage],
85    max_layers: usize,
86    popularity: Option<&PopularityMap>,
87) -> Vec<LayerGroup> {
88    if max_layers < 3 || packages.is_empty() {
89        return vec![LayerGroup {
90            packages: packages.to_vec(),
91        }];
92    }
93
94    // Sort by score desc, then name asc for determinism within ties.
95    let mut sorted = packages.to_vec();
96    sorted.sort_by(|a, b| {
97        let sa = popularity.map(|p| p.score(&a.name)).unwrap_or(0);
98        let sb = popularity.map(|p| p.score(&b.name)).unwrap_or(0);
99        sb.cmp(&sa).then(a.name.cmp(&b.name))
100    });
101
102    let solo_count = max_layers.saturating_sub(2).min(sorted.len());
103    let (solo, tail) = sorted.split_at(solo_count);
104
105    let mut groups: Vec<LayerGroup> = solo
106        .iter()
107        .map(|p| LayerGroup {
108            packages: vec![p.clone()],
109        })
110        .collect();
111
112    if !tail.is_empty() {
113        groups.push(LayerGroup {
114            packages: tail.to_vec(),
115        });
116    }
117    groups
118}
119
120/// Sort packages so catalog hits come first (by count desc, then name asc for
121/// determinism within ties). Cache misses follow in name order. Assign the
122/// first `max_layers - 2` packages their own layer; bundle the rest into a
123/// single long-tail layer. The last two slots are reserved for the meta and
124/// entrypoint layers the caller appends.
125///
126/// Packages already in the catalog already have a matching blob on the
127/// registry from a previous build. Giving them solo-layer priority maximises
128/// cross-image layer deduplication without requiring any pre-computed global
129/// popularity file.
130fn pack_by_catalog(
131    packages: &[ResolvedPackage],
132    max_layers: usize,
133    catalog: Option<&LayerCatalog>,
134) -> Vec<LayerGroup> {
135    if max_layers < 3 || packages.is_empty() {
136        return vec![LayerGroup {
137            packages: packages.to_vec(),
138        }];
139    }
140
141    let mut sorted = packages.to_vec();
142    sorted.sort_by(|a, b| {
143        let ca = catalog
144            .and_then(|c| c.get(&a.name, &a.version, &a.build))
145            .map(|e| e.count)
146            .unwrap_or(0);
147        let cb = catalog
148            .and_then(|c| c.get(&b.name, &b.version, &b.build))
149            .map(|e| e.count)
150            .unwrap_or(0);
151        // Higher count first; break ties by name for determinism.
152        cb.cmp(&ca).then(a.name.cmp(&b.name))
153    });
154
155    let solo_count = max_layers.saturating_sub(2).min(sorted.len());
156    let (solo, tail) = sorted.split_at(solo_count);
157
158    let mut groups: Vec<LayerGroup> = solo
159        .iter()
160        .map(|p| LayerGroup {
161            packages: vec![p.clone()],
162        })
163        .collect();
164
165    if !tail.is_empty() {
166        groups.push(LayerGroup {
167            packages: tail.to_vec(),
168        });
169    }
170    groups
171}
172
173/// Convert a `ResolvedPackage` into a `LayerDescriptor` placeholder.
174/// The actual `digest` and `size` are filled in by `build::build_layer` after
175/// the layer blob has been created.
176pub fn placeholder_descriptor(pkg: &ResolvedPackage) -> LayerDescriptor {
177    LayerDescriptor {
178        digest: String::new(),
179        size: 0,
180        media_type: "application/vnd.oci.image.layer.v1.tar+zstd".into(),
181        conda_package: Some(CondaPackagePin {
182            name: pkg.name.clone(),
183            version: pkg.version.clone(),
184            build: pkg.build.clone(),
185            channel: pkg.channel.clone(),
186            sha256: pkg.sha256.clone(),
187        }),
188    }
189}
190
191#[cfg(test)]
192mod tests {
193    use super::*;
194
195    fn pkg(name: &str) -> ResolvedPackage {
196        crate::spec::ResolvedPackage {
197            name: name.into(),
198            version: "1.0.0".into(),
199            build: "h0_0".into(),
200            channel: "conda-forge".into(),
201            url: format!("https://example.com/{name}.conda"),
202            sha256: "abc".into(),
203            filename: format!("{name}-1.0.0-h0_0.conda"),
204            depends: vec![],
205        }
206    }
207
208    #[test]
209    fn one_per_package_gives_n_groups() {
210        let pkgs = vec![pkg("openssl"), pkg("zlib"), pkg("samtools")];
211        let groups = pack(&pkgs, &PackingStrategy::OnePerPackage, None, None);
212        assert_eq!(groups.len(), 3);
213        assert_eq!(groups[0].packages[0].name, "openssl");
214    }
215
216    #[test]
217    fn popularity_packing_respects_max_layers() {
218        let pkgs: Vec<_> = (0..10).map(|i| pkg(&format!("pkg{i:02}"))).collect();
219        let groups = pack(
220            &pkgs,
221            &PackingStrategy::PopularityBased { max_layers: 5 },
222            None,
223            None,
224        );
225        // 3 solo layers + 1 long-tail (slots 4 and 5 reserved for meta+entrypoint)
226        assert_eq!(groups.len(), 4);
227        assert_eq!(groups.last().unwrap().packages.len(), 7); // 10 - 3
228    }
229
230    #[test]
231    fn popularity_packing_degenerate_small_input() {
232        let pkgs = vec![pkg("samtools")];
233        let groups = pack(
234            &pkgs,
235            &PackingStrategy::PopularityBased { max_layers: 64 },
236            None,
237            None,
238        );
239        assert_eq!(groups.len(), 1);
240        assert_eq!(groups[0].packages[0].name, "samtools");
241    }
242
243    #[test]
244    fn popular_packages_placed_before_rare_ones() {
245        let mut pop = PopularityMap::new();
246        // openssl appears in 10 tools, zlib in 3, rare in 1
247        for _ in 0..10 {
248            pop.record_tool(&["openssl".into()]);
249        }
250        for _ in 0..3 {
251            pop.record_tool(&["zlib".into()]);
252        }
253        pop.record_tool(&["rare".into()]);
254
255        let pkgs = vec![pkg("rare"), pkg("zlib"), pkg("openssl")];
256        let groups = pack(
257            &pkgs,
258            &PackingStrategy::PopularityBased { max_layers: 64 },
259            Some(&pop),
260            None,
261        );
262
263        // All three fit in solo layers (64 - 2 = 62 solo slots).
264        assert_eq!(groups.len(), 3);
265        assert_eq!(groups[0].packages[0].name, "openssl");
266        assert_eq!(groups[1].packages[0].name, "zlib");
267        assert_eq!(groups[2].packages[0].name, "rare");
268    }
269
270    #[test]
271    fn rare_packages_land_in_long_tail() {
272        let mut pop = PopularityMap::new();
273        pop.record_tool(&["openssl".into(), "zlib".into()]);
274        pop.record_tool(&["openssl".into(), "bz2".into()]);
275
276        // 3 solo slots: max_layers=5, 5-2=3 solo, 1 long-tail
277        let pkgs = vec![
278            pkg("openssl"),
279            pkg("zlib"),
280            pkg("bz2"),
281            pkg("rare1"),
282            pkg("rare2"),
283        ];
284        let groups = pack(
285            &pkgs,
286            &PackingStrategy::PopularityBased { max_layers: 5 },
287            Some(&pop),
288            None,
289        );
290
291        // Exactly 4 groups: openssl solo, zlib solo, bz2 solo, long-tail (rare1+rare2).
292        assert_eq!(groups.len(), 4);
293        assert_eq!(groups[0].packages[0].name, "openssl");
294        // rare packages are in the last group
295        let tail = groups.last().unwrap();
296        assert_eq!(tail.packages.len(), 2);
297    }
298
299    #[test]
300    fn packing_is_deterministic_for_same_scores() {
301        let mut pop = PopularityMap::new();
302        pop.record_tool(&["aa".into(), "bb".into(), "cc".into()]);
303
304        let pkgs = vec![pkg("cc"), pkg("aa"), pkg("bb")];
305        let groups1 = pack(
306            &pkgs,
307            &PackingStrategy::PopularityBased { max_layers: 64 },
308            Some(&pop),
309            None,
310        );
311        let groups2 = pack(
312            &pkgs,
313            &PackingStrategy::PopularityBased { max_layers: 64 },
314            Some(&pop),
315            None,
316        );
317
318        let names1: Vec<_> = groups1
319            .iter()
320            .map(|g| g.packages[0].name.as_str())
321            .collect();
322        let names2: Vec<_> = groups2
323            .iter()
324            .map(|g| g.packages[0].name.as_str())
325            .collect();
326        assert_eq!(names1, names2, "packing must be deterministic");
327        // Tie-broken by name: aa < bb < cc
328        assert_eq!(names1, vec!["aa", "bb", "cc"]);
329    }
330
331    /// M5.4: Synthesize 100 fake tool specs with overlapping deps.
332    /// Assert that for any two specs sharing a popular package, that package
333    /// lands in a solo LayerGroup in both specs, guaranteeing identical
334    /// layer digests when the same package+version+build is built reproducibly.
335    #[test]
336    fn shared_popular_packages_get_solo_layers_across_tools() {
337        const NUM_TOOLS: usize = 100;
338        const MAX_LAYERS: usize = 64;
339        const SHARED_PKGS: &[&str] = &[
340            "openssl",
341            "zlib",
342            "libgcc",
343            "libstdcxx",
344            "ncurses",
345            "xz",
346            "bzip2",
347        ];
348        const UNIQUE_SUFFIX: &str = "tool-specific-pkg";
349
350        // Build a fake registry: each tool uses all shared packages + one unique package.
351        let all_tool_packages: Vec<Vec<String>> = (0..NUM_TOOLS)
352            .map(|i| {
353                let mut pkgs: Vec<String> = SHARED_PKGS.iter().map(|s| s.to_string()).collect();
354                pkgs.push(format!("{UNIQUE_SUFFIX}-{i}"));
355                pkgs
356            })
357            .collect();
358
359        // Compute popularity from all tools.
360        let mut pop = PopularityMap::new();
361        for tool_pkgs in &all_tool_packages {
362            pop.record_tool(tool_pkgs);
363        }
364
365        // Pack two representative tools and assert shared packages get solo layers.
366        for tool_idx in [0usize, 42, 99] {
367            let pkgs: Vec<_> = all_tool_packages[tool_idx]
368                .iter()
369                .map(|name| crate::spec::ResolvedPackage {
370                    name: name.clone(),
371                    version: "1.0.0".into(),
372                    build: "h0_0".into(),
373                    channel: "conda-forge".into(),
374                    url: format!("https://example.com/{name}.conda"),
375                    sha256: format!("sha256-{name}"),
376                    filename: format!("{name}-1.0.0-h0_0.conda"),
377                    depends: vec![],
378                })
379                .collect();
380
381            let groups = pack(
382                &pkgs,
383                &PackingStrategy::PopularityBased {
384                    max_layers: MAX_LAYERS,
385                },
386                Some(&pop),
387                None,
388            );
389
390            // Every shared package must appear in a solo group (one package per group).
391            for shared in SHARED_PKGS {
392                let solo = groups
393                    .iter()
394                    .any(|g| g.packages.len() == 1 && g.packages[0].name == *shared);
395                assert!(
396                    solo,
397                    "shared package '{}' must get its own layer in tool-{tool_idx}",
398                    shared
399                );
400            }
401        }
402    }
403
404    /// Same shared package in two different tools must produce the same
405    /// LayerGroup structure (same single package), confirming digest identity.
406    #[test]
407    fn shared_package_has_same_solo_group_across_tools() {
408        let mut pop = PopularityMap::new();
409        pop.record_tool(&["openssl".into(), "samtools".into()]);
410        pop.record_tool(&["openssl".into(), "bwa".into()]);
411
412        let samtools_pkgs = vec![pkg("openssl"), pkg("samtools")];
413        let bwa_pkgs = vec![pkg("openssl"), pkg("bwa")];
414
415        let groups_s = pack(
416            &samtools_pkgs,
417            &PackingStrategy::PopularityBased { max_layers: 64 },
418            Some(&pop),
419            None,
420        );
421        let groups_b = pack(
422            &bwa_pkgs,
423            &PackingStrategy::PopularityBased { max_layers: 64 },
424            Some(&pop),
425            None,
426        );
427
428        // openssl is the first group in both (highest score = 2).
429        assert_eq!(groups_s[0].packages[0].name, "openssl");
430        assert_eq!(groups_b[0].packages[0].name, "openssl");
431
432        // Both openssl groups contain exactly one package with the same identity.
433        // A deterministic build on those groups would yield identical layer digests.
434        assert_eq!(
435            groups_s[0].packages[0].sha256,
436            groups_b[0].packages[0].sha256,
437        );
438    }
439
440    fn pkg_versioned(name: &str, version: &str, build: &str) -> ResolvedPackage {
441        ResolvedPackage {
442            name: name.into(),
443            version: version.into(),
444            build: build.into(),
445            channel: "conda-forge".into(),
446            url: format!("https://example.com/{name}.conda"),
447            sha256: "abc".into(),
448            filename: format!("{name}-{version}-{build}.conda"),
449            depends: vec![],
450        }
451    }
452
453    #[test]
454    fn catalog_aware_prioritizes_known_packages() {
455        let mut cat = LayerCatalog::new();
456        // openssl seen in 2 builds, zlib in 1, "rare" not in catalog
457        cat.record("openssl", "1.0.0", "h0_0", "sha256:aaa");
458        cat.record("openssl", "1.0.0", "h0_0", "sha256:aaa");
459        cat.record("zlib", "1.0.0", "h0_0", "sha256:bbb");
460
461        let pkgs = vec![
462            pkg_versioned("rare", "1.0.0", "h0_0"),
463            pkg_versioned("zlib", "1.0.0", "h0_0"),
464            pkg_versioned("openssl", "1.0.0", "h0_0"),
465        ];
466        let groups = pack(
467            &pkgs,
468            &PackingStrategy::CatalogAware { max_layers: 64 },
469            None,
470            Some(&cat),
471        );
472
473        // All three fit in solo layers (64 - 2 = 62 slots).
474        assert_eq!(groups.len(), 3);
475        assert_eq!(groups[0].packages[0].name, "openssl"); // count=2
476        assert_eq!(groups[1].packages[0].name, "zlib"); // count=1
477        assert_eq!(groups[2].packages[0].name, "rare"); // count=0
478    }
479
480    #[test]
481    fn catalog_aware_pushes_unknown_to_long_tail_when_budget_tight() {
482        let mut cat = LayerCatalog::new();
483        cat.record("openssl", "1.0.0", "h0_0", "sha256:aaa");
484        cat.record("zlib", "1.0.0", "h0_0", "sha256:bbb");
485        cat.record("libgcc", "1.0.0", "h0_0", "sha256:ccc");
486
487        // 5 packages, max_layers=5 → 3 solo slots, 2 unknowns go to long-tail
488        let pkgs = vec![
489            pkg_versioned("rare1", "1.0.0", "h0_0"),
490            pkg_versioned("rare2", "1.0.0", "h0_0"),
491            pkg_versioned("openssl", "1.0.0", "h0_0"),
492            pkg_versioned("zlib", "1.0.0", "h0_0"),
493            pkg_versioned("libgcc", "1.0.0", "h0_0"),
494        ];
495        let groups = pack(
496            &pkgs,
497            &PackingStrategy::CatalogAware { max_layers: 5 },
498            None,
499            Some(&cat),
500        );
501
502        // 3 solo (catalog hits) + 1 long-tail (2 cache misses)
503        assert_eq!(groups.len(), 4);
504        let solo_names: Vec<_> = groups[..3]
505            .iter()
506            .map(|g| g.packages[0].name.as_str())
507            .collect();
508        assert!(solo_names.contains(&"openssl"));
509        assert!(solo_names.contains(&"zlib"));
510        assert!(solo_names.contains(&"libgcc"));
511        let tail = groups.last().unwrap();
512        assert_eq!(tail.packages.len(), 2);
513        let tail_names: Vec<_> = tail.packages.iter().map(|p| p.name.as_str()).collect();
514        assert!(tail_names.contains(&"rare1"));
515        assert!(tail_names.contains(&"rare2"));
516    }
517}