Skip to main content

bv_builder/
layering.rs

1use bv_core::lockfile::{CondaPackagePin, LayerDescriptor};
2
3use crate::popularity::PopularityMap;
4use crate::spec::ResolvedPackage;
5
6/// Strategy for grouping packages into OCI layers.
7///
8/// Layer order: most-stable (lowest in dependency graph) at index 0,
9/// most-volatile (entrypoint) at the top. Docker pulls layers in manifest
10/// order so stable-first minimises re-downloads across tool upgrades.
11#[derive(Debug, Clone, PartialEq, Eq, Default)]
12pub enum PackingStrategy {
13    /// Each package gets its own layer (default for small tool sets).
14    #[default]
15    OnePerPackage,
16    /// Popularity-based packing when `max_layers` is exceeded.
17    PopularityBased { max_layers: usize },
18}
19
20/// A group of packages that will be combined into a single OCI layer.
21#[derive(Debug, Clone)]
22pub struct LayerGroup {
23    pub packages: Vec<ResolvedPackage>,
24}
25
26/// Group `packages` into layer groups according to `strategy`.
27///
28/// When `popularity` is provided and `strategy` is `PopularityBased`, packages
29/// are sorted by their co-occurrence score (descending) before splitting into
30/// solo vs. long-tail groups. Without scores the sort falls back to package
31/// name for determinism, which is correct but not optimal.
32///
33/// The caller is responsible for appending the meta layer and entrypoint layer
34/// after the returned groups.
35pub fn pack(
36    packages: &[ResolvedPackage],
37    strategy: &PackingStrategy,
38    popularity: Option<&PopularityMap>,
39) -> Vec<LayerGroup> {
40    match strategy {
41        PackingStrategy::OnePerPackage => packages
42            .iter()
43            .map(|p| LayerGroup {
44                packages: vec![p.clone()],
45            })
46            .collect(),
47
48        PackingStrategy::PopularityBased { max_layers } => {
49            pack_by_popularity(packages, *max_layers, popularity)
50        }
51    }
52}
53
54/// Sort `packages` by popularity score descending, then by name for
55/// determinism.  The `max_layers - 2` most popular packages each get their own
56/// layer; the remaining packages are packed into a single "long-tail" layer.
57/// The last two layer slots are reserved for the meta and entrypoint layers
58/// added by the caller.
59///
60/// **Stability invariant**: because scores are keyed by package *name* (not
61/// version+build), upgrading an existing popular package (e.g. `openssl`
62/// 3.2.1 → 3.3.0) preserves its high score and keeps it in a solo layer,
63/// just with a new digest.  Only the solo/long-tail boundary changes when the
64/// registry grows beyond `max_layers - 2` unique popular packages, which
65/// happens at most `O(1)` times per new tool added.
66fn pack_by_popularity(
67    packages: &[ResolvedPackage],
68    max_layers: usize,
69    popularity: Option<&PopularityMap>,
70) -> Vec<LayerGroup> {
71    if max_layers < 3 || packages.is_empty() {
72        return vec![LayerGroup {
73            packages: packages.to_vec(),
74        }];
75    }
76
77    // Sort by score desc, then name asc for determinism within ties.
78    let mut sorted = packages.to_vec();
79    sorted.sort_by(|a, b| {
80        let sa = popularity.map(|p| p.score(&a.name)).unwrap_or(0);
81        let sb = popularity.map(|p| p.score(&b.name)).unwrap_or(0);
82        sb.cmp(&sa).then(a.name.cmp(&b.name))
83    });
84
85    let solo_count = max_layers.saturating_sub(2).min(sorted.len());
86    let (solo, tail) = sorted.split_at(solo_count);
87
88    let mut groups: Vec<LayerGroup> = solo
89        .iter()
90        .map(|p| LayerGroup {
91            packages: vec![p.clone()],
92        })
93        .collect();
94
95    if !tail.is_empty() {
96        groups.push(LayerGroup {
97            packages: tail.to_vec(),
98        });
99    }
100    groups
101}
102
103/// Convert a `ResolvedPackage` into a `LayerDescriptor` placeholder.
104/// The actual `digest` and `size` are filled in by `build::build_layer` after
105/// the layer blob has been created.
106pub fn placeholder_descriptor(pkg: &ResolvedPackage) -> LayerDescriptor {
107    LayerDescriptor {
108        digest: String::new(),
109        size: 0,
110        media_type: "application/vnd.oci.image.layer.v1.tar+zstd".into(),
111        conda_package: Some(CondaPackagePin {
112            name: pkg.name.clone(),
113            version: pkg.version.clone(),
114            build: pkg.build.clone(),
115            channel: pkg.channel.clone(),
116            sha256: pkg.sha256.clone(),
117        }),
118    }
119}
120
121#[cfg(test)]
122mod tests {
123    use super::*;
124
125    fn pkg(name: &str) -> ResolvedPackage {
126        crate::spec::ResolvedPackage {
127            name: name.into(),
128            version: "1.0.0".into(),
129            build: "h0_0".into(),
130            channel: "conda-forge".into(),
131            url: format!("https://example.com/{name}.conda"),
132            sha256: "abc".into(),
133            filename: format!("{name}-1.0.0-h0_0.conda"),
134            depends: vec![],
135        }
136    }
137
138    #[test]
139    fn one_per_package_gives_n_groups() {
140        let pkgs = vec![pkg("openssl"), pkg("zlib"), pkg("samtools")];
141        let groups = pack(&pkgs, &PackingStrategy::OnePerPackage, None);
142        assert_eq!(groups.len(), 3);
143        assert_eq!(groups[0].packages[0].name, "openssl");
144    }
145
146    #[test]
147    fn popularity_packing_respects_max_layers() {
148        let pkgs: Vec<_> = (0..10).map(|i| pkg(&format!("pkg{i:02}"))).collect();
149        let groups = pack(
150            &pkgs,
151            &PackingStrategy::PopularityBased { max_layers: 5 },
152            None,
153        );
154        // 3 solo layers + 1 long-tail (slots 4 and 5 reserved for meta+entrypoint)
155        assert_eq!(groups.len(), 4);
156        assert_eq!(groups.last().unwrap().packages.len(), 7); // 10 - 3
157    }
158
159    #[test]
160    fn popularity_packing_degenerate_small_input() {
161        let pkgs = vec![pkg("samtools")];
162        let groups = pack(
163            &pkgs,
164            &PackingStrategy::PopularityBased { max_layers: 64 },
165            None,
166        );
167        assert_eq!(groups.len(), 1);
168        assert_eq!(groups[0].packages[0].name, "samtools");
169    }
170
171    #[test]
172    fn popular_packages_placed_before_rare_ones() {
173        let mut pop = PopularityMap::new();
174        // openssl appears in 10 tools, zlib in 3, rare in 1
175        for _ in 0..10 {
176            pop.record_tool(&["openssl".into()]);
177        }
178        for _ in 0..3 {
179            pop.record_tool(&["zlib".into()]);
180        }
181        pop.record_tool(&["rare".into()]);
182
183        let pkgs = vec![pkg("rare"), pkg("zlib"), pkg("openssl")];
184        let groups = pack(
185            &pkgs,
186            &PackingStrategy::PopularityBased { max_layers: 64 },
187            Some(&pop),
188        );
189
190        // All three fit in solo layers (64 - 2 = 62 solo slots).
191        assert_eq!(groups.len(), 3);
192        assert_eq!(groups[0].packages[0].name, "openssl");
193        assert_eq!(groups[1].packages[0].name, "zlib");
194        assert_eq!(groups[2].packages[0].name, "rare");
195    }
196
197    #[test]
198    fn rare_packages_land_in_long_tail() {
199        let mut pop = PopularityMap::new();
200        pop.record_tool(&["openssl".into(), "zlib".into()]);
201        pop.record_tool(&["openssl".into(), "bz2".into()]);
202
203        // 3 solo slots: max_layers=5, 5-2=3 solo, 1 long-tail
204        let pkgs = vec![
205            pkg("openssl"),
206            pkg("zlib"),
207            pkg("bz2"),
208            pkg("rare1"),
209            pkg("rare2"),
210        ];
211        let groups = pack(
212            &pkgs,
213            &PackingStrategy::PopularityBased { max_layers: 5 },
214            Some(&pop),
215        );
216
217        // Exactly 4 groups: openssl solo, zlib solo, bz2 solo, long-tail (rare1+rare2).
218        assert_eq!(groups.len(), 4);
219        assert_eq!(groups[0].packages[0].name, "openssl");
220        // rare packages are in the last group
221        let tail = groups.last().unwrap();
222        assert_eq!(tail.packages.len(), 2);
223    }
224
225    #[test]
226    fn packing_is_deterministic_for_same_scores() {
227        let mut pop = PopularityMap::new();
228        pop.record_tool(&["aa".into(), "bb".into(), "cc".into()]);
229
230        let pkgs = vec![pkg("cc"), pkg("aa"), pkg("bb")];
231        let groups1 = pack(
232            &pkgs,
233            &PackingStrategy::PopularityBased { max_layers: 64 },
234            Some(&pop),
235        );
236        let groups2 = pack(
237            &pkgs,
238            &PackingStrategy::PopularityBased { max_layers: 64 },
239            Some(&pop),
240        );
241
242        let names1: Vec<_> = groups1
243            .iter()
244            .map(|g| g.packages[0].name.as_str())
245            .collect();
246        let names2: Vec<_> = groups2
247            .iter()
248            .map(|g| g.packages[0].name.as_str())
249            .collect();
250        assert_eq!(names1, names2, "packing must be deterministic");
251        // Tie-broken by name: aa < bb < cc
252        assert_eq!(names1, vec!["aa", "bb", "cc"]);
253    }
254
255    /// M5.4: Synthesize 100 fake tool specs with overlapping deps.
256    /// Assert that for any two specs sharing a popular package, that package
257    /// lands in a solo LayerGroup in both specs, guaranteeing identical
258    /// layer digests when the same package+version+build is built reproducibly.
259    #[test]
260    fn shared_popular_packages_get_solo_layers_across_tools() {
261        const NUM_TOOLS: usize = 100;
262        const MAX_LAYERS: usize = 64;
263        const SHARED_PKGS: &[&str] = &[
264            "openssl",
265            "zlib",
266            "libgcc",
267            "libstdcxx",
268            "ncurses",
269            "xz",
270            "bzip2",
271        ];
272        const UNIQUE_SUFFIX: &str = "tool-specific-pkg";
273
274        // Build a fake registry: each tool uses all shared packages + one unique package.
275        let all_tool_packages: Vec<Vec<String>> = (0..NUM_TOOLS)
276            .map(|i| {
277                let mut pkgs: Vec<String> = SHARED_PKGS.iter().map(|s| s.to_string()).collect();
278                pkgs.push(format!("{UNIQUE_SUFFIX}-{i}"));
279                pkgs
280            })
281            .collect();
282
283        // Compute popularity from all tools.
284        let mut pop = PopularityMap::new();
285        for tool_pkgs in &all_tool_packages {
286            pop.record_tool(tool_pkgs);
287        }
288
289        // Pack two representative tools and assert shared packages get solo layers.
290        for tool_idx in [0usize, 42, 99] {
291            let pkgs: Vec<_> = all_tool_packages[tool_idx]
292                .iter()
293                .map(|name| crate::spec::ResolvedPackage {
294                    name: name.clone(),
295                    version: "1.0.0".into(),
296                    build: "h0_0".into(),
297                    channel: "conda-forge".into(),
298                    url: format!("https://example.com/{name}.conda"),
299                    sha256: format!("sha256-{name}"),
300                    filename: format!("{name}-1.0.0-h0_0.conda"),
301                    depends: vec![],
302                })
303                .collect();
304
305            let groups = pack(
306                &pkgs,
307                &PackingStrategy::PopularityBased {
308                    max_layers: MAX_LAYERS,
309                },
310                Some(&pop),
311            );
312
313            // Every shared package must appear in a solo group (one package per group).
314            for shared in SHARED_PKGS {
315                let solo = groups
316                    .iter()
317                    .any(|g| g.packages.len() == 1 && g.packages[0].name == *shared);
318                assert!(
319                    solo,
320                    "shared package '{}' must get its own layer in tool-{tool_idx}",
321                    shared
322                );
323            }
324        }
325    }
326
327    /// Same shared package in two different tools must produce the same
328    /// LayerGroup structure (same single package), confirming digest identity.
329    #[test]
330    fn shared_package_has_same_solo_group_across_tools() {
331        let mut pop = PopularityMap::new();
332        pop.record_tool(&["openssl".into(), "samtools".into()]);
333        pop.record_tool(&["openssl".into(), "bwa".into()]);
334
335        let samtools_pkgs = vec![pkg("openssl"), pkg("samtools")];
336        let bwa_pkgs = vec![pkg("openssl"), pkg("bwa")];
337
338        let groups_s = pack(
339            &samtools_pkgs,
340            &PackingStrategy::PopularityBased { max_layers: 64 },
341            Some(&pop),
342        );
343        let groups_b = pack(
344            &bwa_pkgs,
345            &PackingStrategy::PopularityBased { max_layers: 64 },
346            Some(&pop),
347        );
348
349        // openssl is the first group in both (highest score = 2).
350        assert_eq!(groups_s[0].packages[0].name, "openssl");
351        assert_eq!(groups_b[0].packages[0].name, "openssl");
352
353        // Both openssl groups contain exactly one package with the same identity.
354        // A deterministic build on those groups would yield identical layer digests.
355        assert_eq!(
356            groups_s[0].packages[0].sha256,
357            groups_b[0].packages[0].sha256,
358        );
359    }
360}