Skip to main content

bv_builder/
build.rs

1use std::io::Write;
2use std::path::Path;
3
4use anyhow::{Context, Result};
5use bv_core::lockfile::{CondaPackagePin, LayerDescriptor};
6use futures_util::StreamExt as _;
7use oci_client::{
8    Reference,
9    client::{Client, ClientConfig, ClientProtocol},
10    secrets::RegistryAuth,
11};
12use sha2::{Digest, Sha256};
13
14use crate::layering::{LayerGroup, PackingStrategy, pack};
15use crate::popularity::PopularityMap;
16use crate::spec::ResolvedSpec;
17
18// SOURCE_DATE_EPOCH = 0 (1970-01-01T00:00:00Z).
19// Reproducibility rule: all file mtimes set to this value so that two builds
20// of the same packages produce bit-identical compressed layer blobs.
21// Reference: https://reproducible-builds.org/docs/source-date-epoch/
22const SOURCE_DATE_EPOCH: u64 = 0;
23
24/// An in-memory OCI image ready to be pushed or saved.
25pub struct OciImage {
26    pub name: String,
27    pub version: String,
28    pub layers: Vec<OciLayer>,
29    /// OCI image config JSON bytes (sha256 needed for manifest).
30    pub config: Vec<u8>,
31}
32
33pub struct OciLayer {
34    pub compressed: Vec<u8>,
35    pub descriptor: LayerDescriptor,
36    /// sha256 of the uncompressed tarball; used for the OCI config DiffID.
37    pub uncompressed_digest: String,
38}
39
40impl OciImage {
41    /// Compute the OCI image manifest JSON (image manifest v2/OCI schema).
42    pub fn manifest_json(&self) -> Result<Vec<u8>> {
43        let config_digest = sha256_hex(&self.config);
44        let config_size = self.config.len() as u64;
45
46        let mut layers_json = String::from("[\n");
47        for (i, layer) in self.layers.iter().enumerate() {
48            let comma = if i + 1 == self.layers.len() { "" } else { "," };
49            layers_json.push_str(&format!(
50                "    {{\"mediaType\":\"{}\",\"digest\":\"{}\",\"size\":{}}}{}\n",
51                layer.descriptor.media_type, layer.descriptor.digest, layer.descriptor.size, comma,
52            ));
53        }
54        layers_json.push(']');
55
56        let manifest = format!(
57            r#"{{
58  "schemaVersion": 2,
59  "mediaType": "application/vnd.oci.image.manifest.v1+json",
60  "config": {{
61    "mediaType": "application/vnd.oci.image.config.v1+json",
62    "digest": "sha256:{config_digest}",
63    "size": {config_size}
64  }},
65  "layers": {layers_json}
66}}"#
67        );
68        Ok(manifest.into_bytes())
69    }
70}
71
72/// Build an `OciImage` from a `ResolvedSpec`.
73///
74/// Each package in the spec becomes one OCI layer (or a group when packing
75/// is enabled). A base OS layer (defaults to debian:12-slim) is prepended so
76/// the container has the dynamic linker and glibc that conda binaries require.
77pub async fn build(
78    resolved: &ResolvedSpec,
79    strategy: &PackingStrategy,
80    popularity: Option<&PopularityMap>,
81) -> Result<OciImage> {
82    let groups = pack(&resolved.packages, strategy, popularity);
83
84    let http = reqwest::Client::builder()
85        .user_agent("bv-builder/0.1")
86        .timeout(std::time::Duration::from_secs(600))
87        .build()?;
88
89    // Pull base image layers first so the container has glibc + dynamic linker.
90    let base_ref = resolved
91        .base
92        .as_deref()
93        .unwrap_or("docker.io/library/debian:12-slim");
94    let mut layers = fetch_base_layers(base_ref)
95        .await
96        .with_context(|| format!("fetch base image '{base_ref}'"))?;
97
98    // buffered (not buffer_unordered) preserves layer input order, which is
99    // required for deterministic manifest digests across rebuilds.
100    // build_group_layer returns None for packages with no extractable files.
101    let concurrency = std::thread::available_parallelism()
102        .map(|n| n.get())
103        .unwrap_or(1)
104        .min(8);
105    let mut pkg_layers: Vec<OciLayer> = futures_util::stream::iter(groups.iter())
106        .map(|g| build_group_layer(&http, g))
107        .buffered(concurrency)
108        .collect::<Vec<_>>()
109        .await
110        .into_iter()
111        .collect::<Result<Vec<Option<OciLayer>>>>()?
112        .into_iter()
113        .flatten()
114        .collect();
115    layers.append(&mut pkg_layers);
116
117    // Meta layer: conda-meta JSON for all packages.
118    let meta_layer = build_meta_layer(resolved)?;
119    layers.push(meta_layer);
120
121    // Entrypoint layer.
122    let entrypoint_layer = build_entrypoint_layer(resolved)?;
123    layers.push(entrypoint_layer);
124
125    let config = build_config(resolved, &layers)?;
126
127    Ok(OciImage {
128        name: resolved.name.clone(),
129        version: resolved.version.clone(),
130        layers,
131        config,
132    })
133}
134
135/// Pull a base OCI image from a registry and return its layers.
136///
137/// The base image (typically `debian:12-slim`) provides glibc and the dynamic
138/// linker that conda binaries depend on. Its layers are prepended before the
139/// conda package layers so the container root FS is complete.
140async fn fetch_base_layers(base_ref: &str) -> Result<Vec<OciLayer>> {
141    use futures_util::StreamExt;
142
143    let reference: Reference = base_ref
144        .parse()
145        .with_context(|| format!("parse base OCI reference '{base_ref}'"))?;
146
147    let oci_config = ClientConfig {
148        protocol: ClientProtocol::HttpsExcept(vec!["localhost".into(), "127.0.0.1".into()]),
149        ..Default::default()
150    };
151    let client = Client::new(oci_config);
152    let auth = if base_ref.contains("ghcr.io") {
153        if let Ok(token) = std::env::var("GITHUB_TOKEN") {
154            RegistryAuth::Basic("token".into(), token)
155        } else {
156            RegistryAuth::Anonymous
157        }
158    } else {
159        RegistryAuth::Anonymous
160    };
161
162    let (manifest, _digest, config_json) = client
163        .pull_manifest_and_config(&reference, &auth)
164        .await
165        .with_context(|| format!("pull manifest+config for '{base_ref}'"))?;
166
167    let base_config: serde_json::Value =
168        serde_json::from_str(&config_json).context("parse base image config")?;
169    let base_diff_ids = base_config["rootfs"]["diff_ids"]
170        .as_array()
171        .cloned()
172        .unwrap_or_default();
173
174    let mut result = Vec::new();
175    for (i, layer_desc) in manifest.layers.iter().enumerate() {
176        let digest = &layer_desc.digest;
177        let media_type = &layer_desc.media_type;
178        let size = layer_desc.size as u64;
179
180        let mut compressed = Vec::new();
181        let mut stream = client
182            .pull_blob_stream(&reference, layer_desc)
183            .await
184            .with_context(|| format!("pull base layer blob {digest}"))?;
185        while let Some(chunk) = stream.next().await {
186            compressed.extend_from_slice(&chunk?);
187        }
188
189        let uncompressed_digest = base_diff_ids
190            .get(i)
191            .and_then(|v| v.as_str())
192            .unwrap_or(digest)
193            .to_string();
194
195        result.push(OciLayer {
196            compressed,
197            uncompressed_digest,
198            descriptor: LayerDescriptor {
199                digest: digest.clone(),
200                size,
201                media_type: media_type.clone(),
202                conda_package: None,
203            },
204        });
205    }
206
207    Ok(result)
208}
209
210/// Download and layer a single package group.
211///
212/// Downloads are async; extraction and zstd compression are CPU-bound and
213/// run in spawn_blocking so they don't starve the async executor's I/O threads.
214/// Returns None if the package(s) contain no extractable files (e.g. pure
215/// Python namespace packages whose pkg- archive is empty after info- is skipped).
216async fn build_group_layer(
217    client: &reqwest::Client,
218    group: &LayerGroup,
219) -> Result<Option<OciLayer>> {
220    // Phase 1: download all packages in this group concurrently.
221    let downloaded: Vec<(crate::spec::ResolvedPackage, Vec<u8>)> =
222        futures_util::future::try_join_all(
223            group
224                .packages
225                .iter()
226                .map(|pkg| download_package(client, pkg)),
227        )
228        .await?;
229
230    let conda_package = if group.packages.len() == 1 {
231        let pkg = &group.packages[0];
232        Some(CondaPackagePin {
233            name: pkg.name.clone(),
234            version: pkg.version.clone(),
235            build: pkg.build.clone(),
236            channel: pkg.channel.clone(),
237            sha256: pkg.sha256.clone(),
238        })
239    } else {
240        None
241    };
242
243    // Phase 2: extract + compress on a blocking thread.
244    tokio::task::spawn_blocking(move || -> Result<Option<OciLayer>> {
245        let work_dir = tempfile::tempdir().context("create temp dir for layer build")?;
246        let prefix = work_dir.path().join("opt").join("conda");
247        std::fs::create_dir_all(&prefix).context("create conda prefix dir")?;
248
249        for (pkg, bytes) in &downloaded {
250            extract_package_bytes(pkg, bytes, &prefix)
251                .with_context(|| format!("extract {}", pkg.filename))?;
252        }
253
254        // Skip packages that extracted no files; only directory scaffolding
255        // (opt/conda/) would produce a deterministic empty layer shared by all
256        // such packages, causing duplicate digest collisions in the manifest.
257        if !prefix_has_files(&prefix) {
258            return Ok(None);
259        }
260
261        let (compressed, uncompressed_digest) = create_reproducible_layer(work_dir.path())?;
262        let digest = format!("sha256:{}", sha256_hex(&compressed));
263        let size = compressed.len() as u64;
264
265        Ok(Some(OciLayer {
266            compressed,
267            uncompressed_digest: format!("sha256:{uncompressed_digest}"),
268            descriptor: LayerDescriptor {
269                digest,
270                size,
271                media_type: "application/vnd.oci.image.layer.v1.tar+zstd".into(),
272                conda_package,
273            },
274        }))
275    })
276    .await
277    .context("layer build task panicked")?
278}
279
280fn prefix_has_files(dir: &Path) -> bool {
281    let Ok(entries) = std::fs::read_dir(dir) else {
282        return false;
283    };
284    for entry in entries.flatten() {
285        let Ok(meta) = entry.metadata() else { continue };
286        if meta.is_file() {
287            return true;
288        }
289        if meta.is_dir() && prefix_has_files(&entry.path()) {
290            return true;
291        }
292    }
293    false
294}
295
296/// Download a conda package and return its raw bytes.
297async fn download_package(
298    client: &reqwest::Client,
299    pkg: &crate::spec::ResolvedPackage,
300) -> Result<(crate::spec::ResolvedPackage, Vec<u8>)> {
301    use futures_util::StreamExt;
302
303    let resp = client
304        .get(&pkg.url)
305        .send()
306        .await
307        .with_context(|| format!("download {}", pkg.url))?;
308
309    if !resp.status().is_success() {
310        anyhow::bail!("HTTP {} fetching {}", resp.status(), pkg.url);
311    }
312
313    let mut bytes = Vec::new();
314    let mut stream = resp.bytes_stream();
315    while let Some(chunk) = stream.next().await {
316        bytes.extend_from_slice(&chunk?);
317    }
318
319    if !pkg.sha256.is_empty() {
320        let actual = sha256_hex(&bytes);
321        if actual != pkg.sha256 {
322            anyhow::bail!(
323                "sha256 mismatch for {} ({}): expected {} got {}",
324                pkg.name,
325                pkg.filename,
326                pkg.sha256,
327                actual
328            );
329        }
330    }
331
332    Ok((pkg.clone(), bytes))
333}
334
335/// Extract a downloaded conda package into `dest`.
336fn extract_package_bytes(
337    pkg: &crate::spec::ResolvedPackage,
338    bytes: &[u8],
339    dest: &Path,
340) -> Result<()> {
341    if pkg.filename.ends_with(".conda") {
342        extract_conda_archive(bytes, dest)
343    } else if pkg.filename.ends_with(".tar.bz2") {
344        extract_tar_bz2(bytes, dest)
345    } else {
346        Ok(())
347    }
348}
349
350fn extract_conda_archive(data: &[u8], dest: &Path) -> Result<()> {
351    use std::io::Read;
352    let cursor = std::io::Cursor::new(data);
353    let mut zip = zip::ZipArchive::new(cursor).context("open .conda zip")?;
354
355    for i in 0..zip.len() {
356        let mut entry = zip.by_index(i)?;
357        // Only extract pkg- (binaries/libs); skip info- (conda metadata not
358        // needed at container runtime).
359        if entry.name().starts_with("pkg-") && entry.name().ends_with(".tar.zst") {
360            let mut zstd_bytes = Vec::new();
361            entry.read_to_end(&mut zstd_bytes)?;
362            let decompressed = zstd::decode_all(std::io::Cursor::new(zstd_bytes))
363                .context("decompress pkg- zstd")?;
364            extract_tar_bytes(&decompressed, dest)?;
365        }
366    }
367    Ok(())
368}
369
370fn extract_tar_bz2(data: &[u8], dest: &Path) -> Result<()> {
371    let decompressed = bzip2::read::BzDecoder::new(data);
372    let mut archive = tar::Archive::new(decompressed);
373    archive.unpack(dest).context("unpack tar.bz2")?;
374    Ok(())
375}
376
377fn extract_tar_bytes(data: &[u8], dest: &Path) -> Result<()> {
378    let mut archive = tar::Archive::new(std::io::Cursor::new(data));
379    archive.unpack(dest).context("unpack tar")?;
380    Ok(())
381}
382
383/// Create a reproducible, sorted, zstd-compressed OCI layer tarball from `dir`.
384///
385/// Reproducibility rules (https://reproducible-builds.org/docs/archives/):
386/// - PAX tar format
387/// - All mtimes set to SOURCE_DATE_EPOCH
388/// - All uid/gid set to 0
389/// - Entries sorted by path
390/// - zstd level 19 compression
391fn create_reproducible_layer(dir: &Path) -> Result<(Vec<u8>, String)> {
392    use std::fs;
393
394    let mut entries: Vec<std::path::PathBuf> = Vec::new();
395    collect_files(dir, &mut entries)?;
396    entries.sort();
397
398    let mut uncompressed: Vec<u8> = Vec::new();
399    {
400        let mut builder = tar::Builder::new(&mut uncompressed);
401        builder.follow_symlinks(false);
402
403        for entry_path in &entries {
404            let rel = entry_path.strip_prefix(dir).unwrap();
405            let meta = fs::symlink_metadata(entry_path)?;
406
407            let mut header = tar::Header::new_ustar();
408            header.set_metadata(&meta);
409            header.set_mtime(SOURCE_DATE_EPOCH);
410            header.set_uid(0);
411            header.set_gid(0);
412            header.set_username("")?;
413            header.set_groupname("")?;
414
415            if meta.file_type().is_symlink() {
416                let target = fs::read_link(entry_path)?;
417                header.set_size(0);
418                header.set_entry_type(tar::EntryType::Symlink);
419                header.set_path(rel)?;
420                header.set_link_name(&target)?;
421                header.set_cksum();
422                builder.append(&header, std::io::empty())?;
423            } else if meta.is_file() {
424                let data = fs::read(entry_path)?;
425                header.set_size(data.len() as u64);
426                header.set_cksum();
427                builder.append_data(&mut header, rel, data.as_slice())?;
428            } else if meta.is_dir() {
429                header.set_size(0);
430                header.set_cksum();
431                builder.append_data(&mut header, rel, std::io::empty())?;
432            }
433        }
434        builder.finish()?;
435    }
436
437    let uncompressed_digest = sha256_hex(&uncompressed);
438
439    // zstd level 19 for maximum compression density.
440    let compressed =
441        zstd::encode_all(std::io::Cursor::new(&uncompressed), 19).context("zstd compress layer")?;
442
443    Ok((compressed, uncompressed_digest))
444}
445
446fn collect_files(dir: &Path, out: &mut Vec<std::path::PathBuf>) -> Result<()> {
447    for entry in std::fs::read_dir(dir)? {
448        let entry = entry?;
449        let path = entry.path();
450        let meta = std::fs::symlink_metadata(&path)?;
451        if meta.file_type().is_symlink() {
452            out.push(path);
453        } else if meta.is_dir() {
454            out.push(path.clone());
455            collect_files(&path, out)?;
456        } else {
457            out.push(path);
458        }
459    }
460    Ok(())
461}
462
463/// Build a thin layer containing `/opt/conda/conda-meta/<pkg>.json` for every package.
464fn build_meta_layer(resolved: &ResolvedSpec) -> Result<OciLayer> {
465    let work_dir = tempfile::tempdir().context("create temp dir for meta layer")?;
466    let conda_meta = work_dir.path().join("opt").join("conda").join("conda-meta");
467    std::fs::create_dir_all(&conda_meta)?;
468
469    for pkg in &resolved.packages {
470        let meta = serde_json::json!({
471            "name": pkg.name,
472            "version": pkg.version,
473            "build": pkg.build,
474            "channel": pkg.channel,
475            "url": pkg.url,
476            "sha256": pkg.sha256,
477        });
478        let filename = format!("{}-{}-{}.json", pkg.name, pkg.version, pkg.build);
479        let path = conda_meta.join(filename);
480        std::fs::write(&path, serde_json::to_string_pretty(&meta)?)?;
481    }
482
483    let (compressed, uncompressed_digest) = create_reproducible_layer(work_dir.path())?;
484    let digest = format!("sha256:{}", sha256_hex(&compressed));
485    let size = compressed.len() as u64;
486
487    Ok(OciLayer {
488        compressed,
489        uncompressed_digest: format!("sha256:{uncompressed_digest}"),
490        descriptor: LayerDescriptor {
491            digest,
492            size,
493            media_type: "application/vnd.oci.image.layer.v1.tar+zstd".into(),
494            conda_package: None,
495        },
496    })
497}
498
499/// Build the entrypoint layer: a `/bv-entrypoint.sh` script that exec's the
500/// tool's declared command.
501fn build_entrypoint_layer(_resolved: &ResolvedSpec) -> Result<OciLayer> {
502    let work_dir = tempfile::tempdir().context("create temp dir for entrypoint layer")?;
503    let script_path = work_dir.path().join("bv-entrypoint.sh");
504    {
505        let mut f = std::fs::File::create(&script_path)?;
506        writeln!(f, "#!/bin/sh")?;
507        writeln!(f, "# Generated by bv-builder; do not edit")?;
508        writeln!(f, "exec \"$@\"")?;
509    }
510    // Make executable (755).
511    #[cfg(unix)]
512    {
513        use std::os::unix::fs::PermissionsExt;
514        let mut perms = std::fs::metadata(&script_path)?.permissions();
515        perms.set_mode(0o755);
516        std::fs::set_permissions(&script_path, perms)?;
517    }
518
519    let (compressed, uncompressed_digest) = create_reproducible_layer(work_dir.path())?;
520    let digest = format!("sha256:{}", sha256_hex(&compressed));
521    let size = compressed.len() as u64;
522
523    Ok(OciLayer {
524        compressed,
525        uncompressed_digest: format!("sha256:{uncompressed_digest}"),
526        descriptor: LayerDescriptor {
527            digest,
528            size,
529            media_type: "application/vnd.oci.image.layer.v1.tar+zstd".into(),
530            conda_package: None,
531        },
532    })
533}
534
535/// Build the OCI image config JSON.
536fn build_config(resolved: &ResolvedSpec, layers: &[OciLayer]) -> Result<Vec<u8>> {
537    let diff_ids: Vec<String> = layers
538        .iter()
539        .map(|l| l.uncompressed_digest.clone())
540        .collect();
541
542    let config = serde_json::json!({
543        "architecture": resolved.platform.to_string().split('/').nth(1).unwrap_or("amd64"),
544        "os": "linux",
545        "created": "1970-01-01T00:00:00Z",
546        "author": "bv-builder",
547        "config": {
548            "Env": [
549                "PATH=/opt/conda/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin",
550                "LD_LIBRARY_PATH=/opt/conda/lib",
551            ],
552            "Labels": {
553                "org.opencontainers.image.title": &resolved.name,
554                "org.opencontainers.image.version": &resolved.version,
555            }
556        },
557        "rootfs": {
558            "type": "layers",
559            "diff_ids": diff_ids,
560        },
561        "history": []
562    });
563
564    Ok(serde_json::to_vec_pretty(&config)?)
565}
566
567pub fn sha256_hex(data: &[u8]) -> String {
568    let mut hasher = Sha256::new();
569    hasher.update(data);
570    hex::encode(hasher.finalize())
571}
572
573#[cfg(test)]
574mod tests {
575    use super::*;
576
577    #[test]
578    fn sha256_hex_is_correct() {
579        let hash = sha256_hex(b"hello");
580        assert_eq!(
581            hash,
582            "2cf24dba5fb0a30e26e83b2ac5b9e29e1b161e5c1fa7425e73043362938b9824"
583        );
584    }
585
586    #[test]
587    fn create_reproducible_layer_is_deterministic() {
588        let dir = tempfile::tempdir().unwrap();
589        std::fs::write(dir.path().join("file.txt"), b"content").unwrap();
590        let (c1, d1) = create_reproducible_layer(dir.path()).unwrap();
591        let (c2, d2) = create_reproducible_layer(dir.path()).unwrap();
592        assert_eq!(c1, c2, "compressed bytes differ between two runs");
593        assert_eq!(d1, d2, "digests differ between two runs");
594    }
595}