convert_genome 0.1.10

Convert DTC, VCF, or BCF genome files to VCF, BCF, or PLINK 1.9
Documentation
use std::{
    fs::File,
    io::{self, Write},
    path::{Path, PathBuf},
};

use anyhow::{Context, Result, anyhow, bail};
use curl::easy::{Easy, WriteError};
use flate2::read::MultiGzDecoder;
use tempfile::TempDir;
use url::Url;
use zip::ZipArchive;

const FASTA_EXTENSIONS: &[&str] = &["fa", "fasta", "fna", "fa.gz", "fasta.gz", "fna.gz"];

pub struct RemoteResource {
    temp_dir: TempDir,
    local_path: PathBuf,
}

impl RemoteResource {
    pub fn local_path(&self) -> &Path {
        &self.local_path
    }

    pub fn temp_dir(&self) -> &TempDir {
        &self.temp_dir
    }
}

pub fn fetch_remote_resource(url: &Url) -> Result<RemoteResource> {
    tracing::info!(target: "convert_genome", source = %url, "downloading remote resource");

    let temp_dir = TempDir::new().context("failed to create temporary directory for download")?;
    let filename = url
        .path_segments()
        .and_then(|mut segments| segments.next_back())
        .filter(|segment| !segment.is_empty())
        .map(|segment| segment.to_string())
        .unwrap_or_else(|| String::from("download"));

    let download_path = temp_dir.path().join(filename);
    download_to_path(url, &download_path).with_context(|| format!("failed to download {url}"))?;

    let prepared = prepare_downloaded_file(temp_dir.path(), &download_path)?;

    Ok(RemoteResource {
        temp_dir,
        local_path: prepared,
    })
}

fn download_to_path(url: &Url, destination: &Path) -> Result<()> {
    let mut easy = Easy::new();
    easy.url(url.as_str())?;
    easy.follow_location(true)?;
    easy.accept_encoding("identity")?;
    easy.fail_on_error(true)?;

    let mut file = File::create(destination)
        .with_context(|| format!("failed to create {}", destination.display()))?;
    let mut write_error: Option<io::Error> = None;

    {
        let mut transfer = easy.transfer();
        transfer.write_function(|data| {
            if let Err(err) = file.write_all(data) {
                write_error = Some(err);
                return Err(WriteError::Pause);
            }
            Ok(data.len())
        })?;
        transfer.perform()?;
    }

    if let Some(err) = write_error {
        return Err(anyhow!(
            "failed to write downloaded data to {}: {err}",
            destination.display()
        ));
    }

    Ok(())
}

fn prepare_downloaded_file(base_dir: &Path, path: &Path) -> Result<PathBuf> {
    if let Some(extension) = path.extension().and_then(|ext| ext.to_str()) {
        match extension.to_ascii_lowercase().as_str() {
            "gz" => return decompress_gzip(path),
            "zip" => return extract_zip_archive(base_dir, path),
            _ => {}
        }
    }

    Ok(path.to_path_buf())
}

fn decompress_gzip(path: &Path) -> Result<PathBuf> {
    let target = path.with_extension("");
    let input = File::open(path).with_context(|| format!("failed to open {}", path.display()))?;
    let mut decoder = MultiGzDecoder::new(input);
    let mut output =
        File::create(&target).with_context(|| format!("failed to create {}", target.display()))?;
    io::copy(&mut decoder, &mut output)
        .with_context(|| format!("failed to decompress {}", path.display()))?;
    Ok(target)
}

fn extract_zip_archive(base_dir: &Path, path: &Path) -> Result<PathBuf> {
    let file = File::open(path).with_context(|| format!("failed to open {}", path.display()))?;
    let mut archive = ZipArchive::new(file)
        .with_context(|| format!("failed to read zip archive {}", path.display()))?;

    let mut selected_index = None;
    let mut selected_name = None;

    for i in 0..archive.len() {
        let entry = archive.by_index(i)?;
        if entry.is_dir() {
            continue;
        }

        let Some(name) = entry.enclosed_name() else {
            continue;
        };

        let name_str = name.to_string_lossy();
        let name_lower = name_str.to_ascii_lowercase();
        let is_fasta = FASTA_EXTENSIONS.iter().any(|ext| name_lower.ends_with(ext));
        if selected_index.is_none() || is_fasta {
            selected_index = Some(i);
            selected_name = Some(name.to_owned());
        }

        if is_fasta {
            break;
        }
    }

    let Some(index) = selected_index else {
        bail!("zip archive {} does not contain files", path.display());
    };

    let mut entry = archive.by_index(index)?;
    let output_name = selected_name
        .and_then(|name| {
            if name.components().count() > 1 {
                tracing::warn!(
                    target: "convert_genome",
                    "zip entry path '{}' flattened to '{}'",
                    name.display(),
                    name.file_name().map(|n| n.to_string_lossy()).unwrap_or_default()
                );
            }
            name.file_name().map(|os| os.to_owned())
        })
        .unwrap_or_else(|| entry.name().into());
    let mut output_path = base_dir.to_path_buf();
    output_path.push(output_name);

    {
        let mut output = File::create(&output_path)
            .with_context(|| format!("failed to create {}", output_path.display()))?;
        io::copy(&mut entry, &mut output)
            .with_context(|| format!("failed to extract {}", entry.name()))?;
    }

    prepare_downloaded_file(base_dir, &output_path)
}

#[cfg(test)]
mod tests {
    use super::*;
    use std::fs;
    use std::io::Write;
    use zip::ZipWriter;

    #[test]
    fn decompresses_gzip_files() {
        let dir = TempDir::new().unwrap();
        let source_path = dir.path().join("test.fa.gz");
        let data = b"ACGT\n";
        {
            let file = File::create(&source_path).unwrap();
            let mut encoder = flate2::write::GzEncoder::new(file, flate2::Compression::default());
            encoder.write_all(data).unwrap();
            encoder.finish().unwrap();
        }

        let decompressed = prepare_downloaded_file(dir.path(), &source_path).unwrap();
        let contents = fs::read_to_string(decompressed).unwrap();
        assert_eq!(contents, "ACGT\n");
    }

    #[test]
    fn extracts_zip_archives() {
        let dir = TempDir::new().unwrap();
        let zip_path = dir.path().join("archive.zip");
        {
            let file = File::create(&zip_path).unwrap();
            let mut zip = ZipWriter::new(file);
            let options = zip::write::FileOptions::<()>::default();
            zip.start_file("test.fa", options).unwrap();
            zip.write_all(b">chr1\nACGT\n").unwrap();
            zip.finish().unwrap();
        }

        let extracted = prepare_downloaded_file(dir.path(), &zip_path).unwrap();
        let contents = fs::read_to_string(extracted).unwrap();
        assert!(contents.contains("ACGT"));
    }

    #[test]
    fn extracts_nested_compressed_files() {
        let dir = TempDir::new().unwrap();
        let nested_zip = dir.path().join("nested.zip");

        let inner_gz_path = dir.path().join("inner.fa.gz");
        {
            let file = File::create(&inner_gz_path).unwrap();
            let mut encoder = flate2::write::GzEncoder::new(file, flate2::Compression::default());
            encoder.write_all(b">chr1\nNNNN\n").unwrap();
            encoder.finish().unwrap();
        }

        {
            let file = File::create(&nested_zip).unwrap();
            let mut zip = ZipWriter::new(file);
            let options = zip::write::FileOptions::<()>::default();
            let data = fs::read(&inner_gz_path).unwrap();
            zip.start_file("inner.fa.gz", options).unwrap();
            zip.write_all(&data).unwrap();
            zip.finish().unwrap();
        }

        let extracted = prepare_downloaded_file(dir.path(), &nested_zip).unwrap();
        let contents = fs::read_to_string(extracted).unwrap();
        assert!(contents.contains("NNNN"));
    }
}