sourmash 0.11.0

MinHash sketches for genomic data
Documentation
use getset::{CopyGetters, Getters, Setters};
use typed_builder::TypedBuilder;

use crate::encodings::HashFunctions;
use crate::index::MHBT;
use crate::signature::Signature;
use crate::sketch::minhash::{max_hash_for_scaled, KmerMinHashBTree};
use crate::sketch::Sketch;
use crate::Error;

pub fn prepare(index_path: &str) -> Result<(), Error> {
    let mut index = MHBT::from_path(index_path)?;

    // TODO equivalent to fill_internal in python
    //unimplemented!();

    index.save_file(index_path, None)?;

    Ok(())
}

impl Signature {
    pub fn from_params(params: &ComputeParameters) -> Signature {
        let template = build_template(params);

        Signature::builder()
            .hash_function("0.murmur64")
            .name(params.merge.clone())
            .filename(None)
            .signatures(template)
            .build()
    }
}

#[allow(dead_code)]
#[derive(TypedBuilder, CopyGetters, Getters, Setters)]
pub struct ComputeParameters {
    #[getset(get = "pub", set = "pub")]
    #[builder(default = vec![21, 31, 51])]
    ksizes: Vec<u32>,

    #[getset(get_copy = "pub", set = "pub")]
    #[builder(default = false)]
    check_sequence: bool,

    #[getset(get_copy = "pub", set = "pub")]
    #[builder(default = true)]
    dna: bool,

    #[getset(get_copy = "pub", set = "pub")]
    #[builder(default = false)]
    dayhoff: bool,

    #[getset(get_copy = "pub", set = "pub")]
    #[builder(default = false)]
    hp: bool,

    #[getset(get_copy = "pub", set = "pub")]
    #[builder(default = false)]
    singleton: bool,

    #[getset(get_copy = "pub", set = "pub")]
    #[builder(default = 0u64)]
    scaled: u64,

    #[getset(get_copy = "pub", set = "pub")]
    #[builder(default = false)]
    force: bool,

    #[getset(get = "pub", set = "pub")]
    #[builder(default = None)]
    output: Option<String>, // TODO: check

    #[getset(get_copy = "pub", set = "pub")]
    #[builder(default = 500u32)]
    num_hashes: u32,

    #[getset(get_copy = "pub", set = "pub")]
    #[builder(default = false)]
    protein: bool,

    #[getset(get_copy = "pub", set = "pub")]
    #[builder(default = false)]
    name_from_first: bool,

    #[getset(get_copy = "pub", set = "pub")]
    #[builder(default = 42u64)]
    seed: u64,

    #[getset(get_copy = "pub", set = "pub")]
    #[builder(default = false)]
    input_is_protein: bool,

    #[getset(get = "pub", set = "pub")]
    #[builder(default = None)]
    merge: Option<String>,

    #[getset(get_copy = "pub", set = "pub")]
    #[builder(default = false)]
    track_abundance: bool,

    #[getset(get_copy = "pub", set = "pub")]
    #[builder(default = false)]
    randomize: bool,

    #[getset(get = "pub", set = "pub")]
    #[builder(default = "CC0".into())]
    license: String,

    #[getset(get_copy = "pub", set = "pub")]
    #[builder(default = 2usize)]
    processes: usize,
}

impl Default for ComputeParameters {
    fn default() -> Self {
        Self::builder().build()
    }
}

pub fn build_template(params: &ComputeParameters) -> Vec<Sketch> {
    let max_hash = max_hash_for_scaled(params.scaled);

    params
        .ksizes
        .iter()
        .flat_map(|k| {
            let mut ksigs = vec![];

            if params.protein {
                ksigs.push(Sketch::LargeMinHash(
                    KmerMinHashBTree::builder()
                        .num(params.num_hashes)
                        .ksize(*k)
                        .hash_function(HashFunctions::murmur64_protein)
                        .max_hash(max_hash)
                        .seed(params.seed)
                        .abunds(if params.track_abundance {
                            Some(Default::default())
                        } else {
                            None
                        })
                        .build(),
                ));
            }

            if params.dayhoff {
                ksigs.push(Sketch::LargeMinHash(
                    KmerMinHashBTree::builder()
                        .num(params.num_hashes)
                        .ksize(*k)
                        .hash_function(HashFunctions::murmur64_dayhoff)
                        .max_hash(max_hash)
                        .seed(params.seed)
                        .abunds(if params.track_abundance {
                            Some(Default::default())
                        } else {
                            None
                        })
                        .build(),
                ));
            }

            if params.hp {
                ksigs.push(Sketch::LargeMinHash(
                    KmerMinHashBTree::builder()
                        .num(params.num_hashes)
                        .ksize(*k)
                        .hash_function(HashFunctions::murmur64_hp)
                        .max_hash(max_hash)
                        .seed(params.seed)
                        .abunds(if params.track_abundance {
                            Some(Default::default())
                        } else {
                            None
                        })
                        .build(),
                ));
            }

            if params.dna {
                ksigs.push(Sketch::LargeMinHash(
                    KmerMinHashBTree::builder()
                        .num(params.num_hashes)
                        .ksize(*k)
                        .hash_function(HashFunctions::murmur64_DNA)
                        .max_hash(max_hash)
                        .seed(params.seed)
                        .abunds(if params.track_abundance {
                            Some(Default::default())
                        } else {
                            None
                        })
                        .build(),
                ));
            }

            ksigs
        })
        .collect()
}