simd-sketch 0.3.0

A SIMD-accelerated library to compute a b-bit bottom-h sketch
use std::path::PathBuf;

use clap::Parser;
use itertools::Itertools;
use log::{info, trace};
use packed_seq::{PackedSeqVec, SeqVec};
use simd_sketch::SketchParams;
use std::io::Write;

#[derive(clap::Parser, Debug, Clone)]
struct Args {
    #[command(flatten)]
    params: SketchParams,

    paths: Vec<PathBuf>,

    #[arg(long)]
    stats: Option<PathBuf>,
}

fn main() {
    env_logger::init();

    let args = Args::parse();
    let paths = collect_paths(&args.paths);
    let q = paths.len();

    let k = args.params.k;
    let s = args.params.s;
    let b = args.params.b;

    let sketcher = SketchParams {
        alg: args.params.alg,
        rc: true,
        k,
        s,
        b,
        filter_empty: true,
        filter_out_n: false,
    }
    .build();

    let mut sketches = vec![];
    let start = std::time::Instant::now();

    for path in paths {
        trace!("Sketching {path:?}");
        let mut reader = needletail::parse_fastx_file(path).unwrap();
        let start = std::time::Instant::now();
        let mut seqs = vec![];
        while let Some(r) = reader.next() {
            seqs.push(PackedSeqVec::from_ascii(&r.unwrap().seq()));
        }
        trace!("Reading & filtering took {:?}", start.elapsed());
        let start = std::time::Instant::now();
        let seqs = seqs.iter().map(|s| s.as_slice()).collect_vec();
        sketches.push(sketcher.sketch_seqs(&seqs));
        trace!("sketching itself took {:?}", start.elapsed());
    }
    let t_sketch = start.elapsed();
    info!(
        "Sketching {q} seqs took {t_sketch:?} ({:?} avg)",
        t_sketch / q as u32
    );

    let start = std::time::Instant::now();
    let dists = sketches
        .iter()
        .tuple_combinations()
        .map(|(s1, s2)| s1.jaccard_similarity(s2))
        .collect_vec();
    let t_dist = start.elapsed();
    let cnt = q * (q - 1) / 2;
    info!(
        "Computing {cnt} dists took {t_dist:?} ({:?} avg)",
        t_dist / cnt.max(1) as u32
    );
    info!(
        "Params {:?}",
        Args {
            paths: vec![],
            ..args.clone()
        }
    );

    if let Some(stats) = &args.stats {
        let mut writer = std::fs::File::options()
            .create(true)
            .append(true)
            .write(true)
            .open(stats)
            .unwrap();
        writeln!(
            writer,
            "SimdSketch {:?} {q} {k} {s} {b} {} {}",
            args.params.alg,
            t_sketch.as_secs_f32(),
            t_dist.as_secs_f32()
        )
        .unwrap();
    }

    for dist in dists {
        println!("{dist}");
    }
}

fn collect_paths(paths: &Vec<PathBuf>) -> Vec<PathBuf> {
    let mut res = vec![];
    for path in paths {
        if path.is_dir() {
            res.extend(path.read_dir().unwrap().map(|entry| entry.unwrap().path()));
        } else {
            res.push(path.clone());
        }
    }
    res.sort();
    res
}