Skip to main content

cf1_rs/
lib.rs

1pub mod budget;
2pub mod classify;
3pub mod directed_kmer;
4pub mod dna;
5pub mod kmer;
6pub mod minimizer;
7pub mod mphf;
8pub mod output;
9pub mod params;
10pub mod pipeline;
11pub mod state;
12pub mod state_vector;
13pub mod superkmer;
14
15use std::path::PathBuf;
16
17use crate::kmer::{Kmer, KmerBits};
18use crate::mphf::RadixSortDedup;
19use crate::output::UnipathsMeta;
20use crate::params::Params;
21use crate::pipeline::run_pipeline;
22
23/// How to specify input sequences for `cf_build`.
24pub enum CfInput {
25    /// Pre-resolved file paths (FASTA/FASTQ, optionally gzipped).
26    Files(Vec<PathBuf>),
27    /// Path to a file listing input paths, one per line.
28    ListFile(PathBuf),
29    /// Directory containing FASTA/FASTQ files.
30    Directory(PathBuf),
31}
32
33/// Result of a successful `cf_build` invocation.
34pub struct CfBuildResult {
35    /// Path to the segment file (`{prefix}.cf_seg`).
36    pub seg_file: PathBuf,
37    /// Path to the sequence/tiling file (`{prefix}.cf_seq`).
38    pub seq_file: PathBuf,
39    /// Path to the JSON metadata file (`{prefix}.json`).
40    pub json_file: PathBuf,
41    /// Number of distinct k-mers (vertices).
42    pub vertex_count: u64,
43    /// Number of maximal unitigs.
44    pub unitig_count: u64,
45    /// Length of the longest unitig (in bases).
46    pub max_unitig_len: usize,
47    /// Length of the shortest unitig (in bases).
48    pub min_unitig_len: usize,
49    /// Sum of all unitig lengths (in bases).
50    pub sum_unitig_len: u64,
51    /// Input sequences shorter than k (name, length).
52    pub short_seqs: Vec<(String, usize)>,
53}
54
55/// Build a compacted de Bruijn graph from input sequences.
56///
57/// Uses a scoped rayon thread pool internally, so it is safe to call from
58/// a program that already has its own rayon global pool.
59///
60/// # Required parameters
61/// - `input` — input sequence specification
62/// - `output_prefix` — prefix for output files (`.cf_seg`, `.cf_seq`, `.json`)
63///
64/// # Optional parameters (with defaults)
65/// - `k` — k-mer length, must be odd and in \[1, 63\] (default: 31)
66/// - `threads` — number of worker threads (default: 1)
67/// - `work_dir` — directory for temporary files (default: parent of `output_prefix`)
68/// - `num_bins` — number of minimizer bins (default: 128)
69/// - `memory_budget_gb` — memory budget for MPHF construction in GB (default: 4.0)
70#[bon::builder]
71pub fn cf_build(
72    input: CfInput,
73    output_prefix: PathBuf,
74    #[builder(default = 31)] k: usize,
75    #[builder(default = 1)] threads: usize,
76    work_dir: Option<PathBuf>,
77    #[builder(default = 128)] num_bins: usize,
78    #[builder(default = 4.0)] memory_budget_gb: f64,
79) -> anyhow::Result<CfBuildResult> {
80    let input_files = resolve_input_files(&input)?;
81
82    let params = Params::from_resolved(
83        input_files,
84        k,
85        threads,
86        output_prefix,
87        3, // format=3 (GFA-reduced)
88        work_dir,
89        true,  // track_short_seqs
90        true,  // poly_n_stretch
91        true,  // collate_in_mem
92        num_bins,
93        memory_budget_gb,
94    )?;
95
96    let pool = rayon::ThreadPoolBuilder::new()
97        .num_threads(threads)
98        .build()?;
99
100    pool.install(|| dispatch_k!(k, run_and_collect, &params))
101}
102
103/// Resolve input files from a `CfInput` specification.
104fn resolve_input_files(input: &CfInput) -> anyhow::Result<Vec<PathBuf>> {
105    let mut files = Vec::new();
106    match input {
107        CfInput::Files(paths) => {
108            files.extend(paths.iter().cloned());
109        }
110        CfInput::ListFile(list_path) => {
111            let content = std::fs::read_to_string(list_path)?;
112            for line in content.lines() {
113                let line = line.trim();
114                if !line.is_empty() {
115                    files.push(PathBuf::from(line));
116                }
117            }
118        }
119        CfInput::Directory(dir) => {
120            for entry in std::fs::read_dir(dir)? {
121                let entry = entry?;
122                let path = entry.path();
123                if path.is_file() {
124                    let ext = path
125                        .extension()
126                        .and_then(|e| e.to_str())
127                        .unwrap_or("");
128                    if matches!(ext, "fa" | "fasta" | "fna" | "gz" | "fq" | "fastq") {
129                        files.push(path);
130                    }
131                }
132            }
133        }
134    }
135    anyhow::ensure!(!files.is_empty(), "No input files found");
136    Ok(files)
137}
138
139/// Internal: run pipeline and collect results into `CfBuildResult`.
140fn run_and_collect<const K: usize>(params: &Params) -> anyhow::Result<CfBuildResult>
141where
142    Kmer<K>: KmerBits,
143    <Kmer<K> as KmerBits>::Storage: RadixSortDedup,
144{
145    let (meta, short_seqs) = run_pipeline::<K>(params)?;
146    Ok(build_result(params, &meta, short_seqs))
147}
148
149/// Map pipeline outputs to `CfBuildResult`.
150fn build_result(params: &Params, meta: &UnipathsMeta, short_seqs: Vec<(String, usize)>) -> CfBuildResult {
151    CfBuildResult {
152        seg_file: params.segment_file_path(),
153        seq_file: params.sequence_file_path(),
154        json_file: params.json_file_path(),
155        vertex_count: meta.kmer_count,
156        unitig_count: meta.unipath_count,
157        max_unitig_len: meta.max_len,
158        min_unitig_len: if meta.min_len == usize::MAX { 0 } else { meta.min_len },
159        sum_unitig_len: meta.sum_len,
160        short_seqs,
161    }
162}