ragc_core/
lib.rs

1#![allow(clippy::all)]
2#![allow(dead_code)]
3#![allow(unused_variables)]
4#![allow(unused_imports)]
5#![allow(private_interfaces)]
6#![allow(unexpected_cfgs)]
7//! Core compression and decompression algorithms for the AGC genome compression format.
8//!
9//! This crate implements the complete AGC compression pipeline with full C++ AGC
10//! format compatibility. Archives created by this library can be read by the C++
11//! implementation and vice versa.
12//!
13//! # Features
14//!
15//! - **Compression** - Create AGC archives from FASTA files
16//! - **Decompression** - Extract genomes from AGC archives
17//! - **C++ Compatibility** - Bidirectional format interoperability
18//! - **Multi-sample support** - Handle multiple genomes in one archive
19//! - **LZ differential encoding** - Efficient encoding against reference sequences
20//! - **ZSTD compression** - High-ratio compression of segments
21//!
22//! # Examples
23//!
24//! ## Compressing genomes
25//!
26//! ```ignore
27//! use ragc_core::{Compressor, CompressorConfig};
28//! use std::path::Path;
29//!
30//! # fn main() -> Result<(), Box<dyn std::error::Error>> {
31//! // Create a compressor
32//! let config = CompressorConfig::default();
33//! let mut compressor = Compressor::new("output.agc", config)?;
34//!
35//! // Add FASTA files
36//! compressor.add_fasta_file("sample1", Path::new("genome1.fasta"))?;
37//! compressor.add_fasta_file("sample2", Path::new("genome2.fasta"))?;
38//!
39//! // Finalize the archive
40//! compressor.finalize()?;
41//! # Ok(())
42//! # }
43//! ```
44//!
45//! ## Decompressing genomes
46//!
47//! ```no_run
48//! use ragc_core::{Decompressor, DecompressorConfig};
49//!
50//! # fn main() -> Result<(), Box<dyn std::error::Error>> {
51//! // Open an archive
52//! let config = DecompressorConfig::default();
53//! let mut decompressor = Decompressor::open("archive.agc", config)?;
54//!
55//! // List available samples
56//! let samples = decompressor.list_samples();
57//! println!("Found {} samples", samples.len());
58//!
59//! // Extract a sample
60//! let contigs = decompressor.get_sample("sample1")?;
61//! for (name, sequence) in contigs {
62//!     println!(">{}",  name);
63//!     // sequence is Vec<u8> with numeric encoding (A=0, C=1, G=2, T=3)
64//! }
65//! # Ok(())
66//! # }
67//! ```
68//!
69//! ## Working with k-mers
70//!
71//! ```
72//! use ragc_core::{Kmer, KmerMode};
73//!
74//! // Create a canonical k-mer
75//! let mut kmer = Kmer::new(21, KmerMode::Canonical);
76//!
77//! // Insert bases (0=A, 1=C, 2=G, 3=T)
78//! kmer.insert(0); // A
79//! kmer.insert(1); // C
80//! kmer.insert(2); // G
81//!
82//! if kmer.is_full() {
83//!     let value = kmer.data();
84//!     println!("K-mer value: {}", value);
85//! }
86//! ```
87//!
88//! ## Custom compression settings
89//!
90//! ```ignore
91//! use ragc_core::CompressorConfig;
92//!
93//! let config = CompressorConfig {
94//!     kmer_length: 25,        // Use 25-mers instead of default 21
95//!     segment_size: 2000,     // Larger segments
96//!     min_match_len: 20,      // Minimum LZ match length
97//!     verbosity: 2,           // More verbose output
98//! };
99//! ```
100//!
101//! # Archive Format
102//!
103//! The AGC format organizes data into streams:
104//!
105//! - **file_type_info** - Version and producer metadata
106//! - **params** - Compression parameters (k-mer length, segment size)
107//! - **splitters** - Singleton k-mers used for segmentation (future)
108//! - **seg-NN** or **seg_dNN** - Compressed genome segments
109//! - **collection** - Sample and contig metadata
110//!
111//! # Compatibility
112//!
113//! This implementation is tested for compatibility with C++ AGC:
114//!
115//! - Archives created by ragc can be read by C++ AGC
116//! - Archives created by C++ AGC can be read by ragc
117//! - Format version 3.0 support
118//! - SHA256-verified roundtrip testing
119
120pub mod agc_compressor;
121pub mod bloom_filter;
122pub mod contig_compression;
123pub mod contig_iterator;
124pub mod decompressor;
125pub mod env_cache;
126pub mod genome_io;
127pub mod kmer;
128pub mod kmer_extract;
129pub mod lz_diff;
130pub mod lz_matcher;
131pub mod memory_bounded_queue;
132pub mod preprocessing;
133pub mod priority_queue;
134pub mod segment;
135pub mod segment_buffer;
136pub mod segment_compression;
137pub mod splitters;
138pub mod task;
139pub mod tuple_packing;
140pub mod worker;
141pub mod zstd_pool;
142
143// C++ AGC FFI (always available for byte-identical archives)
144#[path = "ffi/agc_index.rs"]
145pub mod agc_index_ffi;
146
147#[path = "ffi/agc_compress.rs"]
148pub mod agc_compress_ffi;
149
150#[path = "ffi/splitters.rs"]
151pub mod splitters_ffi;
152
153#[path = "ffi/segment_helpers.rs"]
154pub mod segment_helpers_ffi;
155
156#[path = "ffi/kmer_helpers.rs"]
157pub mod kmer_helpers_ffi;
158
159#[path = "ffi/splitter_check.rs"]
160pub mod splitter_check_ffi;
161
162#[path = "ffi/segment_boundary.rs"]
163pub mod segment_boundary_ffi;
164
165#[path = "ffi/base_validation.rs"]
166pub mod base_validation_ffi;
167
168#[path = "ffi/reverse_complement.rs"]
169pub mod reverse_complement_ffi;
170
171#[path = "ffi/segment_split.rs"]
172pub mod segment_split_ffi;
173
174#[path = "ffi/kmer_pair.rs"]
175pub mod kmer_pair_ffi;
176
177#[path = "ffi/preprocessing.rs"]
178pub mod preprocessing_ffi;
179
180#[path = "ffi/find_splitters_in_contig.rs"]
181pub mod find_splitters_in_contig_ffi;
182
183#[cfg(feature = "cpp_agc")]
184pub mod ragc_ffi {
185    extern "C" {
186        pub fn agc_cost_vector(
187            prefix: i32,
188            ref_ptr: *const u8,
189            ref_len: usize,
190            text_ptr: *const u8,
191            text_len: usize,
192            min_match_len: u32,
193            out_costs: *mut u32,
194        ) -> usize;
195
196        pub fn agc_best_split(
197            left_ref: *const u8,
198            left_len: usize,
199            right_ref: *const u8,
200            right_len: usize,
201            text_ptr: *const u8,
202            text_len: usize,
203            min_match_len: u32,
204            k: u32,
205            front_lt_mid: i32,
206            mid_lt_back: i32,
207            should_reverse: i32,
208            out_best_pos: *mut u32,
209            out_seg2_start: *mut u32,
210            out_should_split: *mut i32,
211        ) -> i32;
212
213        pub fn agc_find_middle(
214            front_list: *const u64,
215            n_front: usize,
216            back_list: *const u64,
217            n_back: usize,
218            out_middle: *mut u64,
219        ) -> i32;
220
221        pub fn agc_decide_split(
222            front_list: *const u64,
223            n_front: usize,
224            back_list: *const u64,
225            n_back: usize,
226            left_ref: *const u8,
227            left_len: usize,
228            right_ref: *const u8,
229            right_len: usize,
230            text_ptr: *const u8,
231            text_len: usize,
232            front_kmer: u64,
233            back_kmer: u64,
234            min_match_len: u32,
235            k: u32,
236            should_reverse: i32,
237            out_has_middle: *mut i32,
238            out_middle: *mut u64,
239            out_best_pos: *mut u32,
240            out_seg2_start: *mut u32,
241            out_should_split: *mut i32,
242        ) -> i32;
243
244        // Grouping Engine FFI
245        pub fn agc_grouping_engine_create(k: u32, start_group_id: u32) -> *mut std::ffi::c_void;
246        pub fn agc_grouping_engine_destroy(engine: *mut std::ffi::c_void);
247        pub fn agc_grouping_engine_register(
248            engine: *mut std::ffi::c_void,
249            kmer_front: u64,
250            kmer_back: u64,
251            group_id: u32,
252        );
253        pub fn agc_grouping_engine_find_middle(
254            engine: *mut std::ffi::c_void,
255            front: u64,
256            back: u64,
257            out_middle: *mut u64,
258        ) -> i32;
259        pub fn agc_grouping_engine_group_exists(
260            engine: *mut std::ffi::c_void,
261            kmer_front: u64,
262            kmer_back: u64,
263        ) -> i32;
264        pub fn agc_grouping_engine_get_group_id(
265            engine: *mut std::ffi::c_void,
266            kmer_front: u64,
267            kmer_back: u64,
268        ) -> u32;
269        pub fn agc_grouping_engine_alloc_id(engine: *mut std::ffi::c_void) -> u32;
270
271        // Estimate function for comparing with RAGC's estimate()
272        pub fn agc_estimate(
273            ref_ptr: *const u8,
274            ref_len: usize,
275            text_ptr: *const u8,
276            text_len: usize,
277            min_match_len: u32,
278            bound: u32,
279        ) -> u32;
280
281        // REAL CLZDiff_V2::Estimate from agc_compress.cpp (always linked)
282        // This is the EXACT function used by C++ AGC's find_cand_segment_with_one_splitter
283        pub fn agc_lzdiff_v2_estimate(
284            ref_ptr: *const u8,
285            ref_len: usize,
286            text_ptr: *const u8,
287            text_len: usize,
288            min_match_len: u32,
289            bound: u32,
290        ) -> u32;
291
292        // REAL CLZDiff_V2::Encode from agc_compress.cpp
293        pub fn agc_lzdiff_v2_encode(
294            ref_ptr: *const u8,
295            ref_len: usize,
296            text_ptr: *const u8,
297            text_len: usize,
298            min_match_len: u32,
299            out_buf: *mut u8,
300            out_buf_len: usize,
301        ) -> u32;
302    }
303
304    /// Compute estimate (total encoding cost) using C++ FFI for comparison
305    pub fn estimate(reference: &[u8], text: &[u8], min_match_len: u32, bound: u32) -> u32 {
306        unsafe {
307            agc_estimate(
308                reference.as_ptr(),
309                reference.len(),
310                text.as_ptr(),
311                text.len(),
312                min_match_len,
313                bound,
314            )
315        }
316    }
317
318    /// Compute estimate using the REAL CLZDiff_V2::Estimate from C++ AGC
319    /// This is the EXACT algorithm used by find_cand_segment_with_one_splitter
320    pub fn lzdiff_v2_estimate(
321        reference: &[u8],
322        text: &[u8],
323        min_match_len: u32,
324        bound: u32,
325    ) -> u32 {
326        unsafe {
327            agc_lzdiff_v2_estimate(
328                reference.as_ptr(),
329                reference.len(),
330                text.as_ptr(),
331                text.len(),
332                min_match_len,
333                bound,
334            )
335        }
336    }
337
338    /// Encode using the REAL CLZDiff_V2::Encode from C++ AGC
339    /// Returns the encoded bytes
340    pub fn lzdiff_v2_encode(reference: &[u8], text: &[u8], min_match_len: u32) -> Option<Vec<u8>> {
341        // Allocate buffer (worst case: same as text size + some margin)
342        let mut out_buf = vec![0u8; text.len() * 2 + 1024];
343        unsafe {
344            let result = agc_lzdiff_v2_encode(
345                reference.as_ptr(),
346                reference.len(),
347                text.as_ptr(),
348                text.len(),
349                min_match_len,
350                out_buf.as_mut_ptr(),
351                out_buf.len(),
352            );
353            if result == u32::MAX {
354                None
355            } else {
356                out_buf.truncate(result as usize);
357                Some(out_buf)
358            }
359        }
360    }
361
362    pub fn cost_vector(
363        prefix: bool,
364        reference: &[u8],
365        text: &[u8],
366        min_match_len: u32,
367    ) -> Vec<u32> {
368        unsafe {
369            let mut out = vec![0u32; text.len()];
370            let _ = agc_cost_vector(
371                if prefix { 1 } else { 0 },
372                reference.as_ptr(),
373                reference.len(),
374                text.as_ptr(),
375                text.len(),
376                min_match_len,
377                out.as_mut_ptr(),
378            );
379            out
380        }
381    }
382
383    pub fn best_split(
384        left_ref: &[u8],
385        right_ref: &[u8],
386        text: &[u8],
387        min_match_len: u32,
388        k: u32,
389        front_lt_mid: bool,
390        mid_lt_back: bool,
391        should_reverse: bool,
392    ) -> Option<(usize, usize, bool)> {
393        unsafe {
394            let mut best: u32 = 0;
395            let mut seg2: u32 = 0;
396            let mut should: i32 = 0;
397            let ok = agc_best_split(
398                left_ref.as_ptr(),
399                left_ref.len(),
400                right_ref.as_ptr(),
401                right_ref.len(),
402                text.as_ptr(),
403                text.len(),
404                min_match_len,
405                k,
406                if front_lt_mid { 1 } else { 0 },
407                if mid_lt_back { 1 } else { 0 },
408                if should_reverse { 1 } else { 0 },
409                &mut best as *mut u32,
410                &mut seg2 as *mut u32,
411                &mut should as *mut i32,
412            );
413            if ok != 0 {
414                Some((best as usize, seg2 as usize, should != 0))
415            } else {
416                None
417            }
418        }
419    }
420
421    pub fn find_middle(front_neighbors: &[u64], back_neighbors: &[u64]) -> Option<u64> {
422        unsafe {
423            let mut out: u64 = 0;
424            let ok = agc_find_middle(
425                front_neighbors.as_ptr(),
426                front_neighbors.len(),
427                back_neighbors.as_ptr(),
428                back_neighbors.len(),
429                &mut out as *mut u64,
430            );
431            if ok != 0 {
432                Some(out)
433            } else {
434                None
435            }
436        }
437    }
438
439    pub fn decide_split(
440        front_neighbors: &[u64],
441        back_neighbors: &[u64],
442        left_ref: &[u8],
443        right_ref: &[u8],
444        text: &[u8],
445        front_kmer: u64,
446        back_kmer: u64,
447        min_match_len: u32,
448        k: u32,
449        should_reverse: bool,
450    ) -> Option<(bool, u64, usize, usize, bool)> {
451        unsafe {
452            let mut has_mid: i32 = 0;
453            let mut middle: u64 = 0;
454            let mut best: u32 = 0;
455            let mut seg2: u32 = 0;
456            let mut should: i32 = 0;
457            let ok = agc_decide_split(
458                front_neighbors.as_ptr(),
459                front_neighbors.len(),
460                back_neighbors.as_ptr(),
461                back_neighbors.len(),
462                left_ref.as_ptr(),
463                left_ref.len(),
464                right_ref.as_ptr(),
465                right_ref.len(),
466                text.as_ptr(),
467                text.len(),
468                front_kmer,
469                back_kmer,
470                min_match_len,
471                k,
472                if should_reverse { 1 } else { 0 },
473                &mut has_mid as *mut i32,
474                &mut middle as *mut u64,
475                &mut best as *mut u32,
476                &mut seg2 as *mut u32,
477                &mut should as *mut i32,
478            );
479            if ok != 0 {
480                Some((
481                    has_mid != 0,
482                    middle,
483                    best as usize,
484                    seg2 as usize,
485                    should != 0,
486                ))
487            } else {
488                None
489            }
490        }
491    }
492
493    /// Safe Rust wrapper for C++ GroupingEngine
494    pub struct GroupingEngine {
495        ptr: *mut std::ffi::c_void,
496    }
497
498    impl GroupingEngine {
499        pub fn new(k: u32, start_group_id: u32) -> Self {
500            unsafe {
501                Self {
502                    ptr: agc_grouping_engine_create(k, start_group_id),
503                }
504            }
505        }
506
507        pub fn register_group(&mut self, kmer_front: u64, kmer_back: u64, group_id: u32) {
508            unsafe {
509                agc_grouping_engine_register(self.ptr, kmer_front, kmer_back, group_id);
510            }
511        }
512
513        pub fn find_middle(&self, front: u64, back: u64) -> Option<u64> {
514            unsafe {
515                let mut out: u64 = 0;
516                let ok =
517                    agc_grouping_engine_find_middle(self.ptr, front, back, &mut out as *mut u64);
518                if ok != 0 {
519                    Some(out)
520                } else {
521                    None
522                }
523            }
524        }
525
526        pub fn group_exists(&self, kmer_front: u64, kmer_back: u64) -> bool {
527            unsafe { agc_grouping_engine_group_exists(self.ptr, kmer_front, kmer_back) != 0 }
528        }
529
530        pub fn get_group_id(&self, kmer_front: u64, kmer_back: u64) -> Option<u32> {
531            unsafe {
532                let gid = agc_grouping_engine_get_group_id(self.ptr, kmer_front, kmer_back);
533                if gid == u32::MAX {
534                    None
535                } else {
536                    Some(gid)
537                }
538            }
539        }
540
541        pub fn alloc_group_id(&mut self) -> u32 {
542            unsafe { agc_grouping_engine_alloc_id(self.ptr) }
543        }
544    }
545
546    impl Drop for GroupingEngine {
547        fn drop(&mut self) {
548            unsafe {
549                agc_grouping_engine_destroy(self.ptr);
550            }
551        }
552    }
553
554    // GroupingEngine is Send + Sync because C++ implementation is thread-safe
555    unsafe impl Send for GroupingEngine {}
556    unsafe impl Sync for GroupingEngine {}
557}
558
559// Re-export commonly used types
560pub use agc_compressor::{QueueStats, StreamingQueueCompressor, StreamingQueueConfig};
561pub use contig_iterator::{MultiFileIterator, PansnFileIterator};
562pub use decompressor::{Decompressor, DecompressorConfig};
563pub use genome_io::{GenomeIO, GenomeWriter};
564pub use kmer::{
565    canonical_kmer, decode_base, encode_base, reverse_complement, reverse_complement_kmer,
566};
567pub use kmer::{Kmer, KmerMode};
568pub use kmer_extract::{enumerate_kmers, find_candidate_kmers, remove_non_singletons};
569pub use lz_diff::LZDiff;
570pub use memory_bounded_queue::MemoryBoundedQueue;
571pub use segment::{split_at_splitters, split_at_splitters_with_size, Segment};
572pub use segment_compression::{
573    compress_reference_segment, compress_segment, compress_segment_configured, decompress_segment,
574    decompress_segment_with_marker,
575};
576pub use splitters::{
577    determine_splitters, determine_splitters_streaming, determine_splitters_streaming_first_sample,
578    find_candidate_kmers_multi, is_hard_contig, is_splitter, two_pass_splitter_discovery,
579};
580pub use worker::create_agc_archive;