ragc_core/
lib.rs

1#![allow(clippy::all)]
2#![allow(dead_code)]
3#![allow(unused_variables)]
4#![allow(unused_imports)]
5#![allow(private_interfaces)]
6//! Core compression and decompression algorithms for the AGC genome compression format.
7//!
8//! This crate implements the complete AGC compression pipeline with full C++ AGC
9//! format compatibility. Archives created by this library can be read by the C++
10//! implementation and vice versa.
11//!
12//! # Features
13//!
14//! - **Compression** - Create AGC archives from FASTA files
15//! - **Decompression** - Extract genomes from AGC archives
16//! - **C++ Compatibility** - Bidirectional format interoperability
17//! - **Multi-sample support** - Handle multiple genomes in one archive
18//! - **LZ differential encoding** - Efficient encoding against reference sequences
19//! - **ZSTD compression** - High-ratio compression of segments
20//!
21//! # Examples
22//!
23//! ## Compressing genomes
24//!
25//! ```no_run
26//! use ragc_core::{Compressor, CompressorConfig};
27//! use std::path::Path;
28//!
29//! # fn main() -> Result<(), Box<dyn std::error::Error>> {
30//! // Create a compressor
31//! let config = CompressorConfig::default();
32//! let mut compressor = Compressor::new("output.agc", config)?;
33//!
34//! // Add FASTA files
35//! compressor.add_fasta_file("sample1", Path::new("genome1.fasta"))?;
36//! compressor.add_fasta_file("sample2", Path::new("genome2.fasta"))?;
37//!
38//! // Finalize the archive
39//! compressor.finalize()?;
40//! # Ok(())
41//! # }
42//! ```
43//!
44//! ## Decompressing genomes
45//!
46//! ```no_run
47//! use ragc_core::{Decompressor, DecompressorConfig};
48//!
49//! # fn main() -> Result<(), Box<dyn std::error::Error>> {
50//! // Open an archive
51//! let config = DecompressorConfig::default();
52//! let mut decompressor = Decompressor::open("archive.agc", config)?;
53//!
54//! // List available samples
55//! let samples = decompressor.list_samples();
56//! println!("Found {} samples", samples.len());
57//!
58//! // Extract a sample
59//! let contigs = decompressor.get_sample("sample1")?;
60//! for (name, sequence) in contigs {
61//!     println!(">{}",  name);
62//!     // sequence is Vec<u8> with numeric encoding (A=0, C=1, G=2, T=3)
63//! }
64//! # Ok(())
65//! # }
66//! ```
67//!
68//! ## Working with k-mers
69//!
70//! ```
71//! use ragc_core::{Kmer, KmerMode};
72//!
73//! // Create a canonical k-mer
74//! let mut kmer = Kmer::new(21, KmerMode::Canonical);
75//!
76//! // Insert bases (0=A, 1=C, 2=G, 3=T)
77//! kmer.insert(0); // A
78//! kmer.insert(1); // C
79//! kmer.insert(2); // G
80//!
81//! if kmer.is_full() {
82//!     let value = kmer.data();
83//!     println!("K-mer value: {}", value);
84//! }
85//! ```
86//!
87//! ## Custom compression settings
88//!
89//! ```no_run
90//! use ragc_core::CompressorConfig;
91//!
92//! let config = CompressorConfig {
93//!     kmer_length: 25,        // Use 25-mers instead of default 21
94//!     segment_size: 2000,     // Larger segments
95//!     min_match_len: 20,      // Minimum LZ match length
96//!     verbosity: 2,           // More verbose output
97//! };
98//! ```
99//!
100//! # Archive Format
101//!
102//! The AGC format organizes data into streams:
103//!
104//! - **file_type_info** - Version and producer metadata
105//! - **params** - Compression parameters (k-mer length, segment size)
106//! - **splitters** - Singleton k-mers used for segmentation (future)
107//! - **seg-NN** or **seg_dNN** - Compressed genome segments
108//! - **collection** - Sample and contig metadata
109//!
110//! # Compatibility
111//!
112//! This implementation is tested for compatibility with C++ AGC:
113//!
114//! - Archives created by ragc can be read by C++ AGC
115//! - Archives created by C++ AGC can be read by ragc
116//! - Format version 3.0 support
117//! - SHA256-verified roundtrip testing
118
119pub mod _compressor_streaming_old;
120pub mod bloom_filter;
121pub mod compressor;
122pub mod contig_compression;
123pub mod contig_iterator;
124pub mod decompressor;
125pub mod genome_io;
126pub mod kmer;
127pub mod kmer_extract;
128pub mod lz_diff;
129pub mod lz_matcher;
130pub mod memory_bounded_queue;
131pub mod priority_queue;
132pub mod segment;
133pub mod segment_buffer;
134pub mod segment_compression;
135pub mod splitters;
136pub mod streaming_compressor_queue;
137pub mod task;
138pub mod tuple_packing;
139pub mod worker;
140pub mod zstd_pool;
141
142// Re-export commonly used types
143pub use _compressor_streaming_old::{StreamingCompressor, StreamingCompressorConfig};
144pub use compressor::{Compressor, CompressorConfig};
145pub use contig_iterator::{MultiFileIterator, PansnFileIterator};
146pub use decompressor::{Decompressor, DecompressorConfig};
147pub use genome_io::{GenomeIO, GenomeWriter};
148pub use kmer::{
149    canonical_kmer, decode_base, encode_base, reverse_complement, reverse_complement_kmer,
150};
151pub use kmer::{Kmer, KmerMode};
152pub use kmer_extract::{enumerate_kmers, find_candidate_kmers, remove_non_singletons};
153pub use lz_diff::LZDiff;
154pub use memory_bounded_queue::MemoryBoundedQueue;
155pub use segment::{split_at_splitters, split_at_splitters_with_size, Segment};
156pub use segment_compression::{
157    compress_reference_segment, compress_segment, compress_segment_configured, decompress_segment,
158    decompress_segment_with_marker,
159};
160pub use splitters::{
161    determine_splitters, determine_splitters_streaming, find_candidate_kmers_multi, is_splitter,
162};
163pub use streaming_compressor_queue::{QueueStats, StreamingQueueCompressor, StreamingQueueConfig};
164pub use worker::create_agc_archive;