ragc-core 0.1.1

Core compression and decompression algorithms for the AGC genome compression format
Documentation
#![allow(clippy::all)]
// Minimal test to reproduce StreamingCompressor data corruption
use ragc_core::{Decompressor, DecompressorConfig};
use ragc_core::{StreamingCompressor, StreamingCompressorConfig};
use sha2::{Digest, Sha256};
use std::fs;

#[test]
fn test_streaming_compressor_extraction_correctness() {
    let archive_path = "/tmp/test_streaming_corruption.agc";
    let _ = fs::remove_file(archive_path);

    // Use yeast10 first sample
    let input_path = "/home/erik/scrapy/yeast10_test/AEL#2.fa";

    if !std::path::Path::new(input_path).exists() {
        eprintln!("Skipping test: {} not found", input_path);
        return;
    }

    // Read original sequence (normalize to uppercase for comparison)
    let original_content = fs::read_to_string(input_path).unwrap();
    let original_sequence: String = original_content
        .lines()
        .filter(|l| !l.starts_with('>'))
        .collect::<Vec<_>>()
        .join("")
        .to_uppercase()
        .chars()
        .filter(|c| !c.is_whitespace())
        .collect();

    let mut original_hasher = Sha256::new();
    original_hasher.update(original_sequence.as_bytes());
    let original_hash = format!("{:x}", original_hasher.finalize());

    println!("Original hash: {}", original_hash);

    // Compress with StreamingCompressor (the CLI path)
    {
        let config = StreamingCompressorConfig {
            kmer_length: 21,
            segment_size: 10000,
            min_match_len: 20,
            compression_level: 11,
            verbosity: 0, // Disable verbosity for cleaner output
            group_flush_threshold: 0,
            concatenated_genomes: false,
            periodic_flush_interval: 0,
            num_threads: 15, // Test multi-threaded determinism
            adaptive_mode: false,
        };

        let mut compressor = StreamingCompressor::new(archive_path, config).unwrap();
        compressor
            .add_fasta_files_with_splitters(&[(
                "AEL#2".to_string(),
                std::path::Path::new(input_path),
            )])
            .unwrap();
        compressor.finalize().unwrap();
    }

    // Extract and verify
    {
        let config = DecompressorConfig { verbosity: 0 };
        let mut decompressor = Decompressor::open(archive_path, config).unwrap();

        // Get the sample
        let samples = decompressor.list_samples();
        println!("Samples in archive: {:?}", samples);
        assert!(!samples.is_empty(), "No samples in archive");

        let sample_name = &samples[0];
        let contigs = decompressor.get_sample(sample_name).unwrap();

        // Extract sequence from result
        let extracted_sequence: String = contigs
            .iter()
            .flat_map(|(_, contig)| {
                contig.iter().map(|&b| match b {
                    0 => 'A',
                    1 => 'C',
                    2 => 'G',
                    3 => 'T',
                    _ => 'N',
                })
            })
            .collect();

        let mut extracted_hasher = Sha256::new();
        extracted_hasher.update(extracted_sequence.as_bytes());
        let extracted_hash = format!("{:x}", extracted_hasher.finalize());

        println!("Extracted hash: {}", extracted_hash);
        println!("Original length: {}", original_sequence.len());
        println!("Extracted length: {}", extracted_sequence.len());

        // Find where sequences diverge
        if original_hash != extracted_hash {
            let mut first_diff: Option<usize> = None;
            let mut diff_count = 0;

            for (i, (o, e)) in original_sequence
                .chars()
                .zip(extracted_sequence.chars())
                .enumerate()
            {
                if o != e {
                    if first_diff.is_none() {
                        first_diff = Some(i);
                        println!("\nFirst difference at position {}", i);
                        let start = i.saturating_sub(50);
                        let end = (i + 50).min(original_sequence.len());
                        println!(
                            "Original  [{}..{}]: {}",
                            start,
                            end,
                            &original_sequence[start..end]
                        );
                        println!(
                            "Extracted [{}..{}]: {}",
                            start,
                            end,
                            &extracted_sequence[start..end]
                        );
                    }
                    diff_count += 1;
                    if diff_count >= 100 {
                        break;
                    }
                }
            }

            println!("\nTotal differences found: {} (stopped at 100)", diff_count);
        }

        assert_eq!(
            original_hash, extracted_hash,
            "Data corruption detected! Sequences don't match.\nOriginal:  {}\nExtracted: {}",
            original_hash, extracted_hash
        );
    }
}