ragc-core 0.1.1

Core compression and decompression algorithms for the AGC genome compression format
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
#![allow(clippy::all)]
//! Integration tests for C++ compatibility
//!
//! These tests verify that ragc produces archives that are bit-compatible
//! with C++ AGC and can read C++ AGC archives correctly.

use ragc_core::{Compressor, CompressorConfig, Decompressor, DecompressorConfig};
use sha2::{Digest, Sha256};
use std::fs;
use std::path::{Path, PathBuf};
use std::process::Command;

#[allow(dead_code)]
fn get_test_data_dir() -> PathBuf {
    PathBuf::from(env!("CARGO_MANIFEST_DIR")).join("../test-data")
}

fn compute_file_hash(path: &Path) -> String {
    let data = fs::read(path).expect("Failed to read file");
    let hash = Sha256::digest(&data);
    format!("{hash:x}")
}

fn contig_to_string(contig: &[u8]) -> String {
    contig
        .iter()
        .map(|&b| match b {
            0 => 'A',
            1 => 'C',
            2 => 'G',
            3 => 'T',
            _ => 'N',
        })
        .collect()
}

fn create_test_fasta(path: &Path) {
    let content = r#">seq1
ACGTACGTACGTACGTACGTACGTACGTACGT
>seq2
ACGTACGTACGTACGTACGTACGTACGTACGTNNNNNNNNNNNN
"#;
    fs::write(path, content).expect("Failed to write test FASTA");
}

#[test]
fn test_ragc_creates_valid_archive() {
    let test_dir = std::env::temp_dir();
    let fasta_path = test_dir.join("test_compat.fasta");
    let archive_path = test_dir.join("test_compat_ragc.agc");

    // Create test FASTA
    create_test_fasta(&fasta_path);

    // Create archive with ragc
    let config = CompressorConfig::default();
    let mut compressor = Compressor::new(archive_path.to_str().unwrap(), config)
        .expect("Failed to create compressor");

    compressor
        .add_fasta_file("test_sample", &fasta_path)
        .expect("Failed to add FASTA");

    compressor.finalize().expect("Failed to finalize archive");

    assert!(archive_path.exists(), "Archive was not created");

    // Verify archive can be read back
    let config = DecompressorConfig::default();
    let decompressor =
        Decompressor::open(archive_path.to_str().unwrap(), config).expect("Failed to open archive");

    let samples = decompressor.list_samples();

    assert_eq!(samples.len(), 1);
    assert_eq!(samples[0], "test_sample");

    // Clean up
    let _ = fs::remove_file(&fasta_path);
    let _ = fs::remove_file(&archive_path);
}

#[test]
fn test_ragc_rust_roundtrip() {
    let test_dir = std::env::temp_dir();
    let fasta_path = test_dir.join("test_roundtrip.fasta");
    let archive_path = test_dir.join("test_roundtrip.agc");
    let output_path = test_dir.join("test_roundtrip_out.fasta");

    // Create test FASTA
    create_test_fasta(&fasta_path);
    let original_hash = compute_file_hash(&fasta_path);

    // Compress with ragc
    let config = CompressorConfig::default();
    let mut compressor = Compressor::new(archive_path.to_str().unwrap(), config)
        .expect("Failed to create compressor");

    compressor
        .add_fasta_file("test_sample", &fasta_path)
        .expect("Failed to add FASTA");

    compressor.finalize().expect("Failed to finalize archive");

    // Decompress with ragc
    let config = DecompressorConfig::default();
    let mut decompressor =
        Decompressor::open(archive_path.to_str().unwrap(), config).expect("Failed to open archive");

    let sequences = decompressor
        .get_sample("test_sample")
        .expect("Failed to extract sample");

    // Write output
    let mut output_content = String::new();
    for (name, contig) in sequences {
        output_content.push_str(&format!(">{name}\n"));
        output_content.push_str(&contig_to_string(&contig));
        output_content.push('\n');
    }
    fs::write(&output_path, output_content).expect("Failed to write output");

    let output_hash = compute_file_hash(&output_path);

    assert_eq!(
        original_hash, output_hash,
        "Roundtrip produced different data! Original: {original_hash}, Output: {output_hash}"
    );

    // Clean up
    let _ = fs::remove_file(&fasta_path);
    let _ = fs::remove_file(&archive_path);
    let _ = fs::remove_file(&output_path);
}

#[test]
#[ignore] // HashMap iteration order is non-deterministic for security reasons
fn test_deterministic_compression() {
    // Test that ragc produces identical archives for identical inputs
    // NOTE: This test is expected to fail due to HashMap randomization
    let test_dir = std::env::temp_dir();
    let fasta_path = test_dir.join("test_deterministic.fasta");
    let archive1_path = test_dir.join("test_deterministic_1.agc");
    let archive2_path = test_dir.join("test_deterministic_2.agc");

    create_test_fasta(&fasta_path);

    // Create first archive
    let config = CompressorConfig::default();
    let mut compressor1 = Compressor::new(archive1_path.to_str().unwrap(), config.clone())
        .expect("Failed to create compressor 1");
    compressor1
        .add_fasta_file("test_sample", &fasta_path)
        .expect("Failed to add FASTA 1");
    compressor1.finalize().expect("Failed to finalize 1");

    // Create second archive
    let mut compressor2 = Compressor::new(archive2_path.to_str().unwrap(), config)
        .expect("Failed to create compressor 2");
    compressor2
        .add_fasta_file("test_sample", &fasta_path)
        .expect("Failed to add FASTA 2");
    compressor2.finalize().expect("Failed to finalize 2");

    // Compare hashes
    let hash1 = compute_file_hash(&archive1_path);
    let hash2 = compute_file_hash(&archive2_path);

    assert_eq!(hash1, hash2,
        "Archives differ! This means compression is non-deterministic.\nArchive 1: {hash1}\nArchive 2: {hash2}");

    // Clean up
    let _ = fs::remove_file(&fasta_path);
    let _ = fs::remove_file(&archive1_path);
    let _ = fs::remove_file(&archive2_path);
}

#[cfg(test)]
mod with_cpp_agc {
    use super::*;

    fn cpp_agc_available() -> bool {
        Command::new("agc").arg("--version").output().is_ok()
    }

    #[test]
    fn test_cpp_can_read_ragc_archives() {
        if !cpp_agc_available() {
            eprintln!("Skipping C++ compatibility test: C++ agc not found");
            return;
        }

        let test_dir = std::env::temp_dir();
        let fasta_path = test_dir.join("test_cpp_read.fasta");
        let archive_path = test_dir.join("test_cpp_read.agc");
        let output_path = test_dir.join("test_cpp_read_out.fasta");

        // Create test FASTA
        create_test_fasta(&fasta_path);
        let original_hash = compute_file_hash(&fasta_path);

        // Create archive with ragc
        let config = CompressorConfig::default();
        let mut compressor = Compressor::new(archive_path.to_str().unwrap(), config)
            .expect("Failed to create compressor");
        compressor
            .add_fasta_file("test_sample", &fasta_path)
            .expect("Failed to add FASTA");
        compressor.finalize().expect("Failed to finalize archive");

        // Extract with C++ agc
        let status = Command::new("agc")
            .arg("getset")
            .arg(archive_path.to_str().unwrap())
            .arg("test_sample")
            .output()
            .expect("Failed to run C++ agc");

        assert!(
            status.status.success(),
            "C++ agc failed to extract: {}",
            String::from_utf8_lossy(&status.stderr)
        );

        fs::write(&output_path, &status.stdout).expect("Failed to write output");
        let output_hash = compute_file_hash(&output_path);

        assert_eq!(
            original_hash, output_hash,
            "C++ extracted different data!\nOriginal: {original_hash}\nC++ Output: {output_hash}"
        );

        // Clean up
        let _ = fs::remove_file(&fasta_path);
        let _ = fs::remove_file(&archive_path);
        let _ = fs::remove_file(&output_path);
    }

    #[test]
    fn test_ragc_can_read_cpp_archives() {
        if !cpp_agc_available() {
            eprintln!("Skipping C++ compatibility test: C++ agc not found");
            return;
        }

        let test_dir = std::env::temp_dir();
        let fasta_path = test_dir.join("test_ragc_read.fasta");
        let archive_path = test_dir.join("test_ragc_read.agc");
        let output_path = test_dir.join("test_ragc_read_out.fasta");

        // Create test FASTA
        create_test_fasta(&fasta_path);
        let original_hash = compute_file_hash(&fasta_path);

        // Create archive with C++ agc
        let status = Command::new("agc")
            .arg("create")
            .arg("-o")
            .arg(archive_path.to_str().unwrap())
            .arg(fasta_path.to_str().unwrap())
            .status()
            .expect("Failed to run C++ agc create");

        assert!(status.success(), "C++ agc failed to create archive");

        // Extract with ragc
        let config = DecompressorConfig::default();
        let mut decompressor = Decompressor::open(archive_path.to_str().unwrap(), config)
            .expect("Failed to open C++ archive");

        let sequences = decompressor
            .get_sample("test_ragc_read")
            .expect("Failed to extract sample");

        let mut output_content = String::new();
        for (name, contig) in sequences {
            output_content.push_str(&format!(">{name}\n"));
            output_content.push_str(&contig_to_string(&contig));
            output_content.push('\n');
        }
        fs::write(&output_path, output_content).expect("Failed to write output");

        let output_hash = compute_file_hash(&output_path);

        assert_eq!(
            original_hash, output_hash,
            "ragc extracted different data from C++ archive!\nOriginal: {original_hash}\nragc Output: {output_hash}"
        );

        // Clean up
        let _ = fs::remove_file(&fasta_path);
        let _ = fs::remove_file(&archive_path);
        let _ = fs::remove_file(&output_path);
    }

    /// Test N-base handling in round-trip compression/decompression
    #[test]
    fn test_n_bases_round_trip() {
        let test_dir = std::env::temp_dir();
        let archive_path = test_dir.join("test_n_bases.agc");

        // Test various N-base patterns
        let test_cases = vec![
            ("single_n", "ACGTACGTNACGTACGT"),  // Single N
            ("short_n_run", "ACGTNNACGT"),      // 2 N's (< MIN_NRUN_LEN)
            ("medium_n_run", "ACGTNNNACGT"),    // 3 N's (< MIN_NRUN_LEN)
            ("long_n_run", "ACGTNNNNNNNNACGT"), // 8 N's (triggers run encoding)
            ("very_long_n_run", "ACGTNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNACGT"), // 32 N's
            ("mixed", "ACGTNACGTNNNACGTNNNNNNNNACGT"), // Mixed single + runs
            ("all_n", "NNNNNNNNNNNN"),          // All N's
            ("start_n", "NNNACGTACGT"),         // N's at start
            ("end_n", "ACGTACGTNNN"),           // N's at end
            ("alternating", "NANANANANANA"),    // Alternating N/A
        ];

        for (name, sequence) in test_cases {
            eprintln!("Testing N-base pattern: {name} ({sequence})");

            // Create archive
            let config = CompressorConfig::default();
            let mut compressor = Compressor::new(archive_path.to_str().unwrap(), config)
                .expect("Failed to create compressor");

            // Convert sequence to numeric encoding
            let numeric: Vec<u8> = sequence
                .bytes()
                .map(|b| match b {
                    b'A' => 0,
                    b'C' => 1,
                    b'G' => 2,
                    b'T' => 3,
                    b'N' => 4,
                    _ => panic!("Invalid base: {}", b as char),
                })
                .collect();

            compressor
                .add_contig("test_sample", name, numeric.clone())
                .expect("Failed to add contig");
            compressor.finalize().expect("Failed to finalize");

            // Extract and verify
            let config = DecompressorConfig::default();
            let mut decompressor = Decompressor::open(archive_path.to_str().unwrap(), config)
                .expect("Failed to open archive");

            let extracted = decompressor
                .get_contig("test_sample", name)
                .expect("Failed to extract contig");

            assert_eq!(
                numeric, extracted,
                "N-base round-trip failed for pattern '{name}'\nExpected: {numeric:?}\nGot: {extracted:?}"
            );

            // Also verify as string
            let extracted_str = contig_to_string(&extracted);
            assert_eq!(
                sequence, extracted_str,
                "N-base string round-trip failed for pattern '{name}'"
            );

            decompressor.close().expect("Failed to close decompressor");
            let _ = fs::remove_file(&archive_path);
        }
    }

    /// Test N-base compatibility with C++ AGC
    #[test]
    fn test_n_bases_cpp_compat() {
        if !cpp_agc_available() {
            eprintln!("Skipping C++ N-base compatibility test: C++ agc not found");
            return;
        }

        let test_dir = std::env::temp_dir();
        let fasta_path = test_dir.join("test_n_compat.fasta");
        let cpp_archive = test_dir.join("test_n_cpp.agc");
        let ragc_archive = test_dir.join("test_n_ragc.agc");

        // Create test FASTA with various N patterns
        let content = r#">test_n_sample#1#single_n
ACGTACGTNACGTACGT
>test_n_sample#1#short_n_run
ACGTNNACGT
>test_n_sample#1#long_n_run
ACGTNNNNNNNNACGT
>test_n_sample#1#mixed
ACGTNACGTNNNACGTNNNNNNNNACGT
>test_n_sample#1#very_long_n_run
ACGTNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNACGT
"#;
        fs::write(&fasta_path, content).expect("Failed to write test FASTA");

        // Test 1: C++ AGC → RAGC
        let status = Command::new("agc")
            .arg("create")
            .arg("-o")
            .arg(cpp_archive.to_str().unwrap())
            .arg(fasta_path.to_str().unwrap())
            .status()
            .expect("Failed to run C++ agc create");
        assert!(status.success(), "C++ agc failed to create archive");

        let config = DecompressorConfig::default();
        let mut decompressor = Decompressor::open(cpp_archive.to_str().unwrap(), config)
            .expect("Failed to open C++ archive");

        let sequences = decompressor
            .get_sample("test_n_compat")
            .expect("Failed to extract sample from C++ archive");

        for (name, contig) in sequences {
            let contig_str = contig_to_string(&contig);
            eprintln!("C++ AGC → RAGC: {name} = {contig_str}");
            assert!(
                !contig_str.contains('X'),
                "Invalid base found in {name} extracted from C++ AGC"
            );
            // Verify N's are present where expected
            if name.contains("_n") {
                assert!(
                    contig_str.contains('N'),
                    "N-bases missing from {name} extracted from C++ AGC"
                );
            }
        }

        // Test 2: RAGC → C++ AGC
        let config = CompressorConfig::default();
        let mut compressor = Compressor::new(ragc_archive.to_str().unwrap(), config)
            .expect("Failed to create RAGC compressor");

        // Add test sequences with N's
        let test_seq: Vec<u8> = vec![0, 1, 2, 3, 4, 4, 4, 4, 0, 1, 2, 3]; // ACGTNNNNACGT
        compressor
            .add_contig("test_ragc", "n_test", test_seq.clone())
            .expect("Failed to add contig");
        compressor
            .finalize()
            .expect("Failed to finalize RAGC archive");

        // Extract with C++ AGC
        let output = Command::new("agc")
            .arg("getset")
            .arg(ragc_archive.to_str().unwrap())
            .arg("test_ragc")
            .output()
            .expect("Failed to run C++ agc getset");

        eprintln!(
            "C++ AGC stdout: {}",
            String::from_utf8_lossy(&output.stdout)
        );
        eprintln!(
            "C++ AGC stderr: {}",
            String::from_utf8_lossy(&output.stderr)
        );
        eprintln!("C++ AGC exit status: {}", output.status);

        assert!(
            output.status.success(),
            "C++ agc failed to extract from RAGC archive"
        );

        let extracted = String::from_utf8_lossy(&output.stdout);
        eprintln!("RAGC → C++ AGC extraction:\n{extracted}");

        // Verify N's are present
        assert!(
            !extracted.is_empty(),
            "C++ AGC extraction produced empty output"
        );
        assert!(
            extracted.contains('N'),
            "N-bases missing from C++ AGC extraction of RAGC archive"
        );
        assert!(
            extracted.contains("NNNN"),
            "N-run missing from C++ AGC extraction"
        );

        // Clean up
        let _ = fs::remove_file(&fasta_path);
        let _ = fs::remove_file(&cpp_archive);
        let _ = fs::remove_file(&ragc_archive);
    }
}