use std::io::{BufWriter, Write};
use std::path::Path;
use crate::cli::args::Args;
use crate::database::format::{DatabaseHeader, DATABASE_MAGIC};
use crate::error::{KmerError, ProcessingResult};
pub fn execute_dump(args: &Args) -> ProcessingResult<()> {
match &args.command {
crate::cli::args::Commands::Dump { database, output } => {
dump_database(database, output.as_deref())
}
_ => Err(KmerError::ProcessingError("Invalid command for execute_dump".to_string()).into()),
}
}
fn dump_database(database_path: &str, output_path: Option<&str>) -> ProcessingResult<()> {
let path = Path::new(database_path);
if !path.exists() {
return Err(KmerError::FileNotFound(database_path.to_string()).into());
}
let format = detect_database_format(path)?;
eprintln!("Detected database format: {:?}", format);
eprintln!("Dumping database: {}", database_path);
match format {
DatabaseFormat::Rkdb => dump_rkdb_database(path, output_path)?,
DatabaseFormat::Bincode => dump_bincode_database(path, output_path)?,
DatabaseFormat::Unknown => {
return Err(KmerError::ProcessingError(
"Unknown database format. Supported formats: Rkdb, bincode".to_string(),
)
.into());
}
}
if let Some(output) = output_path {
eprintln!("Database dumped to: {}", output);
} else {
eprintln!("Database dumped to stdout");
}
Ok(())
}
#[derive(Debug, Clone, PartialEq)]
enum DatabaseFormat {
Rkdb,
Bincode,
#[allow(dead_code)]
Unknown,
}
fn detect_database_format(path: &Path) -> ProcessingResult<DatabaseFormat> {
use std::fs::File;
use std::io::{BufReader, Read};
let file = File::open(path).map_err(KmerError::Io)?;
let mut reader = BufReader::new(file);
let mut header_bytes = [0u8; 4];
if reader.read_exact(&mut header_bytes).is_ok() && header_bytes == *DATABASE_MAGIC {
return Ok(DatabaseFormat::Rkdb);
}
Ok(DatabaseFormat::Bincode)
}
fn dump_rkdb_database(path: &Path, output_path: Option<&str>) -> ProcessingResult<()> {
use std::fs::File;
use std::io::{BufReader, BufWriter};
let file = File::open(path).map_err(KmerError::Io)?;
let mut reader = BufReader::new(file);
let header = DatabaseHeader::read_from(&mut reader).map_err(|e| {
KmerError::ProcessingError(format!("Failed to read database header: {}", e))
})?;
header
.validate()
.map_err(|e| KmerError::ProcessingError(format!("Invalid database header: {}", e)))?;
eprintln!("Database info:");
eprintln!(" K-mer size: {}", header.kmer_size);
eprintln!(" Total k-mers: {}", header.total_kmers);
eprintln!(" Sorted: {}", header.sorted);
eprintln!(" Canonical: {}", header.canonical);
eprintln!(" Format version: {}", header.version);
let writer: Box<dyn Write> = if let Some(output_path) = output_path {
let file = std::fs::File::create(output_path).map_err(|e| {
KmerError::FileWriteError(format!("Failed to create output file: {}", e))
})?;
Box::new(BufWriter::new(file))
} else {
Box::new(BufWriter::new(std::io::stdout()))
};
let mut writer = writer;
writeln!(writer, "# rustkmer database dump")?;
writeln!(writer, "# k: {}", header.kmer_size)?;
writeln!(writer, "# total_kmers: {}", header.total_kmers)?;
writeln!(writer, "# sorted: {}", header.sorted)?;
writeln!(writer, "# canonical: {}", header.canonical)?;
writeln!(writer, "# format: RKDB")?;
writeln!(writer, "#")?;
let entry_size = if header.version == 2 && header.kmer_size > 32 {
20 } else {
12 };
let num_entries = (path.metadata()?.len() - header.data_offset) / entry_size;
eprintln!("Calculated database entries: {}", num_entries);
use crate::database::format::KmerEntry;
let mut processed = 0u64;
while processed < num_entries {
match KmerEntry::read_from(&mut reader) {
Ok(entry) => {
let sequence = if header.version == 2 && header.kmer_size > 32 {
decode_kmer_to_sequence_u128(entry.kmer, header.kmer_size as usize)
} else {
decode_kmer_to_sequence(entry.kmer as u64, header.kmer_size as usize)
};
writeln!(writer, "{}\t{}", sequence, entry.count).map_err(|e| {
KmerError::FileWriteError(format!("Failed to write k-mer entry: {}", e))
})?;
processed += 1;
if processed.is_multiple_of(100_000) {
eprintln!("Processed {} k-mers...", processed);
}
}
Err(e) => {
if processed > 0 {
eprintln!("Reached end of file at {} k-mers", processed);
break;
} else {
return Err(KmerError::ProcessingError(format!(
"Failed to read k-mer entry at position {}: {}",
processed, e
))
.into());
}
}
}
}
eprintln!("Successfully dumped {} k-mers", processed);
Ok(())
}
fn dump_bincode_database(path: &Path, output_path: Option<&str>) -> ProcessingResult<()> {
use std::fs::File;
use std::io::BufReader;
let file = File::open(path).map_err(KmerError::Io)?;
let mut reader = BufReader::new(file);
let bincode_data: (usize, Vec<(u64, u32)>) =
bincode::deserialize_from(&mut reader).map_err(|_| {
KmerError::ProcessingError(
"Failed to parse as bincode format. This might not be a valid rustkmer database."
.to_string(),
)
})?;
let (kmer_length, kmers) = bincode_data;
eprintln!("Database info:");
eprintln!(" K-mer size: {}", kmer_length);
eprintln!(" Total k-mers: {}", kmers.len());
eprintln!(" Format: bincode (legacy)");
let writer: Box<dyn Write> = if let Some(out_path) = output_path {
let file = std::fs::File::create(out_path).map_err(|e| {
KmerError::FileWriteError(format!("Failed to create output file: {}", e))
})?;
Box::new(BufWriter::new(file))
} else {
Box::new(BufWriter::new(std::io::stdout()))
};
let mut writer = writer;
writeln!(writer, "# rustkmer database dump")?;
writeln!(writer, "# k: {}", kmer_length)?;
writeln!(writer, "# total_kmers: {}", kmers.len())?;
writeln!(writer, "# format: bincode (legacy)")?;
writeln!(writer, "#")?;
let kmer_count = kmers.len();
for (kmer, count) in kmers {
let sequence = decode_kmer_to_sequence(kmer, kmer_length);
writeln!(writer, "{}\t{}", sequence, count).map_err(|e| {
KmerError::FileWriteError(format!("Failed to write k-mer entry: {}", e))
})?;
}
eprintln!("Successfully dumped {} k-mers", kmer_count);
Ok(())
}
fn decode_kmer_to_sequence(kmer: u64, k: usize) -> String {
let mut sequence = String::with_capacity(k);
let mut encoded = kmer;
for _ in 0..k {
let base = encoded & 0b11;
let char = match base {
0 => 'A',
1 => 'C',
2 => 'G',
3 => 'T',
_ => 'N',
};
sequence.push(char);
encoded >>= 2;
}
sequence.chars().rev().collect()
}
fn decode_kmer_to_sequence_u128(kmer: u128, k: usize) -> String {
let mut sequence = String::with_capacity(k);
let mut encoded = kmer;
for _ in 0..k {
let base = encoded & 0b11;
let char = match base {
0 => 'A',
1 => 'C',
2 => 'G',
3 => 'T',
_ => 'N',
};
sequence.push(char);
encoded >>= 2;
}
sequence.chars().rev().collect()
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_decode_kmer_to_sequence() {
let encoded = 0b00_01_10_11; let sequence = decode_kmer_to_sequence(encoded, 4);
assert_eq!(sequence, "ACGT"); }
#[test]
fn test_database_format_detection() {
assert!(true);
}
}