use anyhow::Result;
use std::fs::File;
use std::io::{self, BufReader, BufWriter, Read, Write};
use std::path::{Path, PathBuf};
use crate::format::{ENTRY_SIZE, Entry};
pub fn expand_input_paths(input_paths: &[PathBuf]) -> Result<Vec<PathBuf>> {
let mut expanded_paths = Vec::new();
for path in input_paths {
if path.is_dir() {
for entry in std::fs::read_dir(path)? {
let entry = entry?;
let file_path = entry.path();
if is_sequence_file(&file_path) {
expanded_paths.push(file_path);
}
}
} else if path.is_file() {
if is_sequence_file(path) {
expanded_paths.push(path.clone());
} else {
let content = std::fs::read_to_string(path)?;
for line in content.lines() {
let file_path = PathBuf::from(line.trim());
if file_path.exists() && is_sequence_file(&file_path) {
expanded_paths.push(file_path);
}
}
}
}
}
if expanded_paths.is_empty() {
return Err(anyhow::anyhow!(
"No valid sequence files found in input paths"
));
}
expanded_paths.sort();
Ok(expanded_paths)
}
pub fn is_sequence_file(path: &Path) -> bool {
if let Some(ext) = path.extension().map(|e| e.to_string_lossy().to_lowercase()) {
if ext == "gz"
&& let Some(stem_ext) = path.file_stem().and_then(|s| Path::new(s).extension())
{
let stem_ext = stem_ext.to_string_lossy().to_lowercase();
return matches!(
stem_ext.as_str(),
"fasta" | "fa" | "fas" | "fna" | "fastq" | "fq"
);
}
return matches!(
ext.as_str(),
"fasta" | "fa" | "fas" | "fna" | "fastq" | "fq"
);
}
false
}
pub fn read_entries<P: AsRef<Path>>(path: P) -> io::Result<Vec<Entry>> {
let file = File::open(path)?;
let file_size = file.metadata()?.len() as usize;
let entry_count = file_size / ENTRY_SIZE;
let mut reader = BufReader::with_capacity(8 * 1024 * 1024, file);
let mut entries = Vec::with_capacity(entry_count);
let mut buf = [0u8; ENTRY_SIZE];
while reader.read_exact(&mut buf).is_ok() {
let hash = u64::from_le_bytes(buf[0..8].try_into().unwrap());
let sample_id = u32::from_le_bytes(buf[8..12].try_into().unwrap());
entries.push(Entry::new(hash, sample_id));
}
Ok(entries)
}
pub fn write_entries<P: AsRef<Path>>(path: P, entries: &[Entry]) -> io::Result<()> {
let file = File::create(path)?;
let mut writer = BufWriter::with_capacity(8 * 1024 * 1024, file);
writer.write_all(bytemuck::cast_slice(entries))?;
writer.flush()
}
pub struct EntryWriter {
writer: BufWriter<File>,
count: u64,
}
impl EntryWriter {
pub fn new<P: AsRef<Path>>(path: P, buffer_size: usize) -> io::Result<Self> {
let file = File::create(path)?;
Ok(Self {
writer: BufWriter::with_capacity(buffer_size, file),
count: 0,
})
}
pub fn write(&mut self, entry: &Entry) -> io::Result<()> {
self.writer.write_all(bytemuck::bytes_of(entry))?;
self.count += 1;
Ok(())
}
pub fn write_batch(&mut self, entries: &[Entry]) -> io::Result<()> {
self.writer.write_all(bytemuck::cast_slice(entries))?;
self.count += entries.len() as u64;
Ok(())
}
pub fn count(&self) -> u64 {
self.count
}
pub fn flush(&mut self) -> io::Result<()> {
self.writer.flush()
}
}
pub fn extract_unique_hashes(entries: &[Entry]) -> Vec<u64> {
let mut unique = Vec::with_capacity(entries.len() / 10);
let mut last_hash: Option<u64> = None;
for entry in entries {
let h = entry.hash;
if last_hash != Some(h) {
unique.push(h);
last_hash = Some(h);
}
}
unique
}
#[cfg(test)]
mod tests {
use super::*;
use tempfile::tempdir;
#[test]
fn test_entry_roundtrip() {
let dir = tempdir().unwrap();
let path = dir.path().join("test.bin");
let entries = vec![Entry::new(100, 1), Entry::new(200, 2), Entry::new(300, 3)];
write_entries(&path, &entries).unwrap();
let loaded = read_entries(&path).unwrap();
assert_eq!(entries, loaded);
}
#[test]
fn test_extract_unique_hashes() {
let entries = vec![
Entry::new(100, 1),
Entry::new(100, 2),
Entry::new(100, 3),
Entry::new(200, 1),
Entry::new(300, 1),
Entry::new(300, 2),
];
let unique = extract_unique_hashes(&entries);
assert_eq!(unique, vec![100, 200, 300]);
}
#[test]
fn test_entry_writer() {
let dir = tempdir().unwrap();
let path = dir.path().join("test.bin");
let mut writer = EntryWriter::new(&path, 4096).unwrap();
writer.write(&Entry::new(10, 1)).unwrap();
writer.write(&Entry::new(20, 2)).unwrap();
writer.flush().unwrap();
assert_eq!(writer.count(), 2);
let loaded = read_entries(&path).unwrap();
assert_eq!(loaded, vec![Entry::new(10, 1), Entry::new(20, 2)]);
}
}