use crate::data::db::BioVaultDb;
use anyhow::{Context, Result};
use csv::Writer;
use regex::Regex;
use std::collections::{HashMap, HashSet};
use std::fs;
use std::path::{Path, PathBuf};
use tracing::{debug, info, warn};
pub struct ExtractedFields {
participant_id: Option<String>,
date: Option<String>,
other_fields: HashMap<String, String>,
}
pub async fn create(
input_dir: String,
output_file: String,
file_filter: Option<String>,
extract_cols: Option<String>,
ignore_errors: bool,
) -> Result<()> {
let input_path = Path::new(&input_dir);
let output_path = Path::new(&output_file);
if !input_path.exists() {
anyhow::bail!("Input directory does not exist: {}", input_dir);
}
if !input_path.is_dir() {
anyhow::bail!("Input path is not a directory: {}", input_dir);
}
if output_path.exists() {
anyhow::bail!(
"Output file already exists: {}. Please choose a different name.",
output_file
);
}
let filter_pattern = file_filter.unwrap_or_else(|| "*".to_string());
let extract_pattern = extract_cols.unwrap_or_default();
info!("Creating sample sheet from: {}", input_dir);
info!("Output file: {}", output_file);
info!("File filter: {}", filter_pattern);
if !extract_pattern.is_empty() {
info!("Extract pattern: {}", extract_pattern);
}
let entries = fs::read_dir(input_path)?
.filter_map(|entry| entry.ok())
.filter(|entry| {
if let Ok(metadata) = entry.metadata() {
metadata.is_file()
} else {
false
}
})
.map(|entry| entry.path())
.collect::<Vec<_>>();
let filtered_files = filter_files(&entries, &filter_pattern)?;
info!("Found {} files matching filter", filtered_files.len());
for file in &filtered_files {
println!(" - {}", file.display());
}
println!("Total files matching filter: {}", filtered_files.len());
let mut csv_rows = Vec::new();
let mut matched_count = 0;
let mut failed_files = Vec::new();
if extract_pattern.is_empty() {
for file_path in &filtered_files {
if let Some(file_name) = file_path.file_name() {
if let Some(file_name_str) = file_name.to_str() {
let participant_id = file_name_str
.strip_suffix(".txt")
.or_else(|| file_name_str.strip_suffix(".csv"))
.unwrap_or(file_name_str)
.to_string();
let absolute_path = file_path
.canonicalize()
.unwrap_or_else(|_| file_path.clone());
csv_rows.push((participant_id, absolute_path));
matched_count += 1;
}
}
}
} else {
let pattern_regex = create_extraction_regex(&extract_pattern)?;
for file_path in &filtered_files {
if let Some(file_name) = file_path.file_name() {
if let Some(file_name_str) = file_name.to_str() {
match extract_fields(file_name_str, &pattern_regex, &extract_pattern) {
Some(fields) => {
if let Some(participant_id) = fields.participant_id {
let absolute_path = file_path
.canonicalize()
.unwrap_or_else(|_| file_path.clone());
csv_rows.push((participant_id, absolute_path));
matched_count += 1;
}
}
None => {
failed_files.push(file_name_str.to_string());
}
}
}
}
}
}
if !failed_files.is_empty() {
warn!("Files that didn't match the extraction pattern:");
for file in &failed_files {
println!(" ✗ {}", file);
}
if !ignore_errors && !failed_files.is_empty() {
anyhow::bail!(
"{} files didn't match the extraction pattern. Use --ignore to add them anyway.",
failed_files.len()
);
}
}
if !extract_pattern.is_empty() {
csv_rows.sort_by(|a, b| a.0.cmp(&b.0));
info!("Sorted {} participants by participant_id", csv_rows.len());
}
let mut csv_content = String::from("participant_id,genotype_file_path\n");
for (participant_id, file_path) in &csv_rows {
csv_content.push_str(&format!("{},{}\n", participant_id, file_path.display()));
}
fs::write(output_path, csv_content)
.with_context(|| format!("Failed to write CSV file: {}", output_file))?;
println!("\n✓ Sample sheet created: {}", output_file);
println!(" Added {} participants to CSV", matched_count);
if !failed_files.is_empty() && ignore_errors {
println!(
" Ignored {} files that didn't match pattern",
failed_files.len()
);
}
Ok(())
}
pub async fn export_catalog(
output_file: String,
data_type: String,
participants: Option<String>,
path_column: String,
) -> Result<()> {
let db = BioVaultDb::new()?;
let mut stmt = db.conn.prepare(
"SELECT f.file_path, f.data_type, p.participant_id
FROM files f
LEFT JOIN participants p ON f.participant_id = p.id
ORDER BY f.file_path",
)?;
let mut rows = stmt.query([])?;
if let Some(parent) = Path::new(&output_file).parent() {
if !parent.as_os_str().is_empty() {
fs::create_dir_all(parent)?;
}
}
let mut writer = Writer::from_path(&output_file)?;
let path_header = if path_column.trim().is_empty() {
"genotype_file"
} else {
path_column.trim()
};
writer.write_record(["participant_id", path_header])?;
let dt_filter = if data_type.eq_ignore_ascii_case("all") {
None
} else {
Some(data_type.to_lowercase())
};
let participant_filter: Option<HashSet<String>> = participants.as_ref().and_then(|raw| {
if raw.trim().is_empty() || raw.eq_ignore_ascii_case("all") {
None
} else {
Some(
raw.split(',')
.filter_map(|p| {
let trimmed = p.trim();
if trimmed.is_empty() {
None
} else {
Some(trimmed.to_lowercase())
}
})
.collect::<HashSet<_>>(),
)
}
});
let mut written = 0usize;
while let Some(row) = rows.next()? {
let file_path: String = row.get(0)?;
if file_path.trim().is_empty() {
continue;
}
if let Some(ref dt) = dt_filter {
let row_type: Option<String> = row.get(1)?;
let matches = row_type
.as_ref()
.map(|value| value.to_lowercase() == *dt)
.unwrap_or(false);
if !matches {
continue;
}
}
let participant: Option<String> = row.get(2)?;
let mut participant_id = participant.unwrap_or_default();
if participant_id.trim().is_empty() {
participant_id = Path::new(&file_path)
.file_stem()
.and_then(|s| s.to_str())
.unwrap_or("unknown")
.to_string();
}
let participant_id = participant_id.trim().to_string();
if let Some(ref filter) = participant_filter {
if !filter.contains(&participant_id.to_lowercase()) {
continue;
}
}
writer.write_record([participant_id, file_path])?;
written += 1;
}
writer.flush()?;
if written == 0 {
anyhow::bail!(
"No cataloged files matched the requested criteria. Import data before exporting."
);
}
println!("✓ Exported {} catalog entries to {}", written, output_file);
Ok(())
}
fn filter_files(files: &[PathBuf], pattern: &str) -> Result<Vec<PathBuf>> {
if pattern == "*" {
return Ok(files.to_vec());
}
let glob = globset::GlobBuilder::new(pattern)
.literal_separator(false)
.build()
.context("Invalid file filter pattern")?;
let matcher = glob.compile_matcher();
Ok(files
.iter()
.filter(|path| {
if let Some(file_name) = path.file_name() {
if let Some(file_name_str) = file_name.to_str() {
return matcher.is_match(file_name_str);
}
}
false
})
.cloned()
.collect())
}
fn create_extraction_regex(pattern: &str) -> Result<Regex> {
let mut regex_pattern = pattern.to_string();
let field_regex = Regex::new(r"\{([^}]+)\}")?;
let mut replacements = Vec::new();
for cap in field_regex.captures_iter(pattern) {
if let Some(field_match) = cap.get(0) {
if let Some(field_name) = cap.get(1) {
let field_str = field_match.as_str();
let name_str = field_name.as_str();
let capture_group = format!("(?P<{}>[^_]+)", name_str.replace('-', "_"));
replacements.push((field_str.to_string(), capture_group));
}
}
}
for (field_str, capture_group) in replacements {
regex_pattern = regex_pattern.replace(&field_str, &capture_group);
}
regex_pattern = regex_pattern.replace(".", "\\.");
debug!("Generated regex pattern: {}", regex_pattern);
Regex::new(®ex_pattern).context("Failed to create extraction regex")
}
fn extract_fields(
filename: &str,
pattern_regex: &Regex,
_original_pattern: &str,
) -> Option<ExtractedFields> {
if let Some(captures) = pattern_regex.captures(filename) {
let mut fields = ExtractedFields {
participant_id: None,
date: None,
other_fields: HashMap::new(),
};
if let Some(participant_id) = captures.name("participant_id") {
fields.participant_id = Some(participant_id.as_str().to_string());
}
if let Some(date) = captures.name("date") {
fields.date = Some(date.as_str().to_string());
}
for (idx, mat) in captures.iter().enumerate() {
if idx == 0 {
continue;
}
if let Some(m) = mat {
if let Some(name) = pattern_regex.capture_names().nth(idx).flatten() {
if name != "participant_id" && name != "date" {
fields
.other_fields
.insert(name.to_string(), m.as_str().to_string());
}
}
}
}
Some(fields)
} else {
None
}
}
#[cfg(test)]
mod tests {
use super::*;
use std::fs;
use tempfile::TempDir;
#[test]
fn filter_files_respects_glob_patterns() {
let base = TempDir::new().unwrap();
let file_a = base.path().join("sample_A.txt");
let file_b = base.path().join("control.csv");
fs::write(&file_a, "data").unwrap();
fs::write(&file_b, "data").unwrap();
let inputs = vec![file_a.clone(), file_b.clone()];
let filtered = filter_files(&inputs, "sample_*.txt").unwrap();
assert_eq!(filtered.len(), 1);
assert_eq!(filtered[0], file_a);
let all = filter_files(&inputs, "*").unwrap();
assert_eq!(all.len(), 2);
}
#[test]
fn create_extraction_regex_builds_named_groups() {
let regex = create_extraction_regex("participant_{participant_id}_{date}.txt").unwrap();
let caps = regex.captures("participant_1234_20240101.txt").unwrap();
assert_eq!(caps.name("participant_id").unwrap().as_str(), "1234");
assert_eq!(caps.name("date").unwrap().as_str(), "20240101");
}
#[test]
fn extract_fields_returns_named_and_other_values() {
let regex =
create_extraction_regex("participant_{participant_id}_{date}_{batch_id}.txt").unwrap();
let fields = extract_fields(
"participant_9876_20231201_batch42.txt",
®ex,
"participant_{participant_id}_{date}_{batch_id}.txt",
)
.expect("expected captures");
assert_eq!(fields.participant_id.as_deref(), Some("9876"));
assert_eq!(fields.date.as_deref(), Some("20231201"));
assert_eq!(
fields.other_fields.get("batch_id"),
Some(&"batch42".to_string())
);
}
#[tokio::test]
async fn create_generates_csv_without_extract_pattern() {
let input = TempDir::new().unwrap();
let output = input.path().join("sheet.csv");
let file_a = input.path().join("alpha.txt");
let file_b = input.path().join("beta.csv");
fs::write(&file_a, "A").unwrap();
fs::write(&file_b, "B").unwrap();
create(
input.path().to_string_lossy().to_string(),
output.to_string_lossy().to_string(),
Some("*.txt".to_string()),
None,
false,
)
.await
.expect("csv creation succeeds");
let csv = fs::read_to_string(&output).unwrap();
assert!(csv.contains("participant_id,genotype_file_path"));
assert!(csv.contains("alpha"));
assert!(!csv.contains("beta"));
}
#[tokio::test]
async fn create_errors_when_pattern_does_not_match_and_ignore_off() {
let input = TempDir::new().unwrap();
let output = input.path().join("sheet.csv");
let file = input.path().join("sample_foo.txt");
fs::write(&file, "data").unwrap();
let result = create(
input.path().to_string_lossy().to_string(),
output.to_string_lossy().to_string(),
Some("*.txt".to_string()),
Some("sample_{participant_id}_{date}.txt".to_string()),
false,
)
.await;
assert!(result.is_err());
assert!(!output.exists());
}
}