use anyhow::{anyhow, bail, Result};
use chrono::{DateTime, Utc};
use regex::Regex;
use rusqlite::params;
use serde::{Deserialize, Serialize};
use std::collections::{HashMap, HashSet};
use std::fs;
use std::path::{Path, PathBuf};
use walkdir::WalkDir;
#[cfg(test)]
use sha2::{Digest, Sha256};
#[cfg(test)]
use std::io::Read;
use super::{BioVaultDb, GenotypeMetadata};
#[derive(Debug, Serialize, Deserialize)]
pub struct ExtensionInfo {
pub extension: String,
pub count: usize,
pub total_size: u64,
}
#[derive(Debug, Serialize, Deserialize)]
pub struct FileInfo {
pub path: String,
pub size: u64,
pub modified: Option<String>,
}
#[derive(Debug, Serialize, Deserialize)]
pub struct ScanResult {
pub path: String,
pub scanned_at: String,
pub extensions: Vec<ExtensionInfo>,
pub files: Vec<FileInfo>,
pub total_files: usize,
}
#[derive(Debug, Serialize, Deserialize)]
pub struct FileRecord {
pub id: i64,
pub file_path: String,
pub file_hash: String,
pub file_type: Option<String>,
pub file_size: Option<u64>,
pub data_type: Option<String>,
pub source: Option<String>,
pub grch_version: Option<String>,
pub row_count: Option<i64>,
pub chromosome_count: Option<i64>,
pub inferred_sex: Option<String>,
pub status: Option<String>,
pub processing_error: Option<String>,
pub participant_id: Option<String>,
pub participant_name: Option<String>,
pub created_at: String,
pub updated_at: String,
}
#[derive(Debug, Serialize, Deserialize)]
pub struct ImportResult {
pub imported: usize,
pub skipped: usize,
pub errors: Vec<String>,
pub files: Vec<FileRecord>,
}
#[derive(Debug, Clone)]
pub struct CsvFileImport {
pub file_path: String,
pub participant_id: Option<String>,
pub data_type: Option<String>,
pub source: Option<String>,
pub grch_version: Option<String>,
pub row_count: Option<i64>,
pub chromosome_count: Option<i64>,
pub inferred_sex: Option<String>,
}
#[derive(Debug, Serialize, Deserialize, Clone)]
pub struct PatternSuggestion {
pub pattern: String,
pub regex_pattern: String,
pub description: String,
pub example: String,
pub sample_extractions: Vec<(String, String)>, }
#[derive(Debug, Serialize, Deserialize)]
pub struct SuggestPatternsResult {
pub suggestions: Vec<PatternSuggestion>,
pub sample_files: Vec<String>,
}
pub fn scan(path: &str, extension: Option<&str>, recursive: bool) -> Result<ScanResult> {
let path_buf = PathBuf::from(path);
if !path_buf.exists() {
anyhow::bail!("Path does not exist: {}", path);
}
let mut extension_counts: HashMap<String, (usize, u64)> = HashMap::new();
let mut files = Vec::new();
let walker = if recursive {
WalkDir::new(path).follow_links(false)
} else {
WalkDir::new(path).max_depth(1).follow_links(false)
};
for entry in walker.into_iter().filter_map(|e| e.ok()) {
if !entry.file_type().is_file() {
continue;
}
let file_path = entry.path();
let file_ext = file_path
.extension()
.and_then(|e| e.to_str())
.map(|e| format!(".{}", e));
if let Some(filter) = extension {
let filters: Vec<&str> = filter.split(',').collect();
if let Some(ref ext) = file_ext {
if !filters.iter().any(|f| {
let normalized_filter = if f.starts_with('.') {
f.to_string()
} else {
format!(".{}", f)
};
ext == &normalized_filter
}) {
continue;
}
} else {
continue; }
}
let size = entry.metadata().ok().map(|m| m.len()).unwrap_or(0);
let modified = entry
.metadata()
.ok()
.and_then(|m| m.modified().ok())
.map(|t| {
let datetime: DateTime<Utc> = t.into();
datetime.to_rfc3339()
});
if let Some(ext) = &file_ext {
let entry = extension_counts.entry(ext.clone()).or_insert((0, 0));
entry.0 += 1;
entry.1 += size;
}
files.push(FileInfo {
path: file_path.to_string_lossy().to_string(),
size,
modified,
});
}
let mut extensions: Vec<ExtensionInfo> = extension_counts
.into_iter()
.map(|(extension, (count, total_size))| ExtensionInfo {
extension,
count,
total_size,
})
.collect();
extensions.sort_by(|a, b| b.count.cmp(&a.count));
Ok(ScanResult {
path: path.to_string(),
scanned_at: Utc::now().to_rfc3339(),
extensions,
total_files: files.len(),
files,
})
}
pub fn import(
db: &BioVaultDb,
path: &str,
extension: Option<&str>,
recursive: bool,
pattern: Option<&str>,
) -> Result<ImportResult> {
let scan_result = scan(path, extension, recursive)?;
let mut imported = 0;
let mut skipped = 0;
let mut errors = Vec::new();
let mut imported_file_ids = Vec::new();
let compiled_pattern = pattern.map(compile_pattern).transpose()?;
for file_info in scan_result.files {
let extracted_id = compiled_pattern
.as_ref()
.and_then(|matcher| matcher.extract(&file_info.path));
match import_file_with_participant(db, &file_info, extracted_id.as_deref()) {
Ok(Some(file_id)) => {
imported += 1;
imported_file_ids.push(file_id);
}
Ok(None) => {
skipped += 1;
}
Err(e) => {
errors.push(format!("{}: {}", file_info.path, e));
}
}
}
let mut files = Vec::new();
for file_id in imported_file_ids {
if let Ok(Some(record)) = get_file_by_id(db, file_id) {
files.push(record);
}
}
Ok(ImportResult {
imported,
skipped,
errors,
files,
})
}
pub fn import_from_csv(
db: &BioVaultDb,
csv_imports: Vec<CsvFileImport>,
run_analysis: bool,
) -> Result<ImportResult> {
let mut imported = 0;
let mut skipped = 0;
let mut errors = Vec::new();
let mut imported_file_ids = Vec::new();
for csv_row in csv_imports {
match import_file_with_metadata(db, &csv_row, run_analysis) {
Ok(Some(file_id)) => {
imported += 1;
imported_file_ids.push(file_id);
}
Ok(None) => {
skipped += 1;
}
Err(e) => {
errors.push(format!("{}: {}", csv_row.file_path, e));
}
}
}
let mut files = Vec::new();
for file_id in imported_file_ids {
if let Ok(Some(record)) = get_file_by_id(db, file_id) {
files.push(record);
}
}
Ok(ImportResult {
imported,
skipped,
errors,
files,
})
}
fn import_file_with_metadata(
db: &BioVaultDb,
csv_row: &CsvFileImport,
run_analysis: bool,
) -> Result<Option<i64>> {
use crate::cli::download_cache::calculate_blake3;
use std::path::Path;
let file_path = &csv_row.file_path;
let path = Path::new(file_path);
if !path.exists() {
anyhow::bail!("File not found: {}", file_path);
}
let metadata = std::fs::metadata(path)?;
let file_size = metadata.len();
let file_hash = calculate_blake3(path).unwrap_or_else(|e| {
eprintln!("Warning: Failed to hash {}: {}", file_path, e);
format!("error_{}", file_size)
});
let file_type = path
.extension()
.and_then(|e| e.to_str())
.map(|e| format!(".{}", e));
let db_participant_id: Option<i64> = if let Some(pid) = &csv_row.participant_id {
Some(get_or_create_participant(db, pid)?)
} else {
None
};
let existing: Option<(String, i64)> = db
.conn
.query_row(
"SELECT file_hash, id FROM files WHERE file_path = ?1",
params![file_path],
|row| Ok((row.get(0)?, row.get(1)?)),
)
.ok();
let file_id = if let Some((existing_hash, existing_id)) = existing {
if existing_hash == file_hash {
db.conn.execute(
"UPDATE files SET participant_id = ?1, data_type = ?2, source = ?3, grch_version = ?4, updated_at = CURRENT_TIMESTAMP WHERE id = ?5",
params![
db_participant_id,
csv_row.data_type,
csv_row.source,
csv_row.grch_version,
existing_id
],
)?;
existing_id
} else {
db.conn.execute(
"UPDATE files SET file_hash = ?1, file_size = ?2, participant_id = ?3, data_type = ?4, source = ?5, grch_version = ?6, updated_at = CURRENT_TIMESTAMP WHERE id = ?7",
params![
file_hash,
file_size as i64,
db_participant_id,
csv_row.data_type,
csv_row.source,
csv_row.grch_version,
existing_id
],
)?;
existing_id
}
} else {
db.conn.execute(
"INSERT INTO files (participant_id, file_path, file_hash, file_type, file_size, data_type, source, grch_version) VALUES (?1, ?2, ?3, ?4, ?5, ?6, ?7, ?8)",
params![
db_participant_id,
file_path,
file_hash,
file_type,
file_size as i64,
csv_row.data_type,
csv_row.source,
csv_row.grch_version
],
)?;
db.conn.last_insert_rowid()
};
let mut metadata = GenotypeMetadata {
data_type: csv_row
.data_type
.clone()
.unwrap_or_else(|| "Unknown".to_string()),
source: csv_row.source.clone(),
grch_version: csv_row.grch_version.clone(),
row_count: csv_row.row_count,
chromosome_count: csv_row.chromosome_count,
inferred_sex: csv_row.inferred_sex.clone(),
};
if run_analysis {
match crate::data::detect_genotype_metadata(file_path) {
Ok(detected) => {
if metadata.data_type == "Unknown" {
metadata.data_type = detected.data_type;
}
if metadata.source.is_none() {
metadata.source = detected.source;
}
if metadata.grch_version.is_none() {
metadata.grch_version = detected.grch_version;
}
}
Err(e) => {
eprintln!(
"Warning: Failed to detect metadata for {}: {}",
file_path, e
);
}
}
if metadata.data_type == "Genotype"
&& (metadata.row_count.is_none()
|| metadata.chromosome_count.is_none()
|| metadata.inferred_sex.is_none())
{
match crate::data::analyze_genotype_file(file_path) {
Ok(analysis) => {
if metadata.row_count.is_none() {
metadata.row_count = analysis.row_count;
}
if metadata.chromosome_count.is_none() {
metadata.chromosome_count = analysis.chromosome_count;
}
if metadata.inferred_sex.is_none() {
metadata.inferred_sex = analysis.inferred_sex;
}
}
Err(e) => {
eprintln!("Warning: Failed to analyze {}: {}", file_path, e);
}
}
}
}
update_file_from_queue(db, file_id, &file_hash, Some(&metadata))?;
Ok(Some(file_id))
}
fn import_file_with_participant(
db: &BioVaultDb,
file_info: &FileInfo,
participant_id: Option<&str>,
) -> Result<Option<i64>> {
use crate::cli::download_cache::calculate_blake3;
use std::path::Path;
let file_path = &file_info.path;
let file_hash = calculate_blake3(Path::new(file_path)).unwrap_or_else(|e| {
eprintln!("Warning: Failed to hash {}: {}", file_path, e);
format!("error_{}", file_info.size)
});
let file_type = Path::new(file_path)
.extension()
.and_then(|e| e.to_str())
.map(|e| format!(".{}", e));
let db_participant_id: Option<i64> = if let Some(pid) = participant_id {
Some(get_or_create_participant(db, pid)?)
} else {
None
};
let existing: Option<(String, i64)> = db
.conn
.query_row(
"SELECT file_hash, id FROM files WHERE file_path = ?1",
params![file_path],
|row| Ok((row.get(0)?, row.get(1)?)),
)
.ok();
if let Some((existing_hash, existing_id)) = existing {
if existing_hash == file_hash {
if db_participant_id.is_some() {
db.conn.execute(
"UPDATE files SET participant_id = ?1, updated_at = CURRENT_TIMESTAMP WHERE id = ?2",
params![db_participant_id, existing_id],
)?;
}
return Ok(None); } else {
db.conn.execute(
"UPDATE files SET file_hash = ?1, file_size = ?2, participant_id = ?3, updated_at = CURRENT_TIMESTAMP WHERE id = ?4",
params![file_hash, file_info.size as i64, db_participant_id, existing_id],
)?;
return Ok(Some(existing_id));
}
}
db.conn.execute(
"INSERT INTO files (participant_id, file_path, file_hash, file_type, file_size) VALUES (?1, ?2, ?3, ?4, ?5)",
params![db_participant_id, file_path, file_hash, file_type, file_info.size as i64],
)?;
let file_id = db.conn.last_insert_rowid();
Ok(Some(file_id))
}
fn get_or_create_participant(db: &BioVaultDb, participant_id: &str) -> Result<i64> {
match db.conn.query_row(
"SELECT id FROM participants WHERE participant_id = ?1",
params![participant_id],
|row| row.get(0),
) {
Ok(id) => Ok(id),
Err(rusqlite::Error::QueryReturnedNoRows) => {
db.conn.execute(
"INSERT INTO participants (participant_id) VALUES (?1)",
params![participant_id],
)?;
Ok(db.conn.last_insert_rowid())
}
Err(e) => Err(e.into()),
}
}
pub fn get_file_by_id(db: &BioVaultDb, file_id: i64) -> Result<Option<FileRecord>> {
let result = db.conn.query_row(
"SELECT f.id, f.file_path, f.file_hash, f.file_type, f.file_size,
f.data_type, g.source, g.grch_version,
g.row_count, g.chromosome_count, g.inferred_sex,
f.status, f.processing_error,
p.participant_id, p.participant_id, f.created_at, f.updated_at
FROM files f
LEFT JOIN participants p ON f.participant_id = p.id
LEFT JOIN genotype_metadata g ON f.id = g.file_id
WHERE f.id = ?1",
params![file_id],
|row| {
Ok(FileRecord {
id: row.get(0)?,
file_path: row.get(1)?,
file_hash: row.get(2)?,
file_type: row.get(3)?,
file_size: row.get::<_, Option<i64>>(4)?.map(|s| s as u64),
data_type: row.get(5)?,
source: row.get(6)?,
grch_version: row.get(7)?,
row_count: row.get(8)?,
chromosome_count: row.get(9)?,
inferred_sex: row.get(10)?,
status: row.get(11)?,
processing_error: row.get(12)?,
participant_id: row.get(13)?,
participant_name: row.get(14)?,
created_at: row.get(15)?,
updated_at: row.get(16)?,
})
},
);
match result {
Ok(record) => Ok(Some(record)),
Err(rusqlite::Error::QueryReturnedNoRows) => Ok(None),
Err(e) => Err(e.into()),
}
}
pub fn list_files(
db: &BioVaultDb,
extension: Option<&str>,
participant: Option<&str>,
unassigned: bool,
limit: Option<usize>,
) -> Result<Vec<FileRecord>> {
let mut query = String::from(
"SELECT f.id, f.file_path, f.file_hash, f.file_type, f.file_size,
f.data_type, g.source, g.grch_version,
g.row_count, g.chromosome_count, g.inferred_sex,
f.status, f.processing_error,
p.participant_id, p.participant_id, f.created_at, f.updated_at
FROM files f
LEFT JOIN participants p ON f.participant_id = p.id
LEFT JOIN genotype_metadata g ON f.id = g.file_id
WHERE 1=1",
);
let mut params_vec: Vec<Box<dyn rusqlite::ToSql>> = Vec::new();
if let Some(ext) = extension {
query.push_str(" AND f.file_type = ?");
params_vec.push(Box::new(ext.to_string()));
}
if let Some(pid) = participant {
query.push_str(" AND p.participant_id = ?");
params_vec.push(Box::new(pid.to_string()));
}
if unassigned {
query.push_str(" AND f.participant_id IS NULL");
}
query.push_str(" ORDER BY f.created_at DESC");
if let Some(l) = limit {
query.push_str(&format!(" LIMIT {}", l));
}
let params_refs: Vec<&dyn rusqlite::ToSql> = params_vec.iter().map(|p| p.as_ref()).collect();
let mut stmt = db.conn.prepare(&query)?;
let rows = stmt.query_map(params_refs.as_slice(), |row| {
Ok(FileRecord {
id: row.get(0)?,
file_path: row.get(1)?,
file_hash: row.get(2)?,
file_type: row.get(3)?,
file_size: row.get::<_, Option<i64>>(4)?.map(|s| s as u64),
data_type: row.get(5)?,
source: row.get(6)?,
grch_version: row.get(7)?,
row_count: row.get(8)?,
chromosome_count: row.get(9)?,
inferred_sex: row.get(10)?,
status: row.get(11)?,
processing_error: row.get(12)?,
participant_id: row.get(13)?,
participant_name: row.get(14)?,
created_at: row.get(15)?,
updated_at: row.get(16)?,
})
})?;
let mut files = Vec::new();
for row in rows {
files.push(row?);
}
Ok(files)
}
pub fn suggest_patterns(
path: &str,
extension: Option<&str>,
recursive: bool,
) -> Result<SuggestPatternsResult> {
let scan_result = scan(path, extension, recursive)?;
if scan_result.files.is_empty() {
return Ok(SuggestPatternsResult {
suggestions: vec![],
sample_files: vec![],
});
}
let entries = collect_file_entries(&scan_result.files);
if entries.is_empty() {
return Ok(SuggestPatternsResult {
suggestions: vec![],
sample_files: vec![],
});
}
let mut candidates = Vec::new();
candidates.extend(suggest_directory_candidates(&entries));
candidates.extend(suggest_leading_numeric_candidates(&entries));
candidates.extend(suggest_alpha_prefix_numeric_candidates(&entries));
candidates.extend(suggest_generic_numeric_candidates(&entries));
let suggestions = coalesce_candidates(candidates)
.into_iter()
.map(|candidate| PatternSuggestion {
pattern: candidate.pattern,
regex_pattern: candidate.regex_pattern,
description: candidate.description,
example: candidate.example,
sample_extractions: candidate.sample_extractions,
})
.collect();
let sample_files = entries
.iter()
.filter_map(|entry| {
if entry.file_name.is_empty() {
None
} else {
Some(entry.file_name.clone())
}
})
.take(5)
.collect();
Ok(SuggestPatternsResult {
suggestions,
sample_files,
})
}
#[derive(Clone, Debug)]
struct FileEntry {
file_name: String,
stem: Option<String>,
parent: Option<String>,
}
#[derive(Clone, Debug)]
struct PatternCandidate {
pattern: String,
regex_pattern: String,
description: String,
example: String,
sample_extractions: Vec<(String, String)>,
coverage: usize,
}
#[derive(Clone, Copy, Debug)]
enum PatternScope {
Path,
Parent,
Basename,
Stem,
}
#[derive(Clone, Debug)]
pub struct CompiledPattern {
scope: PatternScope,
regex: Regex,
}
impl CompiledPattern {
fn new(scope: PatternScope, regex: Regex) -> Self {
Self { scope, regex }
}
pub fn extract(&self, filepath: &str) -> Option<String> {
let target = match self.scope {
PatternScope::Path => Some(filepath.to_string()),
PatternScope::Parent => Path::new(filepath)
.parent()
.and_then(|p| p.file_name())
.and_then(|n| n.to_str())
.map(|s| s.to_string()),
PatternScope::Basename => Path::new(filepath)
.file_name()
.and_then(|n| n.to_str())
.map(|s| s.to_string()),
PatternScope::Stem => Path::new(filepath)
.file_stem()
.and_then(|n| n.to_str())
.map(|s| s.to_string()),
}?;
let caps = self.regex.captures(&target)?;
if let Some(id) = caps.name("id") {
return Some(id.as_str().to_string());
}
for idx in 1..caps.len() {
if let Some(mat) = caps.get(idx) {
return Some(mat.as_str().to_string());
}
}
None
}
}
pub fn compile_pattern(pattern: &str) -> Result<CompiledPattern> {
compile_pattern_internal(pattern)
}
pub fn extract_id_from_pattern(filepath: &str, pattern: &str) -> Result<Option<String>> {
let compiled = compile_pattern_internal(pattern)?;
Ok(compiled.extract(filepath))
}
fn compile_pattern_internal(pattern: &str) -> Result<CompiledPattern> {
let trimmed = pattern.trim();
if trimmed.is_empty() {
bail!("Pattern cannot be empty");
}
match trimmed {
"{parent}" | "{dirname}" | "{dir}" | "{id}/*" => {
let regex = Regex::new(r"^(?P<id>[A-Za-z0-9._-]+)$").unwrap();
return Ok(CompiledPattern::new(PatternScope::Parent, regex));
}
"{filename}" => {
let regex = Regex::new(r"^(?P<id>.+)$").unwrap();
return Ok(CompiledPattern::new(PatternScope::Stem, regex));
}
"{basename}" => {
let regex = Regex::new(r"^(?P<id>.+)$").unwrap();
return Ok(CompiledPattern::new(PatternScope::Basename, regex));
}
_ => {}
}
if let Some((scope_label, template)) = parse_structured_template(trimmed) {
let scope = parse_scope(&scope_label)?;
let regex_source = build_regex_from_template(&template, scope)?;
let regex = Regex::new(®ex_source).map_err(|err| {
anyhow!(
"Invalid regex derived from template '{}': {}",
template,
err
)
})?;
ensure_has_capture(®ex)?;
return Ok(CompiledPattern::new(scope, regex));
}
if trimmed.contains("{id}") {
let scope = if trimmed.contains('/') {
PatternScope::Path
} else {
PatternScope::Basename
};
let regex_source = build_regex_from_template(trimmed, scope)?;
let regex = Regex::new(®ex_source)
.map_err(|err| anyhow!("Invalid regex derived from pattern '{}': {}", trimmed, err))?;
ensure_has_capture(®ex)?;
return Ok(CompiledPattern::new(scope, regex));
}
let regex =
Regex::new(trimmed).map_err(|err| anyhow!("Invalid regex '{}': {}", trimmed, err))?;
ensure_has_capture(®ex)?;
Ok(CompiledPattern::new(PatternScope::Path, regex))
}
fn ensure_has_capture(regex: &Regex) -> Result<()> {
let mut has_capture = false;
for name in regex.capture_names().flatten() {
if name == "id" {
has_capture = true;
break;
}
}
if !has_capture && regex.captures_len() > 1 {
has_capture = true;
}
if !has_capture {
bail!("Regex must contain at least one capture group (use (?P<id>...))");
}
Ok(())
}
fn parse_structured_template(pattern: &str) -> Option<(String, String)> {
if !(pattern.starts_with('{') && pattern.ends_with('}')) {
return None;
}
let inner = &pattern[1..pattern.len() - 1];
let mut parts = inner.splitn(2, ':');
let scope = parts.next()?.trim();
let template = parts.next()?.trim();
if scope.is_empty() || template.is_empty() {
return None;
}
Some((scope.to_string(), template.to_string()))
}
fn parse_scope(label: &str) -> Result<PatternScope> {
match label.to_lowercase().as_str() {
"path" | "full" => Ok(PatternScope::Path),
"parent" | "dir" | "dirname" | "folder" | "directory" => Ok(PatternScope::Parent),
"filename" | "basename" => Ok(PatternScope::Basename),
"stem" | "name" => Ok(PatternScope::Stem),
other => bail!(
"Unknown pattern scope '{}'. Expected one of path, parent, filename, stem",
other
),
}
}
fn build_regex_from_template(template: &str, scope: PatternScope) -> Result<String> {
if !template.contains("{id}") {
bail!("Template must include {{id}} placeholder");
}
let mut regex = String::from("^");
let mut chars = template.chars().peekable();
let mut found_id = false;
while let Some(ch) = chars.next() {
match ch {
'*' => {
if chars.peek() == Some(&'*') {
chars.next();
regex.push_str(".*");
} else {
match scope {
PatternScope::Path => regex.push_str("[^/]*"),
_ => regex.push_str(".*"),
}
}
}
'?' => regex.push('.'),
'{' => {
let mut lookahead = String::new();
while let Some(&next_ch) = chars.peek() {
if next_ch == '}' {
chars.next();
break;
}
lookahead.push(next_ch);
chars.next();
}
if lookahead == "id" {
if found_id {
bail!("Template may contain only one {{id}} placeholder");
}
found_id = true;
let capture = match scope {
PatternScope::Path => r"(?P<id>[^/]+)",
_ => r"(?P<id>[A-Za-z0-9._-]+)",
};
regex.push_str(capture);
} else {
regex.push_str(&escape_literal(&format!("{{{}}}", lookahead)));
}
}
'.' | '+' | '^' | '$' | '(' | ')' | '[' | ']' | '|' | '\\' => {
regex.push('\\');
regex.push(ch);
}
_ => regex.push(ch),
}
}
if !found_id {
bail!("Template must include {{id}} placeholder");
}
regex.push('$');
Ok(regex)
}
fn escape_literal(text: &str) -> String {
let mut escaped = String::new();
for ch in text.chars() {
match ch {
'.' | '+' | '^' | '$' | '(' | ')' | '[' | ']' | '{' | '}' | '|' | '\\' => {
escaped.push('\\');
escaped.push(ch);
}
_ => escaped.push(ch),
}
}
escaped
}
#[derive(Copy, Clone, Debug, PartialEq, Eq, Hash)]
enum TokenClass {
Numeric,
Hex,
Alpha,
Alphanumeric,
Other,
}
fn collect_file_entries(files: &[FileInfo]) -> Vec<FileEntry> {
files
.iter()
.filter_map(|info| {
let path = Path::new(&info.path);
let file_name = path.file_name()?.to_str()?.to_string();
if file_name.starts_with('.') {
return None;
}
let stem = path
.file_stem()
.and_then(|s| s.to_str())
.map(|s| s.to_string());
let parent = path
.parent()
.and_then(|p| p.file_name())
.and_then(|s| s.to_str())
.map(|s| s.to_string());
Some(FileEntry {
file_name,
stem,
parent,
})
})
.collect()
}
fn coalesce_candidates(mut candidates: Vec<PatternCandidate>) -> Vec<PatternCandidate> {
if candidates.is_empty() {
return candidates;
}
let mut map: HashMap<String, PatternCandidate> = HashMap::new();
for candidate in candidates.drain(..) {
map.entry(candidate.pattern.clone())
.and_modify(|existing| {
if candidate.coverage > existing.coverage {
existing.coverage = candidate.coverage;
existing.description = candidate.description.clone();
existing.example = candidate.example.clone();
existing.regex_pattern = candidate.regex_pattern.clone();
}
for sample in &candidate.sample_extractions {
if !existing.sample_extractions.contains(sample)
&& existing.sample_extractions.len() < 5
{
existing.sample_extractions.push(sample.clone());
}
}
})
.or_insert(candidate);
}
let mut deduped: Vec<PatternCandidate> = map.into_values().collect();
deduped.sort_by(|a, b| {
b.coverage
.cmp(&a.coverage)
.then_with(|| a.pattern.cmp(&b.pattern))
});
deduped
}
fn suggest_directory_candidates(entries: &[FileEntry]) -> Vec<PatternCandidate> {
let mut parent_map: HashMap<&str, Vec<&FileEntry>> = HashMap::new();
for entry in entries {
if let Some(parent) = entry.parent.as_deref() {
if parent.starts_with('.') {
continue;
}
parent_map.entry(parent).or_default().push(entry);
}
}
if parent_map.len() < 2 {
return Vec::new();
}
let id_like_parents: Vec<(&str, &Vec<&FileEntry>)> = parent_map
.iter()
.filter_map(|(name, files)| {
if is_id_like_parent(name) {
Some((*name, files))
} else {
None
}
})
.collect();
if id_like_parents.len() < 2 {
return Vec::new();
}
let id_like_parent_ratio = id_like_parents.len() as f64 / parent_map.len() as f64;
if parent_map.len() >= 4 && id_like_parent_ratio < 0.6 {
return Vec::new();
}
let id_like_coverage: usize = id_like_parents.iter().map(|(_, files)| files.len()).sum();
if id_like_coverage == 0 {
return Vec::new();
}
let mut candidates = Vec::new();
let mut seen = HashSet::new();
let mut sample_extractions = Vec::new();
let mut example = String::new();
for (parent, files) in &id_like_parents {
if seen.insert(*parent) {
if let Some(entry) = files.first() {
let source = format!("{}/{}", parent, entry.file_name);
if example.is_empty() {
example = source.clone();
}
sample_extractions.push((source, (*parent).to_string()));
if sample_extractions.len() >= 3 {
break;
}
}
}
}
if !sample_extractions.is_empty() {
candidates.push(PatternCandidate {
pattern: "{parent:{id}}".to_string(),
regex_pattern: r".*/(?P<id>[A-Za-z0-9._-]+)/[^/]+$".to_string(),
description: "Use parent directory names as participant IDs".to_string(),
example: example.clone(),
sample_extractions,
coverage: id_like_coverage,
});
}
let parent_names: Vec<&str> = id_like_parents.iter().map(|(name, _)| *name).collect();
if parent_names.len() < 2 {
return candidates;
}
let prefix = longest_common_prefix(&parent_names);
if prefix.len() >= 2 {
let mut remainders: Vec<String> = Vec::new();
for parent in &parent_names {
if let Some(rest) = parent.strip_prefix(&prefix) {
if rest.is_empty() {
remainders.clear();
break;
}
remainders.push(rest.to_string());
} else {
remainders.clear();
break;
}
}
if !remainders.is_empty() {
let remainder_refs: Vec<&str> = remainders.iter().map(|r| r.as_str()).collect();
let class = classify_token_slice(&remainder_refs);
let unique_remainders: HashSet<&str> = remainder_refs.iter().copied().collect();
let consistent_length = remainders
.iter()
.map(|r| r.len())
.collect::<HashSet<_>>()
.len()
== 1;
if matches!(
class,
TokenClass::Numeric | TokenClass::Hex | TokenClass::Alphanumeric
) && unique_remainders.len() >= 2
&& consistent_length
{
let mut sample_extractions = Vec::new();
let mut example = String::new();
let mut seen_dirs = HashSet::new();
for (parent, files) in &id_like_parents {
if parent.starts_with(&prefix) && seen_dirs.insert(*parent) {
if let Some(entry) = files.first() {
let remainder = parent[prefix.len()..].to_string();
let source = format!("{}/{}", parent, entry.file_name);
if example.is_empty() {
example = source.clone();
}
sample_extractions.push((source, remainder));
if sample_extractions.len() >= 3 {
break;
}
}
}
}
if !sample_extractions.is_empty() {
let coverage = id_like_parents
.iter()
.filter(|(name, _)| name.starts_with(&prefix))
.map(|(_, files)| files.len())
.sum();
if coverage > 0 {
let descriptor = match class {
TokenClass::Numeric => "digits",
TokenClass::Hex => "hexadecimal IDs",
TokenClass::Alpha => "letters",
TokenClass::Alphanumeric => "alphanumeric IDs",
TokenClass::Other => "IDs",
};
let template_prefix = sanitize_glob_fragment(&prefix);
let remainder_len = remainders.first().map(|r| r.len()).unwrap_or(1);
let capture_pattern = match class {
TokenClass::Numeric => format!("\\d{{{}}}", remainder_len),
TokenClass::Hex => format!("[A-Fa-f0-9]{{{}}}", remainder_len),
TokenClass::Alphanumeric => format!("[A-Za-z0-9]{{{}}}", remainder_len),
TokenClass::Alpha => format!("[A-Za-z]{{{}}}", remainder_len),
TokenClass::Other => format!("[^/]{{{}}}", remainder_len),
};
let regex_pattern = format!(
r".*/(?P<id>{}{})/[^/]+$",
escape_literal(&prefix),
capture_pattern
);
candidates.push(PatternCandidate {
pattern: format!("{{parent:{}{{id}}}}", template_prefix),
regex_pattern,
description: format!(
"Directories with prefix '{}' followed by {}",
prefix, descriptor
),
example,
sample_extractions,
coverage,
});
}
}
}
}
}
candidates
}
fn suggest_leading_numeric_candidates(entries: &[FileEntry]) -> Vec<PatternCandidate> {
let mut records = Vec::new();
let re = Regex::new(r"^(\d{3,})").unwrap();
for entry in entries {
if let Some(stem) = &entry.stem {
if let Some(mat) = re.captures(stem) {
let digits = mat.get(1).unwrap().as_str().to_string();
let next_char = stem.chars().nth(digits.len());
let delimiter = match next_char {
Some('_') | Some('-') | Some('.') => Some(next_char.unwrap()),
_ => None,
};
records.push((entry.clone(), digits, delimiter));
}
}
}
if records.len() < 3 {
return Vec::new();
}
let mut length_counts: HashMap<usize, usize> = HashMap::new();
for (_, id, _) in &records {
*length_counts.entry(id.len()).or_insert(0) += 1;
}
let (dominant_len, dominant_count) = length_counts
.into_iter()
.max_by_key(|(_, count)| *count)
.unwrap();
if dominant_count < 2 {
return Vec::new();
}
if dominant_count * 100 / records.len() < 70 {
return Vec::new();
}
records.retain(|(_, id, _)| id.len() == dominant_len);
let unique_ids: HashSet<&str> = records.iter().map(|(_, id, _)| id.as_str()).collect();
if unique_ids.len() < 2 {
return Vec::new();
}
let mut delimiter_counts: HashMap<char, usize> = HashMap::new();
for (_, _, delimiter) in &records {
if let Some(d) = delimiter {
*delimiter_counts.entry(*d).or_insert(0) += 1;
}
}
let chosen_delimiter = delimiter_counts
.into_iter()
.max_by_key(|(_, count)| *count)
.map(|(delim, _)| delim);
let mut sample_extractions = Vec::new();
let mut example = String::new();
for (entry, id, _) in &records {
sample_extractions.push((entry.file_name.clone(), id.clone()));
if example.is_empty() {
example = entry.file_name.clone();
}
if sample_extractions.len() >= 3 {
break;
}
}
let template = match chosen_delimiter {
Some(d) => format!("{{id}}{}*", d),
None => "{id}*".to_string(),
};
let pattern = format!("{{stem:{}}}", template);
let delimiter_desc = match chosen_delimiter {
Some('_') => "underscore",
Some('-') => "dash",
Some('.') => "dot",
None => "", Some(_) => "delimiter",
};
let base_description = format!("Leading numeric ID ({} digits)", dominant_len);
let description = if delimiter_desc.is_empty() {
base_description.clone()
} else {
format!("{} before {}", base_description, delimiter_desc)
};
let regex_pattern = match chosen_delimiter {
Some(d) => format!(
"^(?P<id>\\d{{{}}}){}.*$",
dominant_len,
escape_literal(&d.to_string())
),
None => format!("^(?P<id>\\d{{{}}}).*$", dominant_len),
};
vec![PatternCandidate {
pattern,
regex_pattern,
description,
example,
sample_extractions,
coverage: records.len(),
}]
}
fn suggest_alpha_prefix_numeric_candidates(entries: &[FileEntry]) -> Vec<PatternCandidate> {
let re = Regex::new(r"^([A-Za-z]{2,}[_-])(\d{3,})").unwrap();
let mut records: HashMap<String, Vec<(FileEntry, String)>> = HashMap::new();
for entry in entries {
if let Some(stem) = &entry.stem {
if let Some(caps) = re.captures(stem) {
let prefix = caps.get(1).unwrap().as_str().to_string();
let id = caps.get(2).unwrap().as_str().to_string();
records.entry(prefix).or_default().push((entry.clone(), id));
}
}
}
let mut candidates = Vec::new();
for (prefix, matches) in records {
if matches.len() < 3 {
continue;
}
let unique_ids: HashSet<&str> = matches.iter().map(|(_, id)| id.as_str()).collect();
if unique_ids.len() < 2 {
continue;
}
let mut sample_extractions = Vec::new();
let mut example = String::new();
for (entry, id) in &matches {
sample_extractions.push((entry.file_name.clone(), id.clone()));
if example.is_empty() {
example = entry.file_name.clone();
}
if sample_extractions.len() >= 3 {
break;
}
}
let mut lengths: Vec<usize> = matches.iter().map(|(_, id)| id.len()).collect();
lengths.sort_unstable();
let min_len = *lengths.first().unwrap_or(&1);
let max_len = *lengths.last().unwrap_or(&min_len);
let capture = if min_len == max_len {
format!("\\d{{{}}}", min_len)
} else {
format!("\\d{{{},{}}}", min_len, max_len)
};
let regex_pattern = format!("^{}(?P<id>{}).*$", escape_literal(&prefix), capture);
let template = format!("{}{{id}}*", prefix);
let pattern = format!("{{stem:{}}}", template);
candidates.push(PatternCandidate {
pattern,
regex_pattern,
description: format!(
"Prefix '{}' followed by digits",
prefix.trim_end_matches(&['-', '_'][..])
),
example,
sample_extractions,
coverage: matches.len(),
});
}
candidates
}
fn suggest_generic_numeric_candidates(entries: &[FileEntry]) -> Vec<PatternCandidate> {
let re = Regex::new(r"(\d{3,})").unwrap();
let mut context_map: HashMap<(String, String), Vec<(FileEntry, String)>> = HashMap::new();
for entry in entries {
if let Some(stem) = &entry.stem {
if let Some(mat) = re.find(stem) {
let id = mat.as_str().to_string();
let before = stem[..mat.start()].to_string();
let after = stem[mat.end()..].to_string();
context_map
.entry((before, after))
.or_default()
.push((entry.clone(), id));
}
}
}
let mut candidates = Vec::new();
for ((before, after), matches) in context_map {
if matches.len() < 3 {
continue;
}
let unique_ids: HashSet<&str> = matches.iter().map(|(_, id)| id.as_str()).collect();
if unique_ids.len() < 2 {
continue;
}
let mut sample_extractions = Vec::new();
let mut example = String::new();
for (entry, id) in &matches {
sample_extractions.push((entry.file_name.clone(), id.clone()));
if example.is_empty() {
example = entry.file_name.clone();
}
if sample_extractions.len() >= 3 {
break;
}
}
let template = format!(
"{}{{id}}{}",
sanitize_glob_fragment(&before),
sanitize_glob_fragment(&after)
);
let pattern = format!("{{stem:{}}}", template);
let mut lengths: Vec<usize> = matches.iter().map(|(_, id)| id.len()).collect();
lengths.sort_unstable();
let min_len = *lengths.first().unwrap_or(&3);
let max_len = *lengths.last().unwrap_or(&min_len);
let capture = if min_len == max_len {
format!("\\d{{{}}}", min_len)
} else {
format!("\\d{{{},{}}}", min_len, max_len)
};
let regex_pattern = format!(
"^{}(?P<id>{}){}$",
escape_literal(&before),
capture,
escape_literal(&after)
);
candidates.push(PatternCandidate {
pattern,
regex_pattern,
description: "Repeated numeric sequence".to_string(),
example,
sample_extractions,
coverage: matches.len(),
});
}
candidates
}
fn is_id_like_parent(name: &str) -> bool {
if name.len() < 3 {
return false;
}
matches!(
classify_token(name),
TokenClass::Numeric | TokenClass::Hex | TokenClass::Alphanumeric
)
}
fn sanitize_glob_fragment(fragment: &str) -> String {
if fragment.is_empty() {
return String::new();
}
let mut sanitized = String::new();
for ch in fragment.chars() {
match ch {
'*' | '?' | '{' | '}' | '[' | ']' => {
sanitized.push('[');
sanitized.push(ch);
sanitized.push(']');
}
_ => sanitized.push(ch),
}
}
sanitized
}
fn longest_common_prefix(strings: &[&str]) -> String {
if strings.is_empty() {
return String::new();
}
let first = strings[0];
let mut end = first.len();
for s in strings.iter().skip(1) {
let mut idx = 0;
let max_len = std::cmp::min(end, s.len());
while idx < max_len && first.as_bytes()[idx] == s.as_bytes()[idx] {
idx += 1;
}
end = idx;
if end == 0 {
break;
}
}
first[..end].to_string()
}
fn classify_token(token: &str) -> TokenClass {
if token.is_empty() {
return TokenClass::Other;
}
let mut has_alpha = false;
let mut has_digit = false;
let mut has_other = false;
for ch in token.chars() {
match ch {
'0'..='9' => has_digit = true,
'a'..='z' | 'A'..='Z' => has_alpha = true,
'-' | '_' => {}
_ => {
has_other = true;
break;
}
}
}
if has_other {
TokenClass::Other
} else if has_alpha && has_digit {
if token.chars().all(|c| c.is_ascii_hexdigit()) {
TokenClass::Hex
} else {
TokenClass::Alphanumeric
}
} else if has_digit {
TokenClass::Numeric
} else if has_alpha {
TokenClass::Alpha
} else {
TokenClass::Other
}
}
fn classify_token_slice(tokens: &[&str]) -> TokenClass {
let mut classes: HashSet<TokenClass> = HashSet::new();
for token in tokens {
classes.insert(classify_token(token));
}
if classes.len() == 1 {
*classes.iter().next().unwrap()
} else if classes.contains(&TokenClass::Other) {
TokenClass::Other
} else if classes.contains(&TokenClass::Alphanumeric)
|| classes.contains(&TokenClass::Hex)
|| (classes.contains(&TokenClass::Alpha) && classes.contains(&TokenClass::Numeric))
{
TokenClass::Alphanumeric
} else if classes.contains(&TokenClass::Alpha) {
TokenClass::Alpha
} else if classes.contains(&TokenClass::Numeric) {
TokenClass::Numeric
} else {
TokenClass::Other
}
}
#[derive(Debug, Serialize, Deserialize)]
pub struct ParticipantRecord {
pub id: i64,
pub participant_id: String,
pub created_at: String,
pub file_count: i64,
}
pub fn list_participants(db: &BioVaultDb) -> Result<Vec<ParticipantRecord>> {
let mut stmt = db.conn.prepare(
"SELECT p.id, p.participant_id, p.created_at, COUNT(f.id) as file_count
FROM participants p
LEFT JOIN files f ON f.participant_id = p.id
GROUP BY p.id, p.participant_id, p.created_at
ORDER BY p.created_at DESC",
)?;
let participants = stmt
.query_map([], |row| {
Ok(ParticipantRecord {
id: row.get(0)?,
participant_id: row.get(1)?,
created_at: row.get(2)?,
file_count: row.get(3)?,
})
})?
.collect::<Result<Vec<_>, _>>()?;
Ok(participants)
}
pub fn delete_participant(db: &BioVaultDb, id: i64) -> Result<usize> {
let files_deleted = db
.conn
.execute("DELETE FROM files WHERE participant_id = ?1", params![id])?;
let rows = db
.conn
.execute("DELETE FROM participants WHERE id = ?1", params![id])?;
if rows == 0 {
anyhow::bail!("Participant with id {} not found", id);
}
Ok(files_deleted)
}
pub fn delete_participants_bulk(db: &BioVaultDb, ids: &[i64]) -> Result<usize> {
if ids.is_empty() {
return Ok(0);
}
let placeholders = ids.iter().map(|_| "?").collect::<Vec<_>>().join(",");
let delete_files_query = format!(
"DELETE FROM files WHERE participant_id IN ({})",
placeholders
);
db.conn
.execute(&delete_files_query, rusqlite::params_from_iter(ids.iter()))?;
let delete_participants_query =
format!("DELETE FROM participants WHERE id IN ({})", placeholders);
let rows = db.conn.execute(
&delete_participants_query,
rusqlite::params_from_iter(ids.iter()),
)?;
Ok(rows)
}
pub fn delete_file(db: &BioVaultDb, file_id: i64) -> Result<()> {
let rows = db
.conn
.execute("DELETE FROM files WHERE id = ?1", params![file_id])?;
if rows == 0 {
anyhow::bail!("File with id {} not found", file_id);
}
Ok(())
}
pub fn delete_files_bulk(db: &BioVaultDb, ids: &[i64]) -> Result<usize> {
if ids.is_empty() {
return Ok(0);
}
let placeholders = ids.iter().map(|_| "?").collect::<Vec<_>>().join(",");
let delete_query = format!("DELETE FROM files WHERE id IN ({})", placeholders);
let rows = db
.conn
.execute(&delete_query, rusqlite::params_from_iter(ids.iter()))?;
Ok(rows)
}
pub fn link_file_to_participant(
db: &BioVaultDb,
file_id: i64,
participant_id: &str,
) -> Result<FileRecord> {
let db_participant_id = get_or_create_participant(db, participant_id)?;
let rows = db.conn.execute(
"UPDATE files SET participant_id = ?1, updated_at = CURRENT_TIMESTAMP WHERE id = ?2",
params![db_participant_id, file_id],
)?;
if rows == 0 {
anyhow::bail!("File with id {} not found", file_id);
}
get_file_by_id(db, file_id)?.ok_or_else(|| anyhow::anyhow!("File not found after update"))
}
pub fn link_files_bulk(
db: &BioVaultDb,
file_participant_map: &std::collections::HashMap<String, String>,
) -> Result<usize> {
if file_participant_map.is_empty() {
return Ok(0);
}
let mut updated = 0;
let tx = db.conn.unchecked_transaction()?;
for (file_path, participant_id) in file_participant_map {
let db_participant_id = {
match tx.query_row(
"SELECT id FROM participants WHERE participant_id = ?1",
params![participant_id],
|row| row.get(0),
) {
Ok(id) => id,
Err(rusqlite::Error::QueryReturnedNoRows) => {
tx.execute(
"INSERT INTO participants (participant_id) VALUES (?1)",
params![participant_id],
)?;
tx.last_insert_rowid()
}
Err(e) => return Err(e.into()),
}
};
let rows = tx.execute(
"UPDATE files SET participant_id = ?1, updated_at = CURRENT_TIMESTAMP WHERE file_path = ?2",
params![db_participant_id, file_path],
)?;
updated += rows;
}
tx.commit()?;
Ok(updated)
}
pub fn unlink_file(db: &BioVaultDb, file_id: i64) -> Result<FileRecord> {
let rows = db.conn.execute(
"UPDATE files SET participant_id = NULL, updated_at = CURRENT_TIMESTAMP WHERE id = ?1",
params![file_id],
)?;
if rows == 0 {
anyhow::bail!("File with id {} not found", file_id);
}
get_file_by_id(db, file_id)?.ok_or_else(|| anyhow::anyhow!("File not found after update"))
}
pub fn hash_file(path: &str) -> Result<String> {
let content = fs::read(path)?;
Ok(blake3::hash(&content).to_hex().to_string())
}
pub fn import_files_as_pending(db: &BioVaultDb, files: Vec<CsvFileImport>) -> Result<ImportResult> {
let conn = db.connection();
let mut imported = 0;
let mut skipped = 0;
let mut errors = Vec::new();
for file_info in files {
let existing: Option<i64> = conn
.query_row(
"SELECT id FROM files WHERE file_path = ?1",
[&file_info.file_path],
|row| row.get(0),
)
.ok();
if existing.is_some() {
skipped += 1;
continue;
}
let participant_id = if let Some(pid) = &file_info.participant_id {
match get_or_create_participant(db, pid) {
Ok(id) => Some(id),
Err(e) => {
errors.push(format!("Failed to create participant {}: {}", pid, e));
continue;
}
}
} else {
None
};
let file_size = match std::fs::metadata(&file_info.file_path) {
Ok(meta) => Some(meta.len() as i64),
Err(_) => None,
};
let file_type = Path::new(&file_info.file_path)
.extension()
.and_then(|e| e.to_str())
.map(|e| format!(".{}", e));
let result = conn.execute(
"INSERT INTO files (participant_id, file_path, file_hash, file_type, file_size, data_type, status, queue_added_at, created_at, updated_at)
VALUES (?1, ?2, ?3, ?4, ?5, ?6, 'pending', CURRENT_TIMESTAMP, CURRENT_TIMESTAMP, CURRENT_TIMESTAMP)",
rusqlite::params![
participant_id,
&file_info.file_path,
"pending", file_type,
file_size,
file_info.data_type.as_deref().unwrap_or("Unknown"),
],
);
match result {
Ok(_) => {
let data_type = file_info.data_type.as_deref().unwrap_or("Unknown");
if data_type == "Genotype"
&& (file_info.source.is_some() || file_info.grch_version.is_some())
{
let file_id = conn.last_insert_rowid();
let meta_result = conn.execute(
"INSERT INTO genotype_metadata (file_id, source, grch_version, created_at, updated_at)
VALUES (?1, ?2, ?3, CURRENT_TIMESTAMP, CURRENT_TIMESTAMP)",
rusqlite::params![
file_id,
file_info.source.as_deref(),
file_info.grch_version.as_deref(),
],
);
if let Err(e) = meta_result {
errors.push(format!(
"Failed to create metadata for {}: {}",
file_info.file_path, e
));
}
}
imported += 1;
}
Err(e) => errors.push(format!("Failed to import {}: {}", file_info.file_path, e)),
}
}
Ok(ImportResult {
imported,
skipped,
errors,
files: Vec::new(), })
}
#[derive(Debug, Clone)]
pub struct PendingFile {
pub id: i64,
pub file_path: String,
pub data_type: Option<String>,
pub participant_id: Option<i64>,
}
pub fn get_pending_files(db: &BioVaultDb, limit: usize) -> Result<Vec<PendingFile>> {
let conn = db.connection();
let mut stmt = conn.prepare(
"SELECT id, file_path, data_type, participant_id
FROM files
WHERE status = 'pending'
ORDER BY queue_added_at ASC
LIMIT ?1",
)?;
let files = stmt
.query_map([limit], |row| {
Ok(PendingFile {
id: row.get(0)?,
file_path: row.get(1)?,
data_type: row.get(2)?,
participant_id: row.get(3)?,
})
})?
.collect::<std::result::Result<Vec<_>, _>>()?;
Ok(files)
}
pub fn update_file_status(
db: &BioVaultDb,
file_id: i64,
status: &str,
error: Option<&str>,
) -> Result<()> {
let conn = db.connection();
conn.execute(
"UPDATE files
SET status = ?1,
processing_error = ?2,
updated_at = CURRENT_TIMESTAMP
WHERE id = ?3",
rusqlite::params![status, error, file_id],
)?;
Ok(())
}
pub fn get_genotype_metadata(db: &BioVaultDb, file_id: i64) -> Result<Option<GenotypeMetadata>> {
let conn = db.connection();
let result = conn.query_row(
"SELECT source, grch_version, row_count, chromosome_count, inferred_sex
FROM genotype_metadata
WHERE file_id = ?1",
params![file_id],
|row| {
Ok(GenotypeMetadata {
data_type: "Genotype".to_string(),
source: row.get(0)?,
grch_version: row.get(1)?,
row_count: row.get(2)?,
chromosome_count: row.get(3)?,
inferred_sex: row.get(4)?,
})
},
);
match result {
Ok(meta) => Ok(Some(meta)),
Err(rusqlite::Error::QueryReturnedNoRows) => Ok(None),
Err(e) => Err(e.into()),
}
}
pub fn update_file_from_queue(
db: &BioVaultDb,
file_id: i64,
hash: &str,
metadata: Option<&GenotypeMetadata>,
) -> Result<()> {
let conn = db.connection();
if let Some(meta) = metadata {
conn.execute(
"UPDATE files
SET file_hash = ?1,
data_type = ?2,
status = 'complete',
updated_at = CURRENT_TIMESTAMP
WHERE id = ?3",
rusqlite::params![hash, meta.data_type, file_id],
)?;
if meta.data_type == "Genotype" {
conn.execute(
"INSERT OR REPLACE INTO genotype_metadata
(file_id, source, grch_version, row_count, chromosome_count, inferred_sex, created_at, updated_at)
VALUES (?1, ?2, ?3, ?4, ?5, ?6, CURRENT_TIMESTAMP, CURRENT_TIMESTAMP)",
rusqlite::params![
file_id,
meta.source,
meta.grch_version,
meta.row_count,
meta.chromosome_count,
meta.inferred_sex,
],
)?;
if let Some(ref sex) = meta.inferred_sex {
conn.execute(
"UPDATE participants
SET inferred_sex = ?1
WHERE id = (SELECT participant_id FROM files WHERE id = ?2)
AND participant_id IS NOT NULL",
rusqlite::params![sex, file_id],
)?;
}
}
} else {
conn.execute(
"UPDATE files
SET file_hash = ?1,
status = 'complete',
updated_at = CURRENT_TIMESTAMP
WHERE id = ?2",
rusqlite::params![hash, file_id],
)?;
}
Ok(())
}
#[cfg(test)]
fn calculate_file_hash(path: &str) -> Result<String> {
let mut file = std::fs::File::open(path)?;
let mut hasher = Sha256::new();
let mut buffer = [0; 8192];
loop {
let bytes_read = file.read(&mut buffer)?;
if bytes_read == 0 {
break;
}
hasher.update(&buffer[..bytes_read]);
}
Ok(format!("{:x}", hasher.finalize()))
}
#[cfg(test)]
mod tests {
use super::*;
use std::fs;
use tempfile::TempDir;
#[test]
fn test_calculate_file_hash() {
let temp = TempDir::new().unwrap();
let file_path = temp.path().join("test.txt");
fs::write(&file_path, b"hello world").unwrap();
let hash = calculate_file_hash(file_path.to_str().unwrap()).unwrap();
assert_eq!(
hash,
"b94d27b9934d3e08a52e52d7da7dabfac484efe37a5380ee9088f7ace2efcde9"
);
}
#[test]
fn test_scan_directory() {
let temp = TempDir::new().unwrap();
fs::write(temp.path().join("file1.txt"), b"content1").unwrap();
fs::write(temp.path().join("file2.txt"), b"content2").unwrap();
fs::write(temp.path().join("file3.vcf"), b"content3").unwrap();
let result = scan(temp.path().to_str().unwrap(), None, false).unwrap();
assert_eq!(result.total_files, 3);
assert_eq!(result.extensions.len(), 2);
let result = scan(temp.path().to_str().unwrap(), Some(".txt"), false).unwrap();
assert_eq!(result.total_files, 2);
}
#[test]
fn test_suggest_patterns_leading_numeric() {
let temp = TempDir::new().unwrap();
let filenames = [
"000000_carika.txt",
"103704_X_X_GSAv3-DTC_GRCh38-07-01-2025.txt",
"111442_X_X_GSAv3-DTC_GRCh38-07-01-2025.txt",
"117292_X_X_GSAv3-DTC_GRCh38-07-01-2025.txt",
"123364_X_X_GSAv3-DTC_GRCh38-07-01-2025.txt",
"256789_Combined_Genome.txt",
"356789_Eric_Uhden_Full_20110718111059.txt",
];
for filename in filenames.iter() {
fs::write(temp.path().join(filename), b"content").unwrap();
}
let result = suggest_patterns(temp.path().to_str().unwrap(), Some(".txt"), false).unwrap();
assert!(!result
.suggestions
.iter()
.any(|suggestion| suggestion.pattern == "{parent:{id}}"));
assert!(result
.suggestions
.iter()
.any(|suggestion| suggestion.pattern == "{stem:{id}_*}"));
let lead_numeric = result
.suggestions
.iter()
.find(|suggestion| suggestion.pattern == "{stem:{id}_*}")
.unwrap();
assert!(lead_numeric
.sample_extractions
.iter()
.any(|(_, id)| id.len() == 6));
assert_eq!(lead_numeric.regex_pattern, "^(?P<id>\\d{6})_.*$");
}
#[test]
fn test_suggest_patterns_directory_prefix() {
let temp = TempDir::new().unwrap();
let dirs = [
("hu17DFDB", vec!["23andMe_Genotyping.txt"]),
("hu44DCFF", vec!["JKP001_genotypes.txt"]),
("hu2D53F2", vec!["hu2D53F2_20120421013417.txt"]),
(
"hu836D0A",
vec!["genome_Maureen_Markov_Full_20100823192336.txt"],
),
("huB714CA", vec!["huB714CA_20110726215545.txt"]),
];
for (dir, files) in dirs.iter() {
let dir_path = temp.path().join(dir);
fs::create_dir_all(&dir_path).unwrap();
for file in files {
fs::write(dir_path.join(file), b"content").unwrap();
}
}
let result = suggest_patterns(temp.path().to_str().unwrap(), None, true).unwrap();
assert!(result
.suggestions
.iter()
.any(|suggestion| suggestion.pattern == "{parent:{id}}"));
let parent_pattern = result
.suggestions
.iter()
.find(|suggestion| suggestion.pattern == "{parent:{id}}")
.unwrap();
assert_eq!(
parent_pattern.regex_pattern,
r".*/(?P<id>[A-Za-z0-9._-]+)/[^/]+$"
);
assert!(result
.suggestions
.iter()
.any(|suggestion| suggestion.pattern == "{parent:hu{id}}"));
let dir_prefix = result
.suggestions
.iter()
.find(|suggestion| suggestion.pattern == "{parent:hu{id}}")
.unwrap();
assert!(dir_prefix
.sample_extractions
.iter()
.all(|(_, id)| id.len() == 6));
assert_eq!(
dir_prefix.regex_pattern,
r".*/(?P<id>hu[A-Fa-f0-9]{6})/[^/]+$"
);
}
#[test]
fn test_extract_id_from_pattern_parent() {
let file_path = "/data/genotype_files/huE922FC/AncestryDNA.txt";
let id = extract_id_from_pattern(file_path, "{parent:{id}}")
.unwrap()
.unwrap();
assert_eq!(id, "huE922FC");
let id = extract_id_from_pattern(file_path, "{dirname}")
.unwrap()
.unwrap();
assert_eq!(id, "huE922FC");
let id = extract_id_from_pattern(file_path, "{dir}")
.unwrap()
.unwrap();
assert_eq!(id, "huE922FC");
}
#[test]
fn test_extract_id_from_pattern_filename() {
let file_path = "/data/genotype_files/huE922FC/AncestryDNA.txt";
let id = extract_id_from_pattern(file_path, "{filename}")
.unwrap()
.unwrap();
assert_eq!(id, "AncestryDNA");
let id = extract_id_from_pattern(file_path, "{basename}")
.unwrap()
.unwrap();
assert_eq!(id, "AncestryDNA.txt");
}
#[test]
fn test_extract_id_from_pattern_legacy() {
let file_path = "/data/genotype_files/huE922FC/AncestryDNA.txt";
let id = extract_id_from_pattern(file_path, "{id}/*")
.unwrap()
.unwrap();
assert_eq!(id, "huE922FC");
}
#[test]
fn test_extract_id_from_pattern_filename_pattern() {
let file_path = "/data/files/123456_sample.txt";
let id = extract_id_from_pattern(file_path, "{stem:{id}_*}")
.unwrap()
.unwrap();
assert_eq!(id, "123456");
let file_path = "/data/files/ABC123_sample.txt";
let id = extract_id_from_pattern(file_path, "{stem:{id}_*}")
.unwrap()
.unwrap();
assert_eq!(id, "ABC123");
}
#[test]
fn test_extract_id_with_structured_template() {
let file_path = "/data/files/genome_Full_20120106210128.txt";
let id = extract_id_from_pattern(file_path, "{basename:genome_Full_{id}.txt}")
.unwrap()
.unwrap();
assert_eq!(id, "20120106210128");
}
#[test]
fn test_extract_id_with_regex() {
let file_path = "/data/genotype_files/huE922FC/AncestryDNA.txt";
let id = extract_id_from_pattern(file_path, r"(?P<id>hu[0-9A-F]{6})")
.unwrap()
.unwrap();
assert_eq!(id, "huE922FC");
}
}