#[cfg(not(windows))]
use tikv_jemallocator::Jemalloc;
#[cfg(not(windows))]
#[global_allocator]
static GLOBAL: Jemalloc = Jemalloc;
use std::io::{self, Write};
use std::path::PathBuf;
use anyhow::Result;
use clap::{Parser, ValueEnum};
use rand::Rng;
use seqtui::controller::{run_app_with_loading, run_app_with_file_browser, run_app_with_file_browser_at};
use seqtui::formats::{parse_file_with_options, FileFormat};
use seqtui::genetic_code::GeneticCodes;
use seqtui::model::{Alignment, Sequence, SequenceType};
fn generate_log_path(output: Option<&str>, suffix_hint: &str) -> PathBuf {
let random_suffix: String = rand::rng()
.sample_iter(&rand::distr::Alphanumeric)
.take(6)
.map(|c| (c as char).to_ascii_lowercase())
.collect();
match output {
Some("-") | None => {
PathBuf::from(format!("seqtui_{}_{}.log", suffix_hint, random_suffix))
}
Some(path) => {
let output_path = PathBuf::from(path);
let parent = output_path.parent().unwrap_or(std::path::Path::new("."));
let stem = output_path
.file_stem()
.and_then(|s| s.to_str())
.unwrap_or("seqtui");
parent.join(format!("{}_{}.log", stem, random_suffix))
}
}
}
fn run_cli_mode(
file_path: &PathBuf,
forced_format: Option<FileFormat>,
output: &str,
translate: bool,
genetic_code: u8,
reading_frame: u8,
force: bool,
) -> Result<()> {
let alignment = parse_file_with_options(file_path, forced_format)?;
let output_alignment = if translate {
if alignment.sequence_type.is_likely_not_nucleotide() && !force {
anyhow::bail!(
"Cannot translate: file appears to be amino acids (only {:.0}% nucleotide characters).\n\
Use --force to proceed anyway.",
alignment.sequence_type.nt_ratio * 100.0
);
}
if !alignment.sequence_type.is_nucleotide() {
eprintln!(
"Warning: file has only {:.0}% nucleotide characters, translation may produce unexpected results.",
alignment.sequence_type.nt_ratio * 100.0
);
}
let codes = GeneticCodes::new();
let code = codes.get(genetic_code).ok_or_else(|| {
anyhow::anyhow!("Unknown genetic code: {}", genetic_code)
})?;
let frame = if reading_frame <= 3 {
(reading_frame - 1) as usize
} else {
anyhow::bail!("Reverse complement frames (4-6) not yet implemented");
};
let mut translated_seqs: Vec<Sequence> = alignment
.sequences
.iter()
.map(|seq| {
let aa_data = code.translate_sequence(seq.as_bytes(), frame);
Sequence::from_bytes(seq.id.clone(), aa_data)
})
.collect();
translated_seqs.shrink_to_fit();
let mut translated = Alignment::new(translated_seqs);
translated.sequence_type = SequenceType::AMINO_ACID;
translated
} else {
alignment
};
if output == "-" {
let stdout = io::stdout();
let mut handle = stdout.lock();
for seq in &output_alignment.sequences {
writeln!(handle, ">{}", seq.id)?;
writeln!(handle, "{}", seq.as_str())?;
}
} else {
let mut file = std::fs::File::create(output)?;
for seq in &output_alignment.sequences {
writeln!(file, ">{}", seq.id)?;
writeln!(file, "{}", seq.as_str())?;
}
eprintln!(
"Wrote {} sequences to {}",
output_alignment.sequence_count(),
output
);
}
Ok(()
)
}
fn extract_key(
id: &str,
delimiter: Option<&str>,
fields: Option<&[usize]>,
file_name: &str,
) -> Result<String> {
match delimiter {
None => Ok(id.to_string()),
Some(delim) => {
let parts: Vec<&str> = id.split(delim).collect();
let field_indices = fields.unwrap_or(&[1]);
let mut extracted: Vec<&str> = Vec::with_capacity(field_indices.len());
for &field_num in field_indices {
if field_num == 0 {
anyhow::bail!(
"Invalid field number 0 (fields are 1-based)\n\
File: {}\n\
Sequence: '{}'",
file_name, id
);
}
let idx = field_num - 1; if idx >= parts.len() {
anyhow::bail!(
"Cannot extract field {} from sequence '{}' in file '{}'\n\
Split result: {:?} ({} fields available)\n\
Requested fields: {:?}",
field_num, id, file_name,
parts, parts.len(),
field_indices
);
}
extracted.push(parts[idx]);
}
Ok(extracted.join(delim))
}
}
}
fn get_chrom_name(file_path: &PathBuf) -> String {
file_path
.file_stem()
.and_then(|s| s.to_str())
.unwrap_or("unknown")
.to_string()
}
fn run_vcf_mode(
files: &[PathBuf],
forced_format: Option<FileFormat>,
output: &str,
min_dist: usize,
delimiter: Option<&str>,
fields: Option<&[usize]>,
force: bool,
) -> Result<()> {
use std::collections::{HashMap, HashSet};
let mut all_keys: Vec<String> = Vec::new();
let mut seen_keys: HashSet<String> = HashSet::new();
let mut suspect_files: Vec<(String, f64)> = Vec::new();
let show_progress = files.len() > 100;
let progress_step = if show_progress { files.len() / 100 } else { 1 };
eprint!("Pass 1: Scanning {} file(s) for sequence IDs", files.len());
if show_progress {
eprint!(" (each dot = ~1%)");
}
eprintln!("...");
for (file_idx, file_path) in files.iter().enumerate() {
if show_progress && file_idx > 0 && file_idx % progress_step == 0 {
eprint!(".");
let _ = std::io::stderr().flush();
}
let alignment = parse_file_with_options(file_path, forced_format)?;
if !alignment.is_valid_alignment {
anyhow::bail!(
"File {} is not a valid alignment (sequences have different lengths). \
VCF mode requires aligned sequences.",
file_path.display()
);
}
if alignment.sequence_type.is_likely_not_nucleotide() {
suspect_files.push((
file_path.display().to_string(),
alignment.sequence_type.nt_ratio,
));
}
let file_name = file_path.display().to_string();
for seq in &alignment.sequences {
let key = extract_key(&seq.id, delimiter, fields, &file_name)?;
if !seen_keys.contains(&key) {
seen_keys.insert(key.clone());
all_keys.push(key);
}
}
}
if !suspect_files.is_empty() && !force {
let log_path = generate_log_path(Some(output), "nt_check");
let mut log_file = std::fs::File::create(&log_path)?;
writeln!(log_file, "# SeqTUI - Nucleotide content check for VCF mode")?;
writeln!(log_file, "# Files with <50% ACGT characters (likely amino acid sequences)")?;
writeln!(log_file, "#")?;
writeln!(log_file, "# File\tACGT_ratio")?;
for (path, ratio) in &suspect_files {
writeln!(log_file, "{}\t{:.1}%", path, ratio * 100.0)?;
}
anyhow::bail!(
"VCF mode requires nucleotide sequences, but {} file(s) appear to be amino acids:\n\
- Less than 50% of characters are ACGT (excluding gaps/N/?)\n\
- Details written to: {}\n\
- Use --force to proceed anyway",
suspect_files.len(),
log_path.display()
);
}
if all_keys.is_empty() {
anyhow::bail!("No sequences found in input files");
}
all_keys.sort();
if show_progress {
eprintln!(); }
eprintln!("Found {} sequences", all_keys.len());
let mut vcf_lines: Vec<String> = Vec::new();
let log_path = generate_log_path(Some(output), "vcf");
let mut log_file = if show_progress {
Some(std::fs::File::create(&log_path)?)
} else {
None
};
if let Some(ref mut log) = log_file {
writeln!(log, "# SeqTUI VCF mode - per-file SNP counts")?;
writeln!(log, "# Min flanking distance: {}", min_dist)?;
writeln!(log, "# Chrom\tSites\tSNPs")?;
}
eprint!("Pass 2: Scanning for biallelic SNPs (min flanking distance: {})", min_dist);
if show_progress {
eprint!(" (each dot = ~1%)");
}
eprintln!("...");
for (file_idx, file_path) in files.iter().enumerate() {
if show_progress && file_idx > 0 && file_idx % progress_step == 0 {
eprint!(".");
let _ = std::io::stderr().flush();
}
let alignment = parse_file_with_options(file_path, forced_format)?;
let chrom = get_chrom_name(file_path);
let aln_len = alignment.alignment_length();
let file_name = file_path.display().to_string();
let mut seq_map: HashMap<String, &[u8]> = HashMap::new();
for seq in &alignment.sequences {
let key = extract_key(&seq.id, delimiter, fields, &file_name)?;
seq_map.insert(key, seq.as_bytes());
}
let mut real_nt_only: Vec<bool> = vec![true; aln_len];
let mut seen_nt: Vec<u8> = vec![0; aln_len];
for seq in &alignment.sequences {
let seq_bytes = seq.as_bytes();
for pos in 0..aln_len {
if real_nt_only[pos] {
match seq_bytes[pos].to_ascii_uppercase() {
b'A' => seen_nt[pos] |= 1,
b'C' => seen_nt[pos] |= 2,
b'G' => seen_nt[pos] |= 4,
b'T' => seen_nt[pos] |= 8,
b'N' | b'?' => {} _ => real_nt_only[pos] = false, }
}
}
}
let mut reset: Vec<bool> = vec![false; aln_len];
for pos in 0..aln_len {
reset[pos] = !real_nt_only[pos] || seen_nt[pos].count_ones() > 1;
}
let mut dist_left: Vec<usize> = vec![0; aln_len];
for i in 1..aln_len {
dist_left[i] = if reset[i - 1] { 0 } else { dist_left[i - 1] + 1 };
}
let mut dist_right: Vec<usize> = vec![0; aln_len];
for i in (0..aln_len - 1).rev() {
dist_right[i] = if reset[i + 1] { 0 } else { dist_right[i + 1] + 1 };
}
let mut snp_count = 0;
for pos in 0..aln_len {
if !real_nt_only[pos] || seen_nt[pos].count_ones() != 2
|| dist_left[pos] < min_dist || dist_right[pos] < min_dist {
continue;
}
let allele_bits = seen_nt[pos];
let alleles: Vec<u8> = [b'A', b'C', b'G', b'T']
.iter()
.zip([1u8, 2, 4, 8])
.filter(|(_, bit)| allele_bits & bit != 0)
.map(|(base, _)| *base)
.collect();
let mut ref_base: Option<u8> = None;
for key in &all_keys {
if let Some(seq) = seq_map.get(key) {
let b = seq[pos].to_ascii_uppercase();
if b == alleles[0] || b == alleles[1] {
ref_base = Some(b);
break;
}
}
}
let ref_base = ref_base.unwrap(); let alt_base = if alleles[0] == ref_base { alleles[1] } else { alleles[0] };
let gt_strings: Vec<String> = all_keys.iter()
.map(|key| {
if let Some(seq) = seq_map.get(key) {
let b = seq[pos].to_ascii_uppercase();
if b == ref_base { "0".to_string() }
else if b == alt_base { "1".to_string() }
else { ".".to_string() } } else {
".".to_string() } })
.collect();
let vcf_line = format!(
"{}\t{}\t.\t{}\t{}\t.\tPASS\tDL={};DR={}\tGT\t{}",
chrom,
pos + 1, ref_base as char,
alt_base as char,
dist_left[pos],
dist_right[pos],
gt_strings.join("\t")
);
vcf_lines.push(vcf_line);
snp_count += 1;
}
if let Some(ref mut log) = log_file {
writeln!(log, "{}\t{}\t{}", chrom, aln_len, snp_count)?;
} else {
eprintln!(" {} : {} sites, {} isolated biallelic SNPs selected", chrom, aln_len, snp_count);
}
}
if show_progress {
eprintln!(); eprintln!("Per-file details written to: {}", log_path.display());
}
let vcf_header = format!(
"##fileformat=VCFv4.2\n\
##INFO=<ID=DL,Number=1,Type=Integer,Description=\"Distance to nearest polymorphic site on the left\">\n\
##INFO=<ID=DR,Number=1,Type=Integer,Description=\"Distance to nearest polymorphic site on the right\">\n\
##FORMAT=<ID=GT,Number=1,Type=String,Description=\"Genotype\">\n\
#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\t{}",
all_keys.join("\t")
);
if output == "-" {
let stdout = io::stdout();
let mut handle = stdout.lock();
writeln!(handle, "{}", vcf_header)?;
for line in &vcf_lines {
writeln!(handle, "{}", line)?;
}
} else {
let mut file = std::fs::File::create(output)?;
writeln!(file, "{}", vcf_header)?;
for line in &vcf_lines {
writeln!(file, "{}", line)?;
}
eprintln!("Wrote {} SNPs to {}", vcf_lines.len(), output);
}
Ok(())
}
fn run_concatenation_mode(
files: &[PathBuf],
forced_format: Option<FileFormat>,
output: &str,
translate: bool,
genetic_code: u8,
reading_frame: u8,
delimiter: Option<&str>,
fields: Option<&[usize]>,
gap_char: Option<char>, partitions_file: Option<&str>,
force: bool,
) -> Result<()> {
use std::collections::{HashMap, HashSet};
let codes = GeneticCodes::new();
let code = codes.get(genetic_code).ok_or_else(|| {
anyhow::anyhow!("Unknown genetic code: {}", genetic_code)
})?;
let frame = if reading_frame <= 3 {
(reading_frame - 1) as usize
} else {
anyhow::bail!("Reverse complement frames (4-6) not yet implemented");
};
let mut all_keys: Vec<String> = Vec::new();
let mut seen_keys: HashSet<String> = HashSet::new();
let mut key_file_count: HashMap<String, usize> = HashMap::new(); let mut file_lengths: Vec<usize> = Vec::new();
eprintln!("Pass 1: Scanning {} files...", files.len());
for file_path in files {
let alignment = parse_file_with_options(file_path, forced_format)?;
if gap_char.is_some() && !alignment.is_valid_alignment {
anyhow::bail!(
"File {} is not a valid alignment (sequences have different lengths). \
Supermatrix mode requires aligned sequences.",
file_path.display()
);
}
let aln_len = if translate {
if alignment.sequence_type.is_likely_not_nucleotide() && !force {
anyhow::bail!(
"Cannot translate {}: appears to be amino acids ({:.0}% NT).\n\
Use --force to proceed anyway.",
file_path.display(),
alignment.sequence_type.nt_ratio * 100.0
);
}
(alignment.alignment_length().saturating_sub(frame)) / 3
} else {
alignment.alignment_length()
};
file_lengths.push(aln_len);
let file_name = file_path.display().to_string();
let mut keys_in_this_file: HashSet<String> = HashSet::new();
for seq in &alignment.sequences {
let key = extract_key(&seq.id, delimiter, fields, &file_name)?;
if !seen_keys.contains(&key) {
seen_keys.insert(key.clone());
all_keys.push(key.clone());
}
keys_in_this_file.insert(key);
}
for key in keys_in_this_file {
*key_file_count.entry(key).or_insert(0) += 1;
}
}
all_keys.sort();
let output_count = all_keys.len();
let orphan_count = key_file_count.values().filter(|&&c| c == 1).count();
eprintln!("Found {} output sequence IDs ({} appear in only one file)",
output_count, orphan_count);
let orphan_ratio = orphan_count as f64 / output_count as f64;
if orphan_ratio > 0.30 && !force {
let log_path = generate_log_path(Some(output), "ids");
let mut log_file = std::fs::File::create(&log_path)?;
use std::io::Write;
writeln!(log_file, "# SeqTUI - Output sequence IDs from concatenation")?;
writeln!(log_file, "# {} IDs total, {} appear in only one file ({:.1}%)",
output_count, orphan_count, orphan_ratio * 100.0)?;
writeln!(log_file, "# IDs marked with * appear in only one file")?;
writeln!(log_file, "#")?;
for key in &all_keys {
let count = key_file_count.get(key).unwrap_or(&0);
if *count == 1 {
writeln!(log_file, "{}*", key)?;
} else {
writeln!(log_file, "{}", key)?;
}
}
anyhow::bail!(
"Suspicious ID matching: {:.0}% of output IDs appear in only one file ({} / {}).\n\
This often means sequence names don't match across files.\n\
- Check if you need -d/--delimiter to extract a common prefix\n\
- List of output IDs written to: {} (orphans marked with *)\n\
- Use --force to proceed anyway",
orphan_ratio * 100.0, orphan_count, output_count, log_path.display()
);
}
let mut seq_data: HashMap<String, Vec<u8>> = HashMap::new();
for key in &all_keys {
seq_data.insert(key.clone(), Vec::new());
}
let mut partitions: Vec<(String, usize, usize)> = Vec::new(); let mut current_pos: usize = 1;
let mut file_stats: Vec<(String, usize, usize)> = Vec::new(); let mut warnings: Vec<String> = Vec::new();
let log_path = generate_log_path(Some(output), "concat");
eprintln!("Pass 2: Concatenating sequences...");
for (file_idx, file_path) in files.iter().enumerate() {
let alignment = parse_file_with_options(file_path, forced_format)?;
let expected_len = file_lengths[file_idx];
let file_name = file_path.display().to_string();
let mut file_seqs: HashMap<String, Vec<u8>> = HashMap::new();
for seq in &alignment.sequences {
let key = extract_key(&seq.id, delimiter, fields, &file_name)?;
let seq_data = if translate {
code.translate_sequence(seq.as_bytes(), frame)
} else {
seq.as_bytes().to_vec()
};
if file_seqs.contains_key(&key) {
let warning = format!(
"Duplicate key '{}' in file {} (using first occurrence)",
key,
file_path.display()
);
eprintln!("Warning: {}", warning);
warnings.push(warning);
continue;
}
file_seqs.insert(key, seq_data);
}
for key in &all_keys {
if let Some(seq_bytes) = file_seqs.get(key) {
seq_data.get_mut(key).unwrap().extend_from_slice(seq_bytes);
} else if let Some(fill_char) = gap_char {
let gaps = vec![fill_char as u8; expected_len];
seq_data.get_mut(key).unwrap().extend_from_slice(&gaps);
}
}
let gene_name = file_path
.file_stem()
.and_then(|s| s.to_str())
.unwrap_or("unknown");
let end_pos = current_pos + expected_len - 1;
partitions.push((gene_name.to_string(), current_pos, end_pos));
current_pos = end_pos + 1;
file_stats.push((file_path.display().to_string(), file_seqs.len(), expected_len));
}
if gap_char.is_none() {
let total_len: usize = file_lengths.iter().sum();
for (key, data) in &seq_data {
if data.len() != total_len {
let warning = format!(
"Sequence '{}' has length {} (expected {}). \
Use -s/--supermatrix to fill missing with gaps.",
key,
data.len(),
total_len
);
eprintln!("Warning: {}", warning);
warnings.push(warning);
}
}
}
{
let mut log_file = std::fs::File::create(&log_path)?;
writeln!(log_file, "# SeqTUI concatenation log")?;
writeln!(log_file, "# Output: {}", output)?;
if translate {
writeln!(log_file, "# Translation: code {}, frame +{}", genetic_code, frame + 1)?;
}
if let Some(fill_char) = gap_char {
writeln!(log_file, "# Mode: supermatrix (missing sequences filled with '{}')", fill_char)?;
} else {
writeln!(log_file, "# Mode: concatenation (no gap filling)")?;
}
writeln!(log_file, "#")?;
writeln!(log_file, "# Per-file statistics:")?;
writeln!(log_file, "# File\tSequences\tSites")?;
for (file, seqs, sites) in &file_stats {
writeln!(log_file, "{}\t{}\t{}", file, seqs, sites)?;
}
if !warnings.is_empty() {
writeln!(log_file, "#")?;
writeln!(log_file, "# Warnings ({}):", warnings.len())?;
for warning in &warnings {
writeln!(log_file, "# WARNING: {}", warning)?;
}
}
writeln!(log_file, "#")?;
let total_sites: usize = file_stats.iter().map(|(_, _, s)| s).sum();
writeln!(log_file, "# Summary: {} files, {} output sequences, {} total sites",
file_stats.len(), all_keys.len(), total_sites)?;
}
eprintln!("Log written to: {}", log_path.display());
let seq_count = all_keys.len();
if output == "-" {
let stdout = io::stdout();
let mut handle = stdout.lock();
for key in &all_keys {
let data = seq_data.get(key).unwrap();
writeln!(handle, ">{}", key)?;
writeln!(handle, "{}", unsafe { std::str::from_utf8_unchecked(data) })?;
}
} else {
let mut file = std::fs::File::create(output)?;
for key in &all_keys {
let data = seq_data.get(key).unwrap();
writeln!(file, ">{}", key)?;
writeln!(file, "{}", unsafe { std::str::from_utf8_unchecked(data) })?;
}
eprintln!("Wrote {} sequences to {}", seq_count, output);
}
if let Some(part_file) = partitions_file {
let mut file = std::fs::File::create(part_file)?;
writeln!(file, "#nexus")?;
writeln!(file, "begin sets;")?;
for (name, start, end) in &partitions {
let sanitized_name = name.replace(' ', "_");
writeln!(file, " charset {} = {}-{};", sanitized_name, start, end)?;
}
writeln!(file, "end;")?;
eprintln!("Wrote NEXUS partition file with {} partitions to {}", partitions.len(), part_file);
}
Ok(())
}
#[derive(Debug, Clone, Copy, ValueEnum)]
enum FormatArg {
Fasta,
Nexus,
Phylip,
Auto,
}
impl From<FormatArg> for Option<FileFormat> {
fn from(arg: FormatArg) -> Self {
match arg {
FormatArg::Fasta => Some(FileFormat::Fasta),
FormatArg::Nexus => Some(FileFormat::Nexus),
FormatArg::Phylip => Some(FileFormat::Phylip),
FormatArg::Auto => None,
}
}
}
#[derive(Parser, Debug)]
#[command(
author = "V. Ranwez",
version,
about = "View, translate, convert (to FASTA), and combine sequences — aligned or not.",
long_about = None,
after_help = "Documentation & examples: https://github.com/ranwez-search/SeqTUI"
)]
struct Args {
files: Vec<PathBuf>,
#[arg(short = 'o', long = "output", help_heading = "Input/Output")]
output: Option<String>,
#[arg(long = "format", value_enum, default_value = "auto", hide_default_value = true, help_heading = "Input/Output")]
format: FormatArg,
#[arg(long = "force", help_heading = "Input/Output")]
force: bool,
#[arg(short = 'd', long = "delimiter", help_heading = "ID Extraction")]
delimiter: Option<String>,
#[arg(short = 'f', long = "fields", help_heading = "ID Extraction")]
fields: Option<String>,
#[arg(short = 's', long = "supermatrix", value_name = "CHAR", default_missing_value = "-", num_args = 0..=1, help_heading = "Multi-file Concatenation")]
supermatrix: Option<String>,
#[arg(short = 'p', long = "partitions", help_heading = "Multi-file Concatenation")]
partitions: Option<String>,
#[arg(short = 't', long = "translate", help_heading = "Translation")]
translate: bool,
#[arg(short = 'g', long = "genetic-code", default_value = "1", hide_default_value = true, help_heading = "Translation")]
genetic_code: u8,
#[arg(short = 'r', long = "reading-frame", default_value = "1", hide_default_value = true, help_heading = "Translation")]
reading_frame: u8,
#[arg(short = 'v', long = "vcf", value_name = "MIN_DIST", help_heading = "SNP Extraction")]
vcf: Option<usize>,
#[arg(long = "fancy", help_heading = "Display")]
fancy: bool,
}
fn main() -> Result<()> {
let args = Args::parse();
let forced_format: Option<FileFormat> = args.format.into();
if args.reading_frame < 1 || args.reading_frame > 3 {
anyhow::bail!("Reading frame must be 1-3 (got {})", args.reading_frame);
}
if args.genetic_code < 1 || args.genetic_code > 33 {
anyhow::bail!("Genetic code must be 1-33 (got {})", args.genetic_code);
}
let gap_char: Option<char> = match &args.supermatrix {
None => None,
Some(s) => {
if s.len() != 1 {
anyhow::bail!(
"-s/--supermatrix requires a single character (got '{}', {} chars)",
s, s.len()
);
}
let c = s.chars().next().unwrap();
if !c.is_ascii() {
anyhow::bail!("-s/--supermatrix character must be ASCII (got '{}')", c);
}
Some(c)
}
};
let fields: Option<Vec<usize>> = match &args.fields {
None => None,
Some(s) => {
let mut field_nums: Vec<usize> = Vec::new();
for part in s.split(',') {
let part = part.trim();
if part.is_empty() {
continue;
}
let num: usize = part.parse().map_err(|_| {
anyhow::anyhow!(
"-f/--fields requires comma-separated positive integers (got '{}')",
part
)
})?;
if num == 0 {
anyhow::bail!("-f/--fields uses 1-based indexing (field 0 is invalid)");
}
field_nums.push(num);
}
if field_nums.is_empty() {
anyhow::bail!("-f/--fields requires at least one field number");
}
Some(field_nums)
}
};
let effective_delimiter: Option<String> = if args.delimiter.is_some() {
args.delimiter.clone()
} else if args.fields.is_some() {
Some("_".to_string()) } else {
None
};
if args.output.is_none() {
if args.supermatrix.is_some() {
anyhow::bail!("-s/--supermatrix requires -o/--output");
}
if args.partitions.is_some() {
anyhow::bail!("-p/--partitions requires -o/--output");
}
if args.vcf.is_some() {
anyhow::bail!("-v/--vcf requires -o/--output");
}
if (args.delimiter.is_some() || args.fields.is_some()) && args.files.len() > 1 {
anyhow::bail!("-d/--delimiter or -f/--fields with multiple files requires -o/--output");
}
}
if args.files.len() == 1 {
if args.supermatrix.is_some() {
anyhow::bail!("-s/--supermatrix requires multiple input files");
}
if args.partitions.is_some() {
anyhow::bail!("-p/--partitions requires multiple input files");
}
}
if args.vcf.is_some() {
if args.translate {
anyhow::bail!("-v/--vcf is incompatible with -t/--translate");
}
if args.supermatrix.is_some() {
anyhow::bail!("-v/--vcf is incompatible with -s/--supermatrix");
}
if args.partitions.is_some() {
anyhow::bail!("-v/--vcf is incompatible with -p/--partitions");
}
}
let fancy_ui = args.fancy;
if args.files.is_empty() {
if args.output.is_some() {
anyhow::bail!("CLI mode (-o/--output) requires at least one input file");
}
return run_app_with_file_browser(fancy_ui);
}
if let Some(min_dist) = args.vcf {
let output = args.output.as_ref().unwrap(); return run_vcf_mode(
&args.files,
forced_format,
output,
min_dist,
effective_delimiter.as_deref(),
fields.as_deref(),
args.force,
);
}
if args.files.len() > 1 {
let output = args.output.ok_or_else(|| {
anyhow::anyhow!("Multiple input files require -o/--output for concatenation")
})?;
run_concatenation_mode(
&args.files,
forced_format,
&output,
args.translate,
args.genetic_code,
args.reading_frame,
effective_delimiter.as_deref(),
fields.as_deref(),
gap_char,
args.partitions.as_deref(),
args.force,
)?;
} else {
let file_path = &args.files[0];
if let Some(output) = args.output {
run_cli_mode(
file_path,
forced_format,
&output,
args.translate,
args.genetic_code,
args.reading_frame,
args.force,
)?;
} else {
if file_path.is_dir() {
return run_app_with_file_browser_at(file_path.clone(), fancy_ui);
}
run_app_with_loading(
file_path.clone(),
forced_format,
if args.translate || args.genetic_code != 1 || args.reading_frame != 1 {
Some((args.genetic_code, args.reading_frame))
} else {
None
},
fancy_ui,
)?;
}
}
Ok(())
}
#[cfg(test)]
mod tests {
use super::*;
use std::io::BufRead;
use std::sync::atomic::{AtomicUsize, Ordering};
static TEST_COUNTER: AtomicUsize = AtomicUsize::new(0);
fn run_vcf_test(files: &[&str], min_dist: usize) -> Vec<String> {
let file_paths: Vec<PathBuf> = files.iter()
.map(|f| PathBuf::from(format!("test_data/vcf_tests/{}", f)))
.collect();
let test_id = TEST_COUNTER.fetch_add(1, Ordering::SeqCst);
let tmp_output = format!("/tmp/seqtui_test_vcf_{}.vcf", test_id);
run_vcf_mode(&file_paths, None, &tmp_output, min_dist, None, None, false).unwrap();
let file = std::fs::File::open(&tmp_output).unwrap();
let reader = std::io::BufReader::new(file);
let lines: Vec<String> = reader.lines()
.filter_map(|l| l.ok())
.filter(|l| !l.starts_with('#'))
.collect();
let _ = std::fs::remove_file(&tmp_output);
lines
}
fn parse_vcf_line(line: &str) -> (String, usize, char, char, usize, usize, Vec<String>) {
let parts: Vec<&str> = line.split('\t').collect();
let chrom = parts[0].to_string();
let pos: usize = parts[1].parse().unwrap();
let ref_base = parts[3].chars().next().unwrap();
let alt_base = parts[4].chars().next().unwrap();
let info = parts[7];
let mut dl = 0;
let mut dr = 0;
for field in info.split(';') {
if field.starts_with("DL=") {
dl = field[3..].parse().unwrap();
} else if field.starts_with("DR=") {
dr = field[3..].parse().unwrap();
}
}
let genotypes: Vec<String> = parts[9..].iter().map(|s| s.to_string()).collect();
(chrom, pos, ref_base, alt_base, dl, dr, genotypes)
}
#[test]
fn test_vcf_biallelic_snp() {
let lines = run_vcf_test(&["biallelic_snp.fa"], 30);
assert_eq!(lines.len(), 1, "Should find exactly 1 SNP");
let (chrom, pos, ref_base, alt_base, dl, dr, genotypes) = parse_vcf_line(&lines[0]);
assert_eq!(chrom, "biallelic_snp");
assert_eq!(pos, 31);
assert_eq!(ref_base, 'G');
assert_eq!(alt_base, 'T');
assert_eq!(dl, 30);
assert_eq!(dr, 31);
assert_eq!(genotypes, vec!["0", "1", "0"]);
}
#[test]
fn test_vcf_triallelic_excluded() {
let lines = run_vcf_test(&["triallelic_snp.fa"], 30);
assert_eq!(lines.len(), 0, "Triallelic site should be excluded");
}
#[test]
fn test_vcf_two_snps_close_excluded() {
let lines = run_vcf_test(&["two_snps_close.fa"], 30);
assert_eq!(lines.len(), 0, "Close SNPs should both be excluded");
}
#[test]
fn test_vcf_two_snps_far_excluded() {
let lines = run_vcf_test(&["two_snps_far.fa"], 30);
assert_eq!(lines.len(), 0, "SNPs with insufficient spacing should be excluded");
}
#[test]
fn test_vcf_three_snps_small_dist() {
let lines = run_vcf_test(&["three_snps.fa"], 5);
assert_eq!(lines.len(), 3, "All 3 SNPs should be selected with min_dist=5");
let (_, pos1, _, _, dl1, dr1, _) = parse_vcf_line(&lines[0]);
let (_, pos2, _, _, dl2, dr2, _) = parse_vcf_line(&lines[1]);
let (_, pos3, _, _, dl3, dr3, _) = parse_vcf_line(&lines[2]);
assert_eq!(pos1, 15);
assert_eq!(pos2, 31);
assert_eq!(pos3, 50);
assert_eq!(dl1, 14); assert_eq!(dr1, 15); assert_eq!(dl2, 15); assert_eq!(dr2, 18); assert_eq!(dl3, 18); assert_eq!(dr3, 12); }
#[test]
fn test_vcf_snp_with_n_missing_genotype() {
let lines = run_vcf_test(&["snp_with_n.fa"], 30);
assert_eq!(lines.len(), 1, "SNP with N should still be selected");
let (_, _, _, _, _, _, genotypes) = parse_vcf_line(&lines[0]);
assert_eq!(genotypes, vec!["0", "1", "."], "Sample with N should have missing genotype");
}
#[test]
fn test_vcf_snp_with_gap_excluded() {
let lines = run_vcf_test(&["snp_with_gap.fa"], 30);
assert_eq!(lines.len(), 0, "SNP with gap should be excluded");
}
#[test]
fn test_generate_log_path_with_output_file() {
let log_path = generate_log_path(Some("output.fasta"), "concat");
let log_str = log_path.to_string_lossy();
assert!(log_str.starts_with("output_"), "Log should start with output prefix");
assert!(log_str.ends_with(".log"), "Log should end with .log");
assert!(log_str.contains("_"), "Log should contain underscore separator");
let without_ext = log_str.trim_end_matches(".log");
let suffix = without_ext.strip_prefix("output_").unwrap();
assert_eq!(suffix.len(), 6, "Random suffix should be 6 characters");
assert!(suffix.chars().all(|c| c.is_ascii_alphanumeric()),
"Suffix should be alphanumeric");
}
#[test]
fn test_generate_log_path_with_stdout() {
let log_path = generate_log_path(Some("-"), "vcf");
let log_str = log_path.to_string_lossy();
assert!(log_str.starts_with("seqtui_vcf_"), "Log should start with seqtui_vcf_ prefix");
assert!(log_str.ends_with(".log"), "Log should end with .log");
}
#[test]
fn test_generate_log_path_with_directory() {
let log_path = generate_log_path(Some("/tmp/subdir/result.fa"), "concat");
let log_str = log_path.to_string_lossy();
assert!(log_str.starts_with("/tmp/subdir/result_"),
"Log should be in same directory as output");
assert!(log_str.ends_with(".log"), "Log should end with .log");
}
#[test]
fn test_generate_log_path_uniqueness() {
let path1 = generate_log_path(Some("test.fa"), "concat");
let path2 = generate_log_path(Some("test.fa"), "concat");
assert_ne!(path1, path2, "Each call should generate a unique log path");
}
#[test]
fn test_concatenation_creates_log_file() {
let test_id = TEST_COUNTER.fetch_add(1, Ordering::SeqCst);
let tmp_output = format!("/tmp/seqtui_test_concat_{}.fasta", test_id);
let files = vec![
PathBuf::from("test_data/alignment.fasta"),
PathBuf::from("test_data/LOC_01790.nex"),
];
let result = run_concatenation_mode(
&files,
None, &tmp_output,
false, 1, 1, None, None, Some('-'), None, true, );
assert!(result.is_ok(), "Concatenation should succeed");
let tmp_dir = std::path::Path::new("/tmp");
let output_stem = format!("seqtui_test_concat_{}", test_id);
let mut found_log = false;
if let Ok(entries) = std::fs::read_dir(tmp_dir) {
for entry in entries.flatten() {
let name = entry.file_name().to_string_lossy().to_string();
if name.starts_with(&output_stem) && name.ends_with(".log") {
found_log = true;
if let Ok(content) = std::fs::read_to_string(entry.path()) {
assert!(content.contains("# SeqTUI"), "Log should have SeqTUI header");
assert!(content.contains("alignment.fasta") || content.contains("LOC_01790"),
"Log should list input files");
}
let _ = std::fs::remove_file(entry.path());
break;
}
}
}
let _ = std::fs::remove_file(&tmp_output);
assert!(found_log, "Concatenation should create a log file");
}
#[test]
fn test_extract_key_no_delimiter() {
let result = extract_key("Ae_bicornis_contig257", None, None, "test.fa");
assert!(result.is_ok());
assert_eq!(result.unwrap(), "Ae_bicornis_contig257");
}
#[test]
fn test_extract_key_default_first_field() {
let result = extract_key("Ae_bicornis_contig257", Some("_"), None, "test.fa");
assert!(result.is_ok());
assert_eq!(result.unwrap(), "Ae");
}
#[test]
fn test_extract_key_single_field() {
let result = extract_key("Ae_bicornis_contig257", Some("_"), Some(&[2]), "test.fa");
assert!(result.is_ok());
assert_eq!(result.unwrap(), "bicornis");
}
#[test]
fn test_extract_key_multiple_fields() {
let result = extract_key("Ae_bicornis_contig257", Some("_"), Some(&[1, 2]), "test.fa");
assert!(result.is_ok());
assert_eq!(result.unwrap(), "Ae_bicornis");
}
#[test]
fn test_extract_key_three_fields() {
let result = extract_key("Ae_bicornis_indiv2_contig257", Some("_"), Some(&[1, 2, 3]), "test.fa");
assert!(result.is_ok());
assert_eq!(result.unwrap(), "Ae_bicornis_indiv2");
}
#[test]
fn test_extract_key_reverse_order() {
let result = extract_key("Ae_bicornis_contig257", Some("_"), Some(&[2, 1]), "test.fa");
assert!(result.is_ok());
assert_eq!(result.unwrap(), "bicornis_Ae");
}
#[test]
fn test_extract_key_duplicate_fields() {
let result = extract_key("Ae_bicornis_contig257", Some("_"), Some(&[1, 1, 2]), "test.fa");
assert!(result.is_ok());
assert_eq!(result.unwrap(), "Ae_Ae_bicornis");
}
#[test]
fn test_extract_key_field_out_of_range() {
let result = extract_key("Ae_bicornis_contig257", Some("_"), Some(&[1, 2, 4]), "test.fa");
assert!(result.is_err());
let err = result.unwrap_err().to_string();
assert!(err.contains("Cannot extract field 4"), "Error should mention field 4");
assert!(err.contains("Ae_bicornis_contig257"), "Error should include full sequence name");
assert!(err.contains("test.fa"), "Error should include file name");
assert!(err.contains("3 fields available"), "Error should mention available field count");
}
#[test]
fn test_extract_key_field_zero_invalid() {
let result = extract_key("Ae_bicornis", Some("_"), Some(&[0, 1]), "test.fa");
assert!(result.is_err());
let err = result.unwrap_err().to_string();
assert!(err.contains("1-based"), "Error should mention 1-based indexing");
}
#[test]
fn test_extract_key_different_delimiter() {
let result = extract_key("species.gene.variant", Some("."), Some(&[1, 2]), "test.fa");
assert!(result.is_ok());
assert_eq!(result.unwrap(), "species.gene");
}
#[test]
fn test_partition_nexus_format() {
let test_id = TEST_COUNTER.fetch_add(1, Ordering::SeqCst);
let tmp_output = format!("/tmp/seqtui_test_part_{}.fasta", test_id);
let tmp_partition = format!("/tmp/seqtui_test_part_{}.nex", test_id);
let files = vec![
PathBuf::from("examples/LOC_01790.nex"),
PathBuf::from("examples/LOC_11070.fasta"),
PathBuf::from("examples/LOC_39310.fasta"),
];
let result = run_concatenation_mode(
&files,
None, &tmp_output,
true, 1, 1, None, None, Some('-'), Some(&tmp_partition), false, );
assert!(result.is_ok(), "Concatenation with partitions should succeed");
let content = std::fs::read_to_string(&tmp_partition)
.expect("Partition file should exist");
let expected = "#nexus
begin sets;
charset LOC_01790 = 1-286;
charset LOC_11070 = 287-636;
charset LOC_39310 = 637-951;
end;
";
assert_eq!(content, expected, "Partition file content should match expected format");
let _ = std::fs::remove_file(&tmp_output);
let _ = std::fs::remove_file(&tmp_partition);
if let Ok(entries) = std::fs::read_dir("/tmp") {
for entry in entries.flatten() {
let name = entry.file_name().to_string_lossy().to_string();
if name.starts_with(&format!("seqtui_test_part_{}_concat_", test_id)) {
let _ = std::fs::remove_file(entry.path());
}
}
}
}
#[test]
fn test_partition_name_sanitization() {
let test_name = "gene with spaces";
let sanitized = test_name.replace(' ', "_");
assert_eq!(sanitized, "gene_with_spaces", "Spaces should be replaced with underscores");
}
}