use fastqrab_config::tpd_adapt_u8_from_byte_or_char;
use fastqrab_dna::dna::{self, IupacExpander, iupac_overlapping};
use anyhow::{Context, Result, bail};
use bstr::BString;
use fastqrab_io::io::{apply_to_read_names_and_sequences, input::open_text_file};
use indexmap::IndexMap;
use schemars::JsonSchema;
use std::{
collections::{BTreeMap, HashSet},
io::BufReader,
path::{Path, PathBuf},
sync::Arc,
};
use toml_pretty_deser::prelude::*;
use crate::{config::PartialConfig, no_barcode_infix};
#[derive(Clone, Debug, JsonSchema)]
#[tpd]
pub struct Barcodes {
#[tpd(nested)]
pub from_file: Option<BarcodesFromFile>,
#[schemars(with = "BTreeMap<String, String>")]
#[tpd(absorb_remaining)]
pub barcode_to_name: IndexMap<BString, String>,
#[tpd(skip, default)]
#[schemars(skip)]
pub seq_to_name: Arc<IndexMap<BString, String>>,
}
#[tpd(no_verify)]
#[derive(Clone, Debug, JsonSchema)]
pub struct BarcodesFromFile {
#[expect(
dead_code,
reason = "only used in PartialBarcodesFromFile. But necessary for logging"
)]
filename: String,
#[tpd(with = "tpd_adapt_u8_from_byte_or_char", alias = "read_comment_char")]
pub read_comment_character: Option<u8>,
}
fn read_barcodes_from_txt(filename: &Path, entries: &mut IndexMap<BString, String>) -> Result<()> {
use bstr::io::BufReadExt;
let file = open_text_file(filename)?;
let buffered = BufReader::new(file);
let mut target_len = None;
for line in buffered.byte_lines() {
let line: BString = line?.into();
match target_len {
None => {
target_len = Some(line.len());
}
Some(old_len) => {
if old_len != line.len() {
bail!("Unequal line lengths detected in {}", filename.display());
}
}
}
let str_line: String = String::from_utf8(line.to_vec())
.context("Expected DNA, found something non-utf8-encoded?")?;
entries.insert(line, str_line);
}
Ok(())
}
impl PartialBarcodesFromFile {
fn load_from_file(
&self,
use_rapidgzip: bool,
) -> Result<IndexMap<BString, String>, ValidationFailure> {
let mut entries: IndexMap<BString, String> = IndexMap::new();
let filename = self.filename.as_ref().expect("Should be checked before");
let comment_char = self
.read_comment_character
.as_ref()
.and_then(|x| x.as_ref());
if let Err(err) = apply_to_read_names_and_sequences(
filename,
&mut |name: &[u8], seq: &[u8]| {
let name_str = if let Some(comment_char) = comment_char
&& let Some(pos) = name.iter().position(|&c| c == *comment_char)
{
let name_without_comment = &name[..pos];
String::from_utf8_lossy(name_without_comment).into_owned()
} else {
String::from_utf8_lossy(name).into_owned()
};
if entries.insert(BString::from(seq), name_str).is_some() {
bail!("Duplicate sequence in {filename}: '{}'", BString::from(seq));
}
Ok(())
},
use_rapidgzip,
) {
if entries.is_empty() && filename.contains(".txt") {
if let Err(second_err) =
read_barcodes_from_txt(&PathBuf::from(filename), &mut entries)
{
return Err(ValidationFailure::new(
format!("Error reading from {filename}"),
Some(format!(
"Tried as sequence file:\n\t{err:?}\n\n\
Tried as text file of one barcode-per-line:\n\t{second_err:?}"
)),
));
}
} else {
return Err(ValidationFailure::new(
format!("Error reading from {filename}"),
Some(format!("{err:?}")),
));
}
}
if entries.is_empty() {
Err(ValidationFailure::new(
format!("Barcodes: reference file '{filename}' contains no sequences.",),
Some("Check your input file.".to_string()),
))
} else {
Ok(entries)
}
}
}
impl VerifyIn<PartialConfig> for PartialBarcodes {
#[expect(clippy::too_many_lines, reason = "needed")]
fn verify(
&mut self,
parent: &PartialConfig,
_options: &VerifyOptions,
) -> std::result::Result<(), ValidationFailure>
where
Self: Sized,
{
self.barcode_to_name.verify_keys(|key| {
if let Some(None) = self.from_file.as_ref() {
if key == "filename" {
return Err(ValidationFailure::new(
"Missing from_file.",
Some("Use from_file = {filename = '...'} to read barcodes from a file")));
} else if key == "read_comment_character" {
return Err(ValidationFailure::new(
"Missing from_file.",
Some("Use from_file = {filename = '...', read_comment_character = 'x'} to read barcodes from a file"))); }
if dna::all_iupac_or_underscore(key.as_bytes()) {
Ok(())
} else {
Err(ValidationFailure::new(
"Invalid IUPAC (uppercase only)",
Some("See https://en.wikipedia.org/wiki/International_Union_of_Pure_and_Applied_Chemistry#Amino_acid_and_nucleotide_base_codes")
))}
} else {
Ok(())
}
});
if let Some(values) = self.barcode_to_name.as_mut() {
if let Some(bad_value) = values
.map
.values_mut()
.find(|v| v.as_ref().map(String::as_str) == Some(no_barcode_infix()))
{
bad_value.state = TomlValueState::new_validation_failed(format!(
"Must not be '{}'",
no_barcode_infix()
));
bad_value.help = Some(
"Choose a different name for your barcode, \
this one is reserved for not-matched reads."
.to_string(),
);
return Ok(());
}
if values.map.is_empty()
&& (self.from_file.as_ref().is_none()
|| matches!(self.from_file.as_ref(), Some(None)))
{
return Err(ValidationFailure::new(
"Empty barcode section",
Some(
"Add at least one barcode mapping (IUPAC='name') under this section,\n\
or read them from a file using from_file={filename='...'}",
),
));
}
}
let iupac_to_name: Option<IndexMap<BString, String>> =
if let Some(Some(from_file)) = self.from_file.as_ref() {
if let Some(barcode_to_name) = self.barcode_to_name.as_ref()
&& !barcode_to_name.map.is_empty()
{
let spans = vec![
(
self.from_file.span.clone(),
"Conflicts with inline barcodes".to_string(),
),
(
self.barcode_to_name.span.clone(),
"Conflicts with from_file".to_string(),
),
];
self.from_file.state = TomlValueState::Custom { spans };
self.from_file.help = Some(
"Remove either the manual barcode definition or the from_file section."
.to_string(),
);
None
} else {
let barcode_to_name = from_file.load_from_file(
parent
.input
.as_ref()
.and_then(|x| x.options.as_ref())
.and_then(|x| x.use_rapidgzip.as_ref())
.copied()
.unwrap_or(false),
)?;
Some(barcode_to_name)
}
} else if let Some(barcode_to_name) = self.barcode_to_name.as_mut()
&& matches!(self.from_file.as_ref(), Some(None))
{
if validate_barcode_disjointness(barcode_to_name) {
Some(
barcode_to_name
.map
.iter()
.map(|(k, v)| (k.clone(), v.value.clone().expect("parent was ok")))
.collect(),
)
} else {
None
}
} else {
None
};
if let Some(iupac_to_name) = iupac_to_name {
let mut dna_to_name: IndexMap<BString, String> = IndexMap::new();
if !iupac_to_name.is_empty()
&& let first_len = iupac_to_name
.keys()
.next()
.map(|k| k.len())
.expect("just checked")
&& iupac_to_name
.iter()
.any(|(iupac, _)| iupac.len() != first_len)
{
let lengths: HashSet<_> = iupac_to_name.keys().map(|k| k.len()).collect();
let mut lengths: Vec<_> = lengths.into_iter().collect();
lengths.sort_unstable();
return Err(ValidationFailure::new(
"Barcodes of different lengths".to_string(),
Some(format!(
"All barcodes in one section must have the same length. Observed lengths: {lengths:?}"
)),
));
}
for (iupac, name) in &iupac_to_name {
for dna in IupacExpander::new(iupac.as_ref()) {
if let Some(old) = dna_to_name.insert(dna.clone(), name.clone())
&& old != *name
{
return Err(ValidationFailure::new(
format!(
"Overlapping sequences after iupac expansion: {dna} {name} -> {old}"
),
Some("Verify your input barcodes are disjoint.".to_string()),
));
}
}
}
self.barcode_to_name.value = Some(MapAndKeys {
keys: vec![],
map: IndexMap::new(),
});
self.barcode_to_name.state = TomlValueState::Ok;
self.seq_to_name = Some(Arc::new(dna_to_name));
}
Ok(())
}
}
#[mutants::skip] fn validate_barcode_disjointness(barcodes: &mut MapAndKeys<BString, String>) -> bool {
struct OverlapPair {
index_i: usize,
index_j: usize,
spans: Vec<(std::ops::Range<usize>, String)>,
}
let mut overlapping_pairs: Vec<OverlapPair> = Vec::new();
for index_i in 0..barcodes.keys.len() {
for index_j in (index_i + 1)..barcodes.keys.len() {
if let Some(dna_a) = barcodes.keys[index_i].value.as_ref()
&& let Some(dna_b) = barcodes.keys[index_j].value.as_ref()
&& let Some(barcode_name_a) = barcodes
.map
.get(bstr::BStr::new(dna_a))
.and_then(|x| x.as_ref())
&& let Some(barcode_name_b) = barcodes
.map
.get(bstr::BStr::new(dna_b))
.and_then(|x| x.as_ref())
&& barcode_name_a != barcode_name_b
&& iupac_overlapping(dna_a.as_bytes(), dna_b.as_bytes())
{
let spans = vec![
(
barcodes.keys[index_i].span(),
format!("Overlaps with {dna_b}"),
),
(
barcodes.keys[index_j].span(),
format!("Overlaps with {dna_a}"),
),
];
overlapping_pairs.push(OverlapPair {
index_i,
index_j,
spans,
});
}
}
}
let mut assigned: std::collections::HashSet<usize> = std::collections::HashSet::new();
let mut err = false;
for op in overlapping_pairs {
let error_idx = if assigned.contains(&op.index_i) {
op.index_j
} else {
op.index_i
};
barcodes.keys[error_idx].state = TomlValueState::Custom { spans: op.spans };
barcodes.keys[error_idx].help =
Some("IUPAC patterns overlap, but lead to different barcodes.".to_string());
assigned.insert(error_idx);
err = true;
}
!err
}