tsalign 2.1.2

A sequence-to-sequence aligner that accounts for template switches
Documentation
use compact_genome::interface::{
    alphabet::{Alphabet, AlphabetCharacter},
    sequence::GenomeSequence,
};
use lib_ts_chainalign::{
    chain_align::performance_parameters::AlignmentPerformanceParameters,
    chaining_lower_bounds::ChainingLowerBounds, costs::AlignmentCosts,
};
use lib_tsalign::a_star_aligner::alignment_geometry::AlignmentRange;
use log::{debug, info, warn};

use std::{fmt::Debug, fs::File, path::PathBuf};

use crate::{
    align::Cli,
    util::{infer_tschain_k, infer_tschain_max_n, load_tsa_config, tschain_preprocess_cache_file},
};

pub fn align_a_star_chain_ts<
    AlphabetType: Alphabet + Debug + Clone + Eq,
    SubsequenceType: GenomeSequence<AlphabetType, SubsequenceType> + ?Sized,
>(
    cli: Cli,
    reference: &SubsequenceType,
    query: &SubsequenceType,
    range: AlignmentRange,
    reference_name: &str,
    query_name: &str,
) {
    let alignment_costs = load_tsa_config::<AlphabetType, _>(&cli.configuration_directory).unwrap();
    let alignment_costs: AlignmentCosts<_> = alignment_costs.into();

    let cache_directory = cli.cache_directory.clone().unwrap_or_else(|| {
        warn!("No cache directory specified, dropping files into current working directory.");
        PathBuf::new()
    });
    let max_n = infer_tschain_max_n(reference.len(), query.len());
    let k = if let Some(k) = cli.k {
        k
    } else {
        infer_tschain_k(reference.len(), query.len()).unwrap()
    };
    debug!("Using max_n = {max_n}");
    info!("Using k = {k}");
    let max_match_run = k - 1;
    let cache_file =
        tschain_preprocess_cache_file(&alignment_costs, &cache_directory, k, max_n).unwrap();

    let chaining_lower_bounds = if let Ok(mut file) = File::open(&cache_file) {
        info!("Loading preprocessed data from cache at {cache_file:?}");
        let chaining_lower_bounds = ChainingLowerBounds::read(&mut file).unwrap();
        assert_eq!(chaining_lower_bounds.alignment_costs(), &alignment_costs);
        assert_eq!(chaining_lower_bounds.max_match_run(), max_match_run);
        chaining_lower_bounds
    } else {
        assert!(
            !cli.force_no_preprocessing,
            "Forcing no preprocessing, but no cache file found."
        );

        info!("Preprocessing...");
        let chaining_lower_bounds =
            lib_ts_chainalign::preprocess(max_n, max_match_run, alignment_costs);

        info!("Storing preprocessed data into cache at {cache_file:?}");
        let mut file = File::create(&cache_file).unwrap();
        chaining_lower_bounds.write(&mut file).unwrap();
        chaining_lower_bounds
    };

    let reference = reference.clone_as_vec();
    let query = query.clone_as_vec();
    let performance_parameters = AlignmentPerformanceParameters {
        max_successors: cli.max_chaining_successors,
        max_exact_cost_function_cost: cli.max_exact_cost_function_cost.into(),
        closed_list: cli.chaining_closed_list,
        open_list: cli.chaining_open_list,
    };

    let alignment = lib_ts_chainalign::align::<AlphabetType>(
        reference,
        query,
        range,
        &performance_parameters,
        &|c| {
            AlphabetType::character_to_ascii(
                AlphabetType::ascii_to_character(c).unwrap().complement(),
            )
        },
        reference_name,
        query_name,
        &chaining_lower_bounds,
    );
    info!("Finished aligning");

    if let Some(output) = cli.output {
        info!("Outputting alignment statistics to {output:?}");
        use std::io::Write;
        let mut output = std::io::BufWriter::new(std::fs::File::create(output).unwrap());
        write!(output, "{}", toml::to_string(&alignment).unwrap()).unwrap();
    }

    println!("{alignment}");
}