sassy 0.2.1

Approximate string matching using SIMD
Documentation
//! # Sassy: fast approximate string matching
//!
//! Sassy is a library for searching approximate matches of short patterns/queries in longer texts.
//! It supports ASCII and DNA, and works best for patterns of length up to 1000.
//!
//! The main entrypoint is the [`Searcher`] object.
//! This can be configured with the alphabet ([`profiles::Ascii`], [`profiles::Dna`] (may panic on `N`), or [`profiles::Iupac`]),
//! whether to search the reverse complement ([`Searcher::new_fwd`], [`Searcher::new_rc`]),
//! and optionally with an _overhang cost_ for IUPAC profiles ([`Searcher::new_fwd_with_overhang`]).
//!
//! Given a [`Searcher`], you can search call [`Searcher::search`] with a pattern, text, and maximum edit distance `k`.
//! This will return a vector of [`Match`] objects, that each contain the substring of text they match, the corresponding `cost`,
//! and the `cigar` string that describes the alignment.
//!
//! ## `search` vs `search_all`
//!
//! Sassy faithfully implements 'Approximate String Matching' as defined by Navarro (2001):
//! it finds all positions in the text where an alignment of the pattern with cost <= `k` ends,
//! and returns those positions, with a traceback/alignment for each. This is covered by the [`Searcher::search_all`] mode.
//!
//! In practice, this can return a lot of overlapping matches. For example,
//! searching `ABC` in `XXXABCXXX` aligns with cost `333210123` after every
//! character. If we have `k=1`, `search_all` will return 3 matches with `AB` (cost 1), `ABC` (cost 0), and `ABCX` (cost 1).
//! [`Searcher::search`] only returns a match at the _rightmost_ position of each local minimum, which in this case is only `ABC`.
//!
//! See the paper (linked on [GitHub](https://github.com/RagnarGrootKoerkamp/sassy)) for details.
//!
//! ## Reverse complements
//!
//! When searching reverse complements, by default Sassy searches the pattern in both the forward and reverse-complement text.
//! Only Sassy2 ([`Searcher::search_encoded_patterns`]) instead searches the reverse complement of the pattern in the forward text.
//!
//! This has implications for the set of matches that is returned. By default,
//! there is at most 1 match per text _end_ position of each RC match, but in v2,
//! there is at most 1 match per text _start_ position of each RC match.
//!
//! ## Traceback
//!
//! Sassy uses a traceback that greedily prefers matches and substitutions, followed by deletions, followed by insertions.
//! Note that this does not guarantee to find an alignment with the minimal number of substitutions.
//!
//! In case more precise alignments are needed, e.g. via affine-cost alignments, it is recommended to run sassy with [`Searcher::without_trace`].
//! Then, each match only contains the end position in the text and pattern, and the user can manually run a backwards extension-alignment from that point.
//! In case of a reverse-complement match, in v1, the match in against `RC(text[text_start..])`, while in v2 (`search_encoded_patterns`),
//! the match is against `RC(pattern[pattern_start..])`.
//!
//! ## Example
//!
//! Usage example:
//! ```
//! use sassy::{
//!     CachedRev, Match, Searcher, Strand,
//!     profiles::{Dna, Iupac},
//! };
//!
//! // --- Test data ---
//! let pattern = b"ATCG";
//! let text = b"CCCATCACCC";
//! let k = 1;
//!
//! // --- FWD only search ---
//! /*
//!     CCCATCACCC (text)
//!        |||x
//!        ATCG    (pattern)
//! */
//! let mut searcher = Searcher::<Dna>::new_fwd();
//! let matches = searcher.search(pattern, &text, k);
//!
//! assert_eq!(matches.len(), 1);
//! let fwd_match = matches[0].clone();
//! assert_eq!(fwd_match.pattern_start, 0);
//! assert_eq!(fwd_match.pattern_end, 4);
//! assert_eq!(fwd_match.text_start, 3);
//! assert_eq!(fwd_match.text_end, 7);
//! assert_eq!(fwd_match.cost, 1);
//! assert_eq!(fwd_match.strand, Strand::Fwd);
//! assert_eq!(fwd_match.cigar.to_string(), "3=1X");
//!
//! // --- FWD + RC search ---
//! /*
//!     CCCATCACCC (text)       GGGTGATGGG (text - rc)
//!        |||x                      ||x|
//!        ATCG    (pattern)         ATCG
//! */
//! let mut searcher = Searcher::<Dna>::new_rc();
//! // We can cache the reverse text if we do multiple pattern
//! // searches in `rc` mode
//! let cached_text = CachedRev::new(text, true);
//! let matches = searcher.search(pattern, &cached_text, k);
//!
//! // Gives two matches, of which one the previous forward match
//! assert_eq!(matches.len(), 2);
//! assert_eq!(matches[0], fwd_match);
//! let rc_match = matches[1].clone();
//! assert_eq!(rc_match.pattern_start, 0);
//! assert_eq!(rc_match.pattern_end, 4);
//! assert_eq!(rc_match.text_start, 1);
//! assert_eq!(rc_match.text_end, 5);
//! assert_eq!(rc_match.cost, 1);
//! assert_eq!(rc_match.strand, Strand::Rc);
//! assert_eq!(rc_match.cigar.to_string(), "2=1X1=");
//!
//! // --- FWD + RC search with overhang ---
//! /*
//!               GTXXXNNN     (text)
//!               ||   |||
//!             ACGT   ACGT    (pattern)
//! */
//! let pattern = b"ACGT";
//! let text = b"GTXXXNNN";
//! let alpha = 0.5;
//! let k = 1;
//! let mut searcher = Searcher::<Iupac>::new_fwd_with_overhang(alpha);
//! let matches = searcher.search(pattern, &text, k);
//!
//! assert_eq!(matches[0].pattern_start, 2);
//! assert_eq!(matches[0].pattern_end, 4);
//! assert_eq!(matches[0].text_start, 0);
//! assert_eq!(matches[0].text_end, 2);
//! assert_eq!(matches[0].cost, 1);
//! assert_eq!(matches[0].strand, Strand::Fwd);
//! assert_eq!(matches[0].cigar.to_string(), "2=");
//!
//! assert_eq!(matches[1].pattern_start, 0);
//! assert_eq!(matches[1].pattern_end, 3);
//! assert_eq!(matches[1].text_start, 5);
//! assert_eq!(matches[1].text_end, 8);
//! assert_eq!(matches[1].cost, 0);
//! assert_eq!(matches[1].strand, Strand::Fwd);
//! assert_eq!(matches[1].cigar.to_string(), "3=");
//! ```

// INTERNAL MODS
mod bitpacking;
mod delta_encoding;
mod minima;
#[cfg(feature = "cli")]
pub mod pretty_print;
mod search;
mod trace;

// (PARTIALLY) PUBLIC MODS

mod pattern_tiling;
pub mod profiles;
pub use pattern_tiling::general::EncodedPatterns;
pub use search::CachedRev;
pub use search::Match;
pub use search::RcSearchAble;
pub use search::Searcher;
pub use search::Strand;

#[doc(hidden)]
pub mod private {
    pub use crate::trace::{CostMatrix, fill, get_trace, simd_fill};
}

// BINDINGS

#[cfg(feature = "python")]
mod python;

#[cfg(feature = "c")]
mod c;

// TYPEDEFS

#[cfg(not(feature = "avx512"))]
const LANES: usize = 4;
#[cfg(not(feature = "avx512"))]
type S = wide::u64x4;

#[cfg(feature = "avx512")]
const LANES: usize = 8;
#[cfg(feature = "avx512")]
type S = wide::u64x8;

/// Print info on CPU features and speed of searching.
#[cfg(feature = "diagnostics")]
pub fn test_cpu_features() {
    eprintln!("CPU features during compilation and runtime:");
    #[cfg(target_arch = "x86_64")]
    {
        eprintln!("Target architecture: x86_64");

        let sse = if is_x86_feature_detected!("sse") {
            "+"
        } else {
            "-"
        };
        #[cfg(target_feature = "sse")]
        eprintln!("SSE  + {sse}");
        #[cfg(not(target_feature = "sse"))]
        eprintln!("SSE  - {sse}");

        let avx2 = if is_x86_feature_detected!("avx2") {
            "+"
        } else {
            "-"
        };
        #[cfg(target_feature = "avx")]
        eprintln!("AVX2 + {avx2}");
        #[cfg(not(target_feature = "avx"))]
        eprintln!("AVX2 - {avx2}");

        let bmi2 = if is_x86_feature_detected!("bmi2") {
            "+"
        } else {
            "-"
        };
        #[cfg(target_feature = "bmi2")]
        eprintln!("BMI2 + {bmi2}");
        #[cfg(not(target_feature = "bmi2"))]
        eprintln!("BMI2 - {bmi2}");
    }
    #[cfg(target_arch = "aarch64")]
    {
        use std::arch::is_aarch64_feature_detected;

        eprintln!("Target architecture: aarch64");

        let neon = if is_aarch64_feature_detected!("neon") {
            "+"
        } else {
            "-"
        };
        #[cfg(target_feature = "neon")]
        eprintln!("NEON + {neon}");
        #[cfg(not(target_feature = "neon"))]
        eprintln!("NEON - {neon}");
    }
    #[cfg(not(any(target_arch = "x86_64", target_arch = "aarch64")))]
    {
        eprintln!("Unsupported target architecture");
    }
}

/// Print throughput in GB/s for random pattern and text.
#[cfg(feature = "diagnostics")]
pub fn test_throughput() {
    eprintln!("Running a little test: aligning a 23bp pattern against 100kb text, with 1 error.");
    eprintln!("With AVX2, this is typically around 2GB/s. Without, closer to 1.3GB/s.");
    eprintln!("If you see 0.02GB/s, that means you're on a debug rather than release build.");

    use rand::Rng;
    let n = 100000;
    let m = 23;
    let k = 1;

    let mut rng = rand::rng();
    let text: Vec<u8> = (0..n).map(|_| b"ACGT"[rng.random_range(0..4)]).collect();
    let pattern: Vec<u8> = (0..m).map(|_| b"ACGT"[rng.random_range(0..4)]).collect();

    let mut searcher = Searcher::<profiles::Iupac>::new_fwd();
    let start = std::time::Instant::now();
    let _matches = searcher.search(&pattern, &text, k);
    let duration = start.elapsed();
    eprintln!(
        "Search throughput in GB/s: {}",
        text.len() as f32 / duration.as_secs_f32() / 1_000_000_000.0
    );
}

#[cfg(test)]
#[cfg(feature = "diagnostics")]
mod test {
    #[test]
    fn test_cpu_features() {
        super::test_cpu_features();
    }
    #[test]
    fn test_throughput() {
        super::test_throughput();
    }
}