extern crate bytepack;
mod error;
pub mod hashmatch;
pub mod treematch;
use error::AlgoSpecError;
use hashmatch::HashMatchIterator;
use treematch::TreeMatchIterator;
#[derive(Clone, Copy, Debug, PartialEq, Eq)]
pub struct Match {
pub first_pos: usize,
pub second_pos: usize,
pub length: usize,
}
impl PartialOrd for Match {
fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
Some(self.cmp(other))
}
}
impl Ord for Match {
fn cmp(&self, other: &Self) -> std::cmp::Ordering {
(self.length, self.first_pos, self.second_pos).cmp(&(
other.length,
other.first_pos,
other.second_pos,
))
}
}
impl Match {
pub fn new(first_pos: usize, second_pos: usize, length: usize) -> Match {
Match {
first_pos,
second_pos,
length,
}
}
pub fn first_end(&self) -> usize {
self.first_pos + self.length
}
pub fn second_end(&self) -> usize {
self.second_pos + self.length
}
}
#[derive(Clone, Copy, Debug)]
pub enum AlgoSpec {
HashMatch(usize),
TreeMatch(usize),
}
pub struct MatchIterator<'a> {
iter: Box<dyn Iterator<Item = Match> + 'a>,
}
impl<'a> MatchIterator<'a> {
pub fn new(first: &'a [u8], second: &'a [u8], algo_spec: AlgoSpec) -> Result<MatchIterator<'a>, AlgoSpecError> {
Ok(MatchIterator {
iter: match algo_spec {
AlgoSpec::TreeMatch(mml) => Box::new(TreeMatchIterator::new(first, second, mml)),
AlgoSpec::HashMatch(1) => Box::new(HashMatchIterator::<u8>::new(first, second)),
AlgoSpec::HashMatch(2) => Box::new(HashMatchIterator::<u16>::new(first, second)),
AlgoSpec::HashMatch(3) => {
Box::new(HashMatchIterator::<[u8; 3]>::new(first, second))
}
AlgoSpec::HashMatch(4) => Box::new(HashMatchIterator::<u32>::new(first, second)),
AlgoSpec::HashMatch(5) => {
Box::new(HashMatchIterator::<[u8; 5]>::new(first, second))
}
AlgoSpec::HashMatch(6) => {
Box::new(HashMatchIterator::<[u16; 3]>::new(first, second))
}
AlgoSpec::HashMatch(7) => {
Box::new(HashMatchIterator::<[u8; 7]>::new(first, second))
}
AlgoSpec::HashMatch(8) => Box::new(HashMatchIterator::<u64>::new(first, second)),
AlgoSpec::HashMatch(10) => {
Box::new(HashMatchIterator::<[u16; 5]>::new(first, second))
}
AlgoSpec::HashMatch(12) => {
Box::new(HashMatchIterator::<[u32; 3]>::new(first, second))
}
AlgoSpec::HashMatch(14) => {
Box::new(HashMatchIterator::<[u16; 7]>::new(first, second))
}
AlgoSpec::HashMatch(16) => {
Box::new(HashMatchIterator::<[u64; 2]>::new(first, second))
}
AlgoSpec::HashMatch(20) => {
Box::new(HashMatchIterator::<[u32; 5]>::new(first, second))
}
AlgoSpec::HashMatch(24) => {
Box::new(HashMatchIterator::<[u64; 3]>::new(first, second))
}
AlgoSpec::HashMatch(28) => {
Box::new(HashMatchIterator::<[u32; 7]>::new(first, second))
}
AlgoSpec::HashMatch(32) => {
Box::new(HashMatchIterator::<[u64; 4]>::new(first, second))
}
AlgoSpec::HashMatch(40) => {
Box::new(HashMatchIterator::<[u64; 5]>::new(first, second))
}
AlgoSpec::HashMatch(48) => {
Box::new(HashMatchIterator::<[u64; 6]>::new(first, second))
}
AlgoSpec::HashMatch(56) => {
Box::new(HashMatchIterator::<[u64; 7]>::new(first, second))
}
AlgoSpec::HashMatch(64) => {
Box::new(HashMatchIterator::<[u64; 8]>::new(first, second))
}
AlgoSpec::HashMatch(mml) => return Err(AlgoSpecError::UnsupportedSize(mml)),
},
})
}
}
impl<'a> Iterator for MatchIterator<'a> {
type Item = Match;
#[inline]
fn next(&mut self) -> Option<Match> {
self.iter.next()
}
}
pub fn longest_common_substring<'a>(first: &'a [u8], second: &'a [u8], algo_spec: AlgoSpec) -> Result<Option<&'a [u8]>, AlgoSpecError> {
let match_iter = MatchIterator::new(first, second, algo_spec)?;
let longest = match_iter.max();
if let Some(longest) = longest {
return Ok(Some(&first[longest.first_pos..longest.first_end()]))
} else {
return Ok(None)
}
}
pub fn longest_common_substrings(
first: &[u8],
second: &[u8],
algo_spec: AlgoSpec,
number: usize,
) -> Result<Vec<Match>, AlgoSpecError> {
let match_iter = MatchIterator::new(first, second, algo_spec)?;
let mut top = Vec::<Match>::with_capacity(number + 1);
let mut threshold = 0;
for m in match_iter {
if m.length > threshold {
let mut insert_pos = 0;
while insert_pos < top.len() && top[insert_pos].length > m.length {
insert_pos += 1;
}
top.insert(insert_pos, m);
if top.len() > number {
top.truncate(number);
threshold = top.last().unwrap().length;
}
}
}
return Ok(top);
}
pub fn patch_set(first: &[u8], second: &[u8], algo_spec: AlgoSpec) -> Result<Vec<Match>, AlgoSpecError> {
let mut match_iter = MatchIterator::new(first, second, algo_spec)?;
let mut patches = Vec::<Match>::new();
if let Some(m) = match_iter.next() {
patches.push(m);
}
for mut m in match_iter {
let last = patches.len() - 1;
if m.second_end() > patches[last].second_end() {
if m.second_pos == patches[last].second_pos {
patches[last] = m;
}
else if m.second_pos < patches[last].second_pos {
let overlap = patches[last].second_pos - m.second_pos;
m.first_pos += overlap;
m.second_pos += overlap;
m.length -= overlap;
patches[last] = m;
}
else if m.second_pos > patches[last].second_pos
&& m.second_pos < patches[last].second_end()
{
let overlap = patches[last].second_end() - m.second_pos;
m.first_pos += overlap;
m.second_pos += overlap;
m.length -= overlap;
patches.push(m);
}
else {
patches.push(m);
}
}
}
return Ok(patches);
}
pub fn unique_strings(first: &[u8], second: &[u8], algo_spec: AlgoSpec) -> Result<Vec<(usize, usize)>, AlgoSpecError> {
let match_iter = MatchIterator::new(first, second, algo_spec)?;
let mut uniques = Vec::<(usize, usize)>::new();
let mut covered = 0;
for m in match_iter {
if m.second_pos > covered {
uniques.push((covered, m.second_pos));
}
if m.second_end() > covered {
covered = m.second_end();
}
}
if covered < second.len() {
uniques.push((covered, second.len()));
}
return Ok(uniques);
}
#[cfg(test)]
mod test {
extern crate rand;
use longest_common_substring;
use longest_common_substrings;
use patch_set;
use treematch::SuffixTree;
use unique_strings;
use AlgoSpec;
const ALGO_SPECS_4: &'static [AlgoSpec] = &[
AlgoSpec::HashMatch(1),
AlgoSpec::HashMatch(2),
AlgoSpec::HashMatch(3),
AlgoSpec::HashMatch(4),
AlgoSpec::TreeMatch(1),
AlgoSpec::TreeMatch(2),
AlgoSpec::TreeMatch(3),
AlgoSpec::TreeMatch(4),
];
const ALGO_SPECS_8: &'static [AlgoSpec] = &[
AlgoSpec::HashMatch(1),
AlgoSpec::HashMatch(2),
AlgoSpec::HashMatch(4),
AlgoSpec::HashMatch(8),
AlgoSpec::TreeMatch(1),
AlgoSpec::TreeMatch(2),
AlgoSpec::TreeMatch(4),
AlgoSpec::TreeMatch(8),
];
#[test]
fn lcs() {
let a = "abcdefghijklmnopqrstuvwxyz";
let b = "rstufghijklmnopqvwxyzabcde";
for algo_spec in ALGO_SPECS_8 {
let m = longest_common_substring(a.as_bytes(), b.as_bytes(), *algo_spec).unwrap().unwrap();
assert_eq!(m, "fghijklmnopq".as_bytes());
}
}
#[test]
fn lcss() {
let a = "abcdefghijklmnopqrstuvwxyz";
let b = "rstufghijklmnopqvwxyzabcde";
for algo_spec in ALGO_SPECS_4 {
let ms = longest_common_substrings(a.as_bytes(), b.as_bytes(), *algo_spec, 10).unwrap();
assert!(ms.len() == 4);
assert!(ms[0].first_pos == 5);
assert!(ms[0].second_pos == 4);
assert!(ms[0].length == 12);
assert!(ms[1].first_pos == 0);
assert!(ms[1].second_pos == 21);
assert!(ms[1].length == 5);
assert!(ms[2].first_pos == 21);
assert!(ms[2].second_pos == 16);
assert!(ms[2].length == 5);
assert!(ms[3].first_pos == 17);
assert!(ms[3].second_pos == 0);
assert!(ms[3].length == 4);
}
}
#[test]
fn ps1() {
let a = "abcdefghijqrstuvwxyzfghijklmnopqr";
let b = "abcdefghijklmnopqrstuvwxyz";
for algo_spec in ALGO_SPECS_8 {
let ps = patch_set(a.as_bytes(), b.as_bytes(), *algo_spec).unwrap();
assert!(ps.len() == 3);
assert!(ps[0].first_pos == 0);
assert!(ps[0].second_pos == 0);
assert!(ps[0].length == 10);
assert!(ps[1].first_pos == 25);
assert!(ps[1].second_pos == 10);
assert!(ps[1].length == 8);
assert!(ps[2].first_pos == 12);
assert!(ps[2].second_pos == 18);
assert!(ps[2].length == 8);
}
}
#[test]
fn ps2() {
let a = "abcdefghijklmnhijklmnopqrstuopqrstuvwxyz";
let b = "abcdefghijklmnopqrstuvwxyz";
for algo_spec in ALGO_SPECS_8 {
let ps = patch_set(a.as_bytes(), b.as_bytes(), *algo_spec).unwrap();
assert!(ps.len() == 2);
assert!(ps[0].first_pos == 0);
assert!(ps[0].second_pos == 0);
assert!(ps[0].length == 14);
assert!(ps[1].first_pos == 28);
assert!(ps[1].second_pos == 14);
assert!(ps[1].length == 12);
}
}
#[test]
fn ps3() {
let a = "abcdefghijklhijklmnhijklmnopqrstuqrstuvwxyz";
let b = "abcdefghijklmnopqrstuvwxyz";
for algo_spec in ALGO_SPECS_4 {
let ps = patch_set(a.as_bytes(), b.as_bytes(), *algo_spec).unwrap();
assert!(ps.len() == 3);
assert!(ps[0].first_pos == 0);
assert!(ps[0].second_pos == 0);
assert!(ps[0].length == 12);
assert!(ps[1].first_pos == 24);
assert!(ps[1].second_pos == 12);
assert!(ps[1].length == 9);
assert!(ps[2].first_pos == 38);
assert!(ps[2].second_pos == 21);
assert!(ps[2].length == 5);
}
}
#[test]
fn us1() {
let a = "abcdefghijklmnopqrstuvwxyz";
let b = "abcdef01ghijklmnop3456qrstuvwxyz";
for algo_spec in ALGO_SPECS_4 {
let us = unique_strings(a.as_bytes(), b.as_bytes(), *algo_spec).unwrap();
assert!(us.len() == 2);
assert!(us[0].0 == 6);
assert!(us[0].1 == 8);
assert!(us[1].0 == 18);
assert!(us[1].1 == 22);
}
}
#[test]
fn us2() {
let a = "abcdefghijklmnopqrstuvwxyz";
let b = "01234";
for algo_spec in ALGO_SPECS_4 {
let us = unique_strings(a.as_bytes(), b.as_bytes(), *algo_spec).unwrap();
assert!(us.len() == 1);
assert!(us[0].0 == 0);
assert!(us[0].1 == 5);
}
}
#[test]
fn us3() {
let a = "abcdefghijklmnopqrstuvwxyz";
let b = "1234abcd5678";
for algo_spec in ALGO_SPECS_4 {
let us = unique_strings(a.as_bytes(), b.as_bytes(), *algo_spec).unwrap();
assert!(us.len() == 2);
assert!(us[0].0 == 0);
assert!(us[0].1 == 4);
assert!(us[1].0 == 8);
assert!(us[1].1 == 12);
}
}
#[test]
fn us4() {
let a = "abcdefghijklmnopqrstuvwxyz";
let b = "abcdefghaxelionijklmnopqrstuvwxyz";
let us = unique_strings(a.as_bytes(), b.as_bytes(), AlgoSpec::HashMatch(4)).unwrap();
assert!(us.len() == 1);
assert!(us[0].0 == 8);
assert!(us[0].1 == 15);
let us = unique_strings(a.as_bytes(), b.as_bytes(), AlgoSpec::HashMatch(1)).unwrap();
assert!(us.len() == 0);
}
#[test]
fn stree() {
let a = "ABABABC";
let stree = SuffixTree::new(a.as_bytes());
println!("{}", stree.to_graphviz(a.as_bytes()));
}
#[test]
fn random_compare() {
let a: Vec<u8> = (0..1000)
.map(|_| (rand::random::<u8>() % 2) + b'a')
.collect();
let b: Vec<u8> = (0..1000)
.map(|_| (rand::random::<u8>() % 2) + b'a')
.collect();
for mml in [2, 4, 8].iter() {
let ms1 = longest_common_substrings(&a, &b, AlgoSpec::HashMatch(*mml), 100).unwrap();
let ms2 = longest_common_substrings(&a, &b, AlgoSpec::TreeMatch(*mml), 100).unwrap();
assert!(ms1.len() == ms2.len());
for i in 0..ms1.len() {
assert!(ms1[i].second_pos == ms2[i].second_pos);
assert!(ms1[i].length == ms2[i].length);
}
}
}
#[test]
fn motif_compare() {
let mut motif = Vec::<u8>::new();
for i in 1..20 {
for _ in 0..i {
motif.push(0);
}
motif.push(i as u8);
}
for mml in [2, 4, 8].iter() {
let ms1 = longest_common_substrings(&motif, &motif, AlgoSpec::HashMatch(*mml), 100).unwrap();
let ms2 = longest_common_substrings(&motif, &motif, AlgoSpec::TreeMatch(*mml), 100).unwrap();
assert!(ms1.len() == ms2.len());
for i in 0..ms1.len() {
assert!(ms1[i].second_pos == ms2[i].second_pos);
assert!(ms1[i].length == ms2[i].length);
}
}
}
}