suffix 0.1.1

Suffix arrays and suffix trees.
use {SuffixArray, SuffixTree, Node, Rawlink};

pub fn to_suffix_tree<'s>(sa: &'s SuffixArray) -> SuffixTree<'s> {
    let mut st = SuffixTree::init(&*sa.text);
    let mut last: *mut Node = &mut *st.root;
    for (i, &sufstart) in sa.indices.iter().enumerate() {
        let lcp_len = sa.lcp_lens[i];
        let vins = ancestor_lcp_len(unsafe { &mut *last }, lcp_len);
        let dv = vins.root_concat_len();
        if dv == lcp_len {
            // The concatenation of the labels from root-to-vins equals
            // the longest common prefix of SA[i-1] and SA[i].
            // This means that the suffix we're adding contains the
            // entirety of `vins`, which in turn means we can simply
            // add it as a new leaf.

            let mut node = box Node::new(sf!(sa.text, sufstart + lcp_len));
            let first_char = node.key();

            // TODO: I don't yet understand why this invariant is true,
            // but it has to be---otherwise the algorithm is flawed. ---AG
            assert!(!vins.children.contains_key(&first_char));

            last = &mut *node;
            node.parent = Rawlink::some(vins);
            node.terminal = true;
            vins.children.insert(first_char, node);
        } else if dv < lcp_len {
            // In this case, `vins`'s right-most child overlaps with the
            // suffix we're trying to insert. So we need to:
            //   1) Cut the right-most edge (but keep the node).
            //   2) Create a new internal node whose path-label is the LCP.
            //   3) Attach the old node to the new internal node. Its
            //      label should be what it was before, but with the LCP
            //      removed.
            //   4) Add a new leaf to this internal node containing the
            //      current suffix with the LCP removed.

            // Why is this invariant true?
            // Well, the only way this can't be true is if `vins` is
            // `last`, which is the last leaf that was added. (If `vins`
            // isn't `last`, then `vins` must be a parent of `last`, which
            // implies it has a child.)
            // But we also know that the current suffix is > than the last
            // suffix inserted, which means the lcp of the last suffix and
            // this suffix can be at *most* len(last). Therefore, when
            // `vins` is `last`, we have that `len(last) >= len(lcp)`,
            // which implies that `len(last)` (== `dv`) can never be less
            // than `len(lcp)` (== `lcp_len`).
            assert!(vins.children.len() > 0);
            // Thus, we can pick the right-most child with impunity.
            let rkey = *vins.children.keys().next_back().unwrap();

            // 1) cut the right-most edge
            let mut rnode = vins.children.remove(&rkey).unwrap();

            // 2) create new internal node (full path label == LCP)
            let mut int_node = box Node::new(
                s!(sa.text, sa.indices[i-1] + dv, sa.indices[i-1] + lcp_len));
            int_node.parent = Rawlink::some(vins);
            int_node.terminal = false;

            // 3) Attach old node to new internal node and update
            // the label.
            rnode.label =
                s!(sa.text,
                   sa.indices[i-1] + lcp_len,
                   sa.indices[i-1] + rnode.root_concat_len());
            rnode.parent = Rawlink::some(&mut *int_node);

            // 4) Create new leaf node with the current suffix, but with
            // the lcp trimmed.
            let mut leaf = box Node::new(sf!(sa.text, sufstart + lcp_len));
            last = &mut *leaf;
            leaf.parent = Rawlink::some(&mut *int_node);
            leaf.terminal = true;

            // Finally, attach all of the nodes together.
            assert!(rnode.key() != leaf.key()); // why is this true? ---AG
            int_node.children.insert(rnode.key(), rnode);
            int_node.children.insert(leaf.key(), leaf);
            vins.children.insert(int_node.key(), int_node);
        } else {
            unreachable!()
        }
    }
    st
}

fn ancestor_lcp_len<'a, 's>(start_node: &'a mut Node<'s>, lcp_len: u32)
                           -> &'a mut Node<'s> {
    let mut cur = start_node;
    loop {
        if cur.root_concat_len() <= lcp_len {
            return cur;
        }
        match cur.parent.resolve() {
            // We've reached the root, so we have no choice but to use it.
            None => break,
            Some(p) => { cur = p; }
        }
    }
    cur
}

#[cfg(test)]
mod tests {
    use array_naive;

    #[test]
    fn basic() {
        let sa = array_naive(format!("banana"));
        debug!("{}", sa);
        let st = sa.to_suffix_tree();
        debug!("{}", st);
    }

    #[test]
    fn basic2() {
        let sa = array_naive(format!("apple"));
        debug!("{}", sa);
        let st = sa.to_suffix_tree();
        debug!("{}", st);
    }

    #[test]
    fn basic3() {
        let sa = array_naive(format!("mississippi"));
        debug!("{}", sa);
        let st = sa.to_suffix_tree();
        debug!("{}", st);
    }
}