general-sam 1.0.3

A general suffix automaton implementation in Rust
Documentation
//! A general suffix automaton implementation.

mod state;
pub use state::GeneralSamState;

use std::convert::Infallible;

use crate::{
    ConstructiveTransitionTable, IterAsChain, TransitionTable, TravelEvent, TrieNodeAlike,
};

pub type GeneralSamNodeID = usize;
pub const SAM_NIL_NODE_ID: GeneralSamNodeID = 0;
pub const SAM_ROOT_NODE_ID: GeneralSamNodeID = 1;

#[derive(Clone, Debug)]
pub struct GeneralSamNode<TransTable: TransitionTable> {
    trans: TransTable,
    len: usize,
    link: GeneralSamNodeID,
    accept: bool,
}

/// A general suffix automaton.
#[derive(Clone, Debug)]
pub struct GeneralSam<TransTable: TransitionTable> {
    node_pool: Vec<GeneralSamNode<TransTable>>,
    topo_and_suf_len_sorted_order: Vec<GeneralSamNodeID>,
}

impl<TransTable: ConstructiveTransitionTable> GeneralSamNode<TransTable> {
    fn new(accept: bool, len: usize, link: GeneralSamNodeID) -> Self {
        Self {
            trans: Default::default(),
            accept,
            len,
            link,
        }
    }
}

impl<TransTable: TransitionTable> GeneralSamNode<TransTable> {
    pub fn is_accepting(&self) -> bool {
        self.accept
    }

    pub fn max_suffix_len(&self) -> usize {
        self.len
    }

    pub fn get_suffix_parent_id(&self) -> GeneralSamNodeID {
        self.link
    }

    pub fn get_trans(&self) -> &TransTable {
        &self.trans
    }

    fn alter_trans_table<NewTableType: TransitionTable<KeyType = TransTable::KeyType>>(
        &self,
    ) -> GeneralSamNode<NewTableType> {
        GeneralSamNode {
            trans: NewTableType::from_kv_iter(self.trans.iter()),
            accept: self.accept,
            len: self.len,
            link: self.link,
        }
    }
}

impl<TransTable: ConstructiveTransitionTable<KeyType = u8>> GeneralSam<TransTable> {
    pub fn from_bytes<S: AsRef<[u8]>>(s: S) -> Self {
        let iter = IterAsChain::from(s.as_ref().iter().copied());
        Self::from_trie(iter)
    }
}

impl<TransTable: ConstructiveTransitionTable<KeyType = u32>> GeneralSam<TransTable> {
    pub fn from_utf32<S: AsRef<[u32]>>(s: S) -> Self {
        let iter = IterAsChain::from(s.as_ref().iter().copied());
        Self::from_trie(iter)
    }
}

impl<TransTable: ConstructiveTransitionTable<KeyType = char>> GeneralSam<TransTable> {
    pub fn from_chars<S: AsRef<str>>(s: S) -> Self {
        let iter = IterAsChain::from(s.as_ref().chars());
        Self::from_trie(iter)
    }
}

impl<TransTable: ConstructiveTransitionTable> Default for GeneralSam<TransTable> {
    fn default() -> Self {
        Self {
            node_pool: vec![
                GeneralSamNode::new(false, 0, SAM_NIL_NODE_ID),
                GeneralSamNode::new(true, 0, SAM_NIL_NODE_ID),
            ],
            topo_and_suf_len_sorted_order: Default::default(),
        }
    }
}

impl<TransTable: TransitionTable> GeneralSam<TransTable> {
    pub fn num_of_nodes(&self) -> usize {
        self.node_pool.len()
    }

    pub fn get_root_node(&self) -> &GeneralSamNode<TransTable> {
        self.get_node(SAM_ROOT_NODE_ID).unwrap()
    }

    pub fn get_node(&self, node_id: GeneralSamNodeID) -> Option<&GeneralSamNode<TransTable>> {
        self.node_pool.get(node_id)
    }

    pub fn get_root_state(&self) -> GeneralSamState<TransTable, &GeneralSam<TransTable>> {
        self.get_state(SAM_ROOT_NODE_ID)
    }

    pub fn get_state(
        &self,
        node_id: GeneralSamNodeID,
    ) -> GeneralSamState<TransTable, &GeneralSam<TransTable>> {
        if node_id < self.node_pool.len() {
            GeneralSamState::new(self, node_id)
        } else {
            GeneralSamState::new(self, SAM_NIL_NODE_ID)
        }
    }

    /// Returns topological sorted, maximum suffix length sorted
    /// and suffix parent depth sorted node id sequence,
    /// which is generated by topological sorting with a queue.
    pub fn get_topo_and_suf_len_sorted_node_ids(&self) -> &Vec<GeneralSamNodeID> {
        &self.topo_and_suf_len_sorted_order
    }

    pub fn alter_trans_table<NewTableType: TransitionTable<KeyType = TransTable::KeyType>>(
        &self,
    ) -> GeneralSam<NewTableType> {
        GeneralSam {
            node_pool: self
                .node_pool
                .iter()
                .map(|x| x.alter_trans_table())
                .collect(),
            topo_and_suf_len_sorted_order: self.topo_and_suf_len_sorted_order.clone(),
        }
    }

    pub fn alter_trans_table_into<NewTableType: TransitionTable<KeyType = TransTable::KeyType>>(
        self,
    ) -> GeneralSam<NewTableType> {
        GeneralSam {
            node_pool: self
                .node_pool
                .iter()
                .map(|x| x.alter_trans_table())
                .collect(),
            topo_and_suf_len_sorted_order: self.topo_and_suf_len_sorted_order,
        }
    }
}

impl<TransTable: ConstructiveTransitionTable> GeneralSam<TransTable> {
    pub fn from_trie<TN: TrieNodeAlike>(node: TN) -> Self
    where
        TN::InnerType: Into<TransTable::KeyType>,
    {
        let mut sam = Self::default();

        let accept_empty_string = node.is_accepting();

        sam.build_with_trie(node);
        sam.topo_sort_with_queue();
        sam.update_accepting();

        sam.node_pool[SAM_ROOT_NODE_ID].accept = accept_empty_string;

        sam
    }

    fn build_with_trie<TN: TrieNodeAlike>(&mut self, node: TN)
    where
        TN::InnerType: Into<TransTable::KeyType>,
    {
        node.bfs_travel(|event| -> Result<GeneralSamNodeID, Infallible> {
            match event {
                TravelEvent::PushRoot(_) => Ok(SAM_ROOT_NODE_ID),
                TravelEvent::Push(cur_tn, cur_node_id, key) => {
                    Ok(self.insert_node_trans(*cur_node_id, key, cur_tn.is_accepting()))
                }
                TravelEvent::Pop(_, cur_node_id) => Ok(cur_node_id),
            }
        })
        .unwrap();
    }

    fn topo_sort_with_queue(&mut self) {
        let mut in_degree: Vec<usize> = vec![0; self.num_of_nodes()];

        self.node_pool.iter().for_each(|node| {
            node.trans.transitions().for_each(|v| {
                in_degree[*v] += 1;
            });
        });
        assert!(in_degree[SAM_ROOT_NODE_ID] == 0);

        self.topo_and_suf_len_sorted_order
            .reserve(self.node_pool.len());

        self.topo_and_suf_len_sorted_order.push(SAM_ROOT_NODE_ID);
        let mut head = 0;
        while head < self.topo_and_suf_len_sorted_order.len() {
            let u_id = self.topo_and_suf_len_sorted_order[head];
            head += 1;
            self.node_pool[u_id].trans.transitions().for_each(|v_id| {
                in_degree[*v_id] -= 1;
                if in_degree[*v_id] == 0 {
                    self.topo_and_suf_len_sorted_order.push(*v_id);
                }
            });
        }
    }

    fn update_accepting(&mut self) {
        self.topo_and_suf_len_sorted_order
            .iter()
            .rev()
            .for_each(|node_id| {
                let link_id = self.node_pool[*node_id].link;
                self.node_pool[link_id].accept |= self.node_pool[*node_id].accept;
            });
        self.node_pool[SAM_NIL_NODE_ID].accept = false;
    }

    fn alloc_node(&mut self, node: GeneralSamNode<TransTable>) -> GeneralSamNodeID {
        let id = self.node_pool.len();
        self.node_pool.push(node);
        id
    }

    fn insert_node_trans<Key: Into<TransTable::KeyType>>(
        &mut self,
        last_node_id: GeneralSamNodeID,
        key: Key,
        accept: bool,
    ) -> GeneralSamNodeID {
        let key: TransTable::KeyType = key.into();

        let new_node_id = {
            let last_node = &self.node_pool[last_node_id];
            self.alloc_node(GeneralSamNode::new(
                accept,
                last_node.len + 1,
                SAM_NIL_NODE_ID,
            ))
        };

        let mut p_node_id = last_node_id;
        while p_node_id != SAM_NIL_NODE_ID {
            let p_node = &mut self.node_pool[p_node_id];
            if p_node.trans.contains_key(&key) {
                break;
            }
            p_node.trans.insert(key.clone(), new_node_id);
            p_node_id = p_node.link;
        }

        if p_node_id == SAM_NIL_NODE_ID {
            self.node_pool[new_node_id].link = SAM_ROOT_NODE_ID;
            return new_node_id;
        }

        let q_node_id = *self.node_pool[p_node_id].trans.get(&key).unwrap();
        let q_node = &self.node_pool[q_node_id];
        if q_node.len == self.node_pool[p_node_id].len + 1 {
            self.node_pool[new_node_id].link = q_node_id;
            return new_node_id;
        }

        let clone_node_id = self.alloc_node(q_node.clone());
        self.node_pool[clone_node_id].len = self.node_pool[p_node_id].len + 1;
        while p_node_id != SAM_NIL_NODE_ID {
            let p_node = &mut self.node_pool[p_node_id];
            if let Some(t_node_id) = p_node.trans.get_mut(&key)
                && *t_node_id == q_node_id
            {
                *t_node_id = clone_node_id;
                p_node_id = p_node.link;
                continue;
            }
            break;
        }

        self.node_pool[new_node_id].link = clone_node_id;
        self.node_pool[q_node_id].link = clone_node_id;

        new_node_id
    }
}