1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
use std::cmp::Ordering;
use std::io::{Read, Write};

use anyhow::Result;

use crate::trie_array::TrieArray;

/// Spece-efficient implementation of [`TrieArray`] with Elias-Fano encording.
#[derive(Default)]
pub struct EliasFanoTrieArray {
    token_ids: sucds::EliasFano,
    pointers: sucds::EliasFano,
}

impl TrieArray for EliasFanoTrieArray {
    fn build(token_ids: Vec<usize>, pointers: Vec<usize>) -> Box<Self> {
        if token_ids.is_empty() {
            return Box::new(Self::default());
        }

        let token_ids = Self::build_token_sequence(token_ids, &pointers);
        let pointers = Self::build_pointers(pointers);

        Box::new(Self {
            token_ids,
            pointers,
        })
    }

    fn serialize_into<W>(&self, mut writer: W) -> Result<usize>
    where
        W: Write,
    {
        Ok(self.token_ids.serialize_into(&mut writer)?
            + self.pointers.serialize_into(&mut writer)?)
    }

    fn deserialize_from<R>(mut reader: R) -> Result<Box<Self>>
    where
        R: Read,
    {
        let token_ids = sucds::EliasFano::deserialize_from(&mut reader)?;
        let pointers = sucds::EliasFano::deserialize_from(&mut reader)?;
        Ok(Box::new(Self {
            token_ids,
            pointers,
        }))
    }

    fn size_in_bytes(&self) -> usize {
        self.token_ids.size_in_bytes() + self.pointers.size_in_bytes()
    }

    fn memory_statistics(&self) -> serde_json::Value {
        let token_ids = self.token_ids.size_in_bytes();
        let pointers = self.pointers.size_in_bytes();
        serde_json::json!({
            "token_ids": token_ids,
            "pointers": pointers,
        })
    }

    /// Gets the token id with a given index.
    fn token_id(&self, i: usize) -> usize {
        let pos = self.pointers.rank(i + 1) - 1;
        let (b, _) = self.range(pos);
        let base = if b == 0 {
            0
        } else {
            self.token_ids.select(b - 1)
        };
        self.token_ids.select(i) - base
    }

    #[inline(always)]
    fn range(&self, pos: usize) -> (usize, usize) {
        (self.pointers.select(pos), self.pointers.select(pos + 1))
    }

    /// Searches for an element within a given range, returning its index.
    /// TODO: Make faster
    #[inline(always)]
    fn find_token(&self, pos: usize, id: usize) -> Option<usize> {
        let (b, e) = self.range(pos);
        let base = if b == 0 {
            0
        } else {
            self.token_ids.select(b - 1)
        };
        for i in b..e {
            let token_id = self.token_ids.select(i) - base;
            match token_id.cmp(&id) {
                Ordering::Equal => return Some(i),
                Ordering::Greater => break,
                _ => {}
            }
        }
        None
    }

    fn num_tokens(&self) -> usize {
        self.token_ids.len()
    }

    fn num_pointers(&self) -> usize {
        self.pointers.len()
    }
}

impl EliasFanoTrieArray {
    fn build_token_sequence(mut token_ids: Vec<usize>, pointers: &[usize]) -> sucds::EliasFano {
        assert_eq!(token_ids.len(), *pointers.last().unwrap());

        let mut sampled_id = 0;
        for i in 0..pointers.len() - 1 {
            let (b, e) = (pointers[i], pointers[i + 1]);
            debug_assert!(b <= e);

            for token_id in token_ids.iter_mut().take(e).skip(b) {
                *token_id += sampled_id;
            }
            if e != 0 {
                sampled_id = token_ids[e - 1];
            }
        }

        let mut token_efb = sucds::EliasFanoBuilder::new(sampled_id + 1, token_ids.len()).unwrap();
        token_efb.append(&token_ids).unwrap();
        sucds::EliasFano::new(token_efb, false)
    }

    fn build_pointers(pointers: Vec<usize>) -> sucds::EliasFano {
        let mut pointer_efb =
            sucds::EliasFanoBuilder::new(pointers.last().unwrap() + 1, pointers.len()).unwrap();
        pointer_efb.append(&pointers).unwrap();
        sucds::EliasFano::new(pointer_efb, true)
    }
}