tantivy_columnar/column/
dictionary_encoded.rs

1use std::ops::Deref;
2use std::sync::Arc;
3use std::{fmt, io};
4
5use sstable::{Dictionary, VoidSSTable};
6
7use crate::column::Column;
8use crate::RowId;
9
10/// Dictionary encoded column.
11///
12/// The column simply gives access to a regular u64-column that, in
13/// which the values are term-ordinals.
14///
15/// These ordinals are ids uniquely identify the bytes that are stored in
16/// the column. These ordinals are small, and sorted in the same order
17/// as the term_ord_column.
18#[derive(Clone)]
19pub struct BytesColumn {
20    pub(crate) dictionary: Arc<Dictionary<VoidSSTable>>,
21    pub(crate) term_ord_column: Column<u64>,
22}
23
24impl fmt::Debug for BytesColumn {
25    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
26        f.debug_struct("BytesColumn")
27            .field("term_ord_column", &self.term_ord_column)
28            .finish()
29    }
30}
31
32impl BytesColumn {
33    pub fn empty(num_docs: u32) -> BytesColumn {
34        BytesColumn {
35            dictionary: Arc::new(Dictionary::empty()),
36            term_ord_column: Column::build_empty_column(num_docs),
37        }
38    }
39
40    /// Fills the given `output` buffer with the term associated to the ordinal `ord`.
41    ///
42    /// Returns `false` if the term does not exist (e.g. `term_ord` is greater or equal to the
43    /// overll number of terms).
44    pub fn ord_to_bytes(&self, ord: u64, output: &mut Vec<u8>) -> io::Result<bool> {
45        self.dictionary.ord_to_term(ord, output)
46    }
47
48    /// Returns the number of rows in the column.
49    pub fn num_rows(&self) -> RowId {
50        self.term_ord_column.num_docs()
51    }
52
53    pub fn term_ords(&self, row_id: RowId) -> impl Iterator<Item = u64> + '_ {
54        self.term_ord_column.values_for_doc(row_id)
55    }
56
57    /// Returns the column of ordinals
58    pub fn ords(&self) -> &Column<u64> {
59        &self.term_ord_column
60    }
61
62    pub fn num_terms(&self) -> usize {
63        self.dictionary.num_terms()
64    }
65
66    pub fn dictionary(&self) -> &Dictionary<VoidSSTable> {
67        self.dictionary.as_ref()
68    }
69}
70
71#[derive(Clone)]
72pub struct StrColumn(BytesColumn);
73
74impl fmt::Debug for StrColumn {
75    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
76        write!(f, "{:?}", self.term_ord_column)
77    }
78}
79
80impl From<StrColumn> for BytesColumn {
81    fn from(str_column: StrColumn) -> BytesColumn {
82        str_column.0
83    }
84}
85
86impl StrColumn {
87    pub fn wrap(bytes_column: BytesColumn) -> StrColumn {
88        StrColumn(bytes_column)
89    }
90
91    pub fn dictionary(&self) -> &Dictionary<VoidSSTable> {
92        self.0.dictionary.as_ref()
93    }
94
95    /// Fills the buffer
96    pub fn ord_to_str(&self, term_ord: u64, output: &mut String) -> io::Result<bool> {
97        unsafe {
98            let buf = output.as_mut_vec();
99            if !self.0.dictionary.ord_to_term(term_ord, buf)? {
100                return Ok(false);
101            }
102            // TODO consider remove checks if it hurts performance.
103            if std::str::from_utf8(buf.as_slice()).is_err() {
104                buf.clear();
105                return Err(io::Error::new(
106                    io::ErrorKind::InvalidData,
107                    "Not valid utf-8",
108                ));
109            }
110        }
111        Ok(true)
112    }
113}
114
115impl Deref for StrColumn {
116    type Target = BytesColumn;
117
118    fn deref(&self) -> &Self::Target {
119        &self.0
120    }
121}