textdb/
lib.rs

1#![doc = include_str!("../README.md")]
2
3pub mod maps;
4pub mod accessor;
5
6use accessor::{Accessor, TsvText};
7use maps::{MemoryMap, SafeMemoryMap};
8
9
10
11/// A table of a memory mapped text database.
12pub struct Table<Map: MemoryMap, Access : Accessor> {
13    accessor: Access,
14    map: Map,
15}
16
17/// A line from a memory mapped text database.
18pub struct Line<'a, Map: MemoryMap, Access : Accessor> {
19    textdb: &'a Table<Map, Access>,
20    line: &'a [u8],
21}
22
23impl Table<SafeMemoryMap, TsvText> {
24    // Make a table from an owned string.
25    pub fn text_tsv_from_string(text: String) -> Self {
26        let accessor = accessor::TsvText::<0>::default();
27        let map = maps::SafeMemoryMap::from_string(text);
28        Table::new(map, accessor)
29    }
30
31    // Make a table from a string reference.
32    pub fn text_tsv_from_str<S : AsRef<str>>(text: S) -> Self {
33        let accessor = accessor::TsvText::<0>::default();
34        let map = maps::SafeMemoryMap::from_str(text.as_ref());
35        Table::new(map, accessor)
36    }
37}
38
39
40impl<Access : Accessor, Map: MemoryMap> Table<Map, Access> {
41    /// Make a new memory mapped text database.
42    pub fn new(map: Map, accessor: Access) -> Self {
43        Self {
44            map,
45            accessor
46        }
47    }
48
49    /// Return true if the database is sorted.
50    /// Note: On large files (> 1TB) this may take some time to run.
51    pub fn is_sorted(&self) -> anyhow::Result<bool> {
52        let bytes = self.map.bytes();
53        let mut iter = bytes.split(|b| *b == b'\n');
54        let mut prev_line = iter.next().unwrap_or_default();
55        for line in iter {
56            if self.accessor.compare_lines(prev_line, line) == std::cmp::Ordering::Greater {
57                return Ok(false);
58            }
59            prev_line = line;
60        }
61        Ok(true)
62    }
63
64    /// Get all the keys as strings.
65    /// Note: On large files (> 1TB) this may take some time to run.
66    pub fn keys(&self) -> impl Iterator<Item=Result<&str, std::str::Utf8Error>> {
67        self.map.bytes().split(|b| *b == b'\n').map(|line| {
68            std::str::from_utf8(self.accessor.key(line))
69        })
70    }
71
72    /// Get one column as strings.
73    pub fn cols(&self, i: usize) -> impl Iterator<Item=Result<&str, std::str::Utf8Error>> {
74        self.map.bytes().split(|b| *b == b'\n').map(move |line| {
75            std::str::from_utf8(self.accessor.col(line, i))
76        })
77    }
78
79    /// Get a whole line between min and max which contains pos.
80    fn find_line_at(bytes: &[u8], min: usize, max: usize, pos: usize) -> (usize, usize, &[u8]) {
81        let start = bytes[min..pos].iter().rposition(|b| *b == b'\n').map(|p| min + p + 1).unwrap_or(min);
82        let end = bytes[pos..max].iter().position(|b| *b == b'\n').map(|p| pos + p + 1).unwrap_or(max);
83        assert!(start >= min);
84        assert!(end <= max);
85        assert!(end >= start);
86
87        // Trim the newline.
88        let line_end = if end != 0 && bytes[end-1] == b'\n' { end-1 } else { end };
89        let line = &bytes[start..line_end];
90        (start, end, line)
91    }
92
93    /// Return an iterator over all matching lines for a certain key.
94    pub fn get_matching_lines(&self, key: &Access::KeyType) -> impl Iterator<Item=Line<Map, Access>> {
95        let bytes = self.map.bytes();
96
97        // Always the start of a line.
98        let mut min = 0;
99
100        // Always the end of a line (not counting the newline).
101        let mut max = bytes.len();
102        loop {
103            let mid = min + (max - min) / 2;
104            let (start, end, line) = Self::find_line_at(bytes, min, max, mid);
105
106            #[cfg(test)]
107            {
108                let cmp = self.accessor.compare_key(line, key);
109                let range = std::str::from_utf8(&bytes[min..max]).unwrap();
110                println!("min={min} mid={mid} max={max} line ={:?} cmp={cmp:?} r={range:?}", std::str::from_utf8(line).unwrap());
111            }
112
113            match self.accessor.compare_key(line, key) {
114                // line < key: 
115                std::cmp::Ordering::Less => {
116                    // Ensure forward progress by moving min up one line.
117                    assert!(min != end);
118                    min = end;
119                }
120                std::cmp::Ordering::Equal => {
121                    let (_start, end, line) = Self::find_line_at(bytes, min, max, min);
122                    #[cfg(test)]
123                    {
124                        assert_eq!(start, min);
125                        let range = std::str::from_utf8(&bytes[min..max]).unwrap();
126                        let cmp = self.accessor.compare_key(line, key);
127                        println!("=min min={min} mid={mid} max={max} line ={:?} cmp={cmp:?} r={range:?}", std::str::from_utf8(line).unwrap());
128                    }
129                    let mut min_is_equal = false;
130                    match self.accessor.compare_key(line, key) {
131                        std::cmp::Ordering::Less => {
132                            assert!(min != end);
133                            min = end;
134                        }
135                        std::cmp::Ordering::Equal => {
136                            min_is_equal = true;
137                        }
138                        std::cmp::Ordering::Greater => {
139                            // Not sorted!
140                            max = min;
141                            break;
142                        }
143                    }
144
145                    let (start, _end, line) = Self::find_line_at(bytes, min, max, max-1);
146                    #[cfg(test)]
147                    {
148                        assert_eq!(_end, max);
149                        let range = std::str::from_utf8(&bytes[min..max]).unwrap();
150                        let cmp = self.accessor.compare_key(line, key);
151                        println!("=max min={min} mid={mid} max={max} line ={:?} cmp={cmp:?} r={range:?}", std::str::from_utf8(line).unwrap());
152                    }
153                    match self.accessor.compare_key(line, key) {
154                        std::cmp::Ordering::Less => {
155                            // Not sorted!
156                            max = min;
157                            break;
158                        }
159                        std::cmp::Ordering::Equal => {
160                            if min_is_equal {
161                                // Sucess, both min and max are equal.
162                                // Trim the range.
163                                max = if max != 0 && bytes[max-1] == b'\n' { max-1 } else { max };
164                                break;
165                            }
166                        }
167                        std::cmp::Ordering::Greater => {
168                            // Ensure forward progress by moving max down one.
169                            assert!(max != start);
170                            max = start;
171                        }
172                    }
173                }
174                std::cmp::Ordering::Greater => {
175                    assert!(max != start);
176                    max = start;
177                }
178            }
179        }
180
181        bytes[min..max].split(|b| *b == b'\n').map(|line| {
182            Line {
183                textdb: self,
184                line,
185            }
186        })
187    }
188}
189
190impl<'a, Access : Accessor, Map: MemoryMap> Line<'a, Map, Access> {
191    /// Get the key of this line as a string.
192    pub fn key(&self) -> Result<&str, std::str::Utf8Error> {
193        std::str::from_utf8(self.textdb.accessor.key(self.line))
194    }
195
196    /// Get a column of this line as a string.
197    pub fn col(&self, i: usize) -> Result<&str, std::str::Utf8Error> {
198        std::str::from_utf8(self.textdb.accessor.col(self.line, i))
199    }
200
201    pub fn line(&self) ->  Result<&str, std::str::Utf8Error> {
202        std::str::from_utf8(self.line)
203    }
204}
205
206