sqlite_decoder/
btree.rs

1//! https://www.sqlite.org/fileformat.html
2
3use crate::util;
4use crate::IResult;
5use crate::ParserError;
6use nom::bytes::complete::take;
7use sqlite_types::TextEncoding;
8
9type BoxError = Box<dyn std::error::Error>;
10
11#[derive(Debug)]
12pub enum Record {
13    Null,
14    Int8(i8),
15    Int16(i16),
16    Blob(Vec<u8>),
17    Text(String),
18}
19
20impl Record {
21    pub fn as_string(&self) -> String {
22        match self {
23            Self::Text(v) => v.clone(),
24            Self::Null => "NULL".to_owned(),
25            v => unreachable!("expected string, given {:?}", v),
26        }
27    }
28
29    pub fn as_int(&self) -> usize {
30        match self {
31            Self::Int8(v) => *v as usize,
32            Self::Int16(v) => *v as usize,
33            _ => unreachable!(),
34        }
35    }
36}
37
38#[derive(Clone)]
39pub struct InputContext<'a> {
40    pub(crate) input: &'a [u8],
41    pub(crate) original_input: Vec<u8>,
42}
43
44#[derive(Debug)]
45pub enum Cell {
46    // FIXME: Remove BTree in those names
47    TableBTreeLeafCell(TableBTreeLeafCell),
48    TableBTreeInteriorCell(TableBTreeInteriorCell),
49    IndexBTreeLeafCell(IndexBTreeLeafCell),
50    IndexBTreeInteriorCell(IndexBTreeInteriorCell),
51}
52
53#[derive(Debug)]
54pub struct TableBTreeLeafCell {
55    /// A varint which is the integer key, a.k.a. "rowid"
56    rowid: u64,
57    pub records: Vec<Record>,
58    page_first_overflow: Option<u32>,
59}
60
61#[derive(Debug)]
62pub struct TableBTreeInteriorCell {
63    pub left_child_page: u32,
64    pub rowid: u64,
65}
66
67#[derive(Debug)]
68pub struct IndexBTreeLeafCell {}
69
70#[derive(Debug)]
71pub struct IndexBTreeInteriorCell {}
72
73#[derive(Debug)]
74pub struct BtreeHeader {
75    page_type: PageType,
76    start_first_freeblock: u16,
77    cell_count: u16,
78    start_cell_content_area: u16,
79    fragmented_free_bytes_count: u8,
80    right_most_pointer: Option<u32>,
81}
82
83#[derive(Debug)]
84pub struct Btree {
85    pub header: BtreeHeader,
86    pub cells: Vec<Cell>,
87}
88
89#[derive(Debug)]
90enum PageContent {
91    Index,
92    Table,
93}
94
95#[derive(Debug)]
96enum PageType {
97    Leaf(PageContent),
98    Interior(PageContent),
99}
100impl PageType {
101    fn header_size(&self) -> u8 {
102        use PageType::*;
103        match self {
104            Leaf(_) => 8,
105            Interior(_) => 12,
106        }
107    }
108
109    fn is_interior(&self) -> bool {
110        matches!(self, PageType::Interior(_))
111    }
112}
113
114fn decode_page_type<'a>(input: InputContext<'a>) -> IResult<InputContext<'a>, PageType> {
115    let (input, byte) = input.read_u8()?;
116    let t = match byte {
117        0x02 => PageType::Interior(PageContent::Index),
118        0x05 => PageType::Interior(PageContent::Table),
119        0x0a => PageType::Leaf(PageContent::Index),
120        0x0d => PageType::Leaf(PageContent::Table),
121        e => {
122            return Err(nom::Err::Failure(ParserError(format!(
123                "unsupported page type: {}",
124                e
125            ))))
126        }
127    };
128    Ok((input, t))
129}
130
131fn decode_header<'a>(input: InputContext<'a>) -> IResult<InputContext<'a>, BtreeHeader> {
132    let (input, page_type) = decode_page_type(input)?;
133    let (input, start_first_freeblock) = input.read_u16()?;
134    let (input, cell_count) = input.read_u16()?;
135    let (input, start_cell_content_area) = input.read_u16()?;
136    let (input, fragmented_free_bytes_count) = input.read_u8()?;
137
138    let (input, right_most_pointer) = if page_type.is_interior() {
139        let (input, right_most_pointer) = input.read_u32()?;
140        (input, Some(right_most_pointer))
141    } else {
142        (input, None)
143    };
144
145    let header = BtreeHeader {
146        page_type,
147        start_first_freeblock,
148        cell_count,
149        start_cell_content_area,
150        fragmented_free_bytes_count,
151        right_most_pointer,
152    };
153    Ok((input, header))
154}
155
156fn decode_cell_pointers<'a>(
157    header: &BtreeHeader,
158    mut input: InputContext<'a>,
159) -> IResult<InputContext<'a>, Vec<u16>> {
160    let mut cell_pointers = Vec::with_capacity(header.cell_count as usize);
161
162    for _ in 0..header.cell_count {
163        let res = input.read_u16()?;
164        input = res.0;
165        cell_pointers.push(res.1);
166    }
167
168    Ok((input, cell_pointers))
169}
170
171fn decode_cell<'a>(
172    enc: &TextEncoding,
173    parent: &PageType,
174    input: InputContext<'a>,
175) -> IResult<InputContext<'a>, Cell> {
176    let (input, cell) = match parent {
177        PageType::Leaf(PageContent::Table) => {
178            let (input, cell) = decode_table_leaf_cell(enc, input)?;
179            (input, Cell::TableBTreeLeafCell(cell))
180        }
181        PageType::Interior(PageContent::Table) => {
182            let (input, cell) = decode_table_interior_cell(input)?;
183            (input, Cell::TableBTreeInteriorCell(cell))
184        }
185        e => {
186            return Err(nom::Err::Failure(ParserError(format!(
187                "unsupported cell with parent: {:?}",
188                e
189            ))))
190        }
191    };
192
193    Ok((input, cell))
194}
195
196fn decode_table_interior_cell<'a>(
197    input: InputContext<'a>,
198) -> IResult<InputContext<'a>, TableBTreeInteriorCell> {
199    let (input, left_child_page) = input.read_u32()?;
200    let (input, rowid) = input.read_varint()?;
201
202    Ok((
203        input,
204        TableBTreeInteriorCell {
205            left_child_page,
206            rowid,
207        },
208    ))
209}
210
211fn decode_table_leaf_cell<'a>(
212    enc: &TextEncoding,
213    input: InputContext<'a>,
214) -> IResult<InputContext<'a>, TableBTreeLeafCell> {
215    let (input, total_payload_size) = input.read_varint()?;
216    let (input, rowid) = input.read_varint()?;
217    // FIXME: using the total_payload_size to read only the part that doesn't
218    // overflow?
219    let (input, raw_payload) = input.read_bytes(total_payload_size as usize)?;
220    let (_input, records) = decode_records(enc, raw_payload)?;
221
222    // FIXME: implement overflow handling. How to detect if it overflowed?
223    // FIXME: it's written in the spec
224    // let (input, page_first_overflow) = input.read_u32()?;
225    let page_first_overflow = None;
226
227    Ok((
228        input,
229        TableBTreeLeafCell {
230            rowid,
231            records,
232            page_first_overflow,
233        },
234    ))
235}
236
237fn decode_records<'a>(enc: &TextEncoding, input: &'a [u8]) -> IResult<&'a [u8], Vec<Record>> {
238    let (input, (header_size, took)) = read_varint(input)?;
239
240    // Header without the header size varint
241    let header_input = &input[..header_size as usize - took];
242    let (_input, columns) = decode_record_columns(header_input)?;
243
244    let mut input = &input[header_size as usize - took..];
245
246    let records = {
247        let mut values = Vec::with_capacity(columns.len());
248
249        for serial_type in columns {
250            let res = decode_record_value(enc, serial_type, input)?;
251            input = res.0;
252            values.push(res.1);
253        }
254
255        values
256    };
257
258    Ok((input, records))
259}
260
261fn decode_record_columns<'a>(mut input: &'a [u8]) -> IResult<&'a [u8], Vec<u64>> {
262    let mut columns = Vec::new();
263    loop {
264        let res = read_varint(input)?;
265        input = res.0;
266        columns.push(res.1 .0);
267
268        if input.is_empty() {
269            break;
270        }
271    }
272
273    Ok((input, columns.to_owned()))
274}
275
276fn decode_record_value<'a>(
277    enc: &TextEncoding,
278    serial_type: u64,
279    input: &'a [u8],
280) -> IResult<&'a [u8], Record> {
281    use Record::*;
282    let (input, serial_type) = match serial_type {
283        0 => (input, Null),
284        1 => {
285            let (input, value) = take(1usize)(input)?;
286            (input, Int8(i8::from_be_bytes(value.try_into().unwrap())))
287        }
288        2 => {
289            let (input, value) = take(2usize)(input)?;
290            (input, Int16(i16::from_be_bytes(value.try_into().unwrap())))
291        }
292        v if v > 12 && v % 2 == 0 => {
293            let size = (v - 12) / 2;
294            let (input, bytes) = take(size)(input)?;
295
296            (input, Blob(bytes.to_owned()))
297        }
298        v if v > 13 && v % 2 != 0 => {
299            let size = (v - 13) / 2;
300
301            let (input, bytes) = take(size)(input)?;
302
303            use TextEncoding::*;
304            let bytes = bytes.to_vec();
305            let value = match enc {
306                UTF8 => String::from_utf8(bytes).unwrap(),
307                UTF16le | UTF16be => unimplemented!(),
308            };
309
310            (input, Text(value))
311        }
312        e => {
313            return Err(nom::Err::Failure(ParserError(format!(
314                "unsupported serial type: {}",
315                e
316            ))))
317        }
318    };
319
320    Ok((input, serial_type))
321}
322
323/// Decode the B-Tree on the first page
324pub fn decode_first_page<'a>(enc: &'a TextEncoding, page: &'a [u8]) -> Result<Btree, BoxError> {
325    // first 100 of the first page are for the database header but preserve the
326    // original input for the absolute offset seek.
327    let input = &page[100..];
328    let input = InputContext {
329        input,
330        original_input: page.to_owned(),
331    };
332    match decode_btree(enc, input) {
333        Ok((_, btree)) => Ok(btree),
334        Err(err) => Err(format!("failed to decode: {}", err).into()),
335    }
336}
337
338/// Decode the B-Tree on a page
339pub fn decode<'a>(enc: &'a TextEncoding, input: &'a [u8]) -> Result<Btree, BoxError> {
340    let input = InputContext {
341        input,
342        original_input: input.to_owned(),
343    };
344    match decode_btree(enc, input) {
345        Ok((_, btree)) => Ok(btree),
346        Err(err) => Err(format!("failed to decode: {}", err).into()),
347    }
348}
349
350fn decode_btree<'a>(
351    enc: &TextEncoding,
352    input: InputContext<'a>,
353) -> IResult<InputContext<'a>, Btree> {
354    let (input, header) = decode_header(input)?;
355    let (input, cell_pointers) = decode_cell_pointers(&header, input)?;
356    let (input, cells) = {
357        let mut cells = Vec::with_capacity(cell_pointers.len());
358
359        let prev_input = input.clone();
360
361        for cell_pointer in cell_pointers {
362            let input = input.seek_at(cell_pointer as usize);
363
364            let res = decode_cell(enc, &header.page_type, input)?;
365            cells.push(res.1);
366        }
367
368        (prev_input, cells)
369    };
370
371    // FIXME: consume the input ??? the cell are always at the end of the page...
372
373    let btree = Btree { header, cells };
374    Ok((input, btree))
375}
376
377impl<'a> InputContext<'a> {
378    fn seek_at(&'a self, offset: usize) -> InputContext<'a> {
379        let input = &self.original_input[offset..];
380        Self {
381            input,
382            original_input: self.original_input.clone(),
383        }
384    }
385
386    fn read_u32(self) -> IResult<InputContext<'a>, u32> {
387        let (input, v) = util::read_u32(&self.input)?;
388        Ok((
389            Self {
390                input,
391                original_input: self.original_input,
392            },
393            v,
394        ))
395    }
396
397    fn read_u16(self) -> IResult<InputContext<'a>, u16> {
398        let (input, v) = util::read_u16(&self.input)?;
399        Ok((
400            Self {
401                input,
402                original_input: self.original_input,
403            },
404            v,
405        ))
406    }
407
408    fn read_u8(self) -> IResult<InputContext<'a>, u8> {
409        let (input, v) = util::read_u8(&self.input)?;
410        Ok((
411            Self {
412                input,
413                original_input: self.original_input,
414            },
415            v,
416        ))
417    }
418
419    fn read_varint(self) -> IResult<InputContext<'a>, u64> {
420        let (input, (v, _)) = read_varint(&self.input)?;
421
422        Ok((
423            Self {
424                input,
425                original_input: self.original_input,
426            },
427            v,
428        ))
429    }
430
431    fn read_bytes(self, n: usize) -> IResult<InputContext<'a>, &'a [u8]> {
432        let (input, bytes) = take(n)(self.input)?;
433
434        Ok((
435            Self {
436                input,
437                original_input: self.original_input,
438            },
439            bytes,
440        ))
441    }
442}
443
444/// Returns (value, variable size)
445fn read_varint<'a>(input: &'a [u8]) -> IResult<&'a [u8], (u64, usize)> {
446    let mut v = 0u64;
447    let mut i = 0usize;
448
449    loop {
450        if i >= 8 {
451            break;
452        }
453
454        v = (v << 7) + (input[i] & 0x7f) as u64;
455        if (input[i] & 0x80) == 0 {
456            return Ok((&input[i + 1..], (v, i + 1)));
457        }
458
459        i += 1;
460    }
461
462    v = (v << 8) + (input[i] & 0xff) as u64;
463
464    let input = &input[9..];
465    Ok((input, (v, 9)))
466}