Skip to main content

anomalyx_normalize/parsers/
sqlite.rs

1//! SQLite database parser — app/telemetry data that lives in `.db` files.
2//!
3//! The first user table (alphabetically, excluding the `sqlite_*` internals)
4//! is scanned straight into a RecordSet: `SELECT *` in rowid order, each SQLite
5//! value mapped to the closed [`Value`] set — so the full detector taxonomy
6//! applies. The database is loaded from the byte buffer via SQLite's deserialize
7//! API (no temp file). `REAL`→`Float`, `INTEGER`→`Int`, `TEXT`→`Str`, `BLOB`→hex
8//! `Str`, `NULL`/non-finite→`Null`.
9//!
10//! Detected by the `SQLite format 3\0` magic; extensions `.db`/`.sqlite`/
11//! `.sqlite3`/`.db3`. Behind the default-on `sqlite` feature (binary format).
12
13use crate::parser::{Confidence, FormatParser, MAGIC};
14use crate::table::TableBuilder;
15use ax_core::{AxError, Column, Value};
16use rusqlite::types::ValueRef;
17use rusqlite::{Connection, MAIN_DB};
18use std::collections::BTreeMap;
19use std::io::Cursor;
20
21/// The 16-byte SQLite database file header.
22const SQLITE_MAGIC: &[u8] = b"SQLite format 3\x00";
23
24#[derive(Debug, Default, Clone)]
25pub struct SqliteParser;
26
27/// Maps a SQLite cell to the closed [`Value`] set. A blob becomes its lowercase
28/// hex (deterministic; structural detectors still see the column exists).
29fn value_ref_to_value(v: ValueRef) -> Value {
30    match v {
31        ValueRef::Null => Value::Null,
32        ValueRef::Integer(i) => Value::Int(i),
33        ValueRef::Real(f) => {
34            if f.is_finite() {
35                Value::Float(f)
36            } else {
37                Value::Null
38            }
39        }
40        ValueRef::Text(bytes) => Value::Str(String::from_utf8_lossy(bytes).into_owned()),
41        ValueRef::Blob(bytes) => {
42            let hex: String = bytes.iter().map(|b| format!("{b:02x}")).collect();
43            Value::Str(hex)
44        }
45    }
46}
47
48impl SqliteParser {
49    fn err(&self, msg: impl std::fmt::Display) -> AxError {
50        AxError::Parse {
51            format: self.id().to_string(),
52            message: msg.to_string(),
53        }
54    }
55}
56
57impl FormatParser for SqliteParser {
58    fn id(&self) -> &'static str {
59        "sqlite"
60    }
61    fn extensions(&self) -> &'static [&'static str] {
62        &["db", "sqlite", "sqlite3", "db3"]
63    }
64    fn sniff(&self, bytes: &[u8]) -> Option<Confidence> {
65        bytes.starts_with(SQLITE_MAGIC).then_some(MAGIC)
66    }
67    fn parse(&self, _source: &str, bytes: &[u8]) -> Result<Vec<Column>, AxError> {
68        // A WAL-mode database sets the file header's *read version* (byte 19) to
69        // 2. We only ever receive the main-file image — the `-wal` companion does
70        // not travel in a byte stream — and SQLite refuses to open a
71        // read-version-2 image read-only, returning SQLITE_CANTOPEN. The main
72        // image of a checkpointed WAL database is itself a complete, valid
73        // database, so we reinterpret it as legacy (read version 1) on a private
74        // copy and read its checkpointed state. This is read-only: we never write
75        // these bytes back. (Byte 18, the write version, does not gate reads.)
76        let patched: Vec<u8>;
77        let data: &[u8] = if bytes.get(19) == Some(&2) {
78            let mut v = bytes.to_vec();
79            v[19] = 1;
80            patched = v;
81            &patched
82        } else {
83            bytes
84        };
85
86        let mut conn = Connection::open_in_memory().map_err(|e| self.err(e))?;
87        conn.deserialize_read_exact(MAIN_DB, Cursor::new(data), data.len(), true)
88            .map_err(|e| self.err(e))?;
89
90        // First user table (deterministic order; internal tables excluded).
91        let table: String = conn
92            .query_row(
93                "SELECT name FROM sqlite_master WHERE type = 'table' \
94                 AND name NOT LIKE 'sqlite_%' ORDER BY name LIMIT 1",
95                [],
96                |row| row.get(0),
97            )
98            .map_err(|e| self.err(format!("no readable table: {e}")))?;
99
100        let sql = format!("SELECT * FROM \"{}\"", table.replace('"', "\"\""));
101        let mut stmt = conn.prepare(&sql).map_err(|e| self.err(e))?;
102        let names: Vec<String> = stmt.column_names().iter().map(|s| s.to_string()).collect();
103
104        let mut rows = stmt.query([]).map_err(|e| self.err(e))?;
105        let mut builder = TableBuilder::new();
106        while let Some(row) = rows.next().map_err(|e| self.err(e))? {
107            let mut record: BTreeMap<String, Value> = BTreeMap::new();
108            for (i, name) in names.iter().enumerate() {
109                let cell = row.get_ref(i).map_err(|e| self.err(e))?;
110                record.insert(name.clone(), value_ref_to_value(cell));
111            }
112            builder.push_row(record);
113        }
114        Ok(builder.finish())
115    }
116}
117
118#[cfg(test)]
119mod tests {
120    use super::*;
121    use ax_core::ColType;
122
123    /// Builds an in-memory SQLite DB and serializes it to bytes (no committed
124    /// binary fixture).
125    fn build_db(setup: &str) -> Vec<u8> {
126        let conn = Connection::open_in_memory().unwrap();
127        conn.execute_batch(setup).unwrap();
128        conn.serialize(MAIN_DB).unwrap().to_vec()
129    }
130
131    const EVENTS: &str = "CREATE TABLE events (id INTEGER, host TEXT, score REAL, ok INTEGER);\
132        INSERT INTO events VALUES (1,'a',9.5,1),(2,'b',3.0,0),(3,'c',7.5,1);";
133
134    fn col<'a>(cols: &'a [Column], name: &str) -> &'a Column {
135        cols.iter()
136            .find(|c| c.name == name)
137            .unwrap_or_else(|| panic!("missing column {name}"))
138    }
139
140    #[test]
141    fn scans_a_table_into_typed_columns() {
142        let cols = SqliteParser.parse("app.db", &build_db(EVENTS)).unwrap();
143        assert_eq!(col(&cols, "id").ty, ColType::Int);
144        assert_eq!(
145            col(&cols, "id").cells,
146            vec![Value::Int(1), Value::Int(2), Value::Int(3)]
147        );
148        assert_eq!(
149            col(&cols, "host").cells,
150            vec![
151                Value::Str("a".into()),
152                Value::Str("b".into()),
153                Value::Str("c".into())
154            ]
155        );
156        assert_eq!(col(&cols, "score").numeric(), vec![9.5, 3.0, 7.5]);
157        assert_eq!(col(&cols, "ok").cells[1], Value::Int(0));
158    }
159
160    #[test]
161    fn picks_the_first_table_alphabetically() {
162        let db = build_db(
163            "CREATE TABLE zeta (z INTEGER); INSERT INTO zeta VALUES (9);\
164             CREATE TABLE alpha (a TEXT); INSERT INTO alpha VALUES ('first');",
165        );
166        let cols = SqliteParser.parse("-", &db).unwrap();
167        // `alpha` sorts before `zeta`, so its column is the one scanned.
168        assert_eq!(col(&cols, "a").cells, vec![Value::Str("first".into())]);
169        assert!(cols.iter().all(|c| c.name != "z"));
170    }
171
172    #[test]
173    fn reads_a_wal_mode_database() {
174        // Production databases (browsers, peewee/yfinance, many apps) default to
175        // WAL journal mode, whose header read-version byte (19) is 2. The
176        // main-file image alone then can't be opened read-only without its `-wal`
177        // companion — SQLite returns CANTOPEN. Flip a valid DB's read-version to
178        // 2 to simulate WAL; the parser must still read it by reinterpreting the
179        // checkpointed image as legacy (read-version 1).
180        let mut db = build_db(EVENTS);
181        assert_eq!(db[19], 1, "serialized fixture should start as legacy");
182        db[19] = 2;
183        let cols = SqliteParser.parse("app.db", &db).unwrap();
184        assert_eq!(
185            col(&cols, "id").cells,
186            vec![Value::Int(1), Value::Int(2), Value::Int(3)]
187        );
188    }
189
190    #[test]
191    fn null_and_blob_cells() {
192        let db = build_db(
193            "CREATE TABLE t (v BLOB, n INTEGER);\
194             INSERT INTO t VALUES (x'01ff', NULL);",
195        );
196        let cols = SqliteParser.parse("-", &db).unwrap();
197        assert_eq!(col(&cols, "v").cells[0], Value::Str("01ff".into())); // blob → hex
198        assert_eq!(col(&cols, "n").cells[0], Value::Null);
199    }
200
201    #[test]
202    fn value_ref_units() {
203        assert_eq!(value_ref_to_value(ValueRef::Null), Value::Null);
204        assert_eq!(value_ref_to_value(ValueRef::Integer(7)), Value::Int(7));
205        assert_eq!(value_ref_to_value(ValueRef::Real(1.5)), Value::Float(1.5));
206        assert_eq!(value_ref_to_value(ValueRef::Real(f64::NAN)), Value::Null);
207        assert_eq!(
208            value_ref_to_value(ValueRef::Text(b"hi")),
209            Value::Str("hi".into())
210        );
211        assert_eq!(
212            value_ref_to_value(ValueRef::Blob(&[0x00, 0xab])),
213            Value::Str("00ab".into())
214        );
215    }
216
217    #[test]
218    fn malformed_input_errors() {
219        assert!(matches!(
220            SqliteParser.parse("app.db", b"this is not a sqlite database"),
221            Err(AxError::Parse { .. })
222        ));
223        // Magic header but otherwise garbage.
224        let mut bad = SQLITE_MAGIC.to_vec();
225        bad.extend_from_slice(b"corrupt");
226        assert!(matches!(
227            SqliteParser.parse("app.db", &bad),
228            Err(AxError::Parse { .. })
229        ));
230    }
231
232    #[test]
233    fn database_with_no_user_tables_errors() {
234        assert!(matches!(
235            SqliteParser.parse("-", &build_db("PRAGMA user_version = 1;")),
236            Err(AxError::Parse { .. })
237        ));
238    }
239
240    #[test]
241    fn sniff_keys_on_magic() {
242        assert_eq!(SqliteParser.sniff(&build_db(EVENTS)), Some(MAGIC));
243        assert_eq!(
244            SqliteParser.sniff(b"SQLite format 3\x00........"),
245            Some(MAGIC)
246        );
247        assert_eq!(SqliteParser.sniff(b"SQLite format 3"), None); // missing the NUL
248        assert_eq!(SqliteParser.sniff(b"PK\x03\x04"), None); // zip, not sqlite
249        assert_eq!(SqliteParser.sniff(b"{\"a\":1}"), None);
250    }
251
252    #[test]
253    fn claims_sqlite_extensions() {
254        assert_eq!(
255            SqliteParser.extensions(),
256            &["db", "sqlite", "sqlite3", "db3"]
257        );
258    }
259
260    #[test]
261    fn resolves_by_extension_and_magic() {
262        let reg = crate::parser::ParserRegistry::default();
263        assert_eq!(reg.resolve("app.db", b"zz").unwrap().id(), "sqlite");
264        assert_eq!(reg.resolve("data.sqlite3", b"zz").unwrap().id(), "sqlite");
265        assert_eq!(reg.resolve("-", &build_db(EVENTS)).unwrap().id(), "sqlite");
266    }
267}