sqlrite-engine 0.5.0

Light version of SQLite developed with Rust. Published as `sqlrite-engine` on crates.io; import as `use sqlrite::…`.
Documentation
//! Schema introspection — turn an open `Connection` (or raw
//! `Database`) into the textual schema dump the LLM uses to ground
//! its SQL generation.
//!
//! ## Why we don't `SELECT FROM sqlrite_master`
//!
//! The `sqlrite_master` catalog persists the original `CREATE TABLE`
//! SQL, so reflecting via SQL would work. But we already have the
//! same information typed in `Database.tables` — going through SQL
//! would mean parsing the round-tripped CREATE statement a second
//! time. Walking the typed structure is cheaper and matches what
//! the engine considers authoritative *right now* (relevant inside
//! an open transaction, where the persisted catalog may already be
//! stale relative to the in-memory snapshot).
//!
//! ## Determinism (matters for prompt caching)
//!
//! Tables are dumped in alphabetical order, columns in declaration
//! order. The output is byte-stable for a fixed schema — that's
//! what lets `cache_control: ephemeral` actually hit on repeat
//! calls. Any change to the format invalidates the cache once for
//! everyone, but steady-state hits are cheap.

use std::fmt::Write;

use crate::Connection;
use crate::sql::db::database::Database;
use crate::sql::db::table::{DataType, Table};

/// Render the schema of every user-visible table on `conn` as a
/// sequence of `CREATE TABLE … (…);` statements, sorted
/// alphabetically by table name.
pub fn dump_schema_for_connection(conn: &Connection) -> String {
    dump_schema_for_database(conn.database())
}

/// Same as [`dump_schema_for_connection`], but takes a `Database`
/// reference directly. Used by the REPL's `.ask` meta-command
/// (which holds `&Database`, not `&Connection`).
pub fn dump_schema_for_database(db: &Database) -> String {
    let mut names: Vec<&str> = db
        .tables
        .keys()
        .filter(|k| k.as_str() != "sqlrite_master")
        .map(String::as_str)
        .collect();
    names.sort_unstable();

    let mut out = String::new();
    for (i, name) in names.iter().enumerate() {
        if i > 0 {
            out.push('\n');
        }
        let table = match db.tables.get(*name) {
            Some(t) => t,
            None => continue,
        };
        format_create_table(table, &mut out);
    }
    out
}

/// Format one `Table` as a single `CREATE TABLE` statement.
///
/// Secondary indexes are deliberately not included — they're a
/// `WHERE col = literal` performance concern, not something the LLM
/// needs to reason about when choosing what columns to project or
/// filter on. (HNSW indexes affect what queries are even *possible*
/// efficiently. A future follow-up can add an `[indexed: hnsw,
/// M=16, ef=200]` annotation on indexed vector columns once the
/// prompt budget can absorb it.)
fn format_create_table(table: &Table, out: &mut String) {
    let _ = writeln!(out, "CREATE TABLE {} (", table.tb_name);
    for (i, col) in table.columns.iter().enumerate() {
        let datatype = render_datatype(&col.datatype);
        let mut clauses: Vec<&'static str> = Vec::new();
        if col.is_pk {
            clauses.push("PRIMARY KEY");
        }
        if col.is_unique && !col.is_pk {
            // PRIMARY KEY already implies UNIQUE; SQLite's reflection
            // doesn't double-print it and neither do we.
            clauses.push("UNIQUE");
        }
        if col.not_null && !col.is_pk {
            clauses.push("NOT NULL");
        }

        let trailing = if i + 1 == table.columns.len() {
            ""
        } else {
            ","
        };
        if clauses.is_empty() {
            let _ = writeln!(out, "  {} {}{}", col.column_name, datatype, trailing);
        } else {
            let _ = writeln!(
                out,
                "  {} {} {}{}",
                col.column_name,
                datatype,
                clauses.join(" "),
                trailing
            );
        }
    }
    out.push_str(");\n");
}

fn render_datatype(dt: &DataType) -> String {
    // Match the canonical SQL the parser accepts on the way in (so
    // round-trip is `dump → CREATE TABLE` → identical schema). The
    // engine's internal `Display` impl is debug-leaning ("Boolean",
    // "Vector(384)") — different enough from the on-the-wire form
    // that we render explicitly here.
    match dt {
        DataType::Integer => "INTEGER".to_string(),
        DataType::Text => "TEXT".to_string(),
        DataType::Real => "REAL".to_string(),
        DataType::Bool => "BOOLEAN".to_string(),
        DataType::Vector(dim) => format!("VECTOR({dim})"),
        DataType::Json => "JSON".to_string(),
        DataType::None => "TEXT".to_string(),
        // Invalid columns shouldn't reach a schema dump in practice
        // (the parser rejects them at CREATE time), but if one slips
        // through we render it as TEXT rather than panicking — the
        // LLM will at worst suggest a TEXT-shaped query.
        DataType::Invalid => "TEXT".to_string(),
    }
}

#[cfg(test)]
mod tests {
    use super::*;
    use crate::Connection;

    fn open() -> Connection {
        Connection::open_in_memory().expect("open in-memory db")
    }

    #[test]
    fn empty_schema_returns_empty_string() {
        let conn = open();
        assert_eq!(dump_schema_for_connection(&conn), "");
    }

    #[test]
    fn single_table_round_trips() {
        let mut conn = open();
        conn.execute(
            "CREATE TABLE users (id INTEGER PRIMARY KEY, name TEXT NOT NULL, email TEXT UNIQUE)",
        )
        .unwrap();
        let dump = dump_schema_for_connection(&conn);
        assert!(dump.contains("CREATE TABLE users ("), "got: {dump}");
        assert!(dump.contains("id INTEGER PRIMARY KEY"));
        assert!(dump.contains("name TEXT NOT NULL"));
        assert!(dump.contains("email TEXT UNIQUE"));
    }

    #[test]
    fn vector_and_json_columns_render_with_canonical_keywords() {
        let mut conn = open();
        conn.execute(
            "CREATE TABLE docs (id INTEGER PRIMARY KEY, embedding VECTOR(384), payload JSON)",
        )
        .unwrap();
        let dump = dump_schema_for_connection(&conn);
        assert!(dump.contains("embedding VECTOR(384)"), "got: {dump}");
        assert!(dump.contains("payload JSON"), "got: {dump}");
    }

    #[test]
    fn tables_emitted_in_alphabetical_order() {
        let mut conn = open();
        conn.execute("CREATE TABLE zebra (id INTEGER PRIMARY KEY)")
            .unwrap();
        conn.execute("CREATE TABLE alpha (id INTEGER PRIMARY KEY)")
            .unwrap();
        conn.execute("CREATE TABLE mango (id INTEGER PRIMARY KEY)")
            .unwrap();
        let dump = dump_schema_for_connection(&conn);
        let alpha = dump.find("CREATE TABLE alpha").unwrap();
        let mango = dump.find("CREATE TABLE mango").unwrap();
        let zebra = dump.find("CREATE TABLE zebra").unwrap();
        assert!(
            alpha < mango && mango < zebra,
            "non-deterministic order: {dump}"
        );
    }

    #[test]
    fn sqlrite_master_is_excluded() {
        let mut conn = open();
        conn.execute("CREATE TABLE users (id INTEGER PRIMARY KEY)")
            .unwrap();
        let dump = dump_schema_for_connection(&conn);
        assert!(
            !dump.contains("sqlrite_master"),
            "internal catalog leaked: {dump}"
        );
    }

    #[test]
    fn dump_is_byte_stable_across_calls() {
        // The whole point of putting the schema dump behind a
        // `cache_control: ephemeral` breakpoint is that this is
        // true. If sort order ever becomes non-deterministic
        // (e.g. by walking the HashMap directly), prompt caching
        // silently degrades.
        let mut conn = open();
        conn.execute("CREATE TABLE a (id INTEGER PRIMARY KEY, x TEXT)")
            .unwrap();
        conn.execute("CREATE TABLE b (id INTEGER PRIMARY KEY, y REAL)")
            .unwrap();
        conn.execute("CREATE TABLE c (id INTEGER PRIMARY KEY, z BOOLEAN)")
            .unwrap();
        let first = dump_schema_for_connection(&conn);
        for _ in 0..20 {
            assert_eq!(dump_schema_for_connection(&conn), first);
        }
    }
}