Skip to main content

sqlite_graphrag/commands/
init.rs

1//! Handler for the `init` CLI subcommand.
2
3use crate::errors::AppError;
4use crate::output;
5use crate::paths::AppPaths;
6use crate::pragmas::{apply_init_pragmas, ensure_wal_mode};
7use crate::storage::connection::open_rw;
8use serde::Serialize;
9
10/// Embedding model choices exposed through `--model`.
11///
12/// Legacy flag kept for CLI compatibility only: since v1.0.76 the build is
13/// LLM-only and no local model is downloaded. The value is accepted and
14/// ignored; `schema_meta.model` records the CLI version (G46).
15#[derive(Copy, Clone, Debug, PartialEq, Eq, clap::ValueEnum)]
16pub enum EmbeddingModelChoice {
17    #[value(name = "multilingual-e5-small")]
18    MultilingualE5Small,
19}
20
21#[derive(clap::Args)]
22#[command(after_long_help = "EXAMPLES:\n  \
23    # Initialize a new database in the current directory\n  \
24    sqlite-graphrag init\n\n  \
25    # Initialize with a specific namespace\n  \
26    sqlite-graphrag init --namespace my-project\n\n  \
27    # Initialize at a custom database path\n  \
28    sqlite-graphrag init --db /path/to/graphrag.sqlite")]
29pub struct InitArgs {
30    /// Path to graphrag.sqlite. Defaults to `./graphrag.sqlite` in the current directory.
31    /// Resolution precedence (highest to lowest): `--db` flag > `SQLITE_GRAPHRAG_DB_PATH` env >
32    /// `SQLITE_GRAPHRAG_HOME` env (used as base directory) > cwd.
33    #[arg(long, env = "SQLITE_GRAPHRAG_DB_PATH")]
34    pub db: Option<String>,
35    /// Legacy embedding model identifier (accepted and ignored since the
36    /// v1.0.76 LLM-only build; kept for CLI compatibility). Safe to omit.
37    #[arg(long, value_enum)]
38    pub model: Option<EmbeddingModelChoice>,
39    /// Force re-initialization, overwriting any existing schema metadata.
40    /// Use only when the schema is corrupted; loses configuration but preserves data.
41    #[arg(long)]
42    pub force: bool,
43    /// Initial namespace to resolve. Aligned with bilingual docs that mention `init --namespace`.
44    /// When provided, overrides `SQLITE_GRAPHRAG_NAMESPACE`; otherwise resolves via env or fallback `global`.
45    #[arg(long)]
46    pub namespace: Option<String>,
47    #[arg(long, hide = true, help = "No-op; JSON is always emitted on stdout")]
48    pub json: bool,
49}
50
51#[derive(Serialize)]
52struct InitResponse {
53    db_path: String,
54    /// Latest applied migration number from `refinery_schema_history`.
55    /// Emitted as a JSON number for cross-command consistency with `health` and `stats` (since v1.0.35).
56    schema_version: u32,
57    model: String,
58    dim: usize,
59    /// Active namespace resolved during initialisation, aligned with the bilingual docs.
60    namespace: String,
61    status: String,
62    /// Total execution time in milliseconds from handler start to serialisation.
63    elapsed_ms: u64,
64}
65
66pub fn run(
67    args: InitArgs,
68    llm_backend: crate::cli::LlmBackendChoice,
69    embedding_backend: crate::cli::EmbeddingBackendChoice,
70) -> Result<(), AppError> {
71    let start = std::time::Instant::now();
72    let paths = AppPaths::resolve(args.db.as_deref())?;
73    paths.ensure_dirs()?;
74
75    let namespace = crate::namespace::resolve_namespace(args.namespace.as_deref())?;
76
77    let mut conn = open_rw(&paths.db)?;
78
79    apply_init_pragmas(&conn)?;
80
81    crate::migrations::runner()
82        .run(&mut conn)
83        .map_err(|e| AppError::Internal(anyhow::anyhow!("migration failed: {e}")))?;
84
85    conn.execute_batch(&format!(
86        "PRAGMA user_version = {};",
87        crate::constants::SCHEMA_USER_VERSION
88    ))?;
89
90    // Defensive re-assertion: refinery may revert journal_mode during migrations.
91    ensure_wal_mode(&conn)?;
92
93    let schema_version = latest_schema_version(&conn)?;
94
95    conn.execute(
96        "INSERT OR REPLACE INTO schema_meta (key, value) VALUES ('schema_version', ?1)",
97        rusqlite::params![schema_version],
98    )?;
99    conn.execute(
100        "INSERT OR REPLACE INTO schema_meta (key, value) VALUES ('model', ?1)",
101        rusqlite::params![crate::constants::SQLITE_GRAPHRAG_VERSION],
102    )?;
103    // G43: pre-v1.0.79 this hardcoded '384', stamping NEW databases with a
104    // dimensionality that contradicts the active default (64 since G42/S1).
105    // INSERT OR IGNORE preserves the recorded dim on re-init of an existing
106    // database; the active dim (env > database > default) fills new ones.
107    conn.execute(
108        "INSERT OR IGNORE INTO schema_meta (key, value) VALUES ('dim', ?1)",
109        rusqlite::params![crate::constants::embedding_dim().to_string()],
110    )?;
111    conn.execute(
112        "INSERT OR REPLACE INTO schema_meta (key, value) VALUES ('created_at', CAST(unixepoch() AS TEXT))",
113        [],
114    )?;
115    conn.execute(
116        "INSERT OR REPLACE INTO schema_meta (key, value) VALUES ('sqlite-graphrag_version', ?1)",
117        rusqlite::params![crate::constants::SQLITE_GRAPHRAG_VERSION],
118    )?;
119    // Persist the resolved namespace so downstream tools can inspect it without re-resolving.
120    conn.execute(
121        "INSERT OR REPLACE INTO schema_meta (key, value) VALUES ('namespace_initial', ?1)",
122        rusqlite::params![namespace],
123    )?;
124
125    output::emit_progress_i18n(
126        "Validating embedding backend...",
127        "Validando backend de embedding...",
128    );
129
130    // GAP-INIT-EMBEDDING-001 FIX (v1.0.89): init must succeed without LLM.
131    // Schema, tables and FTS5 are created above; the smoke test only validates
132    // that the embedding subprocess is reachable. When it is not (OAuth expired,
133    // CLI missing), init still succeeds with dim from the database or default.
134    // ADR-0011: Validation errors (OAuth-only enforcement) are FATAL — propagate.
135    // v1.0.89 (GAP-EMBED-PROPAGATION): honour --llm-backend via embed_passage_with_choice.
136    let (dim, status) = match crate::embedder::embed_passage_with_embedding_choice(
137        &paths.models,
138        "smoke test",
139        embedding_backend,
140        llm_backend,
141    ) {
142        Ok((v, _backend)) => (v.len(), "ok"),
143        Err(crate::errors::AppError::Validation(msg)) => {
144            return Err(crate::errors::AppError::Validation(msg))
145        }
146        Err(e) => {
147            tracing::warn!(target: "init", error = %e, "embedding smoke test failed; init continues without LLM validation");
148            (crate::constants::embedding_dim(), "ok_no_embedding")
149        }
150    };
151
152    output::emit_json(&InitResponse {
153        db_path: paths.db.display().to_string(),
154        schema_version,
155        model: crate::constants::SQLITE_GRAPHRAG_VERSION.to_string(),
156        dim,
157        namespace,
158        status: status.to_string(),
159        elapsed_ms: start.elapsed().as_millis() as u64,
160    })?;
161
162    Ok(())
163}
164
165fn latest_schema_version(conn: &rusqlite::Connection) -> Result<u32, AppError> {
166    match conn.query_row(
167        "SELECT version FROM refinery_schema_history ORDER BY version DESC LIMIT 1",
168        [],
169        |row| row.get::<_, i64>(0),
170    ) {
171        Ok(version) => Ok(version.max(0) as u32),
172        Err(rusqlite::Error::QueryReturnedNoRows) => Ok(0),
173        Err(err) => Err(AppError::Database(err)),
174    }
175}
176
177#[cfg(test)]
178mod tests {
179    use super::*;
180
181    #[test]
182    fn init_response_serializes_all_fields() {
183        let resp = InitResponse {
184            db_path: "/tmp/test.sqlite".to_string(),
185            schema_version: 6,
186            model: crate::constants::SQLITE_GRAPHRAG_VERSION.to_string(),
187            dim: 384,
188            namespace: "global".to_string(),
189            status: "ok".to_string(),
190            elapsed_ms: 100,
191        };
192        let json = serde_json::to_value(&resp).expect("serialization failed");
193        assert_eq!(json["db_path"], "/tmp/test.sqlite");
194        assert_eq!(json["schema_version"], 6);
195        assert_eq!(json["model"], crate::constants::SQLITE_GRAPHRAG_VERSION);
196        assert_eq!(json["dim"], 384usize);
197        assert_eq!(json["namespace"], "global");
198        assert_eq!(json["status"], "ok");
199        assert!(json["elapsed_ms"].is_number());
200    }
201
202    #[test]
203    fn latest_schema_version_returns_zero_for_empty_db() {
204        let conn = rusqlite::Connection::open_in_memory().expect("failed to open in-memory db");
205        conn.execute_batch("CREATE TABLE refinery_schema_history (version INTEGER NOT NULL);")
206            .expect("failed to create table");
207
208        let version = latest_schema_version(&conn).expect("latest_schema_version failed");
209        assert_eq!(version, 0u32, "empty db must return schema_version 0");
210    }
211
212    #[test]
213    fn latest_schema_version_returns_max_version() {
214        let conn = rusqlite::Connection::open_in_memory().expect("failed to open in-memory db");
215        conn.execute_batch(
216            "CREATE TABLE refinery_schema_history (version INTEGER NOT NULL);
217             INSERT INTO refinery_schema_history VALUES (1);
218             INSERT INTO refinery_schema_history VALUES (3);
219             INSERT INTO refinery_schema_history VALUES (2);",
220        )
221        .expect("failed to populate table");
222
223        let version = latest_schema_version(&conn).expect("latest_schema_version failed");
224        assert_eq!(version, 3u32, "must return the highest version present");
225    }
226
227    #[test]
228    fn init_default_dim_is_64() {
229        // G42/S1 (v1.0.79): the default dimensionality dropped from 384
230        // to 64 (MRL, arXiv 2205.13147). The active dim may differ when
231        // an env override or an existing database sets it.
232        assert_eq!(
233            crate::constants::DEFAULT_EMBEDDING_DIM,
234            64,
235            "default dim must be 64 in the LLM-only build"
236        );
237    }
238
239    #[test]
240    fn init_response_namespace_aligned_with_schema() {
241        // Verify namespace field survives round-trip serialization with correct value.
242        let resp = InitResponse {
243            db_path: "/tmp/x.sqlite".to_string(),
244            schema_version: 6,
245            model: crate::constants::SQLITE_GRAPHRAG_VERSION.to_string(),
246            dim: 384,
247            namespace: "my-project".to_string(),
248            status: "ok".to_string(),
249            elapsed_ms: 0,
250        };
251        let json = serde_json::to_value(&resp).expect("serialization failed");
252        assert_eq!(json["namespace"], "my-project");
253    }
254}