archelon_core/
cache.rs

1//! Machine-local SQLite cache for fast entry lookups.
2//!
3//! The cache lives at `$XDG_CACHE_HOME/archelon/{journal_id}/cache.db` — outside
4//! the journal directory so it is never synced by git, Syncthing, or Nextcloud.
5//!
6//! # Sync strategy
7//!
8//! On each invocation, all `.md` files are stat()-ed (O(n), syscalls only).
9//! Per-file mtime comparison is used rather than a global `last_synced_at`
10//! timestamp: syncing tools such as Syncthing preserve the original mtime, so a
11//! global watermark would miss files changed or deleted on another machine.
12//!
13//! The sync:
14//! - **New / modified files** (mtime changed or path not in DB): re-parsed and upserted.
15//! - **Deleted files** (path in DB but gone from disk): removed from cache.
16//!   Handles Syncthing/Nextcloud propagated deletions transparently.
17//!
18//! Explicit deletion after `archelon entry remove` is handled by
19//! [`remove_from_cache`], which avoids a full sync round-trip in that hot path.
20//!
21//! # Schema
22//!
23//! - `files`: tracks every `.md` file ever scanned (path + mtime).  Covers both
24//!   managed entries and non-managed files (e.g. `README.md`).  A file whose mtime
25//!   is unchanged is skipped entirely on subsequent syncs — preventing repeated
26//!   parse-failure warnings for unmanaged files.
27//! - `entries`: managed-entry metadata.  `id INTEGER PRIMARY KEY` uses CarettaId as
28//!   i64.  `path` has an FK to `files(path) ON DELETE CASCADE` so removing a row
29//!   from `files` automatically removes the corresponding entry.
30//! - `tags`: many-to-many tag index for efficient tag filtering.
31//! - `entries_fts`: FTS5 virtual table (trigram tokenizer) over `title` + `body`
32//!   for full-text search. Trigram enables substring search and CJK text with no spaces.
33//!
34//! # Schema versioning
35//!
36//! [`SCHEMA_VERSION`] is stored in SQLite's `PRAGMA user_version`.
37//! - **DB version = 0** (fresh file): schema is applied and version is set.
38//! - **DB version < app version**: schema changed; cache is wiped and rebuilt automatically.
39//! - **DB version > app version**: the cache was created by a newer archelon; an error is
40//!   returned instructing the user to update archelon or run `archelon cache rebuild`.
41
42use std::{
43    collections::{HashMap, HashSet},
44    path::{Path, PathBuf},
45};
46
47use caretta_id::CarettaId;
48use rusqlite::{params, Connection, OptionalExtension as _};
49
50use crate::{
51    error::{Error, Result},
52    journal::{DuplicateTitlePolicy, Journal},
53    parser::{read_entry, render_entry},
54};
55
56// ── schema version ────────────────────────────────────────────────────────────
57
58/// Stored in `PRAGMA user_version`.  Increment whenever the schema changes.
59pub const SCHEMA_VERSION: i32 = 2;
60
61// ── schema ────────────────────────────────────────────────────────────────────
62
63const SCHEMA: &str = "
64CREATE TABLE IF NOT EXISTS files (
65    path       TEXT    PRIMARY KEY,
66    file_mtime INTEGER NOT NULL
67);
68
69CREATE TABLE IF NOT EXISTS entries (
70    id              INTEGER PRIMARY KEY,
71    parent_id       INTEGER REFERENCES entries(id),
72    path            TEXT    NOT NULL UNIQUE REFERENCES files(path) ON DELETE CASCADE,
73    title           TEXT    NOT NULL DEFAULT '',
74    slug            TEXT    NOT NULL DEFAULT '',
75    created_at      TEXT,
76    updated_at      TEXT,
77    task_status     TEXT,
78    task_due        TEXT,
79    task_started_at TEXT,
80    task_closed_at  TEXT,
81    event_start     TEXT,
82    event_end       TEXT,
83    body            TEXT    NOT NULL DEFAULT ''
84);
85CREATE INDEX IF NOT EXISTS idx_entries_parent     ON entries(parent_id);
86CREATE INDEX IF NOT EXISTS idx_entries_title      ON entries(title);
87CREATE INDEX IF NOT EXISTS idx_entries_created_at ON entries(created_at);
88CREATE INDEX IF NOT EXISTS idx_entries_updated_at ON entries(updated_at);
89CREATE INDEX IF NOT EXISTS idx_entries_task_status ON entries(task_status);
90CREATE INDEX IF NOT EXISTS idx_entries_task_due   ON entries(task_due);
91CREATE INDEX IF NOT EXISTS idx_entries_event_start ON entries(event_start);
92
93CREATE TABLE IF NOT EXISTS tags (
94    entry_id INTEGER NOT NULL REFERENCES entries(id) ON DELETE CASCADE,
95    tag      TEXT    NOT NULL,
96    PRIMARY KEY (entry_id, tag)
97);
98CREATE INDEX IF NOT EXISTS idx_tags_tag ON tags(tag);
99
100CREATE VIRTUAL TABLE IF NOT EXISTS entries_fts USING fts5(
101    title,
102    body,
103    content    = 'entries',
104    content_rowid = 'id',
105    tokenize   = 'trigram'
106);
107
108CREATE TABLE IF NOT EXISTS chunks (
109    id          INTEGER PRIMARY KEY,
110    entry_id    INTEGER NOT NULL REFERENCES entries(id) ON DELETE CASCADE,
111    chunk_index INTEGER NOT NULL,
112    text        TEXT    NOT NULL,
113    UNIQUE (entry_id, chunk_index)
114);
115CREATE INDEX IF NOT EXISTS idx_chunks_entry_id ON chunks(entry_id);
116";
117
118// ── public API ────────────────────────────────────────────────────────────────
119
120/// Compute the path to the current-version SQLite cache file within `cache_dir`.
121///
122/// Returns `{cache_dir}/cache.v{SCHEMA_VERSION}.db`.
123pub fn db_path(cache_dir: &Path) -> PathBuf {
124    cache_dir.join(format!("cache_v{SCHEMA_VERSION}.db"))
125}
126
127/// Open (or create) the SQLite cache for `journal`.
128///
129/// - **Fresh DB** (user_version = 0): schema is applied and version is set.
130/// - **DB version = [`SCHEMA_VERSION`]**: opened as-is.
131/// - **DB version ≠ [`SCHEMA_VERSION`]**: returns [`Error::CacheSchemaTooNew`].
132///   This should not normally occur because the DB filename already encodes the
133///   version; it indicates manual tampering.  Run `archelon cache rebuild` to
134///   recreate the current-version DB.
135pub fn open_cache(journal: &Journal) -> Result<Connection> {
136    let cache_dir = journal.cache_dir()?;
137    std::fs::create_dir_all(&cache_dir)?;
138    open_or_init(&db_path(&cache_dir))
139}
140
141/// Delete the current-version cache file and create a fresh one.
142///
143/// Only the current-version file (`cache.v{N}.db` + WAL/SHM) is removed.
144/// Older versions are left untouched; use `archelon cache clean` to remove them.
145/// After this call the returned connection has an empty, schema-correct DB;
146/// call [`sync_cache`] to populate it.
147pub fn rebuild_cache(journal: &Journal) -> Result<Connection> {
148    let cache_dir = journal.cache_dir()?;
149    std::fs::create_dir_all(&cache_dir)?;
150    let p = db_path(&cache_dir);
151    wipe_db_files(&p);
152    open_or_init(&p)
153}
154
155/// Summary information about the current cache state.
156pub struct CacheInfo {
157    pub db_path: PathBuf,
158    pub schema_version: i32,
159    /// Total `.md` files tracked (managed entries + unmanaged files like README.md).
160    pub file_count: u64,
161    pub entry_count: u64,
162    pub unique_tag_count: u64,
163}
164
165/// Collect cache statistics for display.
166pub fn cache_info(journal: &Journal, conn: &Connection) -> Result<CacheInfo> {
167    let db_path = db_path(&journal.cache_dir()?);
168    let schema_version =
169        conn.query_row("PRAGMA user_version", [], |row| row.get::<_, i32>(0))?;
170    let file_count =
171        conn.query_row("SELECT COUNT(*) FROM files", [], |row| row.get::<_, u64>(0))?;
172    let entry_count =
173        conn.query_row("SELECT COUNT(*) FROM entries", [], |row| row.get::<_, u64>(0))?;
174    let unique_tag_count =
175        conn.query_row("SELECT COUNT(DISTINCT tag) FROM tags", [], |row| row.get::<_, u64>(0))?;
176    Ok(CacheInfo { db_path, schema_version, file_count, entry_count, unique_tag_count })
177}
178
179// ── open helpers ──────────────────────────────────────────────────────────────
180
181fn open_or_init(db_path: &Path) -> Result<Connection> {
182    let conn = Connection::open(db_path)?;
183    // WAL for better concurrency; foreign keys required for ON DELETE CASCADE.
184    conn.execute_batch("PRAGMA journal_mode=WAL; PRAGMA foreign_keys=ON;")?;
185
186    let db_version: i32 =
187        conn.query_row("PRAGMA user_version", [], |row| row.get(0))?;
188
189    if db_version == 0 {
190        // Fresh DB: apply schema and stamp the version.
191        conn.execute_batch(SCHEMA)?;
192        conn.execute_batch(&format!("PRAGMA user_version = {SCHEMA_VERSION}"))?;
193        return Ok(conn);
194    }
195
196    if db_version == SCHEMA_VERSION {
197        return Ok(conn);
198    }
199
200    // Version mismatch inside a versioned file is unexpected (indicates manual
201    // tampering or a bug).  Return an error; the user should run `cache rebuild`.
202    Err(Error::CacheSchemaTooNew {
203        db_version,
204        app_version: SCHEMA_VERSION,
205    })
206}
207
208/// Remove the main DB file plus any WAL/SHM sidecar files.  Errors are ignored
209/// (files may not exist or may already be gone).
210fn wipe_db_files(db_path: &Path) {
211    let base = db_path.to_string_lossy();
212    for suffix in ["", "-wal", "-shm"] {
213        let _ = std::fs::remove_file(format!("{base}{suffix}"));
214    }
215}
216
217/// Incrementally sync the cache against the journal's `.md` files.
218///
219/// Files whose mtime changed or whose path is new are re-parsed and upserted.
220/// Files present in the DB but gone from disk are removed (handles Syncthing/
221/// Nextcloud deletions propagated with the original mtime).
222///
223/// FTS5 index is rebuilt in full only when at least one entry changed, avoiding
224/// unnecessary work on invocations where nothing has changed.
225pub fn sync_cache(journal: &Journal, conn: &Connection) -> Result<()> {
226    let disk_files = collect_with_mtime(journal)?;
227    let disk_paths: HashSet<String> = disk_files
228        .iter()
229        .map(|(p, _)| p.to_string_lossy().into_owned())
230        .collect();
231
232    // `files` table tracks ALL scanned .md files (managed + unmanaged).
233    // Using it as the mtime store means non-managed files (e.g. README.md) whose
234    // mtime hasn't changed are skipped entirely — no repeated parse-failure warn.
235    let db_files = query_all_mtimes(conn)?;
236
237    let mut entry_changed = false;
238
239    // Defer FK checks to commit time so children can be inserted before their
240    // parents (e.g. when syncing a journal from scratch or after a Syncthing
241    // propagation that delivers files out of topological order).
242    conn.execute_batch("PRAGMA defer_foreign_keys=ON; BEGIN")?;
243
244    // ── delete files removed from disk ───────────────────────────────────────
245    // Process deletions first so that renamed files (old path gone, new path
246    // present) are cleaned up before the upsert loop runs.  Without this
247    // ordering the duplicate-ID check below would fire on the stale cache row
248    // that still holds the same ID as the renamed file's new path.
249    for db_path in db_files.keys() {
250        if !disk_paths.contains(db_path.as_str()) {
251            let was_entry = conn
252                .query_row(
253                    "SELECT 1 FROM entries WHERE path = ?1",
254                    [db_path.as_str()],
255                    |_| Ok(()),
256                )
257                .is_ok();
258            // Deleting from `files` cascades to `entries` and then `tags`.
259            conn.execute("DELETE FROM files WHERE path = ?1", [db_path])?;
260            if was_entry {
261                entry_changed = true;
262            }
263        }
264    }
265
266    // ── upsert new / modified files ──────────────────────────────────────────
267    for (path, mtime) in &disk_files {
268        let path_str = path.to_string_lossy();
269        let needs_update = db_files
270            .get(path_str.as_ref())
271            .map_or(true, |&stored| stored != *mtime);
272
273        if needs_update {
274            match read_entry(path) {
275                Ok(entry) => {
276                    // ── duplicate ID check ────────────────────────────────
277                    // On collision, increment the ID until a free slot is
278                    // found, rename the file, and rewrite the frontmatter.
279                    let entry = increment_until_free(conn, entry)?;
280                    let final_mtime = file_mtime(&entry.path)?;
281                    let final_str = entry.path.to_string_lossy();
282                    // Record in `files`; `entries.path` has an FK to `files.path`.
283                    conn.execute(
284                        "INSERT OR REPLACE INTO files (path, file_mtime) VALUES (?1, ?2)",
285                        params![final_str.as_ref(), final_mtime],
286                    )?;
287                    upsert_entry(conn, &entry)?;
288                    entry_changed = true;
289                }
290                Err(e) => {
291                    // File changed but is not a valid entry — still track it
292                    // so mtime comparison skips it on future syncs.
293                    conn.execute(
294                        "INSERT OR REPLACE INTO files (path, file_mtime) VALUES (?1, ?2)",
295                        params![path_str.as_ref(), mtime],
296                    )?;
297                    // Remove any stale entry row.
298                    conn.execute(
299                        "DELETE FROM entries WHERE path = ?1",
300                        [path_str.as_ref()],
301                    )?;
302                    eprintln!("warn: {}: {e}", path.display());
303                }
304            }
305        }
306    }
307
308    // ── duplicate title check ─────────────────────────────────────────────────
309    // Runs inside the transaction so it reflects all changes made above.
310    let dup_policy = journal.config().unwrap_or_default().journal.duplicate_title;
311    if dup_policy != DuplicateTitlePolicy::Allow {
312        let mut stmt = conn.prepare(
313            "SELECT title FROM entries WHERE title != '' \
314             GROUP BY title HAVING COUNT(*) > 1 LIMIT 1",
315        )?;
316        let dup: Option<String> = stmt
317            .query_map([], |row| row.get::<_, String>(0))?
318            .next()
319            .transpose()?;
320
321        if let Some(title) = dup {
322            match dup_policy {
323                DuplicateTitlePolicy::Warn => {
324                    eprintln!("warn: duplicate title detected: `{title}`");
325                }
326                DuplicateTitlePolicy::Error => {
327                    conn.execute_batch("ROLLBACK")?;
328                    return Err(Error::DuplicateTitle(title));
329                }
330                DuplicateTitlePolicy::Allow => unreachable!(),
331            }
332        }
333    }
334
335    conn.execute_batch("COMMIT")?;
336
337    // Rebuild FTS5 only when the entries table actually changed.
338    if entry_changed {
339        conn.execute_batch("INSERT INTO entries_fts(entries_fts) VALUES('rebuild')")?;
340    }
341
342    Ok(())
343}
344
345/// Look up an entry by its [`CarettaId`].
346///
347/// If the stored path no longer exists on disk, the stale row is removed and
348/// [`Error::EntryNotFound`] is returned.
349pub fn find_entry_by_id(conn: &Connection, id: CarettaId) -> Result<crate::entry::Entry> {
350    match fetch_full_entry(conn, id) {
351        Ok(entry) => {
352            if !entry.path.exists() {
353                conn.execute("DELETE FROM entries WHERE id = ?1", [id])?;
354                return Err(Error::EntryNotFound(id.to_string()));
355            }
356            Ok(entry)
357        }
358        Err(Error::Cache(rusqlite::Error::QueryReturnedNoRows)) => {
359            Err(Error::EntryNotFound(id.to_string()))
360        }
361        Err(e) => Err(e),
362    }
363}
364
365/// Look up an entry by its exact title.
366///
367/// Returns [`Error::AmbiguousTitle`] if more than one entry matches,
368/// and [`Error::EntryNotFoundByTitle`] if none do.
369///
370/// If the matched path no longer exists on disk the stale row is removed and
371/// [`Error::EntryNotFoundByTitle`] is returned.
372pub fn find_entry_by_title(
373    conn: &Connection,
374    title: &str,
375) -> Result<crate::entry::Entry> {
376    let mut stmt = conn.prepare("SELECT id, path FROM entries WHERE title = ?1")?;
377    let rows: Vec<(CarettaId, String)> = stmt
378        .query_map([title], |row| Ok((row.get::<_, CarettaId>(0)?, row.get::<_, String>(1)?)))?
379        .collect::<rusqlite::Result<Vec<_>>>()?;
380
381    match rows.len() {
382        0 => Err(Error::EntryNotFoundByTitle(title.to_owned())),
383        1 => {
384            let (id, path_str) = rows.into_iter().next().unwrap();
385            if !PathBuf::from(&path_str).exists() {
386                conn.execute("DELETE FROM files WHERE path = ?1", [&path_str])?;
387                return Err(Error::EntryNotFoundByTitle(title.to_owned()));
388            }
389            fetch_full_entry(conn, id)
390        }
391        n => Err(Error::AmbiguousTitle(title.to_owned(), n)),
392    }
393}
394
395/// Read all entries from the cache as [`EntryHeader`] structs (no body).
396///
397/// Both a sync and a cache-open are expected to have been done by the caller.
398/// `slug` and unknown frontmatter fields are not stored in the cache; they
399/// default to `None`/empty in the returned structs.
400pub fn list_entries_from_cache(conn: &Connection) -> Result<Vec<crate::entry::EntryHeader>> {
401    use chrono::NaiveDateTime;
402    use crate::entry::{EntryHeader, EventMetaView, FrontmatterView, TaskMetaView};
403
404    // Fetch all tags in one query to avoid N+1 queries.
405    let mut tag_map: HashMap<CarettaId, Vec<String>> = HashMap::new();
406    {
407        let mut stmt = conn.prepare("SELECT entry_id, tag FROM tags ORDER BY entry_id, tag")?;
408        let rows = stmt
409            .query_map([], |row| {
410                Ok((row.get::<_, CarettaId>(0)?, row.get::<_, String>(1)?))
411            })?
412            .collect::<rusqlite::Result<Vec<_>>>()?;
413        for (id, tag) in rows {
414            tag_map.entry(id).or_default().push(tag);
415        }
416    }
417
418    let parse_dt = |s: &str| {
419        NaiveDateTime::parse_from_str(s, "%Y-%m-%dT%H:%M").unwrap_or_default()
420    };
421    let parse_dt_opt = |s: Option<String>| -> Option<NaiveDateTime> {
422        s.as_deref()
423            .and_then(|s| NaiveDateTime::parse_from_str(s, "%Y-%m-%dT%H:%M").ok())
424    };
425
426    let mut stmt = conn.prepare(
427        "SELECT id, parent_id, path, title, slug, created_at, updated_at,
428                task_status, task_due, task_started_at, task_closed_at,
429                event_start, event_end
430         FROM entries ORDER BY id",
431    )?;
432
433    let rows = stmt
434        .query_map([], |row| {
435            Ok((
436                row.get::<_, CarettaId>(0)?,
437                row.get::<_, Option<CarettaId>>(1)?,
438                row.get::<_, String>(2)?,
439                row.get::<_, String>(3)?,
440                row.get::<_, String>(4)?,
441                row.get::<_, String>(5)?,
442                row.get::<_, String>(6)?,
443                row.get::<_, Option<String>>(7)?,
444                row.get::<_, Option<String>>(8)?,
445                row.get::<_, Option<String>>(9)?,
446                row.get::<_, Option<String>>(10)?,
447                row.get::<_, Option<String>>(11)?,
448                row.get::<_, Option<String>>(12)?,
449            ))
450        })?
451        .collect::<rusqlite::Result<Vec<_>>>()?;
452
453    let mut result = Vec::with_capacity(rows.len());
454    for (id, parent_id, path, title, slug, created_at, updated_at,
455         task_status, task_due, task_started_at, task_closed_at,
456         event_start, event_end) in rows
457    {
458        let tags = tag_map.remove(&id).unwrap_or_default();
459
460        let task = task_status.map(|status| TaskMetaView {
461            status,
462            due: parse_dt_opt(task_due),
463            started_at: parse_dt_opt(task_started_at),
464            closed_at: parse_dt_opt(task_closed_at),
465        });
466
467        let event = match (parse_dt_opt(event_start), parse_dt_opt(event_end)) {
468            (Some(start), Some(end)) => Some(EventMetaView { start, end }),
469            _ => None,
470        };
471
472        let frontmatter = FrontmatterView {
473            id,
474            parent_id,
475            title,
476            slug,
477            tags,
478            created_at: parse_dt(&created_at),
479            updated_at: parse_dt(&updated_at),
480            task,
481            event,
482        };
483
484        let flags = crate::labels::entry_flags(
485            frontmatter.task.as_ref(),
486            frontmatter.event.as_ref(),
487            frontmatter.created_at,
488            frontmatter.updated_at,
489        );
490        result.push(EntryHeader { path, frontmatter, flags });
491    }
492
493    Ok(result)
494}
495
496/// Remove an entry row from the cache by file path.
497///
498/// Tags are removed automatically via `ON DELETE CASCADE`.
499/// The FTS5 index is updated incrementally (no full rebuild needed).
500/// Call this after `archelon entry remove` to keep the cache consistent.
501pub fn remove_from_cache(conn: &Connection, path: &Path) -> Result<()> {
502    let path_str = path.to_string_lossy();
503
504    // Fetch content before deletion so we can update the FTS5 index.
505    let fts_data = conn
506        .query_row(
507            "SELECT id, title, body FROM entries WHERE path = ?1",
508            [path_str.as_ref()],
509            |row| {
510                Ok((
511                    row.get::<_, CarettaId>(0)?,
512                    row.get::<_, String>(1)?,
513                    row.get::<_, String>(2)?,
514                ))
515            },
516        )
517        .ok();
518
519    // Deleting from `files` cascades to `entries` (and then to `tags`).
520    conn.execute("DELETE FROM files WHERE path = ?1", [path_str.as_ref()])?;
521
522    if let Some((id, title, body)) = fts_data {
523        // Remove the entry's tokens from the FTS5 index.
524        let _ = conn.execute(
525            "INSERT INTO entries_fts(entries_fts, rowid, title, body) \
526             VALUES('delete', ?1, ?2, ?3)",
527            params![id, title, body],
528        );
529    }
530
531    Ok(())
532}
533
534/// Upsert a single entry into the cache by re-reading its file.
535///
536/// Use this after `create_entry` or `update_entry` to keep the cache warm
537/// without a full sync round-trip.
538///
539/// If the entry's ID collides with an existing cache entry, the ID is
540/// incremented until a free slot is found, the file is renamed on disk, and
541/// the frontmatter is rewritten — all silently.
542pub fn upsert_entry_from_path(conn: &Connection, path: &Path) -> Result<()> {
543    let entry = read_entry(path)?;
544    // Resolve ID collision by incrementing until a free slot is found.
545    let entry = increment_until_free(conn, entry)?;
546    let mtime = file_mtime(&entry.path)?;
547    let path_str = entry.path.to_string_lossy();
548    // Insert into `files` first; `entries.path` has an FK to `files.path`.
549    conn.execute(
550        "INSERT OR REPLACE INTO files (path, file_mtime) VALUES (?1, ?2)",
551        params![path_str.as_ref(), mtime],
552    )?;
553    upsert_entry(conn, &entry)?;
554    conn.execute_batch("INSERT INTO entries_fts(entries_fts) VALUES('rebuild')")?;
555    Ok(())
556}
557
558// ── internals ─────────────────────────────────────────────────────────────────
559
560/// Fetch a single entry (all columns + tags) from the cache by its numeric ID.
561///
562/// Returns `Error::Cache(QueryReturnedNoRows)` if no row exists.
563fn fetch_full_entry(
564    conn: &Connection,
565    id: CarettaId,
566) -> Result<crate::entry::Entry> {
567    use chrono::NaiveDateTime;
568    use indexmap::IndexMap;
569    use crate::entry::{Entry, EventMeta, Frontmatter, TaskMeta};
570
571    let (parent_id, path_str, title, slug, created_at, updated_at,
572         task_status, task_due, task_started_at, task_closed_at,
573         event_start, event_end, body) = conn.query_row(
574        "SELECT parent_id, path, title, slug, created_at, updated_at,
575                task_status, task_due, task_started_at, task_closed_at,
576                event_start, event_end, body
577         FROM entries WHERE id = ?1",
578        [id],
579        |row| {
580            Ok((
581                row.get::<_, Option<CarettaId>>(0)?,
582                row.get::<_, String>(1)?,
583                row.get::<_, String>(2)?,
584                row.get::<_, String>(3)?,
585                row.get::<_, String>(4)?,
586                row.get::<_, String>(5)?,
587                row.get::<_, Option<String>>(6)?,
588                row.get::<_, Option<String>>(7)?,
589                row.get::<_, Option<String>>(8)?,
590                row.get::<_, Option<String>>(9)?,
591                row.get::<_, Option<String>>(10)?,
592                row.get::<_, Option<String>>(11)?,
593                row.get::<_, String>(12)?,
594            ))
595        },
596    )?;
597
598    let mut tags_stmt = conn.prepare("SELECT tag FROM tags WHERE entry_id = ?1 ORDER BY tag")?;
599    let tags: Vec<String> = tags_stmt
600        .query_map([id], |row| row.get(0))?
601        .collect::<rusqlite::Result<Vec<_>>>()?;
602
603    let parse_dt = |s: &str| {
604        NaiveDateTime::parse_from_str(s, "%Y-%m-%dT%H:%M").unwrap_or_default()
605    };
606    let parse_dt_opt = |s: Option<String>| -> Option<NaiveDateTime> {
607        s.as_deref().and_then(|s| NaiveDateTime::parse_from_str(s, "%Y-%m-%dT%H:%M").ok())
608    };
609
610    let task = task_status.map(|status| TaskMeta {
611        status,
612        due: parse_dt_opt(task_due),
613        started_at: parse_dt_opt(task_started_at),
614        closed_at: parse_dt_opt(task_closed_at),
615        extra: IndexMap::new(),
616    });
617
618    let event = match (parse_dt_opt(event_start), parse_dt_opt(event_end)) {
619        (Some(start), Some(end)) => Some(EventMeta { start, end, extra: IndexMap::new() }),
620        _ => None,
621    };
622
623    let frontmatter = Frontmatter {
624        id,
625        parent_id,
626        title,
627        slug,
628        tags,
629        created_at: parse_dt(&created_at),
630        updated_at: parse_dt(&updated_at),
631        task,
632        event,
633        extra: IndexMap::new(),
634    };
635
636    Ok(Entry { path: PathBuf::from(path_str), frontmatter, body })
637}
638
639/// Resolves an ID collision by calling `increment()` until a free slot is
640/// found, then renames the file on disk and rewrites its frontmatter.
641/// Returns the entry with its final (non-colliding) ID and path.
642fn increment_until_free(
643    conn: &Connection,
644    mut entry: crate::entry::Entry,
645) -> Result<crate::entry::Entry> {
646    loop {
647        let path_str = entry.path.to_string_lossy();
648        let conflict: Option<String> = conn
649            .query_row(
650                "SELECT path FROM entries WHERE id = ?1 AND path != ?2",
651                params![entry.frontmatter.id, path_str.as_ref()],
652                |row| row.get(0),
653            )
654            .optional()?;
655        if conflict.is_none() {
656            break;
657        }
658        entry.frontmatter.id = entry.frontmatter.id.increment();
659        let new_name = crate::ops::entry_filename_from_frontmatter(
660            entry.frontmatter.id,
661            &entry.frontmatter,
662        );
663        let new_path = entry.path.parent().unwrap_or_else(|| Path::new(".")).join(&new_name);
664        std::fs::rename(&entry.path, &new_path)?;
665        std::fs::write(&new_path, render_entry(&entry))?;
666        entry.path = new_path;
667    }
668    Ok(entry)
669}
670
671fn collect_with_mtime(journal: &Journal) -> Result<Vec<(PathBuf, i64)>> {
672    let paths = journal.collect_entries()?;
673    let mut result = Vec::with_capacity(paths.len());
674    for path in paths {
675        let mtime = file_mtime(&path)?;
676        result.push((path, mtime));
677    }
678    Ok(result)
679}
680
681fn file_mtime(path: &Path) -> Result<i64> {
682    Ok(std::fs::metadata(path)?
683        .modified()?
684        .duration_since(std::time::UNIX_EPOCH)
685        .map(|d| d.as_secs() as i64)
686        .unwrap_or(0))
687}
688
689fn query_all_mtimes(conn: &Connection) -> Result<HashMap<String, i64>> {
690    let mut stmt = conn.prepare("SELECT path, file_mtime FROM files")?;
691    let result = stmt
692        .query_map([], |row| Ok((row.get::<_, String>(0)?, row.get::<_, i64>(1)?)))?
693        .collect::<rusqlite::Result<HashMap<_, _>>>()?;
694    Ok(result)
695}
696
697// ── sqlite-vec / vector search ────────────────────────────────────────────────
698
699static SQLITE_VEC_INIT: std::sync::Once = std::sync::Once::new();
700
701fn init_sqlite_vec() {
702    SQLITE_VEC_INIT.call_once(|| {
703        unsafe {
704            rusqlite::ffi::sqlite3_auto_extension(Some(std::mem::transmute(
705                sqlite_vec::sqlite3_vec_init as *const (),
706            )));
707        }
708    });
709}
710
711/// Open (or create) the cache with the sqlite-vec extension loaded.
712///
713/// Behaves identically to [`open_cache`] for the base schema, then additionally
714/// ensures that the `entry_vectors` vec0 virtual table exists with the requested
715/// `embedding_dim`.
716///
717/// If a vector table with a *different* dimension is already present it is
718/// dropped and recreated — all previously stored embeddings are lost.
719pub fn open_cache_vec(journal: &Journal, embedding_dim: u32) -> Result<Connection> {
720    let cache_dir = journal.cache_dir()?;
721    std::fs::create_dir_all(&cache_dir)?;
722    init_sqlite_vec();
723    let conn = open_or_init(&db_path(&cache_dir))?;
724    ensure_vec_tables(&conn, embedding_dim)?;
725    Ok(conn)
726}
727
728fn ensure_vec_tables(conn: &Connection, dim: u32) -> Result<()> {
729    conn.execute_batch(
730        "CREATE TABLE IF NOT EXISTS vec_meta (
731            key   TEXT PRIMARY KEY,
732            value TEXT NOT NULL
733        )",
734    )?;
735
736    let stored_dim: Option<u32> = conn
737        .query_row(
738            "SELECT value FROM vec_meta WHERE key = 'embedding_dim'",
739            [],
740            |row| row.get::<_, String>(0),
741        )
742        .ok()
743        .and_then(|s| s.parse().ok());
744
745    match stored_dim {
746        None => {
747            // First initialisation: create the vector table and record the dimension.
748            conn.execute_batch(&format!(
749                "CREATE VIRTUAL TABLE chunk_vectors USING vec0(\
750                 chunk_id INTEGER PRIMARY KEY, embedding FLOAT[{dim}])",
751            ))?;
752            conn.execute(
753                "INSERT OR REPLACE INTO vec_meta (key, value) VALUES ('embedding_dim', ?1)",
754                [dim.to_string()],
755            )?;
756        }
757        Some(d) if d == dim => {
758            // Dimension unchanged — ensure the table exists in case it was manually dropped.
759            conn.execute_batch(&format!(
760                "CREATE VIRTUAL TABLE IF NOT EXISTS chunk_vectors USING vec0(\
761                 chunk_id INTEGER PRIMARY KEY, embedding FLOAT[{dim}])",
762            ))?;
763        }
764        Some(old) => {
765            // Dimension changed: rebuild the vector table (embeddings are model-specific).
766            eprintln!(
767                "info: embedding dimension changed ({old} → {dim}), \
768                 recreating vector table (all stored embeddings will be lost)..."
769            );
770            conn.execute_batch("DROP TABLE IF EXISTS chunk_vectors")?;
771            conn.execute_batch(&format!(
772                "CREATE VIRTUAL TABLE chunk_vectors USING vec0(\
773                 chunk_id INTEGER PRIMARY KEY, embedding FLOAT[{dim}])",
774            ))?;
775            conn.execute(
776                "INSERT OR REPLACE INTO vec_meta (key, value) VALUES ('embedding_dim', ?1)",
777                [dim.to_string()],
778            )?;
779        }
780    }
781    Ok(())
782}
783
784/// A result returned by [`search_fts_entries`].
785#[derive(serde::Serialize)]
786pub struct SearchResult {
787    /// The entry's numeric ID (CarettaId stored as i64).
788    pub id: i64,
789    pub title: String,
790    pub path: String,
791    /// For FTS5: BM25 rank (negative; more negative = higher relevance).
792    /// For vector search: L2 distance (lower = more similar).
793    pub score: f64,
794}
795
796/// Full-text search using the FTS5 trigram index.
797///
798/// Returns up to `limit` entries matching `query`, ordered by relevance.
799/// The trigram tokenizer supports substring and CJK queries with no
800/// special configuration.
801pub fn search_fts_entries(
802    conn: &Connection,
803    query: &str,
804    limit: usize,
805) -> Result<Vec<SearchResult>> {
806    let mut stmt = conn.prepare(
807        "SELECT e.id, e.title, e.path, fts.rank
808         FROM entries_fts fts
809         JOIN entries e ON e.id = fts.rowid
810         WHERE entries_fts MATCH ?1
811         ORDER BY fts.rank
812         LIMIT ?2",
813    )?;
814    let results = stmt
815        .query_map(params![query, limit as i64], |row| {
816            Ok(SearchResult {
817                id: row.get::<_, i64>(0)?,
818                title: row.get::<_, String>(1)?,
819                path: row.get::<_, String>(2)?,
820                score: row.get::<_, f64>(3).unwrap_or(0.0),
821            })
822        })?
823        .collect::<rusqlite::Result<_>>()?;
824    Ok(results)
825}
826
827fn upsert_entry(conn: &Connection, entry: &crate::entry::Entry) -> Result<()> {
828    let fm = &entry.frontmatter;
829    let path_str = entry.path.to_string_lossy();
830
831    conn.execute(
832        "INSERT OR REPLACE INTO entries (
833            id, parent_id, path,
834            title, slug, created_at, updated_at,
835            task_status, task_due, task_started_at, task_closed_at,
836            event_start, event_end,
837            body
838        ) VALUES (?1, ?2, ?3, ?4, ?5, ?6, ?7, ?8, ?9, ?10, ?11, ?12, ?13, ?14)",
839        params![
840            fm.id,
841            fm.parent_id,
842            path_str.as_ref(),
843            fm.title,
844            fm.slug,
845            fm.created_at.format("%Y-%m-%dT%H:%M").to_string(),
846            fm.updated_at.format("%Y-%m-%dT%H:%M").to_string(),
847            fm.task.as_ref().map(|t| t.status.clone()),
848            fm.task.as_ref().and_then(|t| t.due)
849                .map(|d| d.format("%Y-%m-%dT%H:%M").to_string()),
850            fm.task.as_ref().and_then(|t| t.started_at)
851                .map(|d| d.format("%Y-%m-%dT%H:%M").to_string()),
852            fm.task.as_ref().and_then(|t| t.closed_at)
853                .map(|d| d.format("%Y-%m-%dT%H:%M").to_string()),
854            fm.event.as_ref().map(|e| e.start.format("%Y-%m-%dT%H:%M").to_string()),
855            fm.event.as_ref().map(|e| e.end.format("%Y-%m-%dT%H:%M").to_string()),
856            entry.body,
857        ],
858    )?;
859
860    // Sync tags: delete all existing then re-insert.
861    conn.execute("DELETE FROM tags WHERE entry_id = ?1", [fm.id])?;
862    for tag in &fm.tags {
863        conn.execute(
864            "INSERT OR IGNORE INTO tags (entry_id, tag) VALUES (?1, ?2)",
865            params![fm.id, tag],
866        )?;
867    }
868
869    upsert_chunks(conn, entry)?;
870
871    Ok(())
872}
873
874/// Re-chunk the entry and upsert all chunks into the `chunks` table.
875///
876/// Old chunks that no longer exist (e.g. paragraphs deleted from the body) are
877/// removed; new paragraphs are inserted; existing paragraphs are updated.
878fn upsert_chunks(conn: &Connection, entry: &crate::entry::Entry) -> Result<()> {
879    let fm = &entry.frontmatter;
880    let chunk_texts = crate::chunker::chunk_entry(&fm.title, &entry.body);
881    // Whether the sqlite-vec extension is loaded on this connection.
882    // chunk_vectors only exists when opened via open_cache_vec(); plain
883    // open_cache() (used by sync/rebuild) does not load the extension.
884    let has_vec = conn
885        .query_row(
886            "SELECT 1 FROM sqlite_master WHERE type='table' AND name='chunk_vectors'",
887            [],
888            |_| Ok(()),
889        )
890        .is_ok();
891
892    // Delete excess chunks (handles shrinking bodies).
893    if has_vec {
894        conn.execute(
895            "DELETE FROM chunk_vectors
896             WHERE chunk_id IN (
897                 SELECT id FROM chunks WHERE entry_id = ?1 AND chunk_index >= ?2
898             )",
899            params![fm.id, chunk_texts.len() as i64],
900        )?;
901    }
902    conn.execute(
903        "DELETE FROM chunks WHERE entry_id = ?1 AND chunk_index >= ?2",
904        params![fm.id, chunk_texts.len() as i64],
905    )?;
906
907    for (idx, text) in chunk_texts.iter().enumerate() {
908        // Only apply the UPDATE when text actually changed; leaving text unchanged
909        // means changes() returns 0 and we keep the existing (still valid) embedding.
910        conn.execute(
911            "INSERT INTO chunks (entry_id, chunk_index, text) VALUES (?1, ?2, ?3)
912             ON CONFLICT(entry_id, chunk_index) DO UPDATE
913             SET text = excluded.text
914             WHERE text != excluded.text",
915            params![fm.id, idx as i64, text],
916        )?;
917        // Row was inserted (new chunk) or updated (text changed) → stale embedding gone.
918        if has_vec && conn.changes() > 0 {
919            conn.execute(
920                "DELETE FROM chunk_vectors
921                 WHERE chunk_id = (
922                     SELECT id FROM chunks WHERE entry_id = ?1 AND chunk_index = ?2
923                 )",
924                params![fm.id, idx as i64],
925            )?;
926        }
927    }
928
929    Ok(())
930}
archelon_core/cache.rs

archelon_core/
cache.rs