Skip to main content

lexa_obsidian/
db.rs

1//! `LexaObsidianDb` — an Obsidian-aware wrapper around `lexa_core::LexaDb`.
2//!
3//! All retrieval (BM25, dense, hybrid, rerank) and base storage live in
4//! `lexa-core`. This crate adds:
5//!
6//! - Pre-index walk that strips frontmatter and parses Obsidian syntax
7//!   (`[[wikilinks]]`, `#tags`, `^block-ids`, `![[embeds]]`) into the
8//!   sidecar tables defined in [`schema`].
9//! - Note-shaped query API: `search_notes`, `find_backlinks`,
10//!   `list_tags`, `get_note`, `get_similar`, `vault_status`.
11
12use std::collections::HashMap;
13use std::fs;
14use std::path::{Path, PathBuf};
15
16use lexa_core::{
17    EmbeddingConfig, IndexStats, LexaDb, LexaError, PreprocessOutput, Preprocessor, SearchOptions,
18    SearchTier, Transaction,
19};
20use rusqlite::{params, OptionalExtension};
21use serde::{Deserialize, Serialize};
22
23use crate::frontmatter::{self, Frontmatter};
24use crate::tags;
25use crate::wikilinks::{self, LinkKind, Wikilink};
26use crate::{schema, Result};
27
28/// Wrapper around `LexaDb` with Obsidian-aware indexing and queries.
29pub struct LexaObsidianDb {
30    inner: LexaDb,
31    vault_root: PathBuf,
32}
33
34#[derive(Debug, Clone, Serialize)]
35pub struct IndexReport {
36    pub notes_seen: usize,
37    pub notes_indexed: usize,
38    /// Notes that were in the index but no longer exist on disk; their
39    /// `documents` rows (and CASCADE-linked sidecar rows) were removed
40    /// by `index_vault`. Always reported so callers can surface "your
41    /// deleted notes are no longer searchable" feedback.
42    pub notes_deleted: usize,
43    pub links: usize,
44    pub tags: usize,
45    pub blocks: usize,
46}
47
48#[derive(Debug, Clone, Serialize, Deserialize, Default)]
49pub struct SearchNotesOptions {
50    pub query: String,
51    #[serde(default)]
52    pub tier: SearchTier,
53    #[serde(default = "default_limit")]
54    pub limit: usize,
55    #[serde(default)]
56    pub tags: Vec<String>,
57    /// Path-prefix filters (relative to the vault root).
58    #[serde(default)]
59    pub folders: Vec<String>,
60    #[serde(default)]
61    pub additional_queries: Vec<String>,
62}
63
64fn default_limit() -> usize {
65    10
66}
67
68#[derive(Debug, Clone, Serialize)]
69pub struct NoteHit {
70    pub path: String,
71    pub title: String,
72    pub score: f32,
73    pub excerpt: String,
74    pub heading: Option<String>,
75    pub line_start: i64,
76    pub line_end: i64,
77    pub tags: Vec<String>,
78    pub breakdown: lexa_core::TierBreakdown,
79}
80
81#[derive(Debug, Clone, Serialize)]
82pub struct Backlink {
83    pub src_path: String,
84    pub src_title: Option<String>,
85    pub alias: Option<String>,
86    pub header: Option<String>,
87    pub block_id: Option<String>,
88    pub kind: String,
89}
90
91#[derive(Debug, Clone, Serialize)]
92pub struct TagCount {
93    pub tag: String,
94    pub count: i64,
95}
96
97#[derive(Debug, Clone, Serialize)]
98pub struct LinkRef {
99    pub target_name: String,
100    pub target_path: Option<String>,
101    pub header: Option<String>,
102    pub block_id: Option<String>,
103    pub alias: Option<String>,
104    pub kind: String,
105}
106
107#[derive(Debug, Clone, Serialize)]
108pub struct Note {
109    pub path: String,
110    pub title: String,
111    pub frontmatter: serde_json::Value,
112    pub body: String,
113    pub tags: Vec<String>,
114    pub outgoing: Vec<LinkRef>,
115    pub incoming: Vec<Backlink>,
116}
117
118#[derive(Debug, Clone, Serialize)]
119pub struct VaultStatus {
120    pub stats: IndexStats,
121    pub vault_root: PathBuf,
122    pub note_count: i64,
123    pub tag_count: i64,
124    pub link_count: i64,
125    pub needs_index: bool,
126}
127
128impl LexaObsidianDb {
129    /// Open or create the SQLite-backed index for `vault_root` at
130    /// `db_path`, applying both the lexa-core schema and the Obsidian
131    /// sidecar schema.
132    pub fn open(
133        db_path: impl AsRef<Path>,
134        vault_root: impl AsRef<Path>,
135        embedding_config: EmbeddingConfig,
136    ) -> Result<Self> {
137        let inner = LexaDb::open(db_path, embedding_config)?;
138        schema::migrate(inner.conn())?;
139        Ok(Self {
140            inner,
141            vault_root: vault_root.as_ref().to_path_buf(),
142        })
143    }
144
145    pub fn vault_root(&self) -> &Path {
146        &self.vault_root
147    }
148
149    pub fn inner(&self) -> &LexaDb {
150        &self.inner
151    }
152
153    /// Walk the vault, strip frontmatter before chunking, embed the
154    /// **body**, and populate `note_metadata` / `note_tags` /
155    /// `note_links` inside the same transaction as the chunk insert.
156    /// `note_blocks` is refreshed in a follow-up pass since block-id
157    /// extraction needs the persisted `chunks.id`. Idempotent — the
158    /// content-hash skip in lexa-core makes re-runs cheap.
159    pub fn index_vault(&mut self) -> Result<IndexReport> {
160        let mut report = IndexReport {
161            notes_seen: 0,
162            notes_indexed: 0,
163            notes_deleted: 0,
164            links: 0,
165            tags: 0,
166            blocks: 0,
167        };
168
169        let preprocessor = ObsidianPreprocessor;
170
171        let report_links = std::cell::Cell::new(0usize);
172        let report_tags = std::cell::Cell::new(0usize);
173
174        let indexed = self.inner.index_path_with_preprocessor::<NoteSidecar>(
175            &self.vault_root,
176            Some(&preprocessor),
177            |tx, doc_id, payload| {
178                if !payload.is_obsidian_note {
179                    return Ok(());
180                }
181                write_metadata_tx(tx, doc_id, &payload.title, &payload.frontmatter)?;
182                replace_tags_tx(tx, doc_id, &payload.tags)?;
183                replace_links_tx(tx, doc_id, &payload.links)?;
184                report_tags.set(report_tags.get() + payload.tags.len());
185                report_links.set(report_links.get() + payload.links.len());
186                Ok(())
187            },
188        )?;
189        report.notes_indexed = indexed;
190        report.tags = report_tags.get();
191        report.links = report_links.get();
192
193        // Sweep for orphans: notes that are still in `documents` but have
194        // disappeared from the on-disk vault. Without this, deleted notes
195        // stay searchable forever.
196        report.notes_deleted = self.purge_orphans()?;
197
198        // Block IDs need chunks.id, which only exists post-commit.
199        let docs = self.markdown_documents()?;
200        report.notes_seen = docs.len();
201        for (doc_id, _abs_path) in &docs {
202            report.blocks += self.refresh_blocks(*doc_id)?;
203        }
204
205        // Final sweep: resolve any wiki-links whose target wasn't yet
206        // indexed when the source note's batch flushed.
207        self.resolve_pending_links()?;
208
209        Ok(report)
210    }
211
212    /// Remove `documents` rows whose path no longer exists on disk.
213    /// CASCADE on `documents.id` cleans up `chunks`, `chunks_fts`,
214    /// `vectors_bin`, `vectors_bin_preview`, `note_metadata`,
215    /// `note_links`, `note_tags`, and `note_blocks` automatically.
216    fn purge_orphans(&self) -> Result<usize> {
217        let docs = self.markdown_documents()?;
218        let mut orphan_ids = Vec::new();
219        for (doc_id, path) in docs {
220            if !path.exists() {
221                orphan_ids.push(doc_id);
222            }
223        }
224        if orphan_ids.is_empty() {
225            return Ok(0);
226        }
227        let conn = self.inner.conn();
228        for id in &orphan_ids {
229            conn.execute("DELETE FROM documents WHERE id = ?1", params![id])?;
230        }
231        Ok(orphan_ids.len())
232    }
233
234    /// Hybrid retrieval restricted to indexed Obsidian notes.
235    pub fn search_notes(&self, opts: &SearchNotesOptions) -> Result<Vec<NoteHit>> {
236        let hits = self.inner.search(&SearchOptions {
237            query: opts.query.clone(),
238            tier: opts.tier,
239            limit: opts.limit.saturating_mul(2).max(opts.limit),
240            additional_queries: opts.additional_queries.clone(),
241        })?;
242        let mut out = Vec::with_capacity(hits.len());
243        for hit in hits {
244            if !self.path_passes_folder_filter(&hit.path, &opts.folders) {
245                continue;
246            }
247            let doc_id = match self.lookup_doc_id(&hit.path)? {
248                Some(id) => id,
249                None => continue,
250            };
251            let tags = self.tags_for_doc(doc_id)?;
252            if !opts.tags.is_empty() {
253                let note_tags: std::collections::HashSet<&String> = tags.iter().collect();
254                if !opts.tags.iter().any(|t| note_tags.contains(t)) {
255                    continue;
256                }
257            }
258            let title = self
259                .title_for_doc(doc_id)?
260                .unwrap_or_else(|| file_stem_of(&hit.path));
261            out.push(NoteHit {
262                path: hit.path.clone(),
263                title,
264                score: hit.score,
265                excerpt: hit.excerpt.clone(),
266                heading: hit.heading.clone(),
267                line_start: hit.line_start,
268                line_end: hit.line_end,
269                tags,
270                breakdown: hit.breakdown.clone(),
271            });
272            if out.len() >= opts.limit {
273                break;
274            }
275        }
276        Ok(out)
277    }
278
279    pub fn find_backlinks(&self, note: &str) -> Result<Vec<Backlink>> {
280        let conn = self.inner.conn();
281        let resolved = self.resolve_note_argument(note)?;
282
283        let mut stmt = conn.prepare(
284            "SELECT
285                d.path,
286                m.title,
287                nl.alias,
288                nl.header,
289                nl.block_id,
290                nl.kind
291             FROM note_links nl
292             JOIN documents d ON d.id = nl.src_doc_id
293             LEFT JOIN note_metadata m ON m.doc_id = d.id
294             WHERE nl.target_path = ?1 OR LOWER(nl.target_name) = LOWER(?2)
295             ORDER BY d.path",
296        )?;
297
298        let rows = stmt.query_map(
299            params![resolved.path.as_deref(), resolved.name.as_str()],
300            |row| {
301                Ok(Backlink {
302                    src_path: row.get(0)?,
303                    src_title: row.get::<_, Option<String>>(1)?,
304                    alias: row.get::<_, Option<String>>(2)?,
305                    header: row.get::<_, Option<String>>(3)?,
306                    block_id: row.get::<_, Option<String>>(4)?,
307                    kind: row.get(5)?,
308                })
309            },
310        )?;
311        rows.collect::<std::result::Result<Vec<_>, _>>()
312            .map_err(LexaError::from)
313    }
314
315    pub fn list_tags(&self, prefix: Option<&str>, limit: usize) -> Result<Vec<TagCount>> {
316        let conn = self.inner.conn();
317        let limit = limit.max(1) as i64;
318        let rows: Vec<TagCount> = if let Some(prefix) = prefix {
319            let pattern = format!("{}%", prefix.to_ascii_lowercase());
320            let mut stmt = conn.prepare(
321                "SELECT tag, COUNT(*) FROM note_tags
322                 WHERE tag LIKE ?1
323                 GROUP BY tag ORDER BY COUNT(*) DESC, tag ASC LIMIT ?2",
324            )?;
325            let rows: Result<Vec<_>> = stmt
326                .query_map(params![pattern, limit], |row| {
327                    Ok(TagCount {
328                        tag: row.get(0)?,
329                        count: row.get(1)?,
330                    })
331                })?
332                .collect::<std::result::Result<Vec<_>, _>>()
333                .map_err(LexaError::from);
334            rows?
335        } else {
336            let mut stmt = conn.prepare(
337                "SELECT tag, COUNT(*) FROM note_tags
338                 GROUP BY tag ORDER BY COUNT(*) DESC, tag ASC LIMIT ?1",
339            )?;
340            let rows: Result<Vec<_>> = stmt
341                .query_map(params![limit], |row| {
342                    Ok(TagCount {
343                        tag: row.get(0)?,
344                        count: row.get(1)?,
345                    })
346                })?
347                .collect::<std::result::Result<Vec<_>, _>>()
348                .map_err(LexaError::from);
349            rows?
350        };
351        Ok(rows)
352    }
353
354    pub fn get_note(&self, note: &str, block: Option<&str>) -> Result<Note> {
355        let resolved = self.resolve_note_argument(note)?;
356        let doc_path = resolved
357            .path
358            .clone()
359            .ok_or_else(|| LexaError::InvalidPath(note.to_string()))?;
360        let bytes = fs::read(&doc_path)?;
361        let text = String::from_utf8_lossy(&bytes).into_owned();
362        let stem = Path::new(&doc_path)
363            .file_stem()
364            .and_then(|s| s.to_str())
365            .unwrap_or_default();
366        let (fm, body_str, _) = frontmatter::parse(&text);
367        let title = frontmatter::resolve_title(&fm, body_str, stem);
368        let body = body_str.to_string();
369
370        let conn = self.inner.conn();
371        let doc_id = self
372            .lookup_doc_id(&doc_path)?
373            .ok_or_else(|| LexaError::InvalidPath(format!("note not indexed: {doc_path}")))?;
374        let tags = self.tags_for_doc(doc_id)?;
375
376        let mut outgoing_stmt = conn.prepare(
377            "SELECT target_name, target_path, header, block_id, alias, kind
378             FROM note_links WHERE src_doc_id = ?1",
379        )?;
380        let outgoing = outgoing_stmt
381            .query_map(params![doc_id], |row| {
382                Ok(LinkRef {
383                    target_name: row.get(0)?,
384                    target_path: row.get::<_, Option<String>>(1)?,
385                    header: row.get::<_, Option<String>>(2)?,
386                    block_id: row.get::<_, Option<String>>(3)?,
387                    alias: row.get::<_, Option<String>>(4)?,
388                    kind: row.get(5)?,
389                })
390            })?
391            .collect::<std::result::Result<Vec<_>, _>>()?;
392
393        let incoming = self.find_backlinks(&doc_path)?;
394
395        let final_body = if let Some(block_id) = block {
396            self.body_for_block(doc_id, &body, block_id)?
397                .unwrap_or(body)
398        } else {
399            body
400        };
401
402        Ok(Note {
403            path: doc_path,
404            title,
405            frontmatter: frontmatter_to_json(&fm),
406            body: final_body,
407            tags,
408            outgoing,
409            incoming,
410        })
411    }
412
413    pub fn get_similar(&self, note: &str, limit: usize) -> Result<Vec<NoteHit>> {
414        let resolved = self.resolve_note_argument(note)?;
415        let doc_path = resolved
416            .path
417            .ok_or_else(|| LexaError::InvalidPath(note.to_string()))?;
418        let bytes = fs::read(&doc_path)?;
419        let text = String::from_utf8_lossy(&bytes).into_owned();
420        let (_, body, _) = frontmatter::parse(&text);
421        // Take a representative slice; keep small to fit the embedder.
422        let snippet: String = body.chars().take(2_000).collect();
423        let opts = SearchNotesOptions {
424            query: snippet,
425            tier: SearchTier::Fast,
426            limit: limit.saturating_mul(2).max(limit),
427            tags: Vec::new(),
428            folders: Vec::new(),
429            additional_queries: Vec::new(),
430        };
431        let hits = self.search_notes(&opts)?;
432        Ok(hits
433            .into_iter()
434            .filter(|h| h.path != doc_path)
435            .take(limit)
436            .collect())
437    }
438
439    pub fn vault_status(&self) -> Result<VaultStatus> {
440        let stats = self.inner.stats()?;
441        let conn = self.inner.conn();
442        let note_count: i64 = conn
443            .query_row(
444                "SELECT COUNT(*) FROM documents WHERE LOWER(path) LIKE '%.md'",
445                [],
446                |row| row.get(0),
447            )
448            .unwrap_or(0);
449        let tag_count: i64 = conn
450            .query_row("SELECT COUNT(DISTINCT tag) FROM note_tags", [], |row| {
451                row.get(0)
452            })
453            .unwrap_or(0);
454        let link_count: i64 = conn
455            .query_row("SELECT COUNT(*) FROM note_links", [], |row| row.get(0))
456            .unwrap_or(0);
457        let needs_index = note_count == 0;
458        Ok(VaultStatus {
459            stats,
460            vault_root: self.vault_root.clone(),
461            note_count,
462            tag_count,
463            link_count,
464            needs_index,
465        })
466    }
467
468    pub fn purge_vault(&mut self) -> Result<usize> {
469        // CASCADE on documents.id removes sidecar rows automatically.
470        self.inner.purge_path(self.vault_root.clone())
471    }
472
473    // -------------------- internals --------------------
474
475    fn markdown_documents(&self) -> Result<Vec<(i64, PathBuf)>> {
476        let mut stmt = self
477            .inner
478            .conn()
479            .prepare("SELECT id, path FROM documents WHERE LOWER(path) LIKE '%.md' ORDER BY id")?;
480        let rows = stmt.query_map([], |row| {
481            let id: i64 = row.get(0)?;
482            let path: String = row.get(1)?;
483            Ok((id, PathBuf::from(path)))
484        })?;
485        rows.collect::<std::result::Result<Vec<_>, _>>()
486            .map_err(LexaError::from)
487    }
488
489    // metadata / tags / links writes happen inside the lexa-core
490    // transaction; see `write_metadata_tx`, `replace_tags_tx`,
491    // `replace_links_tx` below.
492
493    fn refresh_blocks(&self, doc_id: i64) -> Result<usize> {
494        let conn = self.inner.conn();
495        conn.execute("DELETE FROM note_blocks WHERE doc_id = ?1", params![doc_id])?;
496        let mut stmt = conn.prepare("SELECT id, text FROM chunks WHERE doc_id = ?1")?;
497        let rows = stmt
498            .query_map(params![doc_id], |row| {
499                let id: i64 = row.get(0)?;
500                let text: String = row.get(1)?;
501                Ok((id, text))
502            })?
503            .collect::<std::result::Result<Vec<_>, _>>()?;
504        drop(stmt);
505        let mut inserted = 0usize;
506        for (chunk_id, text) in rows {
507            if let Some(block_id) = trailing_block_id(&text) {
508                conn.execute(
509                    "INSERT OR IGNORE INTO note_blocks(chunk_id, doc_id, block_id)
510                     VALUES(?1, ?2, ?3)",
511                    params![chunk_id, doc_id, block_id],
512                )?;
513                inserted += 1;
514            }
515        }
516        Ok(inserted)
517    }
518
519    fn resolve_pending_links(&self) -> Result<()> {
520        let conn = self.inner.conn();
521        let mut stmt = conn.prepare("SELECT path FROM documents WHERE LOWER(path) LIKE '%.md'")?;
522        let mut by_stem: HashMap<String, String> = HashMap::new();
523        for row in stmt.query_map([], |row| row.get::<_, String>(0))? {
524            let path = row?;
525            let stem = file_stem_of(&path).to_ascii_lowercase();
526            by_stem.entry(stem).or_insert(path);
527        }
528        drop(stmt);
529
530        let mut update_stmt = conn.prepare(
531            "UPDATE note_links SET target_path = ?1
532             WHERE LOWER(target_name) = ?2 AND target_path IS NULL",
533        )?;
534        for (stem, path) in &by_stem {
535            update_stmt.execute(params![path, stem])?;
536        }
537        Ok(())
538    }
539
540    fn lookup_doc_id(&self, path: &str) -> Result<Option<i64>> {
541        let row: Option<i64> = self
542            .inner
543            .conn()
544            .query_row(
545                "SELECT id FROM documents WHERE path = ?1",
546                params![path],
547                |row| row.get(0),
548            )
549            .optional()?;
550        Ok(row)
551    }
552
553    fn title_for_doc(&self, doc_id: i64) -> Result<Option<String>> {
554        let row: Option<String> = self
555            .inner
556            .conn()
557            .query_row(
558                "SELECT title FROM note_metadata WHERE doc_id = ?1",
559                params![doc_id],
560                |row| row.get(0),
561            )
562            .optional()?;
563        Ok(row)
564    }
565
566    fn tags_for_doc(&self, doc_id: i64) -> Result<Vec<String>> {
567        let mut stmt = self
568            .inner
569            .conn()
570            .prepare("SELECT tag FROM note_tags WHERE doc_id = ?1 ORDER BY tag")?;
571        let rows = stmt.query_map(params![doc_id], |row| row.get::<_, String>(0))?;
572        Ok(rows.collect::<std::result::Result<Vec<_>, _>>()?)
573    }
574
575    fn body_for_block(
576        &self,
577        doc_id: i64,
578        full_body: &str,
579        block_id: &str,
580    ) -> Result<Option<String>> {
581        let key = block_id.trim_start_matches('^');
582        let mut stmt = self.inner.conn().prepare(
583            "SELECT c.text FROM chunks c
584             JOIN note_blocks b ON b.chunk_id = c.id
585             WHERE b.doc_id = ?1 AND b.block_id = ?2",
586        )?;
587        let row: Option<String> = stmt
588            .query_row(params![doc_id, key], |row| row.get(0))
589            .optional()?;
590        // If the block isn't in note_blocks (e.g. inline within a chunk),
591        // try a substring match against the body.
592        if row.is_some() {
593            return Ok(row);
594        }
595        let needle = format!("^{}", key);
596        if let Some(idx) = full_body.find(&needle) {
597            // Return the paragraph containing the block id.
598            let start = full_body[..idx].rfind("\n\n").map(|p| p + 2).unwrap_or(0);
599            let end = full_body[idx..]
600                .find("\n\n")
601                .map(|p| idx + p)
602                .unwrap_or(full_body.len());
603            return Ok(Some(full_body[start..end].to_string()));
604        }
605        Ok(None)
606    }
607
608    fn resolve_note_argument(&self, note: &str) -> Result<ResolvedNote> {
609        // Treat as a path if it exists as a file.
610        let candidate = if Path::new(note).is_absolute() {
611            PathBuf::from(note)
612        } else {
613            self.vault_root.join(note)
614        };
615        if candidate.exists() {
616            let canonical = fs::canonicalize(&candidate)?;
617            let path = canonical.to_string_lossy().into_owned();
618            let name = canonical
619                .file_stem()
620                .and_then(|s| s.to_str())
621                .unwrap_or("")
622                .to_string();
623            return Ok(ResolvedNote {
624                path: Some(path),
625                name,
626            });
627        }
628        // Fall back to the raw stem so backlink lookups can use
629        // `LOWER(target_name) = LOWER(?)` even if the note itself isn't
630        // indexed (yet).
631        let stem = Path::new(note)
632            .file_stem()
633            .and_then(|s| s.to_str())
634            .unwrap_or(note)
635            .to_string();
636        let mut stmt = self.inner.conn().prepare(
637            "SELECT path FROM documents WHERE LOWER(path) LIKE '%' || LOWER(?1) || '.md'",
638        )?;
639        let path: Option<String> = stmt.query_row(params![stem], |row| row.get(0)).optional()?;
640        Ok(ResolvedNote { path, name: stem })
641    }
642
643    fn path_passes_folder_filter(&self, path: &str, folders: &[String]) -> bool {
644        if folders.is_empty() {
645            return true;
646        }
647        let path_str = match Path::new(path).strip_prefix(&self.vault_root) {
648            Ok(rel) => rel.to_string_lossy().into_owned(),
649            Err(_) => path.to_string(),
650        };
651        folders
652            .iter()
653            .any(|folder| path_str.starts_with(folder.as_str()))
654    }
655}
656
657struct ResolvedNote {
658    path: Option<String>,
659    name: String,
660}
661
662fn file_stem_of(path: &str) -> String {
663    Path::new(path)
664        .file_stem()
665        .and_then(|s| s.to_str())
666        .unwrap_or("")
667        .to_string()
668}
669
670fn trailing_block_id(text: &str) -> Option<String> {
671    let last = text.lines().rev().find(|l| !l.trim().is_empty())?;
672    let trimmed = last.trim();
673    let rest = trimmed
674        .strip_suffix(|c: char| !c.is_whitespace())
675        .map(|_| trimmed)?;
676    let _ = rest; // avoid unused warning
677    let stripped = trimmed.split_whitespace().last()?;
678    let id = stripped.strip_prefix('^')?;
679    if id
680        .chars()
681        .all(|c| c.is_ascii_alphanumeric() || c == '-' || c == '_')
682        && !id.is_empty()
683    {
684        Some(id.to_string())
685    } else {
686        None
687    }
688}
689
690fn frontmatter_to_json(fm: &Frontmatter) -> serde_json::Value {
691    let mut map = serde_json::Map::new();
692    if let Some(title) = &fm.title {
693        map.insert("title".into(), serde_json::Value::String(title.clone()));
694    }
695    if !fm.aliases.is_empty() {
696        map.insert(
697            "aliases".into(),
698            serde_json::Value::Array(
699                fm.aliases
700                    .iter()
701                    .map(|s| serde_json::Value::String(s.clone()))
702                    .collect(),
703            ),
704        );
705    }
706    if !fm.tags.is_empty() {
707        map.insert(
708            "tags".into(),
709            serde_json::Value::Array(
710                fm.tags
711                    .iter()
712                    .map(|s| serde_json::Value::String(s.clone()))
713                    .collect(),
714            ),
715        );
716    }
717    for (k, v) in &fm.raw {
718        map.insert(k.clone(), serde_yaml_to_json(v));
719    }
720    serde_json::Value::Object(map)
721}
722
723fn serde_yaml_to_json(value: &serde_yaml::Value) -> serde_json::Value {
724    match value {
725        serde_yaml::Value::Null => serde_json::Value::Null,
726        serde_yaml::Value::Bool(b) => serde_json::Value::Bool(*b),
727        serde_yaml::Value::Number(n) => {
728            if let Some(i) = n.as_i64() {
729                serde_json::Value::Number(i.into())
730            } else if let Some(f) = n.as_f64() {
731                serde_json::Number::from_f64(f)
732                    .map(serde_json::Value::Number)
733                    .unwrap_or(serde_json::Value::Null)
734            } else {
735                serde_json::Value::Null
736            }
737        }
738        serde_yaml::Value::String(s) => serde_json::Value::String(s.clone()),
739        serde_yaml::Value::Sequence(seq) => {
740            serde_json::Value::Array(seq.iter().map(serde_yaml_to_json).collect())
741        }
742        serde_yaml::Value::Mapping(m) => {
743            let mut out = serde_json::Map::new();
744            for (k, v) in m {
745                let key = match k {
746                    serde_yaml::Value::String(s) => s.clone(),
747                    other => serde_yaml::to_string(other)
748                        .unwrap_or_default()
749                        .trim()
750                        .to_string(),
751                };
752                out.insert(key, serde_yaml_to_json(v));
753            }
754            serde_json::Value::Object(out)
755        }
756        serde_yaml::Value::Tagged(tagged) => serde_yaml_to_json(&tagged.value),
757    }
758}
759
760// `LinkKind::as_str` gives us the SQL-stored discriminator. The reverse
761// mapping is exposed via `std::str::FromStr` so callers (and tests) can
762// recover a `LinkKind` from a database value without a custom helper.
763impl std::str::FromStr for LinkKind {
764    type Err = std::convert::Infallible;
765
766    fn from_str(s: &str) -> std::result::Result<Self, Self::Err> {
767        Ok(match s {
768            "embed" => LinkKind::Embed,
769            _ => LinkKind::Link,
770        })
771    }
772}
773
774/// Per-note sidecar payload threaded from preprocessor → commit hook.
775#[derive(Default)]
776struct NoteSidecar {
777    is_obsidian_note: bool,
778    title: String,
779    frontmatter: Frontmatter,
780    tags: Vec<String>,
781    links: Vec<Wikilink>,
782}
783
784/// Strips frontmatter from `.md` files so it doesn't leak into the
785/// embedding, then captures the parsed metadata for the sidecar tables.
786struct ObsidianPreprocessor;
787
788impl Preprocessor for ObsidianPreprocessor {
789    type Payload = NoteSidecar;
790
791    fn preprocess(
792        &self,
793        path: &Path,
794        bytes: &[u8],
795    ) -> Result<Option<PreprocessOutput<Self::Payload>>> {
796        let is_md = path
797            .extension()
798            .and_then(|e| e.to_str())
799            .map(|e| e.eq_ignore_ascii_case("md"))
800            .unwrap_or(false);
801        if !is_md {
802            return Ok(Some(PreprocessOutput {
803                text: String::from_utf8_lossy(bytes).replace("\r\n", "\n"),
804                payload: NoteSidecar::default(),
805            }));
806        }
807        let text = String::from_utf8_lossy(bytes).replace("\r\n", "\n");
808        let stem = path
809            .file_stem()
810            .and_then(|s| s.to_str())
811            .unwrap_or_default();
812        let (fm, body, _offset) = frontmatter::parse(&text);
813        let title = frontmatter::resolve_title(&fm, body, stem);
814        let extracted_tags = tags::extract(body, &fm);
815        let extracted_links = wikilinks::extract(body);
816        Ok(Some(PreprocessOutput {
817            text: body.to_string(),
818            payload: NoteSidecar {
819                is_obsidian_note: true,
820                title,
821                frontmatter: fm,
822                tags: extracted_tags,
823                links: extracted_links,
824            },
825        }))
826    }
827}
828
829fn write_metadata_tx(
830    tx: &Transaction<'_>,
831    doc_id: i64,
832    title: &str,
833    fm: &Frontmatter,
834) -> lexa_core::Result<()> {
835    let aliases_json =
836        serde_json::to_string(&fm.aliases).map_err(|err| LexaError::Embedding(err.to_string()))?;
837    let raw_yaml = serde_yaml::Value::Mapping(
838        fm.raw
839            .iter()
840            .map(|(k, v)| (serde_yaml::Value::String(k.clone()), v.clone()))
841            .collect(),
842    );
843    let raw_json = serde_json::to_string(&serde_yaml_to_json(&raw_yaml))
844        .map_err(|err| LexaError::Embedding(err.to_string()))?;
845    tx.execute(
846        "INSERT INTO note_metadata(doc_id, title, aliases_json, raw_json)
847         VALUES(?1, ?2, ?3, ?4)
848         ON CONFLICT(doc_id) DO UPDATE SET
849            title = excluded.title,
850            aliases_json = excluded.aliases_json,
851            raw_json = excluded.raw_json",
852        params![doc_id, title, aliases_json, raw_json],
853    )?;
854    Ok(())
855}
856
857fn replace_tags_tx(tx: &Transaction<'_>, doc_id: i64, tags: &[String]) -> lexa_core::Result<()> {
858    tx.execute("DELETE FROM note_tags WHERE doc_id = ?1", params![doc_id])?;
859    for tag in tags {
860        tx.execute(
861            "INSERT OR IGNORE INTO note_tags(doc_id, tag) VALUES(?1, ?2)",
862            params![doc_id, tag],
863        )?;
864    }
865    Ok(())
866}
867
868fn replace_links_tx(
869    tx: &Transaction<'_>,
870    doc_id: i64,
871    links: &[Wikilink],
872) -> lexa_core::Result<()> {
873    tx.execute(
874        "DELETE FROM note_links WHERE src_doc_id = ?1",
875        params![doc_id],
876    )?;
877    for link in links {
878        tx.execute(
879            "INSERT INTO note_links
880                (src_doc_id, target_name, target_path, header, block_id, alias, kind)
881             VALUES(?1, ?2, NULL, ?3, ?4, ?5, ?6)",
882            params![
883                doc_id,
884                link.target_name,
885                link.header,
886                link.block_id,
887                link.alias,
888                link.kind.as_str(),
889            ],
890        )?;
891    }
892    Ok(())
893}
894
895#[cfg(test)]
896mod tests {
897    use super::*;
898
899    #[test]
900    fn trailing_block_id_extracts_basic() {
901        assert_eq!(trailing_block_id("paragraph ^abc-1"), Some("abc-1".into()));
902        assert_eq!(trailing_block_id("no marker here"), None);
903    }
904}