mnem-cli 0.1.7

Command-line interface for mnem - Git for AI Agent Knowledge.
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
//! `mnem reindex` - retro-embed nodes that don't yet have a vector.
//!
//! one-shot upgrade path for repos that grew
//! before `mnem add node` learned to auto-embed (C7-4). Walks every
//! node at HEAD, picks the ones missing a `dense_embed` (or all of
//! them when `--force`), embeds them via the configured provider,
//! and commits the result.
//!
//! Mirrors `mnem embed` (the historical spelling) closely; the new
//! verb is the one promoted in the user-facing error message that
//! `mnem add node` now prints when the embedder is unreachable.
//! Functionally:
//!
//! - Source text per node = `node.summary` when set; otherwise a
//! stringified `label + sorted props` so a node without a summary
//! still ends up with *some* vector instead of being silently
//! skipped.
//! - `--since <commit>` (optional) narrows the candidate set to
//! nodes that did not exist (or differ) at the supplied commit,
//! so an operator can re-embed only the tail of recent additions
//! without re-walking the whole graph.
//! - Idempotent: running twice with no flags is a no-op (the second
//! pass sees every node already has a vector for the current
//! model).

use super::*;
use indicatif::{ProgressBar, ProgressStyle};
use mnem_core::id::Cid;
use mnem_core::prolly::Cursor;
use std::collections::HashSet;
use std::time::Instant;

#[derive(clap::Args, Debug)]
#[command(after_long_help = "\
Retro-embed nodes that don't have a vector yet. One commit per run.

Examples:
 mnem reindex # embed every node missing a vector
 mnem reindex --label Person # only nodes of this label
 mnem reindex --since <commit> # only nodes added/changed since <commit>
 mnem reindex --force # re-embed even already-embedded nodes
 mnem reindex --dry-run # report count without changing anything
 mnem reindex --lift-legacy-extra # promote v0.3 inline embed bytes to sidecar
 mnem reindex --lift-legacy-sparse # promote pre-G17 inline sparse_embed to sidecar

Source text per node:
 - `summary` (the `-s` argument to `mnem add node`) when set
 - else `label + sorted props` rendered as text (so unsummarised
 nodes still receive a vector instead of being silently skipped)
")]
pub(crate) struct Args {
    /// Re-embed nodes that already have a vector for the current model.
    #[arg(long)]
    pub force: bool,
    /// Restrict to one label (ntype).
    #[arg(long)]
    pub label: Option<String>,
    /// Only re-embed nodes added (or changed) after this commit. The
    /// commit may be a CID, ref name, branch name, or `HEAD`. Nodes
    /// present in `<since>`'s nodes-tree are skipped.
    #[arg(long, value_name = "COMMIT")]
    pub since: Option<String>,
    /// Count and print what would be embedded; don't call the provider.
    #[arg(long)]
    pub dry_run: bool,
    /// Commit message (default: "mnem reindex: N nodes embedded").
    #[arg(long, short = 'm')]
    pub message: Option<String>,
    /// Lift legacy v0.3 inline embeddings from `node.extra["embed"]` into the
    /// sidecar (`Commit.embeddings`) without re-deriving from text. Mutually
    /// exclusive with `--force`. Nodes that have no `extra["embed"]` key are
    /// skipped (or fall through to the normal path when `--force` is also
    /// set, but `--force` and `--lift-legacy-extra` cannot be combined).
    #[arg(long)]
    pub lift_legacy_extra: bool,
    /// Lift pre-G17 inline sparse embeddings from `node.extra["sparse_embed"]`
    /// into the sparse sidecar (`Commit.sparse`) without re-encoding from text.
    /// Mutually exclusive with `--force`. Nodes that have no
    /// `extra["sparse_embed"]` key are silently skipped. Run this once after
    /// upgrading to v0.5 on a repo that was written with an earlier version.
    #[arg(long)]
    pub lift_legacy_sparse: bool,
}

/// Render a fallback source string for a node with no `summary`. Uses
/// the label plus sorted props so the input is deterministic across
/// runs (prop iteration order is otherwise insertion-defined).
fn fallback_text_of(node: &Node) -> String {
    let mut parts: Vec<String> = Vec::with_capacity(1 + node.props.len());
    parts.push(node.ntype.clone());
    let mut keys: Vec<&String> = node.props.keys().collect();
    keys.sort();
    for k in keys {
        if let Some(v) = node.props.get(k) {
            parts.push(format!("{k}={}", ipld_to_text(v)));
        }
    }
    parts.join(" ")
}

/// Best-effort textification of an Ipld value for fallback embed
/// input. Falls back to a debug rendering for shapes that don't
/// stringify cleanly (lists / maps); the goal is "some vector" not
/// "perfect vector."
fn ipld_to_text(v: &Ipld) -> String {
    match v {
        Ipld::Null => String::new(),
        Ipld::Bool(b) => b.to_string(),
        Ipld::Integer(i) => i.to_string(),
        Ipld::Float(f) => f.to_string(),
        Ipld::String(s) => s.clone(),
        Ipld::Bytes(b) => format!("[{}b]", b.len()),
        Ipld::List(_) | Ipld::Map(_) => format!("{v:?}"),
        Ipld::Link(c) => c.to_string(),
    }
}

/// Pick the embed source for a node. Mirrors `embed_text_of` but
/// always returns `Some(_)` so unsummarised nodes still get a
/// vector via the label+props fallback.
fn reindex_text_of(node: &Node) -> String {
    if let Some(s) = &node.summary
        && !s.trim().is_empty()
    {
        return s.clone();
    }
    if let Some(text) = embed_text_of(node) {
        return text;
    }
    fallback_text_of(node)
}

/// Walk the nodes-tree at `commit_cid` and return the set of node
/// CIDs present there. Used by `--since` to filter candidates down
/// to "newly added or changed" nodes only.
fn nodes_at(
    bs: &std::sync::Arc<dyn mnem_core::store::Blockstore>,
    commit_cid: &Cid,
) -> Result<HashSet<Cid>> {
    let bytes = bs
        .get(commit_cid)?
        .ok_or_else(|| anyhow!("commit CID {commit_cid} missing from store"))?;
    let commit: Commit = from_canonical_bytes(&bytes)?;
    let mut out: HashSet<Cid> = HashSet::new();
    let cursor = Cursor::new(&**bs, &commit.nodes)?;
    for entry in cursor {
        let (_k, node_cid) = entry?;
        out.insert(node_cid);
    }
    Ok(out)
}

pub(crate) fn run(override_path: Option<&Path>, args: Args) -> Result<()> {
    // Guard: --lift-legacy-extra / --lift-legacy-sparse and --force are mutually exclusive.
    if args.lift_legacy_extra && args.force {
        anyhow::bail!(
            "--lift-legacy-extra and --force are mutually exclusive: \
 --lift-legacy-extra promotes existing inline bytes without re-deriving; \
 --force re-derives from text via the embedder. Pick one."
        );
    }
    if args.lift_legacy_sparse && args.force {
        anyhow::bail!(
            "--lift-legacy-sparse and --force are mutually exclusive: \
 --lift-legacy-sparse promotes existing inline bytes without re-encoding; \
 --force re-derives from text via the embedder. Pick one."
        );
    }

    // When --lift-legacy-extra or --lift-legacy-sparse is set we don't need
    // a configured embedder. For the normal path we do.
    let is_lift_only = args.lift_legacy_extra || args.lift_legacy_sparse;
    let data_dir = repo::locate_data_dir(override_path)?;
    let cfg = config::load(&data_dir)?;

    if !is_lift_only {
        let Some(_pc) = config::resolve_embedder(&cfg) else {
            anyhow::bail!(
                "no embedder configured; run `mnem config set embed.provider <openai|ollama>` \
 and `mnem config set embed.model <name>` first"
            );
        };
    }

    // For the normal embed path we need the provider config to open the embedder.
    let pc_opt = config::resolve_embedder(&cfg);

    let (_dir, r, bs, _ohs) = repo::open_all(Some(data_dir.as_path()))?;
    let Some(head) = r.head_commit() else {
        println!("no nodes in this repo yet (run `mnem add node --summary ...` first)");
        return Ok(());
    };

    // Resolve --since up front so we can fail fast on a bad arg
    // before opening the (potentially live-network) embedder.
    let since_set: Option<HashSet<Cid>> = match &args.since {
        None => None,
        Some(s) => {
            let cid = resolve_commitish(&r, s)?;
            Some(nodes_at(&bs, &cid)?)
        }
    };

    // --lift-legacy-extra path: scan nodes for extra["embed"] and lift
    // them into the sidecar. No embedder needed.
    if args.lift_legacy_extra {
        return run_lift_legacy_extra(&r, &bs, head, since_set, &args, &cfg);
    }

    // --lift-legacy-sparse path: scan nodes for extra["sparse_embed"] and lift
    // them into the sparse sidecar (Commit.sparse). No encoder needed.
    if args.lift_legacy_sparse {
        return run_lift_legacy_sparse(&r, &bs, head, since_set, &args, &cfg);
    }

    // Normal embed path: open the embedder.
    //
    // Strategy: always try to open. If `--dry-run` and the open
    // fails, we can still print the count using a placeholder model
    // string -- the user only cares about *how many* would change.
    // Per spec, Ollama-unreachable on `mnem reindex` is a hard error
    // (the user explicitly asked to embed, unlike `mnem add node`
    // where embedding is incidental); this matches `mnem embed`.
    let pc = pc_opt.expect("embedder config present (checked above)");
    let embedder_result = mnem_embed_providers::open(&pc);
    let (embedder, model_fq) = match (&embedder_result, args.dry_run) {
        (Ok(e), _) => {
            let m = e.model().to_string();
            (Some(e), m)
        }
        (Err(_), true) => (None, String::from("<configured-embedder>")),
        (Err(e), false) => {
            eprintln!("{}", format_embed_failure(e, &pc, "embedding"));
            anyhow::bail!("cannot reindex: embedder open failed (see above)");
        }
    };

    // Walk every node at head; pick candidates per the same rules as
    // `mnem embed`, with the addition of the `--since` filter.
    //
    // candidates carry their existing NodeCid alongside
    // the decoded Node so the reindex commit can attach the new vector
    // via `Transaction::set_embedding(node_cid, ...)` without rewriting
    // the node body. The legacy `node.with_embed(emb)` rewrite path is
    // gone; removes `Node::embed` outright.
    let mut candidates: Vec<(Cid, Node)> = Vec::new();
    let mut total_nodes: usize = 0;
    let mut matched_label: usize = 0;
    let mut skipped_already_embedded: usize = 0;
    let mut skipped_outside_since: usize = 0;
    let cursor = Cursor::new(&*bs, &head.nodes)?;
    for entry in cursor {
        let (_k, node_cid) = entry?;
        let bytes = bs
            .get(&node_cid)?
            .ok_or_else(|| anyhow!("node CID {node_cid} missing from store"))?;
        let node: Node = from_canonical_bytes(&bytes)?;
        total_nodes += 1;

        if let Some(set) = &since_set
            && set.contains(&node_cid)
        {
            skipped_outside_since += 1;
            continue;
        }

        if let Some(lbl) = &args.label
            && &node.ntype != lbl
        {
            continue;
        }
        matched_label += 1;

        // Embedding lives in the sidecar bucket keyed by NodeCid.
        // "Already embedded under this model" is a sidecar lookup,
        // not a node-body field; `--force` re-embeds regardless.
        let already = if args.force {
            false
        } else {
            r.embedding_for(&node_cid, &model_fq)?.is_some()
        };
        if already {
            skipped_already_embedded += 1;
            continue;
        }
        // Skip the `mnem init` anchor: it carries no summary, no
        // content, and no agent-meaningful text. The fallback path in
        // `reindex_text_of` would otherwise embed it from its
        // ntype/props (the "label+props fallback" docstring), which
        // means every retrieve thereafter surfaces the anchor as
        // low-score noise. `mnem_core::anchor::is_system_node` is the
        // canonical predicate; if more system nodes are added later,
        // the skip extends automatically.
        if mnem_core::anchor::is_system_node(&node) {
            continue;
        }
        candidates.push((node_cid, node));
    }

    if candidates.is_empty() {
        if matched_label == 0 {
            if let Some(lbl) = &args.label {
                println!(
                    "no nodes match --label {lbl} ({total_nodes} node(s) scanned; \
 drop --label to reindex across all labels)"
                );
            } else if since_set.is_some() && skipped_outside_since == total_nodes {
                println!(
                    "no nodes added since the supplied commit \
 ({total_nodes} node(s) scanned)"
                );
            } else {
                println!("repo has no nodes to reindex");
            }
        } else if skipped_already_embedded == matched_label {
            println!(
                "every matched node already has a {model_fq} vector \
 ({skipped_already_embedded} node(s)); use --force to re-embed"
            );
        } else {
            println!(
                "nothing to reindex: {matched_label} matched, \
 {skipped_already_embedded} already embedded"
            );
        }
        return Ok(());
    }

    if args.dry_run {
        println!("would reindex {} node(s) via {model_fq}", candidates.len());
        return Ok(());
    }

    // Past the dry-run gate, the embedder must be live -- we proved
    // this above by bailing on Err for the non-dry-run path.
    let embedder = embedder.expect("embedder live for non-dry-run path");

    let total = candidates.len();
    let started = Instant::now();
    eprintln!("reindexing {total} node(s) via {model_fq}");
    let pb = ProgressBar::new(total as u64);
    pb.set_style(
        ProgressStyle::with_template(
            " [{elapsed_precise}] {bar:32.cyan/blue} {pos}/{len} ({percent}%) ETA {eta}",
        )
        .unwrap()
        .progress_chars("=>-"),
    );

    let mut tx = r.start_transaction();
    for (node_cid, node) in candidates {
        let text = reindex_text_of(&node);
        let v = embedder.embed(&text)?;
        let emb = mnem_embed_providers::to_embedding(&model_fq, &v);
        // attach to the existing NodeCid via the
        // sidecar instead of rewriting the node body. The Node
        // bytes are unchanged so the CID we read from the cursor
        // is still the canonical key for this node's embeddings.
        tx.set_embedding(node_cid, model_fq.clone(), emb)?;
        pb.inc(1);
    }
    pb.finish_and_clear();

    let msg = args
        .message
        .unwrap_or_else(|| format!("mnem reindex: {total} nodes embedded with {model_fq}"));
    let new_r = tx.commit(&config::author_string(&cfg), &msg)?;
    println!(
        "reindexed {total} node(s) in {:.1}s; committed as op {}",
        started.elapsed().as_secs_f32(),
        new_r.op_id()
    );
    Ok(())
}

/// `--lift-legacy-extra` path: walk HEAD nodes, find any that carry
/// `extra["embed"]`, decode the `Embedding` from that IPLD value, and
/// write it into the sidecar via `tx.set_embedding`. Nodes without the
/// key are skipped entirely.
fn run_lift_legacy_extra(
    r: &mnem_core::repo::ReadonlyRepo,
    bs: &std::sync::Arc<dyn mnem_core::store::Blockstore>,
    head: &mnem_core::objects::Commit,
    since_set: Option<HashSet<Cid>>,
    args: &Args,
    cfg: &crate::config::Config,
) -> Result<()> {
    use mnem_core::objects::node::Embedding;

    let mut total_nodes: usize = 0;
    let mut legacy_count: usize = 0;
    let mut decode_errors: usize = 0;

    // Collect (node_cid, embedding) pairs to lift.
    let mut to_lift: Vec<(Cid, Embedding)> = Vec::new();

    let cursor = Cursor::new(&**bs, &head.nodes)?;
    for entry in cursor {
        let (_k, node_cid) = entry?;
        let bytes = bs
            .get(&node_cid)?
            .ok_or_else(|| anyhow!("node CID {node_cid} missing from store"))?;
        let node: Node = from_canonical_bytes(&bytes)?;
        total_nodes += 1;

        // --since filter: skip nodes that were already present at that commit.
        if let Some(set) = &since_set
            && set.contains(&node_cid)
        {
            continue;
        }

        // --label filter.
        if let Some(lbl) = &args.label
            && &node.ntype != lbl
        {
            continue;
        }

        // Belt-and-suspenders skip for the system anchor: if an
        // operator ever sets `extra["embed"]` on it during repo
        // surgery, the lift would otherwise propagate that into the
        // sidecar and re-introduce the noise this whole filter
        // chain exists to prevent.
        if mnem_core::anchor::is_system_node(&node) {
            continue;
        }

        let Some(ipld_val) = node.extra.get("embed") else {
            continue;
        };

        // Decode the Ipld value as an Embedding via DAG-CBOR round-trip.
        // The legacy wire format encoded Embedding as a map; we re-encode
        // the Ipld to bytes and decode as Embedding using the same codec.
        let emb: Embedding = match decode_embedding_from_ipld(ipld_val) {
            Ok(e) => e,
            Err(err) => {
                eprintln!(
                    "warning: node {node_cid} has extra[\"embed\"] but decoding failed: \
 {err}; skipping"
                );
                decode_errors += 1;
                continue;
            }
        };

        legacy_count += 1;
        to_lift.push((node_cid, emb));
    }

    if args.dry_run {
        println!(
            "would lift {legacy_count} legacy inline embedding(s) to sidecar \
 ({total_nodes} node(s) scanned, {decode_errors} decode error(s))"
        );
        return Ok(());
    }

    if to_lift.is_empty() {
        println!(
            "no nodes with extra[\"embed\"] found ({total_nodes} scanned); \
 nothing to lift"
        );
        return Ok(());
    }

    let total = to_lift.len();
    let started = Instant::now();
    let mut tx = r.start_transaction();
    for (node_cid, emb) in to_lift {
        let model = emb.model.clone();
        tx.set_embedding(node_cid, model, emb)?;
    }

    let msg = args.message.clone().unwrap_or_else(|| {
        format!("mnem reindex --lift-legacy-extra: {total} embedding(s) promoted to sidecar")
    });
    let new_r = tx.commit(&crate::config::author_string(cfg), &msg)?;
    println!(
        "lifted {total} embedding(s) to sidecar in {:.1}s; committed as op {}",
        started.elapsed().as_secs_f32(),
        new_r.op_id()
    );
    if decode_errors > 0 {
        eprintln!(
            "warning: {decode_errors} node(s) had undecodable extra[\"embed\"] and were skipped"
        );
    }
    Ok(())
}

/// Decode an [`Embedding`] from an [`Ipld`] value that was serialized
/// via DAG-CBOR (the legacy v0.3 on-wire format for `node.embed`).
/// We re-encode the `Ipld` value to DAG-CBOR bytes (using the same
/// codec path as the rest of the stack) and decode into `Embedding`.
fn decode_embedding_from_ipld(val: &Ipld) -> Result<mnem_core::objects::node::Embedding> {
    use mnem_core::codec::{from_canonical_bytes, to_canonical_bytes};
    let bytes = to_canonical_bytes(val)
        .map_err(|e| anyhow!("CBOR re-encode of extra[\"embed\"] failed: {e}"))?;
    let emb: mnem_core::objects::node::Embedding = from_canonical_bytes(&bytes)
        .map_err(|e| anyhow!("decode of extra[\"embed\"] as Embedding failed: {e}"))?;
    emb.validate()
        .map_err(|e| anyhow!("extra[\"embed\"] Embedding invariant violated: {e:?}"))?;
    Ok(emb)
}

/// `--lift-legacy-sparse` path: walk HEAD nodes, find any that carry
/// `extra["sparse_embed"]` (written by pre-G17 versions), decode the
/// `SparseEmbed` from that IPLD value, and write it into the sparse
/// sidecar via `tx.set_sparse_embedding`. Nodes without the key are
/// skipped entirely. Run once after upgrading a repo to v0.5.
fn run_lift_legacy_sparse(
    r: &mnem_core::repo::ReadonlyRepo,
    bs: &std::sync::Arc<dyn mnem_core::store::Blockstore>,
    head: &mnem_core::objects::Commit,
    since_set: Option<HashSet<Cid>>,
    args: &Args,
    cfg: &crate::config::Config,
) -> Result<()> {
    use mnem_core::sparse::SparseEmbed;

    let mut total_nodes: usize = 0;
    let mut legacy_count: usize = 0;
    let mut decode_errors: usize = 0;

    // Collect (node_cid, sparse_embed) pairs to lift.
    let mut to_lift: Vec<(Cid, SparseEmbed)> = Vec::new();

    let cursor = Cursor::new(&**bs, &head.nodes)?;
    for entry in cursor {
        let (_k, node_cid) = entry?;
        let bytes = bs
            .get(&node_cid)?
            .ok_or_else(|| anyhow!("node CID {node_cid} missing from store"))?;
        let node: Node = from_canonical_bytes(&bytes)?;
        total_nodes += 1;

        // --since filter: skip nodes present at the earlier commit.
        if let Some(set) = &since_set
            && set.contains(&node_cid)
        {
            continue;
        }

        // --label filter.
        if let Some(lbl) = &args.label
            && &node.ntype != lbl
        {
            continue;
        }

        // Same belt-and-suspenders skip as the dense lift above.
        if mnem_core::anchor::is_system_node(&node) {
            continue;
        }

        let Some(ipld_val) = node.extra.get("sparse_embed") else {
            continue;
        };

        // Decode the Ipld value as a SparseEmbed via DAG-CBOR round-trip.
        let se: SparseEmbed = match decode_sparse_embed_from_ipld(ipld_val) {
            Ok(s) => s,
            Err(err) => {
                eprintln!(
                    "warning: node {node_cid} has extra[\"sparse_embed\"] but decoding \
 failed: {err}; skipping"
                );
                decode_errors += 1;
                continue;
            }
        };

        // Idempotency: skip nodes whose sidecar already carries this vocab_id.
        // This makes a second `--lift-legacy-sparse` run a true no-op (no new
        // commit) rather than writing identical bytes again.
        if r.sparse_for(&node_cid, &se.vocab_id)?.is_some() {
            continue;
        }

        legacy_count += 1;
        to_lift.push((node_cid, se));
    }

    if args.dry_run {
        println!(
            "would lift {legacy_count} legacy inline sparse embedding(s) to sidecar \
 ({total_nodes} node(s) scanned, {decode_errors} decode error(s))"
        );
        return Ok(());
    }

    if to_lift.is_empty() {
        println!(
            "no nodes with extra[\"sparse_embed\"] found ({total_nodes} scanned); \
 nothing to lift"
        );
        return Ok(());
    }

    let total = to_lift.len();
    let started = Instant::now();
    let mut tx = r.start_transaction();
    for (node_cid, se) in to_lift {
        let vocab_id = se.vocab_id.clone();
        tx.set_sparse_embedding(node_cid, vocab_id, se)?;
    }

    let msg = args.message.clone().unwrap_or_else(|| {
        format!(
            "mnem reindex --lift-legacy-sparse: {total} sparse embedding(s) promoted to sidecar"
        )
    });
    let new_r = tx.commit(&crate::config::author_string(cfg), &msg)?;
    println!(
        "lifted {total} sparse embedding(s) to sidecar in {:.1}s; committed as op {}",
        started.elapsed().as_secs_f32(),
        new_r.op_id()
    );
    if decode_errors > 0 {
        eprintln!(
            "warning: {decode_errors} node(s) had undecodable extra[\"sparse_embed\"] and \
 were skipped"
        );
    }
    Ok(())
}

/// Decode a [`SparseEmbed`] from an [`Ipld`] value that was serialized
/// via DAG-CBOR (the pre-G17 on-wire format for `node.sparse_embed`).
/// Re-encodes the `Ipld` to bytes, decodes as `SparseEmbed`, then calls
/// [`SparseEmbed::validate`] to catch corrupt data (non-ascending indices
/// or mismatched `indices.len() != values.len()`) before it reaches the
/// sidecar.
fn decode_sparse_embed_from_ipld(val: &Ipld) -> Result<mnem_core::sparse::SparseEmbed> {
    use mnem_core::codec::{from_canonical_bytes, to_canonical_bytes};
    let bytes = to_canonical_bytes(val)
        .map_err(|e| anyhow!("CBOR re-encode of extra[\"sparse_embed\"] failed: {e}"))?;
    let se: mnem_core::sparse::SparseEmbed = from_canonical_bytes(&bytes)
        .map_err(|e| anyhow!("decode of extra[\"sparse_embed\"] as SparseEmbed failed: {e}"))?;
    se.validate()
        .map_err(|e| anyhow!("extra[\"sparse_embed\"] SparseEmbed invariant violated: {e}"))?;
    Ok(se)
}

#[cfg(test)]
mod tests {
    use super::*;
    use mnem_core::id::NodeId;

    #[test]
    fn fallback_text_uses_label_and_sorted_props() {
        let n = Node::new(NodeId::from_bytes_raw([1u8; 16]), "Person")
            .with_prop("name", Ipld::String("Alice".into()))
            .with_prop("city", Ipld::String("Berlin".into()));
        let s = fallback_text_of(&n);
        // "Person" first; props alphabetised.
        assert!(s.starts_with("Person "), "got: {s}");
        let ci = s.find("city=").expect("city present");
        let ni = s.find("name=").expect("name present");
        assert!(ci < ni, "props must be sorted: {s}");
    }

    #[test]
    fn reindex_text_prefers_summary() {
        let n = Node::new(NodeId::from_bytes_raw([2u8; 16]), "Doc")
            .with_summary("Important brief")
            .with_prop("title", Ipld::String("X".into()));
        assert_eq!(reindex_text_of(&n), "Important brief");
    }

    #[test]
    fn reindex_text_falls_back_when_no_summary_or_content() {
        let n = Node::new(NodeId::from_bytes_raw([3u8; 16]), "Person")
            .with_prop("name", Ipld::String("Bob".into()));
        let s = reindex_text_of(&n);
        assert!(s.contains("Person"));
        assert!(s.contains("name=Bob"));
    }
}