Skip to main content

engram/storage/
auto_linker.rs

1//! Auto-linking engine for emergent knowledge graph.
2//!
3//! Automatically creates typed links between memories based on semantic similarity
4//! (via embedding cosine distance) and other signals (temporal proximity, co-occurrence).
5//!
6//! # Tables
7//! - `auto_links` — machine-generated links (schema v18)
8//!
9//! # Usage
10//! ```ignore
11//! let embedder = TfIdfEmbedder::new(384);
12//! let opts = SemanticLinkOptions::default();
13//! let result = run_semantic_linker(&conn, &embedder, &opts)?;
14//! println!("Created {} links over {} memories", result.links_created, result.memories_processed);
15//!
16//! // Temporal proximity linking
17//! let t_opts = TemporalLinkOptions::default();
18//! let t_result = run_temporal_linker(&conn, &t_opts)?;
19//! println!("Created {} temporal links", t_result.links_created);
20//! ```
21
22use rusqlite::{params, Connection};
23use serde::{Deserialize, Serialize};
24
25use crate::embedding::{cosine_similarity, get_embedding, Embedder};
26use crate::error::Result;
27
28// ---------------------------------------------------------------------------
29// Public types
30// ---------------------------------------------------------------------------
31
32/// Summary of a completed auto-linking run.
33#[derive(Debug, Clone)]
34pub struct AutoLinkResult {
35    /// Number of new links inserted into `auto_links`.
36    pub links_created: usize,
37    /// Number of memories examined.
38    pub memories_processed: usize,
39    /// Wall-clock time for the run in milliseconds.
40    pub duration_ms: u64,
41}
42
43/// Configuration for semantic auto-linking.
44#[derive(Debug, Clone)]
45pub struct SemanticLinkOptions {
46    /// Minimum cosine similarity to create a link (0.0 – 1.0). Default: 0.75.
47    pub threshold: f32,
48    /// Maximum links created *per memory* (top-N by score). Default: 5.
49    pub max_links_per_memory: usize,
50    /// Restrict to a single workspace. `None` processes all workspaces.
51    pub workspace: Option<String>,
52    /// How many memories to load per batch (controls memory usage). Default: 100.
53    pub batch_size: usize,
54}
55
56impl Default for SemanticLinkOptions {
57    fn default() -> Self {
58        Self {
59            threshold: 0.75,
60            max_links_per_memory: 5,
61            workspace: None,
62            batch_size: 100,
63        }
64    }
65}
66
67/// Configuration for temporal proximity auto-linking.
68///
69/// Links memories that were created close together in time, reflecting the idea
70/// that co-temporal thoughts are often related (session clustering, task batches, etc.).
71#[derive(Debug, Clone)]
72pub struct TemporalLinkOptions {
73    /// Time window in minutes. Memories created within this window of each other
74    /// will be considered for linking. Default: 30 minutes.
75    pub window_minutes: u64,
76    /// Maximum temporal links created *per memory*. Default: 5.
77    pub max_links_per_memory: usize,
78    /// Minimum temporal overlap in seconds. When set, only memories whose
79    /// creation times overlap within this tighter bound are linked. This can be
80    /// used to restrict linking to memories within the same "session".
81    /// Default: `None` (no additional constraint beyond `window_minutes`).
82    pub min_overlap_secs: Option<u64>,
83    /// Restrict processing to a single workspace. `None` processes all workspaces.
84    pub workspace: Option<String>,
85}
86
87impl Default for TemporalLinkOptions {
88    fn default() -> Self {
89        Self {
90            window_minutes: 30,
91            max_links_per_memory: 5,
92            min_overlap_secs: None,
93            workspace: None,
94        }
95    }
96}
97
98/// A single auto-link record as stored in the `auto_links` table.
99#[derive(Debug, Clone, Serialize, Deserialize)]
100pub struct AutoLink {
101    pub id: i64,
102    pub from_id: i64,
103    pub to_id: i64,
104    pub link_type: String,
105    pub score: f64,
106    pub created_at: String,
107}
108
109// ---------------------------------------------------------------------------
110// Core functions
111// ---------------------------------------------------------------------------
112
113/// Insert a single auto-link, ignoring duplicate (from_id, to_id, link_type) triplets.
114///
115/// Returns `Ok(true)` if the row was inserted and `Ok(false)` if it was already present.
116pub fn insert_auto_link(
117    conn: &Connection,
118    from_id: i64,
119    to_id: i64,
120    link_type: &str,
121    score: f64,
122) -> Result<bool> {
123    let rows = conn.execute(
124        "INSERT OR IGNORE INTO auto_links (from_id, to_id, link_type, score)
125         VALUES (?1, ?2, ?3, ?4)",
126        params![from_id, to_id, link_type, score],
127    )?;
128    Ok(rows > 0)
129}
130
131/// Run the semantic auto-linker over memories that have stored embeddings.
132///
133/// The algorithm:
134/// 1. Fetch IDs of all memories that have embeddings (optionally filtered by workspace).
135/// 2. Load embeddings in batches of `options.batch_size`.
136/// 3. For each memory, compute cosine similarity against all *later* memories in the
137///    current window (avoids duplicate pair computation).
138/// 4. Collect pairs above `options.threshold`, sort by score descending, take the top
139///    `options.max_links_per_memory` for each side.
140/// 5. Insert into `auto_links` with `link_type = "semantic"` using `INSERT OR IGNORE`.
141///
142/// # Complexity
143/// O(n²) pairwise within each batch. For large collections you should lower `batch_size`
144/// or schedule the job during off-peak hours.
145pub fn run_semantic_linker(
146    conn: &Connection,
147    _embedder: &dyn Embedder,
148    options: &SemanticLinkOptions,
149) -> Result<AutoLinkResult> {
150    let start = std::time::Instant::now();
151    let mut links_created = 0usize;
152
153    // 1. Fetch all memory IDs that have stored embeddings, optionally workspace-filtered.
154    let ids: Vec<i64> = if let Some(ws) = &options.workspace {
155        let mut stmt = conn.prepare(
156            "SELECT m.id FROM memories m
157             WHERE m.has_embedding = 1 AND m.valid_to IS NULL
158               AND m.workspace = ?1
159             ORDER BY m.id ASC
160             LIMIT ?2",
161        )?;
162        let rows: Vec<i64> = stmt
163            .query_map(params![ws, options.batch_size as i64], |row| row.get(0))?
164            .filter_map(|r| r.ok())
165            .collect();
166        rows
167    } else {
168        let mut stmt = conn.prepare(
169            "SELECT m.id FROM memories m
170             WHERE m.has_embedding = 1 AND m.valid_to IS NULL
171             ORDER BY m.id ASC
172             LIMIT ?1",
173        )?;
174        let rows: Vec<i64> = stmt
175            .query_map(params![options.batch_size as i64], |row| row.get(0))?
176            .filter_map(|r| r.ok())
177            .collect();
178        rows
179    };
180
181    let memories_processed = ids.len();
182
183    // 2. Load embeddings for those IDs.
184    let mut embeddings: Vec<(i64, Vec<f32>)> = Vec::with_capacity(ids.len());
185    for id in &ids {
186        if let Ok(Some(emb)) = get_embedding(conn, *id) {
187            embeddings.push((*id, emb));
188        }
189    }
190
191    // 3. Pairwise similarity — upper triangle only (i < j avoids double-counting).
192    //    For each memory i we collect all pairs above threshold, then honour max_links_per_memory.
193    //    We also need to honour the limit from memory j's perspective; we track per-id counts.
194    let n = embeddings.len();
195    if n < 2 {
196        return Ok(AutoLinkResult {
197            links_created: 0,
198            memories_processed,
199            duration_ms: start.elapsed().as_millis() as u64,
200        });
201    }
202
203    // Collect all qualifying pairs: (score, from_id, to_id) sorted descending by score.
204    let mut pairs: Vec<(f32, i64, i64)> = Vec::new();
205
206    for i in 0..n {
207        for j in (i + 1)..n {
208            let sim = cosine_similarity(&embeddings[i].1, &embeddings[j].1);
209            if sim >= options.threshold {
210                pairs.push((sim, embeddings[i].0, embeddings[j].0));
211            }
212        }
213    }
214
215    // Sort by score descending so we take the strongest links first.
216    pairs.sort_by(|a, b| b.0.partial_cmp(&a.0).unwrap_or(std::cmp::Ordering::Equal));
217
218    // Enforce max_links_per_memory: track how many links each memory has received.
219    let mut link_counts: std::collections::HashMap<i64, usize> = std::collections::HashMap::new();
220
221    for (score, from_id, to_id) in pairs {
222        let from_count = link_counts.entry(from_id).or_insert(0);
223        if *from_count >= options.max_links_per_memory {
224            continue;
225        }
226        let to_count = link_counts.entry(to_id).or_insert(0);
227        if *to_count >= options.max_links_per_memory {
228            continue;
229        }
230
231        // 4. Insert the link (INSERT OR IGNORE respects the UNIQUE constraint).
232        let inserted = insert_auto_link(conn, from_id, to_id, "semantic", score as f64)?;
233        if inserted {
234            links_created += 1;
235            *link_counts.entry(from_id).or_insert(0) += 1;
236            *link_counts.entry(to_id).or_insert(0) += 1;
237        }
238    }
239
240    Ok(AutoLinkResult {
241        links_created,
242        memories_processed,
243        duration_ms: start.elapsed().as_millis() as u64,
244    })
245}
246
247/// Run the temporal proximity auto-linker over memories.
248///
249/// The algorithm:
250/// 1. Fetch all active memories ordered by `created_at` (optionally workspace-filtered).
251/// 2. For each memory `m`, walk forward and backward in the sorted list to find
252///    others whose `created_at` lies within `options.window_minutes` of `m`.
253/// 3. Compute a proximity score: `score = 1.0 - (time_diff_minutes / window_minutes)`.
254///    Memories created at exactly the same time receive score 1.0; memories at the
255///    edge of the window receive score approaching 0.0.
256/// 4. Optionally apply `min_overlap_secs` as an additional tighter bound.
257/// 5. Collect candidates per memory, sort descending by score, take top
258///    `max_links_per_memory`, then insert with `link_type = "temporal"` using
259///    `INSERT OR IGNORE` for idempotency.
260///
261/// # Notes
262/// - Links are directional in the table (from_id < to_id is *not* enforced), but the
263///   UNIQUE index on `(from_id, to_id, link_type)` prevents exact duplicates.
264/// - The function uses a linear scan of the sorted result set, so complexity is O(n × k)
265///   where k is the average number of memories per window — efficient in practice.
266pub fn run_temporal_linker(
267    conn: &Connection,
268    options: &TemporalLinkOptions,
269) -> Result<AutoLinkResult> {
270    let start = std::time::Instant::now();
271    let mut links_created = 0usize;
272
273    let window_secs = options.window_minutes as f64 * 60.0;
274
275    // -----------------------------------------------------------------------
276    // 1. Load all active memories with their creation timestamps.
277    //    We store (id, created_secs) tuples for fast arithmetic comparisons.
278    // -----------------------------------------------------------------------
279
280    // Helper: convert raw (id, ts_string) pairs into (id, unix_secs) pairs.
281    fn collect_rows(raw: Vec<(i64, String)>) -> Vec<(i64, f64)> {
282        raw.into_iter()
283            .filter_map(|(id, ts)| parse_timestamp_to_secs(&ts).map(|s| (id, s)))
284            .collect()
285    }
286
287    let rows: Vec<(i64, f64)> = if let Some(ws) = &options.workspace {
288        let mut stmt = conn.prepare(
289            "SELECT id, created_at
290             FROM memories
291             WHERE valid_to IS NULL AND workspace = ?1
292             ORDER BY created_at ASC",
293        )?;
294        let raw: Vec<(i64, String)> = stmt
295            .query_map(params![ws], |row| Ok((row.get(0)?, row.get(1)?)))?
296            .filter_map(|r| r.ok())
297            .collect();
298        collect_rows(raw)
299    } else {
300        let mut stmt = conn.prepare(
301            "SELECT id, created_at
302             FROM memories
303             WHERE valid_to IS NULL
304             ORDER BY created_at ASC",
305        )?;
306        let raw: Vec<(i64, String)> = stmt
307            .query_map([], |row| Ok((row.get(0)?, row.get(1)?)))?
308            .filter_map(|r| r.ok())
309            .collect();
310        collect_rows(raw)
311    };
312
313    let memories_processed = rows.len();
314
315    if memories_processed < 2 {
316        return Ok(AutoLinkResult {
317            links_created: 0,
318            memories_processed,
319            duration_ms: start.elapsed().as_millis() as u64,
320        });
321    }
322
323    // -----------------------------------------------------------------------
324    // 2. Pairwise scan within window — rows are sorted by created_at, so
325    //    for each i we only need to walk forward until the time delta exceeds
326    //    window_secs (early exit).
327    // -----------------------------------------------------------------------
328
329    // Collect all qualifying candidates: (score, from_id, to_id).
330    let mut candidates: Vec<(f64, i64, i64)> = Vec::new();
331
332    for i in 0..memories_processed {
333        for j in (i + 1)..memories_processed {
334            let diff_secs = rows[j].1 - rows[i].1;
335
336            // Sorted ascending → once diff exceeds window, no point continuing.
337            if diff_secs > window_secs {
338                break;
339            }
340
341            // Optional minimum-overlap check: skip if diff is ABOVE min_overlap_secs.
342            // (min_overlap_secs constrains how *close* they must be, acting as a tighter window.)
343            if let Some(min_secs) = options.min_overlap_secs {
344                if diff_secs > min_secs as f64 {
345                    continue;
346                }
347            }
348
349            // Score: 1.0 when diff=0, approaches 0.0 at diff=window_secs.
350            let score = if window_secs > 0.0 {
351                1.0 - (diff_secs / window_secs)
352            } else {
353                1.0
354            };
355
356            candidates.push((score, rows[i].0, rows[j].0));
357        }
358    }
359
360    // Sort by score descending to honour max_links_per_memory with strongest links first.
361    candidates.sort_by(|a, b| b.0.partial_cmp(&a.0).unwrap_or(std::cmp::Ordering::Equal));
362
363    // -----------------------------------------------------------------------
364    // 3. Enforce per-memory link cap, then insert.
365    // -----------------------------------------------------------------------
366    let mut link_counts: std::collections::HashMap<i64, usize> = std::collections::HashMap::new();
367
368    for (score, from_id, to_id) in candidates {
369        {
370            let from_count = link_counts.entry(from_id).or_insert(0);
371            if *from_count >= options.max_links_per_memory {
372                continue;
373            }
374        }
375        {
376            let to_count = link_counts.entry(to_id).or_insert(0);
377            if *to_count >= options.max_links_per_memory {
378                continue;
379            }
380        }
381
382        let inserted = insert_auto_link(conn, from_id, to_id, "temporal", score)?;
383        if inserted {
384            links_created += 1;
385            *link_counts.entry(from_id).or_insert(0) += 1;
386            *link_counts.entry(to_id).or_insert(0) += 1;
387        }
388    }
389
390    Ok(AutoLinkResult {
391        links_created,
392        memories_processed,
393        duration_ms: start.elapsed().as_millis() as u64,
394    })
395}
396
397/// List auto-links, optionally filtered by `link_type`.
398///
399/// Results are ordered by score descending.  `limit` is capped to 1000 to
400/// prevent accidental full-table scans; pass 1000 explicitly for large queries.
401pub fn list_auto_links(
402    conn: &Connection,
403    link_type: Option<&str>,
404    limit: usize,
405) -> Result<Vec<AutoLink>> {
406    let capped_limit = limit.min(1000);
407
408    let rows: Vec<AutoLink> = if let Some(lt) = link_type {
409        let mut stmt = conn.prepare(
410            "SELECT id, from_id, to_id, link_type, score, created_at
411             FROM auto_links
412             WHERE link_type = ?1
413             ORDER BY score DESC
414             LIMIT ?2",
415        )?;
416        let collected: Vec<AutoLink> = stmt
417            .query_map(params![lt, capped_limit as i64], row_to_auto_link)?
418            .filter_map(|r| r.ok())
419            .collect();
420        collected
421    } else {
422        let mut stmt = conn.prepare(
423            "SELECT id, from_id, to_id, link_type, score, created_at
424             FROM auto_links
425             ORDER BY score DESC
426             LIMIT ?1",
427        )?;
428        let collected: Vec<AutoLink> = stmt
429            .query_map(params![capped_limit as i64], row_to_auto_link)?
430            .filter_map(|r| r.ok())
431            .collect();
432        collected
433    };
434
435    Ok(rows)
436}
437
438/// Return auto-link statistics as a JSON object, grouped by `link_type`.
439///
440/// Example output:
441/// ```json
442/// { "semantic": 42, "temporal": 7 }
443/// ```
444pub fn auto_link_stats(conn: &Connection) -> Result<serde_json::Value> {
445    let mut stmt = conn.prepare(
446        "SELECT link_type, COUNT(*) as cnt
447         FROM auto_links
448         GROUP BY link_type
449         ORDER BY link_type ASC",
450    )?;
451
452    let mut map = serde_json::Map::new();
453    let rows = stmt.query_map([], |row| {
454        let lt: String = row.get(0)?;
455        let cnt: i64 = row.get(1)?;
456        Ok((lt, cnt))
457    })?;
458
459    for row in rows.filter_map(|r| r.ok()) {
460        map.insert(row.0, serde_json::Value::Number(row.1.into()));
461    }
462
463    Ok(serde_json::Value::Object(map))
464}
465
466// ---------------------------------------------------------------------------
467// Private helpers
468// ---------------------------------------------------------------------------
469
470/// Parse a timestamp string (RFC3339 or SQLite CURRENT_TIMESTAMP format) into
471/// UNIX seconds as f64.  Returns `None` if parsing fails — those rows are
472/// silently skipped by the temporal linker.
473fn parse_timestamp_to_secs(ts: &str) -> Option<f64> {
474    // Try RFC 3339 first (the canonical engram format).
475    if let Ok(dt) = chrono::DateTime::parse_from_rfc3339(ts) {
476        return Some(dt.timestamp() as f64 + dt.timestamp_subsec_nanos() as f64 * 1e-9);
477    }
478    // Fall back to SQLite's CURRENT_TIMESTAMP format: "YYYY-MM-DD HH:MM:SS"
479    if let Ok(ndt) = chrono::NaiveDateTime::parse_from_str(ts, "%Y-%m-%d %H:%M:%S") {
480        return Some(ndt.and_utc().timestamp() as f64);
481    }
482    None
483}
484
485fn row_to_auto_link(row: &rusqlite::Row) -> rusqlite::Result<AutoLink> {
486    Ok(AutoLink {
487        id: row.get(0)?,
488        from_id: row.get(1)?,
489        to_id: row.get(2)?,
490        link_type: row.get(3)?,
491        score: row.get(4)?,
492        created_at: row.get(5)?,
493    })
494}
495
496// ---------------------------------------------------------------------------
497// Tests
498// ---------------------------------------------------------------------------
499
500#[cfg(test)]
501mod tests {
502    use super::*;
503    use crate::embedding::TfIdfEmbedder;
504    use crate::storage::migrations::run_migrations;
505    use rusqlite::Connection;
506
507    /// Create an in-memory SQLite database with the full schema applied.
508    fn setup_db() -> Connection {
509        let conn = Connection::open_in_memory().expect("in-memory db");
510        run_migrations(&conn).expect("migrations");
511        conn
512    }
513
514    /// Insert a minimal memory row and optionally a stored embedding, returning the new ID.
515    fn insert_memory_with_embedding(
516        conn: &Connection,
517        content: &str,
518        embedding: Option<&[f32]>,
519    ) -> i64 {
520        conn.execute(
521            "INSERT INTO memories (content, memory_type, has_embedding)
522             VALUES (?1, 'note', ?2)",
523            params![content, embedding.is_some() as i32],
524        )
525        .expect("insert memory");
526        let id = conn.last_insert_rowid();
527
528        if let Some(emb) = embedding {
529            // Serialise f32 slice as little-endian bytes (same format as EmbeddingWorker).
530            let bytes: Vec<u8> = emb.iter().flat_map(|f| f.to_le_bytes()).collect();
531            conn.execute(
532                "INSERT INTO embeddings (memory_id, embedding, model, dimensions)
533                 VALUES (?1, ?2, 'tfidf', ?3)",
534                params![id, bytes, emb.len() as i64],
535            )
536            .expect("insert embedding");
537        }
538
539        id
540    }
541
542    // ------------------------------------------------------------------
543    // insert_auto_link tests
544    // ------------------------------------------------------------------
545
546    #[test]
547    fn test_insert_auto_link_creates_a_link() {
548        let conn = setup_db();
549        let a = insert_memory_with_embedding(&conn, "alpha", None);
550        let b = insert_memory_with_embedding(&conn, "beta", None);
551
552        let inserted = insert_auto_link(&conn, a, b, "semantic", 0.9).expect("insert");
553        assert!(inserted, "first insert should return true");
554
555        let count: i64 = conn
556            .query_row("SELECT COUNT(*) FROM auto_links", [], |r| r.get(0))
557            .unwrap();
558        assert_eq!(count, 1);
559    }
560
561    #[test]
562    fn test_insert_auto_link_is_idempotent() {
563        let conn = setup_db();
564        let a = insert_memory_with_embedding(&conn, "alpha", None);
565        let b = insert_memory_with_embedding(&conn, "beta", None);
566
567        let first = insert_auto_link(&conn, a, b, "semantic", 0.9).expect("first insert");
568        let second = insert_auto_link(&conn, a, b, "semantic", 0.9).expect("second insert");
569
570        assert!(first, "first insert should return true");
571        assert!(!second, "duplicate insert should return false");
572
573        let count: i64 = conn
574            .query_row("SELECT COUNT(*) FROM auto_links", [], |r| r.get(0))
575            .unwrap();
576        assert_eq!(count, 1, "only one row should exist after duplicate insert");
577    }
578
579    #[test]
580    fn test_insert_auto_link_different_type_is_not_duplicate() {
581        let conn = setup_db();
582        let a = insert_memory_with_embedding(&conn, "alpha", None);
583        let b = insert_memory_with_embedding(&conn, "beta", None);
584
585        insert_auto_link(&conn, a, b, "semantic", 0.9).unwrap();
586        let second = insert_auto_link(&conn, a, b, "temporal", 0.5).unwrap();
587
588        assert!(second, "different link_type should be a new row");
589        let count: i64 = conn
590            .query_row("SELECT COUNT(*) FROM auto_links", [], |r| r.get(0))
591            .unwrap();
592        assert_eq!(count, 2);
593    }
594
595    // ------------------------------------------------------------------
596    // run_semantic_linker tests
597    // ------------------------------------------------------------------
598
599    #[test]
600    fn test_run_semantic_linker_processes_memories_and_creates_links() {
601        let conn = setup_db();
602        let embedder = TfIdfEmbedder::new(4);
603
604        // Two memories with identical embeddings → cosine similarity = 1.0
605        let emb = vec![1.0f32, 0.0, 0.0, 0.0];
606        let _a = insert_memory_with_embedding(&conn, "memory A", Some(&emb));
607        let _b = insert_memory_with_embedding(&conn, "memory B", Some(&emb));
608
609        let opts = SemanticLinkOptions {
610            threshold: 0.9,
611            max_links_per_memory: 5,
612            workspace: None,
613            batch_size: 100,
614        };
615
616        let result = run_semantic_linker(&conn, &embedder, &opts).expect("linker");
617
618        assert_eq!(result.memories_processed, 2);
619        assert_eq!(result.links_created, 1, "one link for the identical pair");
620    }
621
622    #[test]
623    fn test_threshold_filtering_lower_threshold_creates_more_links() {
624        let conn = setup_db();
625        let embedder = TfIdfEmbedder::new(4);
626
627        // Three memories: A & B are identical (sim=1.0), A & C are orthogonal (sim=0.0)
628        let emb_a = vec![1.0f32, 0.0, 0.0, 0.0];
629        let emb_b = vec![1.0f32, 0.0, 0.0, 0.0];
630        let emb_c = vec![0.0f32, 1.0, 0.0, 0.0]; // orthogonal to A and B
631
632        insert_memory_with_embedding(&conn, "A", Some(&emb_a));
633        insert_memory_with_embedding(&conn, "B", Some(&emb_b));
634        insert_memory_with_embedding(&conn, "C", Some(&emb_c));
635
636        // High threshold: only A-B should be linked
637        let high_opts = SemanticLinkOptions {
638            threshold: 0.9,
639            max_links_per_memory: 5,
640            workspace: None,
641            batch_size: 100,
642        };
643        let result_high = run_semantic_linker(&conn, &embedder, &high_opts).expect("high");
644        let count_high: i64 = conn
645            .query_row("SELECT COUNT(*) FROM auto_links", [], |r| r.get(0))
646            .unwrap();
647        assert_eq!(result_high.links_created, 1);
648
649        // Delete links so we can rerun with a low threshold.
650        conn.execute("DELETE FROM auto_links", []).unwrap();
651
652        // Low threshold: A-B still linked; A-C and B-C are orthogonal (sim=0), still not linked.
653        // So result should be the same for orthogonal vectors.
654        let low_opts = SemanticLinkOptions {
655            threshold: 0.0,
656            max_links_per_memory: 5,
657            workspace: None,
658            batch_size: 100,
659        };
660        let result_low = run_semantic_linker(&conn, &embedder, &low_opts).expect("low");
661        // With threshold=0.0, zero-similarity pairs are also included (sim >= 0.0).
662        // A-B (1.0), A-C (0.0), B-C (0.0) → 3 links
663        assert!(
664            result_low.links_created >= count_high as usize,
665            "lower threshold should create at least as many links"
666        );
667    }
668
669    #[test]
670    fn test_max_links_per_memory_is_respected() {
671        let conn = setup_db();
672        let embedder = TfIdfEmbedder::new(4);
673
674        // Six memories all with the same embedding → all pairs have similarity = 1.0
675        let emb = vec![1.0f32, 0.0, 0.0, 0.0];
676        for i in 0..6 {
677            insert_memory_with_embedding(&conn, &format!("memory {}", i), Some(&emb));
678        }
679
680        let opts = SemanticLinkOptions {
681            threshold: 0.9,
682            max_links_per_memory: 2, // limit to 2 links per memory
683            workspace: None,
684            batch_size: 100,
685        };
686
687        run_semantic_linker(&conn, &embedder, &opts).expect("linker");
688
689        // Fetch per-memory link counts and ensure none exceed max_links_per_memory.
690        let mut stmt = conn
691            .prepare(
692                "SELECT mem_id, COUNT(*) as cnt FROM (
693                     SELECT from_id AS mem_id FROM auto_links
694                     UNION ALL
695                     SELECT to_id AS mem_id FROM auto_links
696                 ) GROUP BY mem_id",
697            )
698            .unwrap();
699
700        let counts: Vec<i64> = stmt
701            .query_map([], |r| r.get(1))
702            .unwrap()
703            .filter_map(|r| r.ok())
704            .collect();
705
706        for cnt in &counts {
707            assert!(
708                *cnt <= opts.max_links_per_memory as i64,
709                "memory exceeds max_links_per_memory: {} > {}",
710                cnt,
711                opts.max_links_per_memory
712            );
713        }
714    }
715
716    // ------------------------------------------------------------------
717    // list_auto_links tests
718    // ------------------------------------------------------------------
719
720    #[test]
721    fn test_list_auto_links_returns_results() {
722        let conn = setup_db();
723        let a = insert_memory_with_embedding(&conn, "A", None);
724        let b = insert_memory_with_embedding(&conn, "B", None);
725        let c = insert_memory_with_embedding(&conn, "C", None);
726
727        insert_auto_link(&conn, a, b, "semantic", 0.9).unwrap();
728        insert_auto_link(&conn, b, c, "temporal", 0.6).unwrap();
729
730        let all = list_auto_links(&conn, None, 10).expect("list all");
731        assert_eq!(all.len(), 2);
732
733        let semantic = list_auto_links(&conn, Some("semantic"), 10).expect("list semantic");
734        assert_eq!(semantic.len(), 1);
735        assert_eq!(semantic[0].link_type, "semantic");
736
737        let temporal = list_auto_links(&conn, Some("temporal"), 10).expect("list temporal");
738        assert_eq!(temporal.len(), 1);
739        assert_eq!(temporal[0].link_type, "temporal");
740    }
741
742    #[test]
743    fn test_list_auto_links_ordered_by_score_descending() {
744        let conn = setup_db();
745        let a = insert_memory_with_embedding(&conn, "A", None);
746        let b = insert_memory_with_embedding(&conn, "B", None);
747        let c = insert_memory_with_embedding(&conn, "C", None);
748
749        insert_auto_link(&conn, a, b, "semantic", 0.5).unwrap();
750        insert_auto_link(&conn, a, c, "semantic", 0.95).unwrap();
751
752        let links = list_auto_links(&conn, Some("semantic"), 10).unwrap();
753        assert_eq!(links.len(), 2);
754        assert!(
755            links[0].score >= links[1].score,
756            "results should be ordered by score desc"
757        );
758    }
759
760    // ------------------------------------------------------------------
761    // auto_link_stats tests
762    // ------------------------------------------------------------------
763
764    #[test]
765    fn test_auto_link_stats_returns_counts() {
766        let conn = setup_db();
767        let a = insert_memory_with_embedding(&conn, "A", None);
768        let b = insert_memory_with_embedding(&conn, "B", None);
769        let c = insert_memory_with_embedding(&conn, "C", None);
770
771        insert_auto_link(&conn, a, b, "semantic", 0.8).unwrap();
772        insert_auto_link(&conn, a, c, "semantic", 0.7).unwrap();
773        insert_auto_link(&conn, b, c, "temporal", 0.5).unwrap();
774
775        let stats = auto_link_stats(&conn).expect("stats");
776
777        assert_eq!(stats["semantic"], serde_json::json!(2));
778        assert_eq!(stats["temporal"], serde_json::json!(1));
779    }
780
781    #[test]
782    fn test_auto_link_stats_empty_returns_empty_object() {
783        let conn = setup_db();
784        let stats = auto_link_stats(&conn).expect("stats");
785        assert!(stats.as_object().unwrap().is_empty());
786    }
787
788    // ------------------------------------------------------------------
789    // run_temporal_linker tests
790    // ------------------------------------------------------------------
791
792    /// Insert a minimal memory with an explicit created_at timestamp (RFC3339).
793    fn insert_memory_at(conn: &Connection, content: &str, created_at: &str) -> i64 {
794        conn.execute(
795            "INSERT INTO memories (content, memory_type, created_at)
796             VALUES (?1, 'note', ?2)",
797            params![content, created_at],
798        )
799        .expect("insert memory with timestamp");
800        conn.last_insert_rowid()
801    }
802
803    #[test]
804    fn test_temporal_linker_creates_link_for_close_memories() {
805        let conn = setup_db();
806
807        // Two memories 5 minutes apart — within the default 30-minute window.
808        let a = insert_memory_at(&conn, "first thought", "2024-01-01T10:00:00Z");
809        let b = insert_memory_at(&conn, "second thought", "2024-01-01T10:05:00Z");
810
811        let opts = TemporalLinkOptions::default();
812        let result = run_temporal_linker(&conn, &opts).expect("temporal linker");
813
814        assert_eq!(result.memories_processed, 2);
815        assert_eq!(
816            result.links_created, 1,
817            "one temporal link for the nearby pair"
818        );
819
820        // Verify link is stored with the correct type.
821        let links = list_auto_links(&conn, Some("temporal"), 10).expect("list");
822        assert_eq!(links.len(), 1);
823        assert_eq!(links[0].from_id, a);
824        assert_eq!(links[0].to_id, b);
825        assert_eq!(links[0].link_type, "temporal");
826    }
827
828    #[test]
829    fn test_temporal_linker_no_link_outside_window() {
830        let conn = setup_db();
831
832        // Two memories 60 minutes apart — outside the default 30-minute window.
833        let _a = insert_memory_at(&conn, "morning thought", "2024-01-01T08:00:00Z");
834        let _b = insert_memory_at(&conn, "afternoon thought", "2024-01-01T09:00:00Z");
835
836        let opts = TemporalLinkOptions::default(); // window_minutes = 30
837        let result = run_temporal_linker(&conn, &opts).expect("temporal linker");
838
839        assert_eq!(result.memories_processed, 2);
840        assert_eq!(
841            result.links_created, 0,
842            "memories 60m apart should not be linked"
843        );
844    }
845
846    #[test]
847    fn test_temporal_score_formula_closer_means_higher_score() {
848        let conn = setup_db();
849
850        // Three memories: A at T=0, B at T=5min, C at T=25min (within 30-min window).
851        let _a = insert_memory_at(&conn, "A", "2024-01-01T10:00:00Z");
852        let _b = insert_memory_at(&conn, "B", "2024-01-01T10:05:00Z");
853        let _c = insert_memory_at(&conn, "C", "2024-01-01T10:25:00Z");
854
855        let opts = TemporalLinkOptions {
856            window_minutes: 30,
857            max_links_per_memory: 10,
858            min_overlap_secs: None,
859            workspace: None,
860        };
861        run_temporal_linker(&conn, &opts).expect("linker");
862
863        // A-B are 5 min apart → score = 1 - (5/30) ≈ 0.833
864        // A-C are 25 min apart → score = 1 - (25/30) ≈ 0.167
865        // B-C are 20 min apart → score = 1 - (20/30) ≈ 0.333
866        let links = list_auto_links(&conn, Some("temporal"), 10).expect("list");
867        assert_eq!(links.len(), 3, "all three pairs within window");
868
869        // List is sorted by score descending; A-B should be first.
870        assert!(
871            links[0].score > links[1].score,
872            "higher score should come first"
873        );
874        assert!(
875            links[0].score > 0.8,
876            "A-B score should be ~0.833, got {}",
877            links[0].score
878        );
879    }
880
881    #[test]
882    fn test_temporal_linker_idempotent() {
883        let conn = setup_db();
884
885        let _a = insert_memory_at(&conn, "A", "2024-01-01T10:00:00Z");
886        let _b = insert_memory_at(&conn, "B", "2024-01-01T10:10:00Z");
887
888        let opts = TemporalLinkOptions::default();
889
890        let first = run_temporal_linker(&conn, &opts).expect("first run");
891        let second = run_temporal_linker(&conn, &opts).expect("second run");
892
893        assert_eq!(first.links_created, 1);
894        assert_eq!(
895            second.links_created, 0,
896            "second run should create no new links"
897        );
898
899        let count: i64 = conn
900            .query_row(
901                "SELECT COUNT(*) FROM auto_links WHERE link_type = 'temporal'",
902                [],
903                |r| r.get(0),
904            )
905            .unwrap();
906        assert_eq!(count, 1, "exactly one temporal link should exist");
907    }
908
909    #[test]
910    fn test_temporal_linker_max_links_per_memory_respected() {
911        let conn = setup_db();
912
913        // Five memories all within a 10-minute window — all pairs qualify.
914        // With max_links_per_memory=2, each memory gets at most 2 links.
915        for i in 0..5i64 {
916            let ts = format!("2024-01-01T10:{:02}:00Z", i * 2); // 0, 2, 4, 6, 8 minutes
917            insert_memory_at(&conn, &format!("mem {}", i), &ts);
918        }
919
920        let opts = TemporalLinkOptions {
921            window_minutes: 30,
922            max_links_per_memory: 2,
923            min_overlap_secs: None,
924            workspace: None,
925        };
926        run_temporal_linker(&conn, &opts).expect("linker");
927
928        // Verify per-memory link counts.
929        let mut stmt = conn
930            .prepare(
931                "SELECT mem_id, COUNT(*) as cnt FROM (
932                     SELECT from_id AS mem_id FROM auto_links WHERE link_type = 'temporal'
933                     UNION ALL
934                     SELECT to_id AS mem_id FROM auto_links WHERE link_type = 'temporal'
935                 ) GROUP BY mem_id",
936            )
937            .unwrap();
938
939        let counts: Vec<i64> = stmt
940            .query_map([], |r| r.get(1))
941            .unwrap()
942            .filter_map(|r| r.ok())
943            .collect();
944
945        for cnt in &counts {
946            assert!(
947                *cnt <= 2,
948                "a memory has more than max_links_per_memory=2 links: {}",
949                cnt
950            );
951        }
952    }
953
954    #[test]
955    fn test_temporal_linker_workspace_filter() {
956        let conn = setup_db();
957
958        // Memories in two different workspaces, both within the time window.
959        conn.execute(
960            "INSERT INTO memories (content, memory_type, workspace, created_at)
961             VALUES ('ws-alpha-1', 'note', 'alpha', '2024-01-01T10:00:00Z')",
962            [],
963        )
964        .unwrap();
965        conn.execute(
966            "INSERT INTO memories (content, memory_type, workspace, created_at)
967             VALUES ('ws-alpha-2', 'note', 'alpha', '2024-01-01T10:05:00Z')",
968            [],
969        )
970        .unwrap();
971        conn.execute(
972            "INSERT INTO memories (content, memory_type, workspace, created_at)
973             VALUES ('ws-beta-1', 'note', 'beta', '2024-01-01T10:02:00Z')",
974            [],
975        )
976        .unwrap();
977
978        // Only process workspace "alpha".
979        let opts = TemporalLinkOptions {
980            workspace: Some("alpha".to_string()),
981            ..Default::default()
982        };
983        let result = run_temporal_linker(&conn, &opts).expect("linker");
984
985        // Should process 2 alpha memories and create 1 link between them.
986        assert_eq!(result.memories_processed, 2);
987        assert_eq!(result.links_created, 1);
988
989        // The beta memory must not appear in any link.
990        let beta_id: i64 = conn
991            .query_row(
992                "SELECT id FROM memories WHERE workspace = 'beta'",
993                [],
994                |r| r.get(0),
995            )
996            .unwrap();
997        let beta_link_count: i64 = conn
998            .query_row(
999                "SELECT COUNT(*) FROM auto_links
1000                 WHERE (from_id = ?1 OR to_id = ?1) AND link_type = 'temporal'",
1001                params![beta_id],
1002                |r| r.get(0),
1003            )
1004            .unwrap();
1005        assert_eq!(
1006            beta_link_count, 0,
1007            "beta workspace memory should not be linked"
1008        );
1009    }
1010
1011    #[test]
1012    fn test_temporal_linker_min_overlap_secs_restricts_candidates() {
1013        let conn = setup_db();
1014
1015        // Three memories: A at T=0, B at T=2min, C at T=10min.
1016        // window_minutes=30 so all qualify by window, but min_overlap_secs=120 (2 min)
1017        // means only A-B (exactly 120s apart) qualify; A-C and B-C exceed 120s.
1018        let _a = insert_memory_at(&conn, "A", "2024-01-01T10:00:00Z");
1019        let _b = insert_memory_at(&conn, "B", "2024-01-01T10:02:00Z");
1020        let _c = insert_memory_at(&conn, "C", "2024-01-01T10:10:00Z");
1021
1022        let opts = TemporalLinkOptions {
1023            window_minutes: 30,
1024            max_links_per_memory: 5,
1025            min_overlap_secs: Some(120), // 2 minutes
1026            workspace: None,
1027        };
1028        let result = run_temporal_linker(&conn, &opts).expect("linker");
1029
1030        assert_eq!(result.memories_processed, 3);
1031        // A-B: 120s diff = exactly at the boundary (diff <= min_overlap_secs=120) → linked.
1032        // A-C: 600s > 120 → not linked.  B-C: 480s > 120 → not linked.
1033        assert_eq!(
1034            result.links_created, 1,
1035            "only A-B qualifies under min_overlap_secs=120"
1036        );
1037    }
1038
1039    #[test]
1040    fn test_temporal_linker_empty_database_returns_zero() {
1041        let conn = setup_db();
1042        let opts = TemporalLinkOptions::default();
1043        let result = run_temporal_linker(&conn, &opts).expect("linker");
1044        assert_eq!(result.memories_processed, 0);
1045        assert_eq!(result.links_created, 0);
1046    }
1047
1048    #[test]
1049    fn test_parse_timestamp_to_secs_rfc3339() {
1050        let secs = parse_timestamp_to_secs("2024-01-01T10:00:00Z");
1051        assert!(secs.is_some(), "should parse RFC3339 timestamp");
1052
1053        let secs2 = parse_timestamp_to_secs("2024-01-01T10:05:00Z");
1054        assert!(secs2.is_some());
1055
1056        // 5 minutes = 300 seconds difference.
1057        let diff = secs2.unwrap() - secs.unwrap();
1058        assert!(
1059            (diff - 300.0).abs() < 1.0,
1060            "difference should be ~300s, got {}",
1061            diff
1062        );
1063    }
1064
1065    #[test]
1066    fn test_parse_timestamp_to_secs_sqlite_format() {
1067        let secs = parse_timestamp_to_secs("2024-01-01 10:00:00");
1068        assert!(
1069            secs.is_some(),
1070            "should parse SQLite CURRENT_TIMESTAMP format"
1071        );
1072    }
1073
1074    #[test]
1075    fn test_parse_timestamp_to_secs_invalid_returns_none() {
1076        let result = parse_timestamp_to_secs("not-a-date");
1077        assert!(result.is_none(), "invalid timestamp should return None");
1078    }
1079}