mentedb 0.5.0

A purpose-built database engine for AI agent memory
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
//! Realistic and stress tests for MenteDB.
//!
//! These tests go beyond basic CRUD by simulating real-world AI agent workloads:
//! multi-turn conversations with actual code, concurrent multi-agent writes,
//! context assembly under token budgets, and contradiction detection chains.

use std::collections::HashSet;
use std::sync::{Arc, Mutex};
use std::thread;
use std::time::{SystemTime, UNIX_EPOCH};

use mentedb::MenteDb;
use mentedb::prelude::*;

use mentedb_cognitive::{InferredAction, WriteInferenceConfig, WriteInferenceEngine};
use mentedb_consolidation::{DecayConfig, DecayEngine};
use mentedb_context::{AssemblyConfig, ContextAssembler, ScoredMemory};
use mentedb_embedding::{EmbeddingProvider, HashEmbeddingProvider};

use mentedb_core::types::{AgentId, MemoryId};
use tempfile::tempdir;

// ---------------------------------------------------------------------------
// Helpers
// ---------------------------------------------------------------------------

const EMBEDDING_DIM: usize = 64;

fn now_us() -> u64 {
    SystemTime::now()
        .duration_since(UNIX_EPOCH)
        .unwrap()
        .as_micros() as u64
}

const DAY_US: u64 = 24 * 3600 * 1_000_000;

fn embedder() -> HashEmbeddingProvider {
    HashEmbeddingProvider::new(EMBEDDING_DIM)
}

fn embed(provider: &HashEmbeddingProvider, text: &str) -> Vec<f32> {
    provider.embed(text).unwrap()
}

fn make_memory_from_text(
    agent_id: AgentId,
    content: &str,
    mem_type: MemoryType,
    provider: &HashEmbeddingProvider,
) -> MemoryNode {
    let embedding = embed(provider, content);
    MemoryNode::new(agent_id, mem_type, content.to_string(), embedding)
}

fn make_memory_with_tags(
    agent_id: AgentId,
    content: &str,
    mem_type: MemoryType,
    tags: Vec<String>,
    provider: &HashEmbeddingProvider,
) -> MemoryNode {
    let mut node = make_memory_from_text(agent_id, content, mem_type, provider);
    node.tags = tags;
    node
}

fn make_memory_at_time(
    agent_id: AgentId,
    content: &str,
    mem_type: MemoryType,
    created_at: u64,
    salience: f32,
    provider: &HashEmbeddingProvider,
) -> MemoryNode {
    let mut node = make_memory_from_text(agent_id, content, mem_type, provider);
    node.created_at = created_at;
    node.accessed_at = created_at;
    node.salience = salience;
    node
}

// ---------------------------------------------------------------------------
// Test 1: Multi-turn coding conversation
// ---------------------------------------------------------------------------

/// Validates that MenteDB can store and recall context from a realistic 15+ turn
/// conversation between a developer and an AI assistant building a REST API.
/// The conversation includes actual code snippets, error messages, and iterative
/// refinement -- the kind of dialogue a coding assistant handles daily.
#[test]
fn test_multi_turn_coding_conversation() {
    let dir = tempdir().unwrap();
    let mut db = MenteDb::open(dir.path()).unwrap();
    let agent_id = AgentId::new();
    let provider = embedder();

    let conversation: Vec<(&str, MemoryType)> = vec![
        (
            "User: I want to build a REST API in Rust using Actix-web for a todo list application.",
            MemoryType::Episodic,
        ),
        (
            "Assistant: Let us start with the project structure. Run `cargo new todo-api` and add \
             actix-web, serde, and sqlx to Cargo.toml.",
            MemoryType::Episodic,
        ),
        (
            "User: Done. Here is my Cargo.toml:\n\
             [dependencies]\n\
             actix-web = \"4\"\n\
             serde = { version = \"1\", features = [\"derive\"] }\n\
             sqlx = { version = \"0.7\", features = [\"runtime-tokio\", \"sqlite\"] }",
            MemoryType::Episodic,
        ),
        (
            "Assistant: Now create the Todo model:\n\
             ```rust\n\
             #[derive(Debug, Serialize, Deserialize, sqlx::FromRow)]\n\
             pub struct Todo {\n\
                 pub id: i64,\n\
                 pub title: String,\n\
                 pub completed: bool,\n\
             }\n\
             ```",
            MemoryType::Procedural,
        ),
        (
            "User: I added the model. Now I need the GET /todos endpoint.",
            MemoryType::Episodic,
        ),
        (
            "Assistant: Here is the handler for listing todos:\n\
             ```rust\n\
             async fn list_todos(pool: web::Data<SqlitePool>) -> impl Responder {\n\
                 let todos = sqlx::query_as::<_, Todo>(\"SELECT * FROM todos\")\n\
                     .fetch_all(pool.get_ref())\n\
                     .await\n\
                     .unwrap();\n\
                 HttpResponse::Ok().json(todos)\n\
             }\n\
             ```",
            MemoryType::Procedural,
        ),
        (
            "User: I also need POST /todos to create new todos with JSON body validation.",
            MemoryType::Episodic,
        ),
        (
            "Assistant: Add this create handler:\n\
             ```rust\n\
             #[derive(Deserialize)]\n\
             struct CreateTodo { title: String }\n\n\
             async fn create_todo(\n\
                 pool: web::Data<SqlitePool>,\n\
                 body: web::Json<CreateTodo>,\n\
             ) -> impl Responder {\n\
                 let result = sqlx::query(\"INSERT INTO todos (title, completed) VALUES (?, false)\")\n\
                     .bind(&body.title)\n\
                     .execute(pool.get_ref())\n\
                     .await\n\
                     .unwrap();\n\
                 HttpResponse::Created().json(serde_json::json!({\"id\": result.last_insert_rowid()}))\n\
             }\n\
             ```",
            MemoryType::Procedural,
        ),
        (
            "User: I am getting a compile error: `the trait bound SqlitePool: Clone is not satisfied`.",
            MemoryType::Episodic,
        ),
        (
            "Assistant: SqlitePool already implements Clone. The issue is likely that you need to \
             wrap it in web::Data before passing to App::app_data(). Make sure you do:\n\
             ```rust\n\
             let pool = SqlitePool::connect(\"sqlite:todos.db\").await.unwrap();\n\
             HttpServer::new(move || {\n\
                 App::new()\n\
                     .app_data(web::Data::new(pool.clone()))\n\
                     .route(\"/todos\", web::get().to(list_todos))\n\
                     .route(\"/todos\", web::post().to(create_todo))\n\
             })\n\
             ```",
            MemoryType::Procedural,
        ),
        (
            "User: That fixed the compile error. Now I want to add DELETE /todos/{id} for removing individual items.",
            MemoryType::Episodic,
        ),
        (
            "Assistant: Here is the delete handler with path parameter extraction:\n\
             ```rust\n\
             async fn delete_todo(\n\
                 pool: web::Data<SqlitePool>,\n\
                 path: web::Path<i64>,\n\
             ) -> impl Responder {\n\
                 let id = path.into_inner();\n\
                 sqlx::query(\"DELETE FROM todos WHERE id = ?\")\n\
                     .bind(id)\n\
                     .execute(pool.get_ref())\n\
                     .await\n\
                     .unwrap();\n\
                 HttpResponse::NoContent().finish()\n\
             }\n\
             ```\n\
             Register it with `.route(\"/todos/{id}\", web::delete().to(delete_todo))`.",
            MemoryType::Procedural,
        ),
        (
            "User: I want to add proper error handling instead of unwrap() everywhere.",
            MemoryType::Episodic,
        ),
        (
            "Assistant: Create a custom error type that implements actix_web::ResponseError:\n\
             ```rust\n\
             #[derive(Debug)]\n\
             enum ApiError {\n\
                 DbError(sqlx::Error),\n\
                 NotFound,\n\
             }\n\
             impl std::fmt::Display for ApiError {\n\
                 fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {\n\
                     match self {\n\
                         ApiError::DbError(e) => write!(f, \"Database error: {}\", e),\n\
                         ApiError::NotFound => write!(f, \"Resource not found\"),\n\
                     }\n\
                 }\n\
             }\n\
             impl ResponseError for ApiError {}\n\
             ```",
            MemoryType::Procedural,
        ),
        (
            "User: Should I add middleware for request logging and CORS?",
            MemoryType::Episodic,
        ),
        (
            "Assistant: Yes. For logging use actix_web::middleware::Logger, and for CORS use the \
             actix-cors crate. Add `actix-cors = \"0.7\"` to Cargo.toml, then configure:\n\
             ```rust\n\
             use actix_cors::Cors;\n\n\
             App::new()\n\
                 .wrap(Logger::default())\n\
                 .wrap(Cors::permissive())\n\
                 .app_data(web::Data::new(pool.clone()))\n\
             ```",
            MemoryType::Procedural,
        ),
        (
            "User: The API is working. I ran `curl -X POST http://localhost:8080/todos -H \
             'Content-Type: application/json' -d '{\"title\":\"Buy groceries\"}'` and got back a 201.",
            MemoryType::Episodic,
        ),
    ];

    let mut memory_ids: Vec<MemoryId> = Vec::new();
    for (content, mem_type) in &conversation {
        let node = make_memory_from_text(agent_id, content, *mem_type, &provider);
        memory_ids.push(node.id);
        db.store(node).unwrap();
    }

    assert_eq!(
        memory_ids.len(),
        17,
        "All 17 conversation turns should be stored"
    );

    // At turn 15+, recall context about error handling (turn 13-14)
    let query_embedding = embed(&provider, "error handling in actix-web REST API");
    let results = db.recall_similar(&query_embedding, 5).unwrap();

    assert!(
        !results.is_empty(),
        "Recall should find relevant memories about error handling"
    );
    assert!(results.len() <= 5, "Should respect the k=5 limit");

    // Recall context about database setup
    let db_query = embed(&provider, "SQLite database pool setup in actix-web");
    let db_results = db.recall_similar(&db_query, 3).unwrap();
    assert!(
        !db_results.is_empty(),
        "Recall should find memories about database configuration"
    );

    // Recall about the POST endpoint specifically
    let post_query = embed(&provider, "create new todo item POST endpoint handler");
    let post_results = db.recall_similar(&post_query, 3).unwrap();
    assert!(
        !post_results.is_empty(),
        "Recall should find the POST handler discussion"
    );

    db.close().unwrap();
}

// ---------------------------------------------------------------------------
// Test 2: Customer support agent with interleaved tickets
// ---------------------------------------------------------------------------

/// Validates that a support agent can handle multiple interleaved customer
/// conversations without context bleeding. Each customer has a distinct problem,
/// and the agent must recall the correct context when switching between them.
/// This is a common real-world pattern where agents serve multiple users.
#[test]
fn test_customer_support_agent() {
    let dir = tempdir().unwrap();
    let mut db = MenteDb::open(dir.path()).unwrap();
    let agent_id = AgentId::new();
    let provider = embedder();

    // Customer A: billing dispute about double charge
    let customer_a_msgs = vec![
        "Customer A: I was charged twice for my Pro subscription this month, order #ORD-7823.",
        "Agent to A: I can see the duplicate charge of $49.99 on your account. Let me initiate a refund.",
        "Customer A: How long will the refund take to appear on my credit card?",
        "Agent to A: The refund has been processed. It typically takes 5-7 business days to appear on your statement.",
    ];

    // Customer B: technical issue with API integration
    let customer_b_msgs = vec![
        "Customer B: Our webhook endpoint is returning 502 errors when receiving events from your API.",
        "Agent to B: Can you share the webhook URL and any error logs from your server?",
        "Customer B: The URL is https://api.example.com/webhooks/ingest and the nginx logs show upstream timeout after 30 seconds.",
        "Agent to B: The timeout suggests your handler is taking too long. Webhook delivery has a 10-second timeout on our side. Consider processing events asynchronously with a message queue.",
    ];

    // Customer C: account access issue
    let customer_c_msgs = vec![
        "Customer C: I cannot log in to my admin dashboard. It says my account has been locked.",
        "Agent to C: I see your account was locked after 5 failed login attempts. I will send a password reset link to your registered email.",
        "Customer C: I did not receive the reset email. Can you check if my email is still john.doe@company.org?",
        "Agent to C: Your registered email is j.doe@company.org, not john.doe@company.org. I have sent the reset link to the correct address.",
    ];

    // Store all messages with customer-specific tags, interleaved
    let all_msgs = vec![
        (customer_a_msgs.as_slice(), "customer-a", "billing"),
        (customer_b_msgs.as_slice(), "customer-b", "technical"),
        (customer_c_msgs.as_slice(), "customer-c", "account"),
    ];

    let mut ids_by_customer: std::collections::HashMap<&str, Vec<MemoryId>> =
        std::collections::HashMap::new();

    // Interleave the messages as they would arrive in a real support queue
    for turn in 0..4 {
        for (msgs, customer_tag, category_tag) in &all_msgs {
            let node = make_memory_with_tags(
                agent_id,
                msgs[turn],
                MemoryType::Episodic,
                vec![customer_tag.to_string(), category_tag.to_string()],
                &provider,
            );
            ids_by_customer
                .entry(customer_tag)
                .or_default()
                .push(node.id);
            db.store(node).unwrap();
        }
    }

    // Verify all 12 memories stored
    let total_ids: usize = ids_by_customer.values().map(|v| v.len()).sum();
    assert_eq!(
        total_ids, 12,
        "All 12 interleaved messages should be stored"
    );

    // Query for billing-related context (should surface Customer A)
    let billing_query = embed(&provider, "refund for duplicate subscription charge");
    let billing_results = db.recall_similar(&billing_query, 5).unwrap();
    assert!(
        !billing_results.is_empty(),
        "Should find billing-related memories for Customer A"
    );

    // Query for webhook/API issues (should surface Customer B)
    let webhook_query = embed(&provider, "webhook 502 error timeout upstream");
    let webhook_results = db.recall_similar(&webhook_query, 5).unwrap();
    assert!(
        !webhook_results.is_empty(),
        "Should find webhook-related memories for Customer B"
    );

    // Query for account lockout (should surface Customer C)
    let lockout_query = embed(
        &provider,
        "account locked failed login attempts password reset",
    );
    let lockout_results = db.recall_similar(&lockout_query, 5).unwrap();
    assert!(
        !lockout_results.is_empty(),
        "Should find account lockout memories for Customer C"
    );

    // Verify that billing queries do not return webhook memories as the top result.
    let customer_b_id_set: HashSet<MemoryId> =
        ids_by_customer["customer-b"].iter().copied().collect();

    // The top billing result should not be from Customer B's webhook thread
    if let Some((top_id, _)) = billing_results.first() {
        assert!(
            !customer_b_id_set.contains(top_id),
            "Top billing recall result should not be a webhook memory from Customer B"
        );
    }

    // Verify no overlap between the top result of each customer query
    let top_billing = billing_results.first().map(|(id, _)| *id);
    let top_webhook = webhook_results.first().map(|(id, _)| *id);
    let top_lockout = lockout_results.first().map(|(id, _)| *id);

    if let (Some(b), Some(w)) = (top_billing, top_webhook) {
        assert_ne!(
            b, w,
            "Top billing and webhook results should be distinct memories"
        );
    }
    if let (Some(b), Some(l)) = (top_billing, top_lockout) {
        assert_ne!(
            b, l,
            "Top billing and lockout results should be distinct memories"
        );
    }
    if let (Some(w), Some(l)) = (top_webhook, top_lockout) {
        assert_ne!(
            w, l,
            "Top webhook and lockout results should be distinct memories"
        );
    }

    db.close().unwrap();
}

// ---------------------------------------------------------------------------
// Test 3: Research assistant knowledge accumulation over time
// ---------------------------------------------------------------------------

/// Simulates a research assistant that accumulates knowledge about ML model
/// deployment over a simulated week. Validates that the decay engine properly
/// reduces salience of older memories while preserving foundational knowledge
/// that has been accessed frequently, and that recent findings have higher
/// effective salience than stale, rarely-accessed notes.
#[test]
fn test_research_assistant_knowledge_accumulation() {
    let dir = tempdir().unwrap();
    let mut db = MenteDb::open(dir.path()).unwrap();
    let agent_id = AgentId::new();
    let provider = embedder();

    let base_time = now_us() - 8 * DAY_US;

    // Day 1: foundational knowledge (high salience, frequently accessed)
    let day1_content = vec![
        "ML model deployment requires converting trained models to optimized inference formats like ONNX or TensorRT for production serving.",
        "Docker containers are the standard packaging mechanism for ML model serving, bundling the model, runtime, and dependencies together.",
        "Kubernetes orchestrates containerized model services, handling auto-scaling based on request load and GPU resource allocation.",
        "Model versioning with tools like MLflow or DVC tracks which training run produced which artifact, enabling reproducible deployments.",
        "A/B testing infrastructure routes a percentage of inference traffic to canary model versions before full rollout.",
        "Feature stores like Feast ensure that the same feature transformations used during training are applied consistently at inference time.",
        "Model monitoring tracks prediction drift, latency percentiles, and throughput to detect when retraining is needed.",
        "CI/CD pipelines for ML include data validation, model training, evaluation gating, and staged rollout to production.",
        "Model registries like MLflow Model Registry or Weights and Biases store trained artifacts with metadata and lineage information.",
        "GPU memory management is critical for serving large models; techniques include model sharding, quantization, and memory-mapped weights.",
        "Load balancing for model servers uses least-connections routing to distribute inference requests across healthy replicas.",
        "Health checks for model endpoints verify both HTTP liveness and model inference readiness with sample prediction calls.",
        "Logging inference requests and predictions enables debugging, auditing, and building datasets for future model retraining.",
    ];

    // Day 3: intermediate findings
    let day3_content = vec![
        "NVIDIA Triton Inference Server supports concurrent model execution across multiple GPUs with dynamic batching.",
        "gRPC outperforms REST for model serving by 3-5x on latency due to binary serialization with Protocol Buffers.",
        "Model quantization from FP32 to INT8 reduces memory by 4x with less than 1% accuracy degradation for most vision models.",
        "Kubernetes Horizontal Pod Autoscaler can scale model replicas based on custom metrics like GPU utilization or queue depth.",
        "Blue-green deployments for model updates maintain the previous version as a hot standby for instant rollback.",
        "Prometheus and Grafana dashboards visualize model serving metrics including p50, p95, and p99 latency distributions.",
        "Batch inference pipelines using Apache Spark or Ray process large datasets offline at lower cost than real-time serving.",
        "TensorRT optimization applies layer fusion, kernel auto-tuning, and precision calibration to reduce inference latency by 5-10x.",
        "Model warm-up scripts pre-load weights and run dummy inference to avoid cold-start latency spikes on first real requests.",
        "Canary analysis compares error rates and latency distributions between the baseline and canary model versions using statistical tests.",
        "Shadow deployments route a copy of production traffic to a new model version without affecting user-facing responses.",
        "Data pipeline monitoring with Great Expectations validates that input features conform to expected distributions before inference.",
        "Cost attribution tags on GPU instances enable tracking inference spend per model, per team, and per customer.",
    ];

    // Day 5: more recent research
    let day5_content = vec![
        "vLLM achieves 24x throughput improvement over HuggingFace Transformers for LLM serving through PagedAttention memory management.",
        "Speculative decoding uses a smaller draft model to generate candidate tokens verified by the main model, reducing latency by 2-3x.",
        "LoRA adapters can be hot-swapped at serving time, allowing a single base model to serve multiple fine-tuned variants.",
        "KV-cache compression techniques reduce memory requirements for long-context LLM inference by up to 8x.",
        "Model sharding across multiple GPUs with tensor parallelism enables serving models too large for a single GPU memory.",
        "Continuous batching in LLM serving dynamically adds new requests to ongoing batches, improving GPU utilization from 30% to 90%.",
        "SGLang compiler optimizes LLM serving by reusing KV cache across requests that share common prompt prefixes.",
        "Medusa decoding adds multiple prediction heads to generate several tokens per forward pass, trading compute for reduced latency.",
        "AWQ quantization preserves salient weight channels while aggressively quantizing others, maintaining accuracy at 4-bit precision.",
        "Pipeline parallelism splits model layers across GPUs in a pipeline, overlapping compute and communication for higher throughput.",
        "Request scheduling policies like shortest-job-first reduce average latency for LLM serving workloads with variable output lengths.",
        "Disaggregated prefill architectures separate prompt encoding from token generation across different GPU pools for better utilization.",
        "Prompt caching stores encoded representations of common system prompts to skip redundant prefill computation.",
    ];

    // Day 6-7: most recent findings
    let day7_content = vec![
        "Structured output generation using constrained decoding with JSON Schema guarantees valid output format from LLMs.",
        "Embedding model serving with sentence-transformers can batch encode 1000+ documents per second on a single A100 GPU.",
        "Model cascading routes simple queries to a small model and complex ones to a large model, reducing average inference cost by 60%.",
        "Retrieval-augmented generation pipelines combine vector search with LLM generation, requiring careful chunk size tuning around 512 tokens.",
        "Prefill-decode disaggregation separates the compute-heavy prefill phase from memory-bound decode phase across different GPU pools.",
        "FlashAttention-2 reduces transformer attention computation from O(n^2) memory to O(n) with 2x speedup on A100 GPUs.",
        "GGUF format enables efficient CPU inference of quantized LLMs, making deployment possible on commodity hardware without GPUs.",
        "Tool-use fine-tuning teaches LLMs to emit structured function calls, requiring specialized serving infrastructure for tool execution loops.",
        "Multi-modal model serving requires handling image, audio, and text inputs in a unified pipeline with different preprocessing steps.",
        "Inference routers use classifier models to predict query complexity and route to appropriately sized models in a cascade.",
        "Guardrail models run in parallel with the main LLM to detect and filter harmful or off-topic outputs before delivery.",
        "Token streaming over Server-Sent Events delivers partial LLM responses to clients incrementally, improving perceived latency.",
        "Auto-scaling policies for LLM serving must account for variable request durations, using queue depth rather than CPU utilization.",
    ];

    // Store day-1 memories with high salience, simulating frequent access
    let mut all_memories: Vec<MemoryNode> = Vec::new();
    for content in &day1_content {
        let mut node = make_memory_at_time(
            agent_id,
            content,
            MemoryType::Semantic,
            base_time,
            1.0,
            &provider,
        );
        // Foundational knowledge accessed many times
        node.access_count = 15;
        node.accessed_at = base_time + 6 * DAY_US;
        db.store(node.clone()).unwrap();
        all_memories.push(node);
    }

    for content in &day3_content {
        let mut node = make_memory_at_time(
            agent_id,
            content,
            MemoryType::Semantic,
            base_time + 2 * DAY_US,
            0.9,
            &provider,
        );
        node.access_count = 5;
        node.accessed_at = base_time + 4 * DAY_US;
        db.store(node.clone()).unwrap();
        all_memories.push(node);
    }

    for content in &day5_content {
        let mut node = make_memory_at_time(
            agent_id,
            content,
            MemoryType::Semantic,
            base_time + 4 * DAY_US,
            0.95,
            &provider,
        );
        node.access_count = 3;
        node.accessed_at = base_time + 5 * DAY_US;
        db.store(node.clone()).unwrap();
        all_memories.push(node);
    }

    for content in &day7_content {
        let mut node = make_memory_at_time(
            agent_id,
            content,
            MemoryType::Semantic,
            base_time + 6 * DAY_US,
            1.0,
            &provider,
        );
        node.access_count = 1;
        node.accessed_at = base_time + 6 * DAY_US;
        db.store(node.clone()).unwrap();
        all_memories.push(node);
    }

    assert!(
        all_memories.len() >= 50,
        "Should have stored at least 50 memories, got {}",
        all_memories.len()
    );

    // Apply decay to all memories at "now" (end of week + 1 day)
    let decay_time = base_time + 8 * DAY_US;
    let decay_engine = DecayEngine::new(DecayConfig {
        half_life_us: 7 * DAY_US,
        min_salience: 0.01,
        access_boost: 0.1,
        max_salience: 1.0,
    });

    // Clone for decay computation (the engine mutates salience in-place)
    let mut decayed = all_memories.clone();
    decay_engine.apply_decay_batch(&mut decayed, decay_time);

    // Foundational day-1 memories with high access counts should still have
    // reasonable salience after decay due to the access boost
    let day1_decayed: Vec<f32> = decayed[..day1_content.len()]
        .iter()
        .map(|m| m.salience)
        .collect();
    for (i, sal) in day1_decayed.iter().enumerate() {
        assert!(
            *sal > 0.05,
            "Foundational memory {} should retain salience above 0.05 due to frequent access, got {}",
            i,
            sal
        );
    }

    // Day-3 memories with moderate access should decay more than day-1
    let day3_start = day1_content.len();
    let day3_end = day3_start + day3_content.len();
    let day3_decayed: Vec<f32> = decayed[day3_start..day3_end]
        .iter()
        .map(|m| m.salience)
        .collect();

    // Recent day-7 memories should have higher salience than older stale memories
    let day7_start = day3_end + day5_content.len();
    let day7_decayed: Vec<f32> = decayed[day7_start..].iter().map(|m| m.salience).collect();

    let avg_day3 = day3_decayed.iter().sum::<f32>() / day3_decayed.len() as f32;
    let avg_day7 = day7_decayed.iter().sum::<f32>() / day7_decayed.len() as f32;

    assert!(
        avg_day7 > avg_day3,
        "Recent day-7 memories (avg salience {:.4}) should have higher salience than day-3 \
         memories with fewer accesses (avg salience {:.4})",
        avg_day7,
        avg_day3
    );

    // Verify recall still returns results for foundational topics
    let query_emb = embed(
        &provider,
        "Kubernetes auto-scaling ML model deployment containers",
    );
    let results = db.recall_similar(&query_emb, 10).unwrap();
    assert!(
        !results.is_empty(),
        "Should recall foundational deployment knowledge even after a simulated week"
    );

    db.close().unwrap();
}

// ---------------------------------------------------------------------------
// Test 4: Concurrent multi-agent writes
// ---------------------------------------------------------------------------

/// Validates that MenteDb handles concurrent writes from multiple agent threads
/// without data loss or corruption. Four threads each write 25 memories to the
/// same database instance (behind a Mutex), and we verify all 100 memories are
/// present and retrievable after the writes complete.
#[test]
fn test_concurrent_multi_agent_writes() {
    let dir = tempdir().unwrap();
    let db = Arc::new(Mutex::new(MenteDb::open(dir.path()).unwrap()));
    let provider = Arc::new(embedder());

    let num_agents = 4;
    let memories_per_agent = 25;
    let all_ids = Arc::new(Mutex::new(Vec::new()));

    let mut handles = Vec::new();

    for agent_idx in 0..num_agents {
        let db = Arc::clone(&db);
        let provider = Arc::clone(&provider);
        let all_ids = Arc::clone(&all_ids);
        let agent_id = AgentId::new();

        let handle = thread::spawn(move || {
            let topics: Vec<String> = (0..memories_per_agent)
                .map(|i| {
                    match (agent_idx, i % 5) {
                        (0, 0) => format!(
                            "Agent-0 research note {}: Transformer attention mechanisms \
                             compute scaled dot-product attention with Q, K, V matrices.",
                            i
                        ),
                        (0, 1) => format!(
                            "Agent-0 research note {}: Positional encoding adds sinusoidal \
                             signals to input embeddings so transformers can model sequence order.",
                            i
                        ),
                        (0, 2) => format!(
                            "Agent-0 research note {}: Layer normalization stabilizes training \
                             by normalizing activations across the feature dimension.",
                            i
                        ),
                        (0, 3) => format!(
                            "Agent-0 research note {}: Multi-head attention runs h parallel \
                             attention operations, each with d_model/h dimensions.",
                            i
                        ),
                        (0, _) => format!(
                            "Agent-0 research note {}: Feed-forward networks in transformers \
                             use two linear layers with a ReLU or GELU activation between them.",
                            i
                        ),
                        (1, 0) => format!(
                            "Agent-1 code review {}: The database connection pool should use \
                             a maximum of 20 connections to avoid exhausting PostgreSQL limits.",
                            i
                        ),
                        (1, 1) => format!(
                            "Agent-1 code review {}: Missing index on users.email column \
                             causes full table scans on login queries taking 200ms+.",
                            i
                        ),
                        (1, 2) => format!(
                            "Agent-1 code review {}: The authentication middleware should \
                             validate JWT tokens before reaching any route handler.",
                            i
                        ),
                        (1, 3) => format!(
                            "Agent-1 code review {}: Connection retry logic uses exponential \
                             backoff with jitter starting at 100ms, maxing at 30 seconds.",
                            i
                        ),
                        (1, _) => format!(
                            "Agent-1 code review {}: Rate limiting at the API gateway should \
                             use token bucket algorithm with 100 requests per minute per client.",
                            i
                        ),
                        (2, 0) => format!(
                            "Agent-2 debug log {}: Memory leak traced to unclosed gRPC streams \
                             in the prediction service, accumulating 50MB per hour.",
                            i
                        ),
                        (2, 1) => format!(
                            "Agent-2 debug log {}: Deadlock between the cache invalidation \
                             thread and the request handler holding concurrent read-write locks.",
                            i
                        ),
                        (2, 2) => format!(
                            "Agent-2 debug log {}: Segfault in native BLAS library when batch \
                             size exceeds 512 on the ARM-based inference nodes.",
                            i
                        ),
                        (2, 3) => format!(
                            "Agent-2 debug log {}: OOM killer terminated the model server \
                             after loading 3 large models simultaneously into GPU memory.",
                            i
                        ),
                        (2, _) => format!(
                            "Agent-2 debug log {}: Race condition in the feature cache \
                             returns stale embeddings when concurrent writes update the same key.",
                            i
                        ),
                        (3, 0) => format!(
                            "Agent-3 planning note {}: Sprint 14 deliverables include the \
                             search reranking pipeline and the embedding cache warm-up job.",
                            i
                        ),
                        (3, 1) => format!(
                            "Agent-3 planning note {}: Migration from Elasticsearch to \
                             Meilisearch requires reindexing 2.3 million documents over the weekend.",
                            i
                        ),
                        (3, 2) => format!(
                            "Agent-3 planning note {}: On-call rotation for Q4 needs at least \
                             3 engineers who understand the inference pipeline end to end.",
                            i
                        ),
                        (3, 3) => format!(
                            "Agent-3 planning note {}: Cost optimization target is reducing \
                             GPU spend by 30% through better batching and model distillation.",
                            i
                        ),
                        (_, _) => format!(
                            "Agent-3 planning note {}: Quarterly OKR review scheduled for \
                             November 15th, focusing on latency targets and reliability SLOs.",
                            i
                        ),
                    }
                })
                .collect();

            let mut local_ids = Vec::new();
            for content in &topics {
                let embedding = provider.embed(content.as_str()).unwrap();
                let node =
                    MemoryNode::new(agent_id, MemoryType::Episodic, content.clone(), embedding);
                local_ids.push(node.id);

                let mut db = db.lock().unwrap();
                db.store(node).unwrap();
            }

            all_ids.lock().unwrap().extend(local_ids);
        });

        handles.push(handle);
    }

    for handle in handles {
        handle.join().expect("Thread should not panic");
    }

    let ids = all_ids.lock().unwrap();
    assert_eq!(
        ids.len(),
        num_agents * memories_per_agent,
        "All {} memories should have been stored across {} agents",
        num_agents * memories_per_agent,
        num_agents
    );

    // Verify no duplicate IDs
    let unique: HashSet<MemoryId> = ids.iter().copied().collect();
    assert_eq!(
        unique.len(),
        ids.len(),
        "All memory IDs should be unique (no collisions)"
    );

    // Verify the database can still recall after concurrent writes
    let mut db = db.lock().unwrap();
    let query_emb = embed(&provider, "transformer attention mechanism");
    let results = db.recall_similar(&query_emb, 10).unwrap();
    assert!(
        !results.is_empty(),
        "Should recall memories after concurrent writes from 4 agents"
    );

    db.close().unwrap();
}

// ---------------------------------------------------------------------------
// Test 5: Large context window assembly with token budgets
// ---------------------------------------------------------------------------

/// Validates that the context assembler correctly respects different token
/// budgets when assembling context from a large pool of 200 memories. Tests
/// that smaller budgets include fewer memories, that the assembler prioritizes
/// high-salience content, and that the token count stays within limits.
#[test]
fn test_large_context_window_assembly() {
    let agent_id = AgentId::new();
    let provider = embedder();

    // Build 200 scored memories with varying salience and realistic content
    let topics: Vec<&str> = vec![
        "Rust ownership model prevents data races at compile time without a garbage collector.",
        "The borrow checker enforces that references cannot outlive their referent.",
        "Lifetimes annotate the scope for which a reference is valid.",
        "Pattern matching with match expressions exhaustively handles all enum variants.",
        "Traits define shared behavior similar to interfaces in other languages.",
        "Async/await in Rust uses zero-cost futures that compile to state machines.",
        "The tokio runtime provides multi-threaded work-stealing for async IO tasks.",
        "Serde serialization framework handles JSON, TOML, YAML, and binary formats.",
        "Cargo build system manages dependencies, compilation, and cross-compilation targets.",
        "Procedural macros generate code at compile time from token streams.",
        "Error handling uses Result and Option types instead of exceptions.",
        "The type system prevents null pointer dereferences through Option<T>.",
        "Closures in Rust capture variables by reference, mutable reference, or value.",
        "Smart pointers like Box, Rc, and Arc manage heap allocation and reference counting.",
        "Unsafe blocks opt out of specific compiler guarantees for FFI and raw pointer operations.",
        "Iterators are lazy and compose with combinators like map, filter, and fold.",
        "Channels in std::sync::mpsc provide message passing between threads.",
        "Mutex and RwLock provide interior mutability for shared state across threads.",
        "Pin prevents values from being moved in memory, required for self-referential types.",
        "The Drop trait defines custom cleanup logic run when values go out of scope.",
    ];

    let mut scored_memories: Vec<ScoredMemory> = Vec::new();

    for i in 0..200 {
        let topic = topics[i % topics.len()];
        let content = format!(
            "Memory {}: {}. Additional context for entry {} covering implementation details \
             and practical usage patterns in production systems.",
            i, topic, i
        );
        let embedding = embed(&provider, &content);
        let mut node = MemoryNode::new(agent_id, MemoryType::Semantic, content, embedding);

        // Vary salience: memories 0-49 are high, 50-149 are medium, 150-199 are low
        node.salience = if i < 50 {
            0.9 + (i as f32 * 0.002)
        } else if i < 150 {
            0.5 + ((i - 50) as f32 * 0.003)
        } else {
            0.1 + ((i - 150) as f32 * 0.004)
        };

        // Use salience as score for assembly ordering
        scored_memories.push(ScoredMemory {
            score: node.salience,
            memory: node,
        });
    }

    assert_eq!(
        scored_memories.len(),
        200,
        "Should have 200 candidate memories"
    );

    // Assemble with a small 500-token budget
    let small_config = AssemblyConfig {
        token_budget: 500,
        ..AssemblyConfig::default()
    };
    let small_window = ContextAssembler::assemble(scored_memories.clone(), vec![], &small_config);

    // Assemble with a medium 2000-token budget
    let medium_config = AssemblyConfig {
        token_budget: 2000,
        ..AssemblyConfig::default()
    };
    let medium_window = ContextAssembler::assemble(scored_memories.clone(), vec![], &medium_config);

    // Assemble with a large 8000-token budget
    let large_config = AssemblyConfig {
        token_budget: 8000,
        ..AssemblyConfig::default()
    };
    let large_window = ContextAssembler::assemble(scored_memories.clone(), vec![], &large_config);

    // Token counts should respect budgets
    assert!(
        small_window.total_tokens <= 500,
        "Small window tokens ({}) should not exceed 500",
        small_window.total_tokens
    );
    assert!(
        medium_window.total_tokens <= 2000,
        "Medium window tokens ({}) should not exceed 2000",
        medium_window.total_tokens
    );
    assert!(
        large_window.total_tokens <= 8000,
        "Large window tokens ({}) should not exceed 8000",
        large_window.total_tokens
    );

    // Larger budgets should include more memories
    assert!(
        medium_window.metadata.included_count > small_window.metadata.included_count,
        "Medium budget ({} memories) should include more than small budget ({} memories)",
        medium_window.metadata.included_count,
        small_window.metadata.included_count
    );
    assert!(
        large_window.metadata.included_count > medium_window.metadata.included_count,
        "Large budget ({} memories) should include more than medium budget ({} memories)",
        large_window.metadata.included_count,
        medium_window.metadata.included_count
    );

    // All assemblies should have excluded some candidates (200 is a lot)
    assert!(
        small_window.metadata.excluded_count > 0,
        "Small budget should exclude memories"
    );
    assert!(
        medium_window.metadata.excluded_count > 0,
        "Medium budget should exclude memories"
    );

    // The format output should be non-empty for all budgets
    assert!(
        !small_window.format.is_empty(),
        "Small window should produce serialized output"
    );
    assert!(
        !medium_window.format.is_empty(),
        "Medium window should produce serialized output"
    );
    assert!(
        !large_window.format.is_empty(),
        "Large window should produce serialized output"
    );

    // Metadata total_candidates should always be 200
    assert_eq!(small_window.metadata.total_candidates, 200);
    assert_eq!(medium_window.metadata.total_candidates, 200);
    assert_eq!(large_window.metadata.total_candidates, 200);
}

// ---------------------------------------------------------------------------
// Test 6: Contradiction chain detection
// ---------------------------------------------------------------------------

/// Simulates an evolving decision about which database to use in a project.
/// Each new memory contradicts the previous one. The write inference engine
/// should detect these contradictions and flag or mark obsolete the earlier
/// decisions. This validates that MenteDB can track belief changes over time
/// and surface the contradiction chain for the agent to reason about.
#[test]
fn test_contradiction_chain() {
    let dir = tempdir().unwrap();
    let mut db = MenteDb::open(dir.path()).unwrap();
    let agent_id = AgentId::new();
    let provider = embedder();

    // Use very low thresholds to catch contradictions with hash embeddings.
    // Hash embeddings are deterministic but not semantically meaningful, so
    // cosine similarity between related texts will be lower than with a real
    // model. We lower the thresholds to ensure the engine can still detect
    // relationships in the chain.
    let config = WriteInferenceConfig {
        contradiction_threshold: 0.3,
        obsolete_threshold: 0.2,
        related_min: 0.05,
        related_max: 0.3,
        correction_threshold: 0.1,
        confidence_decay_factor: 0.5,
        confidence_floor: 0.1,
    };
    let engine = WriteInferenceEngine::with_config(config);

    // The evolving chain of database decisions
    let chain = [
        "The project will use PostgreSQL as the primary database for all relational data storage and querying.",
        "We are switching from PostgreSQL to MySQL because the hosting provider offers managed MySQL at lower cost.",
        "Actually, we decided on MongoDB instead of MySQL because our data is mostly unstructured JSON documents.",
        "After load testing, MongoDB write performance was insufficient. Moving to Cassandra for its write throughput.",
        "Cassandra operational complexity is too high for our small team. Switching to CockroachDB for managed distributed SQL.",
        "Final decision: we are going with TiDB because it offers MySQL compatibility with horizontal scaling built in.",
    ];

    let base_time = now_us() - 6 * DAY_US;
    let mut stored_memories: Vec<MemoryNode> = Vec::new();
    let mut all_actions: Vec<Vec<InferredAction>> = Vec::new();
    let mut total_edges_created: usize = 0;
    let mut total_obsolete_flags = 0;
    let mut total_contradiction_flags = 0;

    for (i, content) in chain.iter().enumerate() {
        let node = make_memory_at_time(
            agent_id,
            content,
            MemoryType::Semantic,
            base_time + (i as u64) * DAY_US,
            1.0,
            &provider,
        );

        // Run write inference against all previously stored memories
        let actions = engine.infer_on_write(&node, &stored_memories, &[]);

        for action in &actions {
            match action {
                InferredAction::FlagContradiction { .. } => {
                    total_contradiction_flags += 1;
                }
                InferredAction::MarkObsolete { .. } => {
                    total_obsolete_flags += 1;
                }
                InferredAction::CreateEdge { .. } => {
                    total_edges_created += 1;
                }
                _ => {}
            }
        }

        all_actions.push(actions);
        db.store(node.clone()).unwrap();
        stored_memories.push(node);
    }

    assert_eq!(
        stored_memories.len(),
        chain.len(),
        "All memories in the contradiction chain should be stored"
    );

    // With lowered thresholds, the inference engine should detect at least some
    // relationships between the database decision memories
    let total_inferred = total_contradiction_flags + total_obsolete_flags + total_edges_created;
    assert!(
        total_inferred > 0,
        "Write inference should detect at least one relationship in the contradiction chain. \
         Got: {} contradictions, {} obsolete, {} edges",
        total_contradiction_flags,
        total_obsolete_flags,
        total_edges_created
    );

    // The first memory in the chain should have been written before any inference
    // could run (no existing memories at that point)
    assert!(
        all_actions[0].is_empty(),
        "First memory should have no inferred actions (nothing to compare against)"
    );

    // Later memories should produce inferred actions as the chain grows
    let later_actions_count: usize = all_actions[1..].iter().map(|a| a.len()).sum();
    assert!(
        later_actions_count > 0,
        "Later chain entries should produce inferred actions against earlier memories"
    );

    // Verify all memories are queryable
    let query_emb = embed(&provider, "database technology selection for the project");
    let results = db.recall_similar(&query_emb, 10).unwrap();
    assert!(
        !results.is_empty(),
        "Should recall database decision memories"
    );

    db.close().unwrap();
}