use std::fs;
use lantern::ingest::ingest_path;
use lantern::search::{SearchOptions, search};
use lantern::store::Store;
use rusqlite::params;
use tempfile::tempdir;
const TRANSCRIPT: &str = concat!(
"{\"role\":\"user\",\"content\":\"Where should I keep the lantern?\"}\n",
"{\"role\":\"assistant\",\"content\":[",
"{\"type\":\"text\",\"text\":\"Hang it near the door.\"},",
"{\"type\":\"text\",\"text\":\"It should catch the evening breeze.\"}",
"]}\n",
"\n",
"{\"malformed line that should be skipped\n",
"{\"role\":\"tool\",\"content\":\"\"}\n",
"{\"text\":\"rust is a systems language\"}\n",
);
fn ingest_transcript() -> (tempfile::TempDir, Store, std::path::PathBuf) {
let root = tempdir().unwrap();
let mut store = Store::initialize(&root.path().join("store")).unwrap();
let file = root.path().join("session.jsonl");
fs::write(&file, TRANSCRIPT).unwrap();
ingest_path(&mut store, &file).unwrap();
(root, store, file)
}
#[test]
fn jsonl_ingest_produces_one_chunk_per_extractable_line() {
let (_root, store, _file) = ingest_transcript();
let (source_count, chunk_count, kind) = {
let conn = store.conn();
let s: i64 = conn
.query_row("SELECT COUNT(*) FROM sources", [], |r| r.get(0))
.unwrap();
let c: i64 = conn
.query_row("SELECT COUNT(*) FROM chunks", [], |r| r.get(0))
.unwrap();
let k: String = conn
.query_row("SELECT kind FROM sources", [], |r| r.get(0))
.unwrap();
(s, c, k)
};
assert_eq!(source_count, 1);
assert_eq!(chunk_count, 3, "3 extractable lines out of 6");
assert_eq!(kind, "application/jsonl");
}
#[test]
fn role_prefix_is_preserved_and_searchable() {
let (_root, store, _file) = ingest_transcript();
let assistant_hits = search(&store, "breeze", SearchOptions::default()).unwrap();
assert_eq!(assistant_hits.len(), 1);
assert!(assistant_hits[0].text.contains("[assistant]"));
assert!(assistant_hits[0].text.contains("Hang it near the door"));
assert!(
assistant_hits[0].text.contains("evening breeze"),
"multi-block content should be joined"
);
let user_hits = search(&store, "lantern", SearchOptions::default()).unwrap();
assert_eq!(user_hits.len(), 1);
assert!(user_hits[0].text.starts_with("[user]"));
}
#[test]
fn alternate_text_field_lines_are_indexed() {
let (_root, store, _file) = ingest_transcript();
let hits = search(&store, "systems", SearchOptions::default()).unwrap();
assert_eq!(hits.len(), 1);
assert_eq!(hits[0].text, "rust is a systems language");
}
#[test]
fn chunk_byte_ranges_match_their_source_line() {
let (_root, store, _file) = ingest_transcript();
let conn = store.conn();
let source_id: String = conn
.query_row("SELECT id FROM sources", [], |r| r.get(0))
.unwrap();
let mut stmt = conn
.prepare(
"SELECT byte_start, byte_end FROM chunks
WHERE source_id = ?1 ORDER BY ordinal",
)
.unwrap();
let ranges: Vec<(i64, i64)> = stmt
.query_map(params![source_id], |row| Ok((row.get(0)?, row.get(1)?)))
.unwrap()
.collect::<Result<Vec<_>, _>>()
.unwrap();
for (start, end) in &ranges {
assert!(end > start);
let slice = &TRANSCRIPT[*start as usize..*end as usize];
assert!(
slice.ends_with('\n'),
"each chunk maps to a full JSONL line"
);
assert!(slice.trim_start().starts_with('{'));
}
}
#[test]
fn ingests_directory_mixing_jsonl_and_markdown() {
let root = tempdir().unwrap();
let mut store = Store::initialize(&root.path().join("store")).unwrap();
let data = root.path().join("data");
fs::create_dir_all(&data).unwrap();
fs::write(data.join("note.md"), "# markdown\n\nSome body.\n").unwrap();
fs::write(
data.join("session.jsonl"),
"{\"role\":\"user\",\"content\":\"needle in jsonl\"}\n",
)
.unwrap();
let report = ingest_path(&mut store, &data).unwrap();
assert_eq!(report.ingested.len(), 2);
let hits = search(&store, "needle", SearchOptions::default()).unwrap();
assert_eq!(hits.len(), 1);
assert!(hits[0].uri.ends_with("/session.jsonl"));
assert_eq!(hits[0].kind, "application/jsonl");
}
#[test]
fn ingests_claude_code_style_message_envelope_transcript() {
let root = tempdir().unwrap();
let mut store = Store::initialize(&root.path().join("store")).unwrap();
let file = root.path().join("envelope.jsonl");
let transcript = concat!(
"{\"type\":\"user\",\"sessionId\":\"sess-envelope\",\"timestamp\":1700001000,",
"\"message\":{\"role\":\"user\",\"content\":\"where does the lantern hang\"}}\n",
"{\"type\":\"assistant\",\"sessionId\":\"sess-envelope\",\"timestamp\":1700001001,",
"\"message\":{\"role\":\"assistant\",\"content\":[",
"{\"type\":\"text\",\"text\":\"by the door\"},",
"{\"type\":\"text\",\"text\":\"facing the breeze\"}",
"]}}\n",
);
fs::write(&file, transcript).unwrap();
let report = ingest_path(&mut store, &file).unwrap();
assert_eq!(report.ingested.len(), 1);
assert_eq!(report.ingested[0].chunks, 2);
let user_hits = search(&store, "lantern", SearchOptions::default()).unwrap();
assert_eq!(user_hits.len(), 1);
assert!(user_hits[0].text.starts_with("[user]"));
assert_eq!(user_hits[0].session_id.as_deref(), Some("sess-envelope"));
assert_eq!(user_hits[0].timestamp_unix, Some(1_700_001_000));
let assistant_hits = search(&store, "breeze", SearchOptions::default()).unwrap();
assert_eq!(assistant_hits.len(), 1);
assert!(assistant_hits[0].text.starts_with("[assistant]"));
assert!(
assistant_hits[0].text.contains("by the door"),
"envelope multi-block content joins the same way as flat block arrays"
);
}
#[test]
fn ingests_anthropic_tool_use_id_into_chunks_tool_call_id() {
let root = tempdir().unwrap();
let mut store = Store::initialize(&root.path().join("store")).unwrap();
let file = root.path().join("tool-use.jsonl");
let transcript = concat!(
"{\"role\":\"tool\",\"tool_name\":\"search\",",
"\"tool_use_id\":\"toolu_01abc\",\"content\":\"snake output\"}\n",
"{\"role\":\"tool\",\"toolName\":\"lookup\",",
"\"toolUseId\":\"toolu_02def\",\"content\":\"camel output\"}\n",
);
fs::write(&file, transcript).unwrap();
let report = ingest_path(&mut store, &file).unwrap();
assert_eq!(report.ingested.len(), 1);
assert_eq!(report.ingested[0].chunks, 2);
let conn = store.conn();
let mut stmt = conn
.prepare(
"SELECT tool_call_id, tool_name FROM chunks
WHERE source_id = (SELECT id FROM sources LIMIT 1)
ORDER BY ordinal",
)
.unwrap();
let rows: Vec<(Option<String>, Option<String>)> = stmt
.query_map([], |row| Ok((row.get(0)?, row.get(1)?)))
.unwrap()
.collect::<Result<Vec<_>, _>>()
.unwrap();
assert_eq!(rows.len(), 2);
assert_eq!(rows[0].0.as_deref(), Some("toolu_01abc"));
assert_eq!(rows[0].1.as_deref(), Some("search"));
assert_eq!(rows[1].0.as_deref(), Some("toolu_02def"));
assert_eq!(rows[1].1.as_deref(), Some("lookup"));
}
#[test]
fn ingests_anthropic_tool_use_content_block_as_searchable_text() {
let root = tempdir().unwrap();
let mut store = Store::initialize(&root.path().join("store")).unwrap();
let file = root.path().join("tool-use-block.jsonl");
let transcript = concat!(
"{\"role\":\"assistant\",\"content\":[",
"{\"type\":\"text\",\"text\":\"let me search the docs\"},",
"{\"type\":\"tool_use\",\"id\":\"toolu_01\",\"name\":\"docs_search\",",
"\"input\":{\"q\":\"provenance\"}}",
"]}\n",
);
fs::write(&file, transcript).unwrap();
let report = ingest_path(&mut store, &file).unwrap();
assert_eq!(report.ingested.len(), 1);
assert_eq!(report.ingested[0].chunks, 1);
let name_hits = search(&store, "docs_search", SearchOptions::default()).unwrap();
assert_eq!(name_hits.len(), 1);
assert!(
name_hits[0].text.contains("[tool_use:docs_search]"),
"tool call name should land in the chunk text with the tool_use prefix"
);
assert!(
name_hits[0].text.contains("let me search the docs"),
"surrounding prose should still be present alongside the tool call"
);
let input_hits = search(&store, "provenance", SearchOptions::default()).unwrap();
assert_eq!(
input_hits.len(),
1,
"structured tool input must survive ingest as compact JSON so it's searchable"
);
}
#[test]
fn ingests_anthropic_server_tool_use_content_block_as_searchable_text() {
let root = tempdir().unwrap();
let mut store = Store::initialize(&root.path().join("store")).unwrap();
let file = root.path().join("server-tool-use-block.jsonl");
let transcript = concat!(
"{\"role\":\"assistant\",\"content\":[",
"{\"type\":\"text\",\"text\":\"let me search the web\"},",
"{\"type\":\"server_tool_use\",\"id\":\"srvtoolu_01\",\"name\":\"web_search\",",
"\"input\":{\"query\":\"lantern provenance\"}}",
"]}\n",
);
fs::write(&file, transcript).unwrap();
let report = ingest_path(&mut store, &file).unwrap();
assert_eq!(report.ingested.len(), 1);
assert_eq!(report.ingested[0].chunks, 1);
let name_hits = search(&store, "web_search", SearchOptions::default()).unwrap();
assert_eq!(name_hits.len(), 1);
assert!(
name_hits[0].text.contains("[server_tool_use:web_search]"),
"server-side tool call should render behind the distinct server_tool_use prefix"
);
assert!(
!name_hits[0].text.contains("[tool_use:web_search]"),
"server-side calls must not collide with client-side `[tool_use:NAME]` prefix"
);
assert!(
name_hits[0].text.contains("let me search the web"),
"surrounding prose should still be present alongside the server tool call"
);
assert_eq!(name_hits[0].tool_name.as_deref(), Some("web_search"));
assert_eq!(name_hits[0].tool_call_id.as_deref(), Some("srvtoolu_01"));
let input_hits = search(&store, "provenance", SearchOptions::default()).unwrap();
assert_eq!(
input_hits.len(),
1,
"structured server tool input must survive ingest as compact JSON so it's searchable"
);
}
#[test]
fn ingests_anthropic_tool_result_error_block_as_searchable_tool_error_text() {
let root = tempdir().unwrap();
let mut store = Store::initialize(&root.path().join("store")).unwrap();
let file = root.path().join("tool-error.jsonl");
let transcript = concat!(
"{\"role\":\"user\",\"content\":[",
"{\"type\":\"tool_result\",\"tool_use_id\":\"toolu_99\",\"is_error\":true,",
"\"content\":\"command failed with exit 17\"}",
"]}\n",
"{\"role\":\"user\",\"content\":[",
"{\"type\":\"tool_result\",\"tool_use_id\":\"toolu_100\",",
"\"content\":\"command succeeded\"}",
"]}\n",
);
fs::write(&file, transcript).unwrap();
let report = ingest_path(&mut store, &file).unwrap();
assert_eq!(report.ingested.len(), 1);
assert_eq!(report.ingested[0].chunks, 2);
let error_hits = search(&store, "tool_error", SearchOptions::default()).unwrap();
assert_eq!(
error_hits.len(),
1,
"the `[tool_error]` prefix should make the failed tool call keyword-searchable"
);
assert!(error_hits[0].text.contains("[tool_error]"));
assert!(error_hits[0].text.contains("command failed with exit 17"));
let success_hits = search(&store, "succeeded", SearchOptions::default()).unwrap();
assert_eq!(success_hits.len(), 1);
assert!(
!success_hits[0].text.contains("[tool_error]"),
"tool_result blocks without is_error=true must stay unprefixed"
);
}
#[test]
fn ingests_nested_content_block_tool_call_metadata_end_to_end() {
let root = tempdir().unwrap();
let mut store = Store::initialize(&root.path().join("store")).unwrap();
let file = root.path().join("nested-tool.jsonl");
let transcript = concat!(
"{\"role\":\"assistant\",\"content\":[",
"{\"type\":\"text\",\"text\":\"let me search the docs\"},",
"{\"type\":\"tool_use\",\"tool_use_id\":\"toolu_outer_use\",",
"\"name\":\"docs_search\",\"input\":{\"q\":\"provenance\"}}",
"]}\n",
"{\"role\":\"user\",\"content\":[",
"{\"type\":\"tool_result\",\"tool_use_id\":\"toolu_outer_result\",",
"\"content\":\"plain result\"}",
"]}\n",
"{\"type\":\"user\",\"sessionId\":\"sess-nested\",",
"\"message\":{\"role\":\"user\",\"content\":[",
"{\"type\":\"tool_result\",\"toolUseId\":\"toolu_envelope\",",
"\"content\":\"envelope result\"}",
"]}}\n",
);
fs::write(&file, transcript).unwrap();
let report = ingest_path(&mut store, &file).unwrap();
assert_eq!(report.ingested.len(), 1);
assert_eq!(report.ingested[0].chunks, 3);
let conn = store.conn();
let mut stmt = conn
.prepare(
"SELECT tool_call_id, tool_name FROM chunks
WHERE source_id = (SELECT id FROM sources LIMIT 1)
ORDER BY ordinal",
)
.unwrap();
let rows: Vec<(Option<String>, Option<String>)> = stmt
.query_map([], |row| Ok((row.get(0)?, row.get(1)?)))
.unwrap()
.collect::<Result<Vec<_>, _>>()
.unwrap();
assert_eq!(rows.len(), 3);
assert_eq!(rows[0].0.as_deref(), Some("toolu_outer_use"));
assert_eq!(rows[0].1.as_deref(), Some("docs_search"));
assert_eq!(rows[1].0.as_deref(), Some("toolu_outer_result"));
assert_eq!(rows[1].1, None);
assert_eq!(rows[2].0.as_deref(), Some("toolu_envelope"));
assert_eq!(rows[2].1, None);
}
#[test]
fn ingests_openai_tool_calls_array_as_searchable_text_and_metadata() {
let root = tempdir().unwrap();
let mut store = Store::initialize(&root.path().join("store")).unwrap();
let file = root.path().join("openai-tool-calls.jsonl");
let transcript = concat!(
"{\"role\":\"assistant\",\"tool_calls\":[",
"{\"id\":\"call_abc123\",\"type\":\"function\",",
"\"function\":{\"name\":\"docs_search\",",
"\"arguments\":\"{\\\"q\\\":\\\"provenance\\\"}\"}}",
"]}\n",
);
fs::write(&file, transcript).unwrap();
let report = ingest_path(&mut store, &file).unwrap();
assert_eq!(report.ingested.len(), 1);
assert_eq!(report.ingested[0].chunks, 1);
let name_hits = search(&store, "docs_search", SearchOptions::default()).unwrap();
assert_eq!(name_hits.len(), 1);
assert!(name_hits[0].text.contains("[tool_use:docs_search]"));
let args_hits = search(&store, "provenance", SearchOptions::default()).unwrap();
assert_eq!(args_hits.len(), 1);
let conn = store.conn();
let (tool_call_id, tool_name): (Option<String>, Option<String>) = conn
.query_row(
"SELECT tool_call_id, tool_name FROM chunks
WHERE source_id = (SELECT id FROM sources LIMIT 1)
ORDER BY ordinal LIMIT 1",
[],
|row| Ok((row.get(0)?, row.get(1)?)),
)
.unwrap();
assert_eq!(tool_call_id.as_deref(), Some("call_abc123"));
assert_eq!(tool_name.as_deref(), Some("docs_search"));
}
#[test]
fn ingests_openai_tool_calls_array_preserves_outer_metadata_precedence() {
let root = tempdir().unwrap();
let mut store = Store::initialize(&root.path().join("store")).unwrap();
let file = root.path().join("openai-precedence.jsonl");
let transcript = concat!(
"{\"role\":\"assistant\",\"tool_name\":\"outer_name\",",
"\"tool_call_id\":\"outer_call\",\"tool_calls\":[",
"{\"id\":\"inner_call\",\"type\":\"function\",",
"\"function\":{\"name\":\"inner_name\",\"arguments\":\"{}\"}}",
"]}\n",
);
fs::write(&file, transcript).unwrap();
let report = ingest_path(&mut store, &file).unwrap();
assert_eq!(report.ingested.len(), 1);
assert_eq!(report.ingested[0].chunks, 1);
let conn = store.conn();
let (tool_call_id, tool_name): (Option<String>, Option<String>) = conn
.query_row(
"SELECT tool_call_id, tool_name FROM chunks
WHERE source_id = (SELECT id FROM sources LIMIT 1)
ORDER BY ordinal LIMIT 1",
[],
|row| Ok((row.get(0)?, row.get(1)?)),
)
.unwrap();
assert_eq!(tool_call_id.as_deref(), Some("outer_call"));
assert_eq!(tool_name.as_deref(), Some("outer_name"));
}
#[test]
fn ingests_anthropic_thinking_block_as_searchable_chunk() {
let root = tempdir().unwrap();
let mut store = Store::initialize(&root.path().join("store")).unwrap();
let file = root.path().join("thinking.jsonl");
let transcript = concat!(
"{\"role\":\"assistant\",\"content\":[",
"{\"type\":\"thinking\",\"thinking\":\"weigh the tradeoffs carefully\",",
"\"signature\":\"sig-abc\"},",
"{\"type\":\"text\",\"text\":\"final answer is fortytwo\"}",
"]}\n",
);
fs::write(&file, transcript).unwrap();
let report = ingest_path(&mut store, &file).unwrap();
assert_eq!(report.ingested.len(), 1);
assert_eq!(report.ingested[0].chunks, 1);
let thinking_hits = search(&store, "tradeoffs", SearchOptions::default()).unwrap();
assert_eq!(thinking_hits.len(), 1);
assert!(thinking_hits[0].text.contains("[thinking]"));
assert!(thinking_hits[0].text.contains("weigh the tradeoffs"));
let text_hits = search(&store, "fortytwo", SearchOptions::default()).unwrap();
assert_eq!(text_hits.len(), 1);
assert!(text_hits[0].text.contains("final answer is fortytwo"));
assert_eq!(
text_hits[0].text,
"[assistant] [thinking] weigh the tradeoffs carefully\nfinal answer is fortytwo"
);
}
#[test]
fn ingests_openai_reasoning_text_block_as_searchable_chunk() {
let root = tempdir().unwrap();
let mut store = Store::initialize(&root.path().join("store")).unwrap();
let file = root.path().join("reasoning.jsonl");
let transcript = concat!(
"{\"role\":\"assistant\",\"content\":[",
"{\"type\":\"reasoning_text\",\"text\":\"consider the cosmological constant\"},",
"{\"type\":\"text\",\"text\":\"the universe expands at fortythree\"}",
"]}\n",
);
fs::write(&file, transcript).unwrap();
let report = ingest_path(&mut store, &file).unwrap();
assert_eq!(report.ingested.len(), 1);
assert_eq!(report.ingested[0].chunks, 1);
let reasoning_hits = search(&store, "cosmological", SearchOptions::default()).unwrap();
assert_eq!(reasoning_hits.len(), 1);
assert!(reasoning_hits[0].text.contains("[thinking]"));
assert!(reasoning_hits[0].text.contains("cosmological constant"));
let text_hits = search(&store, "fortythree", SearchOptions::default()).unwrap();
assert_eq!(text_hits.len(), 1);
assert!(
text_hits[0]
.text
.contains("the universe expands at fortythree")
);
assert_eq!(
text_hits[0].text,
"[assistant] [thinking] consider the cosmological constant\nthe universe expands at fortythree"
);
}
#[test]
fn ingests_openai_responses_function_call_content_block_as_searchable_text_and_metadata() {
let root = tempdir().unwrap();
let mut store = Store::initialize(&root.path().join("store")).unwrap();
let file = root.path().join("responses-function-call.jsonl");
let transcript = concat!(
"{\"role\":\"assistant\",\"content\":[",
"{\"type\":\"text\",\"text\":\"let me search the docs\"},",
"{\"type\":\"function_call\",\"call_id\":\"call_responses_123\",",
"\"name\":\"docs_search\",\"arguments\":\"{\\\"q\\\":\\\"provenance\\\"}\"}",
"]}\n",
);
fs::write(&file, transcript).unwrap();
let report = ingest_path(&mut store, &file).unwrap();
assert_eq!(report.ingested.len(), 1);
assert_eq!(report.ingested[0].chunks, 1);
let name_hits = search(&store, "docs_search", SearchOptions::default()).unwrap();
assert_eq!(name_hits.len(), 1);
assert!(
name_hits[0].text.contains("[tool_use:docs_search]"),
"function_call block should render behind the same `[tool_use:NAME]` prefix as Anthropic tool_use"
);
assert!(
name_hits[0].text.contains("let me search the docs"),
"surrounding prose should still be present alongside the function call"
);
let args_hits = search(&store, "provenance", SearchOptions::default()).unwrap();
assert_eq!(args_hits.len(), 1);
let conn = store.conn();
let (tool_call_id, tool_name): (Option<String>, Option<String>) = conn
.query_row(
"SELECT tool_call_id, tool_name FROM chunks
WHERE source_id = (SELECT id FROM sources LIMIT 1)
ORDER BY ordinal LIMIT 1",
[],
|row| Ok((row.get(0)?, row.get(1)?)),
)
.unwrap();
assert_eq!(tool_call_id.as_deref(), Some("call_responses_123"));
assert_eq!(tool_name.as_deref(), Some("docs_search"));
}
#[test]
fn ingests_openai_responses_function_call_output_content_block_as_searchable_tool_result() {
let root = tempdir().unwrap();
let mut store = Store::initialize(&root.path().join("store")).unwrap();
let file = root.path().join("responses-function-call-output.jsonl");
let transcript = concat!(
"{\"role\":\"tool\",\"content\":[",
"{\"type\":\"function_call_output\",\"call_id\":\"call_responses_456\",",
"\"output\":{\"results\":[\"lantern is local-first\",\"provenance is preserved\"]}}",
"]}\n",
);
fs::write(&file, transcript).unwrap();
let report = ingest_path(&mut store, &file).unwrap();
assert_eq!(report.ingested.len(), 1);
assert_eq!(report.ingested[0].chunks, 1);
let prefix_hits = search(&store, "tool_result", SearchOptions::default()).unwrap();
assert_eq!(prefix_hits.len(), 1);
assert!(
prefix_hits[0].text.contains("[tool_result]"),
"function_call_output should render behind a [tool_result] prefix distinct from [tool_use:NAME]"
);
let payload_hits = search(&store, "provenance", SearchOptions::default()).unwrap();
assert_eq!(payload_hits.len(), 1);
assert!(payload_hits[0].text.contains("provenance is preserved"));
let conn = store.conn();
let (tool_call_id, tool_name): (Option<String>, Option<String>) = conn
.query_row(
"SELECT tool_call_id, tool_name FROM chunks
WHERE source_id = (SELECT id FROM sources LIMIT 1)
ORDER BY ordinal LIMIT 1",
[],
|row| Ok((row.get(0)?, row.get(1)?)),
)
.unwrap();
assert_eq!(tool_call_id.as_deref(), Some("call_responses_456"));
assert_eq!(tool_name, None);
}
#[test]
fn ingests_anthropic_mcp_tool_use_content_block_as_searchable_text() {
let root = tempdir().unwrap();
let mut store = Store::initialize(&root.path().join("store")).unwrap();
let file = root.path().join("mcp-tool-use-block.jsonl");
let transcript = concat!(
"{\"role\":\"assistant\",\"content\":[",
"{\"type\":\"text\",\"text\":\"let me file a linear ticket\"},",
"{\"type\":\"mcp_tool_use\",\"id\":\"mcptoolu_01\",\"server_name\":\"linear\",",
"\"name\":\"create_issue\",\"input\":{\"title\":\"provenance regression\"}}",
"]}\n",
);
fs::write(&file, transcript).unwrap();
let report = ingest_path(&mut store, &file).unwrap();
assert_eq!(report.ingested.len(), 1);
assert_eq!(report.ingested[0].chunks, 1);
let name_hits = search(&store, "create_issue", SearchOptions::default()).unwrap();
assert_eq!(name_hits.len(), 1);
assert!(
name_hits[0]
.text
.contains("[mcp_tool_use:linear/create_issue]"),
"MCP tool call should render behind the distinct mcp_tool_use prefix"
);
assert!(
!name_hits[0].text.contains("[tool_use:create_issue]"),
"MCP calls must not collide with the client-side `[tool_use:NAME]` prefix"
);
assert!(
!name_hits[0].text.contains("[server_tool_use:create_issue]"),
"MCP calls must not collide with the server-managed `[server_tool_use:NAME]` prefix"
);
assert!(
name_hits[0].text.contains("let me file a linear ticket"),
"surrounding prose should still be present alongside the MCP tool call"
);
assert_eq!(name_hits[0].tool_name.as_deref(), Some("create_issue"));
assert_eq!(name_hits[0].tool_call_id.as_deref(), Some("mcptoolu_01"));
let input_hits = search(&store, "provenance", SearchOptions::default()).unwrap();
assert_eq!(
input_hits.len(),
1,
"structured MCP tool input must survive ingest as compact JSON so it's searchable"
);
let stored_call_id: Option<String> = store
.conn()
.query_row(
"SELECT tool_call_id FROM chunks WHERE id = ?",
params![name_hits[0].chunk_id],
|row| row.get(0),
)
.unwrap();
assert_eq!(stored_call_id.as_deref(), Some("mcptoolu_01"));
}
#[test]
fn ingests_openai_refusal_content_block_as_searchable_chunk() {
let root = tempdir().unwrap();
let mut store = Store::initialize(&root.path().join("store")).unwrap();
let file = root.path().join("refusal.jsonl");
let transcript = concat!(
"{\"role\":\"assistant\",\"content\":[",
"{\"type\":\"text\",\"text\":\"considered the request about supernova\"},",
"{\"type\":\"refusal\",\"refusal\":\"declining to comply with operation marigold\"}",
"]}\n",
);
fs::write(&file, transcript).unwrap();
let report = ingest_path(&mut store, &file).unwrap();
assert_eq!(report.ingested.len(), 1);
assert_eq!(report.ingested[0].chunks, 1);
let refusal_hits = search(&store, "marigold", SearchOptions::default()).unwrap();
assert_eq!(refusal_hits.len(), 1);
assert!(refusal_hits[0].text.contains("[refusal]"));
assert!(
refusal_hits[0]
.text
.contains("declining to comply with operation marigold")
);
let text_hits = search(&store, "supernova", SearchOptions::default()).unwrap();
assert_eq!(text_hits.len(), 1);
assert!(
text_hits[0]
.text
.contains("considered the request about supernova")
);
assert_eq!(
text_hits[0].text,
"[assistant] considered the request about supernova\n\
[refusal] declining to comply with operation marigold"
);
}
#[test]
fn ingests_anthropic_image_content_block_as_searchable_chunk() {
let root = tempdir().unwrap();
let mut store = Store::initialize(&root.path().join("store")).unwrap();
let file = root.path().join("image.jsonl");
let transcript = concat!(
"{\"role\":\"user\",\"content\":[",
"{\"type\":\"text\",\"text\":\"what species is this beetle\"},",
"{\"type\":\"image\",\"source\":{\"type\":\"url\",",
"\"url\":\"https://example.com/photos/zarathustra-beetle.png\"}}",
"]}\n",
"{\"role\":\"user\",\"content\":[",
"{\"type\":\"image\",\"source\":{\"type\":\"base64\",",
"\"media_type\":\"image/jpeg\",\"data\":\"AAAABBBBCCCC\"}}",
"]}\n",
);
fs::write(&file, transcript).unwrap();
let report = ingest_path(&mut store, &file).unwrap();
assert_eq!(report.ingested.len(), 1);
assert_eq!(report.ingested[0].chunks, 2);
let url_hits = search(&store, "zarathustra", SearchOptions::default()).unwrap();
assert_eq!(url_hits.len(), 1);
assert!(url_hits[0].text.contains("[image]"));
assert!(
url_hits[0]
.text
.contains("https://example.com/photos/zarathustra-beetle.png")
);
assert_eq!(
url_hits[0].text,
"[user] what species is this beetle\n\
[image] https://example.com/photos/zarathustra-beetle.png"
);
let mime_hits = search(&store, "jpeg", SearchOptions::default()).unwrap();
assert_eq!(mime_hits.len(), 1);
assert_eq!(mime_hits[0].text, "[user] [image:image/jpeg]");
let data_hits = search(&store, "AAAABBBB", SearchOptions::default()).unwrap();
assert!(
data_hits.is_empty(),
"base64 data payload must not leak into chunks: {data_hits:?}"
);
}
#[test]
fn ingests_anthropic_document_content_block_as_searchable_chunk() {
let root = tempdir().unwrap();
let mut store = Store::initialize(&root.path().join("store")).unwrap();
let file = root.path().join("document.jsonl");
let transcript = concat!(
"{\"role\":\"user\",\"content\":[",
"{\"type\":\"text\",\"text\":\"please summarize\"},",
"{\"type\":\"document\",\"source\":{\"type\":\"url\",",
"\"url\":\"https://example.com/papers/zarathustra-whitepaper.pdf\"}}",
"]}\n",
"{\"role\":\"user\",\"content\":[",
"{\"type\":\"document\",\"source\":{\"type\":\"base64\",",
"\"media_type\":\"application/pdf\",\"data\":\"JVBERi0xLjQKBINARY\"}}",
"]}\n",
"{\"role\":\"user\",\"content\":[",
"{\"type\":\"document\",\"source\":{\"type\":\"text\",",
"\"media_type\":\"text/plain\",",
"\"data\":\"Quarterly report: revenue rose by twelve percent\"}}",
"]}\n",
);
fs::write(&file, transcript).unwrap();
let report = ingest_path(&mut store, &file).unwrap();
assert_eq!(report.ingested.len(), 1);
assert_eq!(report.ingested[0].chunks, 3);
let url_hits = search(&store, "zarathustra", SearchOptions::default()).unwrap();
assert_eq!(url_hits.len(), 1);
assert!(url_hits[0].text.contains("[document]"));
assert!(
url_hits[0]
.text
.contains("https://example.com/papers/zarathustra-whitepaper.pdf")
);
assert_eq!(
url_hits[0].text,
"[user] please summarize\n\
[document] https://example.com/papers/zarathustra-whitepaper.pdf"
);
let mime_hits = search(&store, "pdf", SearchOptions::default()).unwrap();
let base64_hit = mime_hits
.iter()
.find(|h| h.text == "[user] [document:application/pdf]")
.expect("base64-source document should be keyword-searchable on its media_type");
assert_eq!(base64_hit.text, "[user] [document:application/pdf]");
let data_hits = search(&store, "JVBERi0xLjQK", SearchOptions::default()).unwrap();
assert!(
data_hits.is_empty(),
"base64 data payload must not leak into chunks: {data_hits:?}"
);
let text_hits = search(&store, "twelve", SearchOptions::default()).unwrap();
assert_eq!(text_hits.len(), 1);
assert_eq!(
text_hits[0].text,
"[user] [document] Quarterly report: revenue rose by twelve percent"
);
}
#[test]
fn empty_or_non_transcript_jsonl_ingests_with_no_chunks() {
let root = tempdir().unwrap();
let mut store = Store::initialize(&root.path().join("store")).unwrap();
let file = root.path().join("empty.jsonl");
fs::write(&file, "{\"timestamp\":1234}\n{\"foo\":\"bar\"}\n").unwrap();
let report = ingest_path(&mut store, &file).unwrap();
assert_eq!(report.ingested.len(), 1);
assert_eq!(report.ingested[0].chunks, 0);
let chunk_count: i64 = store
.conn()
.query_row("SELECT COUNT(*) FROM chunks", [], |r| r.get(0))
.unwrap();
assert_eq!(chunk_count, 0);
}
#[test]
fn ingests_anthropic_mcp_tool_result_content_block_end_to_end() {
let root = tempdir().unwrap();
let mut store = Store::initialize(&root.path().join("store")).unwrap();
let file = root.path().join("mcp-tool-result-block.jsonl");
let transcript = concat!(
"{\"role\":\"assistant\",\"content\":[",
"{\"type\":\"mcp_tool_use\",\"id\":\"mcptoolu_abc\",\"server_name\":\"linear\",",
"\"name\":\"create_issue\",\"input\":{\"title\":\"bug\"}}",
"]}\n",
"{\"role\":\"user\",\"content\":[",
"{\"type\":\"mcp_tool_result\",\"tool_use_id\":\"mcptoolu_abc\",",
"\"content\":[{\"type\":\"text\",\"text\":\"ticket LIN-42 created\"}]}",
"]}\n",
"{\"role\":\"user\",\"content\":[",
"{\"type\":\"mcp_tool_result\",\"tool_use_id\":\"mcptoolu_xyz\",",
"\"is_error\":true,\"content\":\"permission denied on workspace\"}",
"]}\n",
);
fs::write(&file, transcript).unwrap();
let report = ingest_path(&mut store, &file).unwrap();
assert_eq!(report.ingested.len(), 1);
assert_eq!(report.ingested[0].chunks, 3);
let success_hits = search(&store, "created", SearchOptions::default()).unwrap();
assert_eq!(success_hits.len(), 1);
assert!(
success_hits[0].text.contains("[mcp_tool_result]"),
"MCP tool reply should render behind the distinct mcp_tool_result prefix"
);
assert!(
!success_hits[0].text.contains("[mcp_tool_error]"),
"non-error replies must not get the error prefix"
);
assert_eq!(
success_hits[0].tool_call_id.as_deref(),
Some("mcptoolu_abc")
);
let error_hits = search(&store, "permission", SearchOptions::default()).unwrap();
assert_eq!(error_hits.len(), 1);
assert!(
error_hits[0].text.contains("[mcp_tool_error]"),
"failed MCP replies should render behind the distinct mcp_tool_error prefix"
);
assert!(
!error_hits[0].text.contains("[tool_error]")
|| error_hits[0].text.contains("[mcp_tool_error]"),
"MCP error must be distinguishable from the regular `[tool_error]` prefix"
);
assert_eq!(error_hits[0].tool_call_id.as_deref(), Some("mcptoolu_xyz"));
let stored_call_id: Option<String> = store
.conn()
.query_row(
"SELECT tool_call_id FROM chunks WHERE id = ?",
params![success_hits[0].chunk_id],
|row| row.get(0),
)
.unwrap();
assert_eq!(stored_call_id.as_deref(), Some("mcptoolu_abc"));
}
#[test]
fn ingests_anthropic_web_search_tool_result_content_block_end_to_end() {
let root = tempdir().unwrap();
let mut store = Store::initialize(&root.path().join("store")).unwrap();
let file = root.path().join("web-search-tool-result-block.jsonl");
let transcript = concat!(
"{\"role\":\"assistant\",\"content\":[",
"{\"type\":\"server_tool_use\",\"id\":\"srvtoolu_ws_main\",\"name\":\"web_search\",",
"\"input\":{\"query\":\"lantern provenance memory\"}}",
"]}\n",
"{\"role\":\"assistant\",\"content\":[",
"{\"type\":\"web_search_tool_result\",\"tool_use_id\":\"srvtoolu_ws_main\",",
"\"content\":[",
"{\"type\":\"web_search_result\",\"url\":\"https://docs.lantern.example/intro\",",
"\"title\":\"Lantern Intro\",\"encrypted_content\":\"OPAQUE_BLOB_PAYLOAD\",",
"\"page_age\":\"Jan 1, 2024\"},",
"{\"type\":\"web_search_result\",\"url\":\"https://blog.lantern.example/2024\",",
"\"title\":\"Year in review\",\"encrypted_content\":\"OPAQUE_BLOB_PAYLOAD\"}",
"]}",
"]}\n",
"{\"role\":\"assistant\",\"content\":[",
"{\"type\":\"web_search_tool_result\",\"tool_use_id\":\"srvtoolu_ws_failed\",",
"\"content\":{\"type\":\"web_search_tool_result_error\",",
"\"error_code\":\"max_uses_exceeded\"}}",
"]}\n",
);
fs::write(&file, transcript).unwrap();
let report = ingest_path(&mut store, &file).unwrap();
assert_eq!(report.ingested.len(), 1);
assert_eq!(report.ingested[0].chunks, 3);
let success_hits = search(&store, "Lantern Intro", SearchOptions::default()).unwrap();
assert_eq!(success_hits.len(), 1);
assert!(
success_hits[0]
.text
.contains("[web_search_tool_result] Lantern Intro https://docs.lantern.example/intro"),
"first result entry should surface behind the web_search_tool_result prefix"
);
assert!(
success_hits[0]
.text
.contains("[web_search_tool_result] Year in review https://blog.lantern.example/2024"),
"second result entry should also surface in the same chunk"
);
assert!(
!success_hits[0].text.contains("OPAQUE_BLOB_PAYLOAD"),
"encrypted_content is opaque binary noise and must never leak into the chunk"
);
assert_eq!(
success_hits[0].tool_call_id.as_deref(),
Some("srvtoolu_ws_main")
);
let blob_hits = search(&store, "OPAQUE_BLOB_PAYLOAD", SearchOptions::default()).unwrap();
assert!(
blob_hits.is_empty(),
"encrypted_content payload must not be searchable as keyword anywhere"
);
let error_hits = search(&store, "max_uses_exceeded", SearchOptions::default()).unwrap();
assert_eq!(error_hits.len(), 1);
assert!(
error_hits[0]
.text
.contains("[web_search_tool_error] error_code=max_uses_exceeded"),
"failed web_search replies should render behind the distinct web_search_tool_error prefix"
);
assert!(
!error_hits[0].text.contains("[tool_error]")
&& !error_hits[0].text.contains("[mcp_tool_error]"),
"web_search error must be distinct from both `[tool_error]` and `[mcp_tool_error]`"
);
assert_eq!(
error_hits[0].tool_call_id.as_deref(),
Some("srvtoolu_ws_failed")
);
let stored_call_id: Option<String> = store
.conn()
.query_row(
"SELECT tool_call_id FROM chunks WHERE id = ?",
params![success_hits[0].chunk_id],
|row| row.get(0),
)
.unwrap();
assert_eq!(stored_call_id.as_deref(), Some("srvtoolu_ws_main"));
}
#[test]
fn ingests_anthropic_code_execution_tool_result_content_block_end_to_end() {
let root = tempdir().unwrap();
let mut store = Store::initialize(&root.path().join("store")).unwrap();
let file = root.path().join("code-execution-tool-result-block.jsonl");
let transcript = concat!(
"{\"role\":\"assistant\",\"content\":[",
"{\"type\":\"server_tool_use\",\"id\":\"srvtoolu_ce_main\",\"name\":\"code_execution\",",
"\"input\":{\"code\":\"print(7 * 6)\"}}",
"]}\n",
"{\"role\":\"assistant\",\"content\":[",
"{\"type\":\"code_execution_tool_result\",\"tool_use_id\":\"srvtoolu_ce_main\",",
"\"content\":{\"type\":\"code_execution_result\",\"stdout\":\"42\\n\",",
"\"stderr\":\"\",\"return_code\":0}}",
"]}\n",
"{\"role\":\"assistant\",\"content\":[",
"{\"type\":\"code_execution_tool_result\",\"tool_use_id\":\"srvtoolu_ce_failed\",",
"\"content\":{\"type\":\"code_execution_tool_result_error\",",
"\"error_code\":\"unavailable\"}}",
"]}\n",
);
fs::write(&file, transcript).unwrap();
let report = ingest_path(&mut store, &file).unwrap();
assert_eq!(report.ingested.len(), 1);
assert_eq!(report.ingested[0].chunks, 3);
let success_hits = search(&store, "return_code", SearchOptions::default()).unwrap();
assert_eq!(success_hits.len(), 1);
assert!(
success_hits[0]
.text
.contains("[code_execution_result] {\"return_code\":0,\"stdout\":\"42\\n\"}"),
"success reply should surface behind the code_execution_result prefix"
);
assert_eq!(
success_hits[0].tool_call_id.as_deref(),
Some("srvtoolu_ce_main")
);
let error_hits = search(&store, "unavailable", SearchOptions::default()).unwrap();
assert_eq!(error_hits.len(), 1);
assert!(
error_hits[0]
.text
.contains("[code_execution_error] error_code=unavailable"),
"failed code_execution replies should render behind the distinct code_execution_error prefix"
);
assert!(
!error_hits[0].text.contains("[tool_error]")
&& !error_hits[0].text.contains("[mcp_tool_error]")
&& !error_hits[0].text.contains("[web_search_tool_error]"),
"code_execution error must be distinct from every other tool_error variant"
);
assert_eq!(
error_hits[0].tool_call_id.as_deref(),
Some("srvtoolu_ce_failed")
);
let stored_call_id: Option<String> = store
.conn()
.query_row(
"SELECT tool_call_id FROM chunks WHERE id = ?",
params![success_hits[0].chunk_id],
|row| row.get(0),
)
.unwrap();
assert_eq!(stored_call_id.as_deref(), Some("srvtoolu_ce_main"));
}
#[test]
fn ingests_anthropic_code_execution_output_file_ids_end_to_end() {
let root = tempdir().unwrap();
let mut store = Store::initialize(&root.path().join("store")).unwrap();
let file = root.path().join("code-execution-output-file-ids.jsonl");
let transcript = concat!(
"{\"role\":\"user\",\"content\":\"please plot the revenue chart\"}\n",
"{\"role\":\"assistant\",\"content\":[",
"{\"type\":\"server_tool_use\",\"id\":\"srvtoolu_ce_plot\",\"name\":\"code_execution\",",
"\"input\":{\"code\":\"plot()\"}}",
"]}\n",
"{\"role\":\"assistant\",\"content\":[",
"{\"type\":\"code_execution_tool_result\",\"tool_use_id\":\"srvtoolu_ce_plot\",",
"\"content\":{\"type\":\"code_execution_result\",\"stdout\":\"plot saved\",",
"\"stderr\":\"\",\"return_code\":0,\"content\":[",
"{\"type\":\"code_execution_output\",\"file_id\":\"file_revenue_chart_2026\"},",
"{\"type\":\"code_execution_output\",\"file_id\":\"file_revenue_csv_2026\"}",
"]}}",
"]}\n",
"{\"role\":\"assistant\",\"content\":\"chart attached above\"}\n",
);
fs::write(&file, transcript).unwrap();
let report = ingest_path(&mut store, &file).unwrap();
assert_eq!(report.ingested.len(), 1);
assert_eq!(report.ingested[0].chunks, 4);
for file_id in ["file_revenue_chart_2026", "file_revenue_csv_2026"] {
let hits = search(&store, file_id, SearchOptions::default()).unwrap();
assert_eq!(hits.len(), 1, "expected one hit for {file_id}");
assert!(
hits[0]
.text
.contains(&format!("[code_execution_output] {file_id}")),
"{file_id} should surface behind the code_execution_output prefix"
);
assert!(
hits[0]
.text
.contains("[code_execution_result] {\"return_code\":0,\"stdout\":\"plot saved\"}"),
"the main code_execution_result body should still ride in the same chunk as the outputs"
);
assert_eq!(hits[0].tool_call_id.as_deref(), Some("srvtoolu_ce_plot"));
}
let plot_request = search(&store, "plot the revenue chart", SearchOptions::default()).unwrap();
assert_eq!(plot_request.len(), 1);
assert!(plot_request[0].text.starts_with("[user]"));
let attached = search(&store, "chart attached", SearchOptions::default()).unwrap();
assert_eq!(attached.len(), 1);
assert!(attached[0].text.starts_with("[assistant]"));
}
#[test]
fn ingests_anthropic_bash_code_execution_tool_result_content_block_end_to_end() {
let root = tempdir().unwrap();
let mut store = Store::initialize(&root.path().join("store")).unwrap();
let file = root
.path()
.join("bash-code-execution-tool-result-block.jsonl");
let transcript = concat!(
"{\"role\":\"assistant\",\"content\":[",
"{\"type\":\"server_tool_use\",\"id\":\"srvtoolu_bash_main\",\"name\":\"bash_code_execution\",",
"\"input\":{\"command\":\"echo lantern\"}}",
"]}\n",
"{\"role\":\"assistant\",\"content\":[",
"{\"type\":\"bash_code_execution_tool_result\",\"tool_use_id\":\"srvtoolu_bash_main\",",
"\"content\":{\"type\":\"bash_code_execution_result\",\"stdout\":\"lantern\\n\",",
"\"stderr\":\"\",\"return_code\":0}}",
"]}\n",
"{\"role\":\"assistant\",\"content\":[",
"{\"type\":\"bash_code_execution_tool_result\",\"tool_use_id\":\"srvtoolu_bash_failed\",",
"\"content\":{\"type\":\"bash_code_execution_tool_result_error\",",
"\"error_code\":\"unavailable\"}}",
"]}\n",
);
fs::write(&file, transcript).unwrap();
let report = ingest_path(&mut store, &file).unwrap();
assert_eq!(report.ingested.len(), 1);
assert_eq!(report.ingested[0].chunks, 3);
let success_hits = search(&store, "lantern", SearchOptions::default()).unwrap();
let success_hit = success_hits
.iter()
.find(|h| h.text.contains("[bash_code_execution_result]"))
.expect("success reply should be keyword-searchable behind the new prefix");
assert!(
success_hit
.text
.contains("[bash_code_execution_result] {\"return_code\":0,\"stdout\":\"lantern\\n\"}"),
"success reply should surface behind the bash_code_execution_result prefix"
);
assert!(
!success_hit.text.contains("[code_execution_result]"),
"bash prefix must stay distinct from the Python code_execution_result prefix"
);
assert_eq!(
success_hit.tool_call_id.as_deref(),
Some("srvtoolu_bash_main")
);
let error_hits = search(&store, "unavailable", SearchOptions::default()).unwrap();
assert_eq!(error_hits.len(), 1);
assert!(
error_hits[0]
.text
.contains("[bash_code_execution_error] error_code=unavailable"),
"failed bash replies should render behind the distinct bash_code_execution_error prefix"
);
assert!(
!error_hits[0].text.contains("[code_execution_error]")
&& !error_hits[0].text.contains("[tool_error]")
&& !error_hits[0].text.contains("[mcp_tool_error]")
&& !error_hits[0].text.contains("[web_search_tool_error]"),
"bash code execution error must be distinct from every other tool_error variant"
);
assert_eq!(
error_hits[0].tool_call_id.as_deref(),
Some("srvtoolu_bash_failed")
);
let stored_call_id: Option<String> = store
.conn()
.query_row(
"SELECT tool_call_id FROM chunks WHERE id = ?",
params![success_hit.chunk_id],
|row| row.get(0),
)
.unwrap();
assert_eq!(stored_call_id.as_deref(), Some("srvtoolu_bash_main"));
}
#[test]
fn ingests_anthropic_web_fetch_tool_result_content_block_end_to_end() {
let root = tempdir().unwrap();
let mut store = Store::initialize(&root.path().join("store")).unwrap();
let file = root.path().join("web-fetch-tool-result-block.jsonl");
let transcript = concat!(
"{\"role\":\"assistant\",\"content\":[",
"{\"type\":\"server_tool_use\",\"id\":\"srvtoolu_wf_main\",\"name\":\"web_fetch\",",
"\"input\":{\"url\":\"https://docs.lantern.example/intro\"}}",
"]}\n",
"{\"role\":\"assistant\",\"content\":[",
"{\"type\":\"web_fetch_tool_result\",\"tool_use_id\":\"srvtoolu_wf_main\",",
"\"content\":{\"type\":\"web_fetch_result\",",
"\"url\":\"https://docs.lantern.example/intro\",",
"\"retrieved_at\":\"2026-01-01T00:00:00Z\",",
"\"content\":{\"type\":\"document\",\"source\":{\"type\":\"text\",",
"\"media_type\":\"text/plain\",",
"\"data\":\"lantern is a local provenance memory engine\"}}}}",
"]}\n",
"{\"role\":\"assistant\",\"content\":[",
"{\"type\":\"web_fetch_tool_result\",\"tool_use_id\":\"srvtoolu_wf_pdf\",",
"\"content\":{\"type\":\"web_fetch_result\",",
"\"url\":\"https://docs.lantern.example/report.pdf\",",
"\"content\":{\"type\":\"document\",\"source\":{\"type\":\"base64\",",
"\"media_type\":\"application/pdf\",\"data\":\"OPAQUE_WEB_FETCH_PAYLOAD\"}}}}",
"]}\n",
"{\"role\":\"assistant\",\"content\":[",
"{\"type\":\"web_fetch_tool_result\",\"tool_use_id\":\"srvtoolu_wf_failed\",",
"\"content\":{\"type\":\"web_fetch_tool_result_error\",",
"\"error_code\":\"unsupported_content_type\"}}",
"]}\n",
);
fs::write(&file, transcript).unwrap();
let report = ingest_path(&mut store, &file).unwrap();
assert_eq!(report.ingested.len(), 1);
assert_eq!(report.ingested[0].chunks, 4);
let success_hits = search(&store, "provenance", SearchOptions::default()).unwrap();
assert_eq!(success_hits.len(), 1);
assert!(
success_hits[0]
.text
.contains("[web_fetch_tool_result] https://docs.lantern.example/intro"),
"web_fetch_result should surface behind the distinct prefix; got: {}",
success_hits[0].text
);
assert!(
success_hits[0]
.text
.contains("[document] lantern is a local provenance memory engine"),
"nested text-source document body should be keyword-searchable"
);
assert_eq!(
success_hits[0].tool_call_id.as_deref(),
Some("srvtoolu_wf_main")
);
let pdf_hits = search(&store, "report", SearchOptions::default()).unwrap();
assert_eq!(pdf_hits.len(), 1);
assert!(
pdf_hits[0]
.text
.contains("[web_fetch_tool_result] https://docs.lantern.example/report.pdf"),
"pdf reply should still surface the URL anchor"
);
assert!(
pdf_hits[0].text.contains("[document:application/pdf]"),
"base64 nested document should fall back to media_type anchor"
);
assert!(
!pdf_hits[0].text.contains("OPAQUE_WEB_FETCH_PAYLOAD"),
"base64 payload must never leak into the chunk"
);
let blob_hits = search(&store, "OPAQUE_WEB_FETCH_PAYLOAD", SearchOptions::default()).unwrap();
assert!(
blob_hits.is_empty(),
"base64 web_fetch payload must not be searchable as keyword anywhere"
);
let error_hits = search(&store, "unsupported_content_type", SearchOptions::default()).unwrap();
assert_eq!(error_hits.len(), 1);
assert!(
error_hits[0]
.text
.contains("[web_fetch_tool_error] error_code=unsupported_content_type"),
"failed web_fetch replies should render behind the distinct web_fetch_tool_error prefix"
);
assert!(
!error_hits[0].text.contains("[tool_error]")
&& !error_hits[0].text.contains("[mcp_tool_error]")
&& !error_hits[0].text.contains("[web_search_tool_error]")
&& !error_hits[0].text.contains("[code_execution_error]"),
"web_fetch error must be distinct from every other tool_error variant"
);
assert_eq!(
error_hits[0].tool_call_id.as_deref(),
Some("srvtoolu_wf_failed")
);
let stored_call_id: Option<String> = store
.conn()
.query_row(
"SELECT tool_call_id FROM chunks WHERE id = ?",
params![success_hits[0].chunk_id],
|row| row.get(0),
)
.unwrap();
assert_eq!(stored_call_id.as_deref(), Some("srvtoolu_wf_main"));
}
#[test]
fn ingests_openai_web_search_call_top_level_line_end_to_end() {
let root = tempdir().unwrap();
let mut store = Store::initialize(&root.path().join("store")).unwrap();
let file = root.path().join("web-search-call-line.jsonl");
let transcript = concat!(
"{\"role\":\"user\",\"content\":\"Find me the latest AI news.\"}\n",
"{\"type\":\"web_search_call\",\"id\":\"ws_call_search\",\"status\":\"completed\",",
"\"action\":{\"type\":\"search\",\"query\":\"latest news about AI\",",
"\"sources\":[{\"url\":\"https://big.example/1\",\"snippet\":\"NOISE_BLOB_PAYLOAD\"}]}}\n",
"{\"type\":\"web_search_call\",\"id\":\"ws_call_open\",\"status\":\"completed\",",
"\"action\":{\"type\":\"open_page\",\"url\":\"https://news.example/article\"}}\n",
"{\"role\":\"assistant\",\"content\":\"Here is what I found.\"}\n",
);
fs::write(&file, transcript).unwrap();
let report = ingest_path(&mut store, &file).unwrap();
assert_eq!(report.ingested.len(), 1);
assert_eq!(report.ingested[0].chunks, 4);
let search_hits = search(&store, "latest news about AI", SearchOptions::default()).unwrap();
assert_eq!(search_hits.len(), 1);
assert!(
search_hits[0]
.text
.contains("[web_search_call:search] {\"query\":\"latest news about AI\"}"),
"search-action line should render behind the web_search_call:search prefix"
);
assert_eq!(
search_hits[0].tool_call_id.as_deref(),
Some("ws_call_search")
);
let open_hits = search(&store, "article", SearchOptions::default()).unwrap();
assert_eq!(open_hits.len(), 1);
assert!(
open_hits[0]
.text
.contains("[web_search_call:open_page] {\"url\":\"https://news.example/article\"}"),
"open_page-action line should render behind the web_search_call:open_page prefix"
);
assert_eq!(open_hits[0].tool_call_id.as_deref(), Some("ws_call_open"));
let noise_hits = search(&store, "NOISE_BLOB_PAYLOAD", SearchOptions::default()).unwrap();
assert!(
noise_hits.is_empty(),
"large optional action arrays must not leak into the chunk text"
);
let prose_hits = search(&store, "Here is what I found", SearchOptions::default()).unwrap();
assert_eq!(prose_hits.len(), 1);
assert!(prose_hits[0].text.starts_with("[assistant]"));
let stored_call_id: Option<String> = store
.conn()
.query_row(
"SELECT tool_call_id FROM chunks WHERE id = ?",
params![search_hits[0].chunk_id],
|row| row.get(0),
)
.unwrap();
assert_eq!(stored_call_id.as_deref(), Some("ws_call_search"));
}
#[test]
fn ingests_openai_web_search_call_find_in_page_action_end_to_end() {
let root = tempdir().unwrap();
let mut store = Store::initialize(&root.path().join("store")).unwrap();
let file = root.path().join("web-search-call-find-in-page.jsonl");
let transcript = concat!(
"{\"role\":\"user\",\"content\":\"Look up docs on the tokio scheduler.\"}\n",
"{\"type\":\"web_search_call\",\"id\":\"ws_find_call\",\"status\":\"completed\",",
"\"action\":{\"type\":\"find_in_page\",\"pattern\":\"async runtime\",",
"\"url\":\"https://docs.example/tokio\",",
"\"sources\":[{\"snippet\":\"NOISE_BLOB_FIND_PAGE\"}]}}\n",
"{\"role\":\"assistant\",\"content\":\"Located the relevant section.\"}\n",
);
fs::write(&file, transcript).unwrap();
let report = ingest_path(&mut store, &file).unwrap();
assert_eq!(report.ingested.len(), 1);
assert_eq!(report.ingested[0].chunks, 3);
let pattern_hits = search(&store, "async runtime", SearchOptions::default()).unwrap();
assert_eq!(pattern_hits.len(), 1);
assert!(
pattern_hits[0].text.contains(
"[web_search_call:find_in_page] {\"pattern\":\"async runtime\",\"url\":\"https://docs.example/tokio\"}"
),
"find_in_page line should render with sorted pattern+url anchors"
);
assert_eq!(
pattern_hits[0].tool_call_id.as_deref(),
Some("ws_find_call")
);
let noise_hits = search(&store, "NOISE_BLOB_FIND_PAGE", SearchOptions::default()).unwrap();
assert!(
noise_hits.is_empty(),
"large optional action arrays must not leak into chunk text after pattern widening"
);
let prose_hits = search(
&store,
"Located the relevant section",
SearchOptions::default(),
)
.unwrap();
assert_eq!(prose_hits.len(), 1);
assert!(prose_hits[0].text.starts_with("[assistant]"));
let stored_call_id: Option<String> = store
.conn()
.query_row(
"SELECT tool_call_id FROM chunks WHERE id = ?",
params![pattern_hits[0].chunk_id],
|row| row.get(0),
)
.unwrap();
assert_eq!(stored_call_id.as_deref(), Some("ws_find_call"));
}
#[test]
fn ingests_openai_file_search_call_top_level_line_end_to_end() {
let root = tempdir().unwrap();
let mut store = Store::initialize(&root.path().join("store")).unwrap();
let file = root.path().join("file-search-call-line.jsonl");
let transcript = concat!(
"{\"role\":\"user\",\"content\":\"Look up lantern wiring guidance.\"}\n",
"{\"type\":\"file_search_call\",\"id\":\"fs_call_one\",\"status\":\"completed\",",
"\"queries\":[\"lantern wiring schematic\",\"copper terminal\"],",
"\"results\":[{\"file_id\":\"file_xyz\",\"text\":\"NOISE_BLOB_FILE_CONTENT\"}]}\n",
"{\"role\":\"assistant\",\"content\":\"Here is what the docs say.\"}\n",
);
fs::write(&file, transcript).unwrap();
let report = ingest_path(&mut store, &file).unwrap();
assert_eq!(report.ingested.len(), 1);
assert_eq!(report.ingested[0].chunks, 3);
let schematic_hits = search(&store, "schematic", SearchOptions::default()).unwrap();
assert_eq!(schematic_hits.len(), 1);
assert!(
schematic_hits[0].text.contains(
"[file_search_call] {\"queries\":[\"lantern wiring schematic\",\"copper terminal\"]}"
),
"file_search_call line should render behind the file_search_call prefix with both queries"
);
assert_eq!(
schematic_hits[0].tool_call_id.as_deref(),
Some("fs_call_one")
);
let terminal_hits = search(&store, "copper", SearchOptions::default()).unwrap();
assert_eq!(terminal_hits.len(), 1);
assert_eq!(
terminal_hits[0].chunk_id, schematic_hits[0].chunk_id,
"both queries should land in the same chunk for the same call"
);
let noise_hits = search(&store, "NOISE_BLOB_FILE_CONTENT", SearchOptions::default()).unwrap();
assert!(
noise_hits.is_empty(),
"large optional results arrays must not leak into the chunk text"
);
let prose_hits = search(&store, "Here is what the docs", SearchOptions::default()).unwrap();
assert_eq!(prose_hits.len(), 1);
assert!(prose_hits[0].text.starts_with("[assistant]"));
let stored_call_id: Option<String> = store
.conn()
.query_row(
"SELECT tool_call_id FROM chunks WHERE id = ?",
params![schematic_hits[0].chunk_id],
|row| row.get(0),
)
.unwrap();
assert_eq!(stored_call_id.as_deref(), Some("fs_call_one"));
}
#[test]
fn ingests_openai_code_interpreter_call_top_level_line_end_to_end() {
let root = tempdir().unwrap();
let mut store = Store::initialize(&root.path().join("store")).unwrap();
let file = root.path().join("code-interpreter-call-line.jsonl");
let transcript = concat!(
"{\"role\":\"user\",\"content\":\"Please summarise the dataframe.\"}\n",
"{\"type\":\"code_interpreter_call\",\"id\":\"ci_call_one\",",
"\"status\":\"completed\",\"container_id\":\"cntr_NOISE_BLOB_SANDBOX_ID\",",
"\"code\":\"import pandas as pd\\nprint(df.describe())\"}\n",
"{\"role\":\"assistant\",\"content\":\"Here is the summary.\"}\n",
);
fs::write(&file, transcript).unwrap();
let report = ingest_path(&mut store, &file).unwrap();
assert_eq!(report.ingested.len(), 1);
assert_eq!(report.ingested[0].chunks, 3);
let hits = search(&store, "describe", SearchOptions::default()).unwrap();
assert_eq!(hits.len(), 1);
assert!(
hits[0]
.text
.contains("[code_interpreter_call] import pandas as pd"),
"code_interpreter_call line should render behind the distinct prefix"
);
assert!(
hits[0].text.contains("print(df.describe())"),
"multi-line code should be preserved verbatim in the chunk text"
);
assert_eq!(hits[0].tool_call_id.as_deref(), Some("ci_call_one"));
let noise_hits = search(&store, "NOISE_BLOB_SANDBOX_ID", SearchOptions::default()).unwrap();
assert!(
noise_hits.is_empty(),
"container_id sandbox identifier must not leak into chunk text"
);
let prose_hits = search(&store, "Here is the summary", SearchOptions::default()).unwrap();
assert_eq!(prose_hits.len(), 1);
assert!(prose_hits[0].text.starts_with("[assistant]"));
let stored_call_id: Option<String> = store
.conn()
.query_row(
"SELECT tool_call_id FROM chunks WHERE id = ?",
params![hits[0].chunk_id],
|row| row.get(0),
)
.unwrap();
assert_eq!(stored_call_id.as_deref(), Some("ci_call_one"));
}
#[test]
fn ingests_openai_code_interpreter_call_inline_outputs_end_to_end() {
let root = tempdir().unwrap();
let mut store = Store::initialize(&root.path().join("store")).unwrap();
let file = root.path().join("code-interpreter-call-outputs.jsonl");
let transcript = concat!(
"{\"role\":\"user\",\"content\":\"Plot the dataframe.\"}\n",
"{\"type\":\"code_interpreter_call\",\"id\":\"ci_with_outputs\",",
"\"status\":\"completed\",",
"\"code\":\"plot(df['revenue'])\",\"outputs\":[",
"{\"type\":\"logs\",\"logs\":\"computed_revenue_total=4242\"},",
"{\"type\":\"image\",\"url\":\"data:image/png;base64,NOISE_BASE64_PAYLOAD\"},",
"{\"type\":\"image\",\"file_id\":\"file_chart_anchor\"}",
"]}\n",
"{\"role\":\"assistant\",\"content\":\"Here is the chart.\"}\n",
);
fs::write(&file, transcript).unwrap();
let report = ingest_path(&mut store, &file).unwrap();
assert_eq!(report.ingested.len(), 1);
assert_eq!(report.ingested[0].chunks, 3);
let logs_hits = search(&store, "computed_revenue_total", SearchOptions::default()).unwrap();
assert_eq!(logs_hits.len(), 1);
assert!(
logs_hits[0]
.text
.contains("[code_interpreter_logs] computed_revenue_total=4242"),
"inline logs output should render behind the distinct prefix"
);
assert!(
logs_hits[0]
.text
.contains("[code_interpreter_call] plot(df['revenue'])"),
"outputs lines should be appended to the call body in the same chunk"
);
assert!(
logs_hits[0]
.text
.contains("[code_interpreter_image:image/png]"),
"data: URL image entries should collapse to a media-type anchor"
);
assert!(
logs_hits[0]
.text
.contains("[code_interpreter_image:file] file_chart_anchor"),
"file_id-only image entries should surface under the `:file` sub-prefix"
);
let payload_hits = search(&store, "NOISE_BASE64_PAYLOAD", SearchOptions::default()).unwrap();
assert!(
payload_hits.is_empty(),
"base64 image payload must never leak into any chunk"
);
let file_anchor_hits = search(&store, "file_chart_anchor", SearchOptions::default()).unwrap();
assert_eq!(file_anchor_hits.len(), 1);
assert_eq!(
logs_hits[0].tool_call_id.as_deref(),
Some("ci_with_outputs")
);
let prose_hits = search(&store, "Here is the chart", SearchOptions::default()).unwrap();
assert_eq!(prose_hits.len(), 1);
assert!(prose_hits[0].text.starts_with("[assistant]"));
}
#[test]
fn ingests_openai_local_shell_call_top_level_line_end_to_end() {
let root = tempdir().unwrap();
let mut store = Store::initialize(&root.path().join("store")).unwrap();
let file = root.path().join("local-shell-call-line.jsonl");
let transcript = concat!(
"{\"role\":\"user\",\"content\":\"Please list /tmp.\"}\n",
"{\"type\":\"local_shell_call\",\"id\":\"lsh_item_one\",",
"\"call_id\":\"call_shell_one\",\"status\":\"completed\",",
"\"action\":{\"type\":\"exec\",",
"\"command\":[\"bash\",\"-c\",\"ls /tmp/lantern_workdir\"],",
"\"working_directory\":\"/home/agent\",",
"\"env\":{\"SECRET_TOKEN\":\"NOISE_ENV_VALUE\"},",
"\"timeout_ms\":30000,\"user\":\"root\"}}\n",
"{\"role\":\"assistant\",\"content\":\"Listed successfully.\"}\n",
);
fs::write(&file, transcript).unwrap();
let report = ingest_path(&mut store, &file).unwrap();
assert_eq!(report.ingested.len(), 1);
assert_eq!(report.ingested[0].chunks, 3);
let hits = search(&store, "lantern_workdir", SearchOptions::default()).unwrap();
assert_eq!(hits.len(), 1);
assert!(
hits[0].text.contains("[local_shell_call:exec]"),
"local_shell_call line should render behind the distinct prefix"
);
assert!(
hits[0]
.text
.contains("\"command\":[\"bash\",\"-c\",\"ls /tmp/lantern_workdir\"]"),
"command tokens should be preserved in deterministic JSON shape"
);
assert!(
hits[0]
.text
.contains("\"working_directory\":\"/home/agent\""),
"working_directory anchor should survive ingest"
);
assert_eq!(hits[0].tool_call_id.as_deref(), Some("call_shell_one"));
let noise_hits = search(&store, "NOISE_ENV_VALUE", SearchOptions::default()).unwrap();
assert!(
noise_hits.is_empty(),
"env values must not leak into chunk text"
);
let secret_hits = search(&store, "SECRET_TOKEN", SearchOptions::default()).unwrap();
assert!(
secret_hits.is_empty(),
"env keys must not leak into chunk text"
);
let item_id_hits = search(&store, "lsh_item_one", SearchOptions::default()).unwrap();
let bound_to_item_id = item_id_hits
.iter()
.any(|h| h.tool_call_id.as_deref() == Some("lsh_item_one"));
assert!(
!bound_to_item_id,
"response item id must NOT be promoted to tool_call_id"
);
let prose_hits = search(&store, "Listed successfully", SearchOptions::default()).unwrap();
assert_eq!(prose_hits.len(), 1);
assert!(prose_hits[0].text.starts_with("[assistant]"));
let stored_call_id: Option<String> = store
.conn()
.query_row(
"SELECT tool_call_id FROM chunks WHERE id = ?",
params![hits[0].chunk_id],
|row| row.get(0),
)
.unwrap();
assert_eq!(stored_call_id.as_deref(), Some("call_shell_one"));
}
#[test]
fn ingests_openai_local_shell_call_output_top_level_line_end_to_end() {
let root = tempdir().unwrap();
let mut store = Store::initialize(&root.path().join("store")).unwrap();
let file = root.path().join("local-shell-call-output.jsonl");
let transcript = concat!(
"{\"role\":\"user\",\"content\":\"Please list /tmp.\"}\n",
"{\"type\":\"local_shell_call\",\"id\":\"lsh_item_one\",",
"\"call_id\":\"call_shell_one\",\"status\":\"completed\",",
"\"action\":{\"type\":\"exec\",",
"\"command\":[\"bash\",\"-c\",\"ls /tmp/lantern_workdir\"]}}\n",
"{\"type\":\"local_shell_call_output\",\"id\":\"lsh_out_item_one\",",
"\"call_id\":\"call_shell_one\",\"status\":\"completed\",",
"\"output\":\"alpha_file.txt\\nbeta_file.txt\"}\n",
"{\"role\":\"assistant\",\"content\":\"Listed successfully.\"}\n",
);
fs::write(&file, transcript).unwrap();
let report = ingest_path(&mut store, &file).unwrap();
assert_eq!(report.ingested.len(), 1);
assert_eq!(report.ingested[0].chunks, 4);
let hits = search(&store, "alpha_file", SearchOptions::default()).unwrap();
assert_eq!(hits.len(), 1);
assert!(
hits[0].text.contains("[local_shell_call_output]"),
"local_shell_call_output line should render behind the distinct prefix"
);
assert!(
hits[0].text.contains("alpha_file.txt"),
"stdout tokens should be preserved in chunk text"
);
assert!(
hits[0].text.contains("beta_file.txt"),
"multi-line stdout should be preserved across newlines"
);
assert_eq!(hits[0].tool_call_id.as_deref(), Some("call_shell_one"));
assert!(
!hits[0].text.contains("[local_shell_call:"),
"output prefix must not collide with [local_shell_call:ACTION]"
);
let item_id_hits = search(&store, "lsh_out_item_one", SearchOptions::default()).unwrap();
let bound_to_item_id = item_id_hits
.iter()
.any(|h| h.tool_call_id.as_deref() == Some("lsh_out_item_one"));
assert!(
!bound_to_item_id,
"response item id must NOT be promoted to tool_call_id"
);
let prose_hits = search(&store, "Listed successfully", SearchOptions::default()).unwrap();
assert_eq!(prose_hits.len(), 1);
assert!(prose_hits[0].text.starts_with("[assistant]"));
let stored_call_id: Option<String> = store
.conn()
.query_row(
"SELECT tool_call_id FROM chunks WHERE id = ?",
params![hits[0].chunk_id],
|row| row.get(0),
)
.unwrap();
assert_eq!(stored_call_id.as_deref(), Some("call_shell_one"));
}
#[test]
fn ingests_openai_computer_call_output_top_level_line_end_to_end() {
let root = tempdir().unwrap();
let mut store = Store::initialize(&root.path().join("store")).unwrap();
let file = root.path().join("computer-call-output.jsonl");
let transcript = concat!(
"{\"role\":\"user\",\"content\":\"Take a screenshot.\"}\n",
"{\"type\":\"computer_call\",\"id\":\"cu_item_one\",",
"\"call_id\":\"call_screenshot_one\",\"status\":\"completed\",",
"\"action\":{\"type\":\"screenshot\"}}\n",
"{\"type\":\"computer_call_output\",\"id\":\"cuo_item_one\",",
"\"call_id\":\"call_screenshot_one\",\"status\":\"completed\",",
"\"output\":{\"type\":\"input_image\",",
"\"image_url\":\"https://example.test/cap_unique_token.png\"}}\n",
"{\"role\":\"assistant\",\"content\":\"Captured.\"}\n",
);
fs::write(&file, transcript).unwrap();
let report = ingest_path(&mut store, &file).unwrap();
assert_eq!(report.ingested.len(), 1);
assert_eq!(report.ingested[0].chunks, 4);
let hits = search(&store, "cap_unique_token", SearchOptions::default()).unwrap();
assert_eq!(hits.len(), 1);
assert!(
hits[0].text.contains("[computer_call_output]"),
"computer_call_output line should render behind the distinct prefix"
);
assert!(
hits[0]
.text
.contains("https://example.test/cap_unique_token.png"),
"image_url should be preserved in chunk text"
);
assert_eq!(hits[0].tool_call_id.as_deref(), Some("call_screenshot_one"));
let item_id_hits = search(&store, "cuo_item_one", SearchOptions::default()).unwrap();
let bound_to_item_id = item_id_hits
.iter()
.any(|h| h.tool_call_id.as_deref() == Some("cuo_item_one"));
assert!(
!bound_to_item_id,
"response item id must NOT be promoted to tool_call_id"
);
let prose_hits = search(&store, "Captured", SearchOptions::default()).unwrap();
assert_eq!(prose_hits.len(), 1);
assert!(prose_hits[0].text.starts_with("[assistant]"));
let stored_call_id: Option<String> = store
.conn()
.query_row(
"SELECT tool_call_id FROM chunks WHERE id = ?",
params![hits[0].chunk_id],
|row| row.get(0),
)
.unwrap();
assert_eq!(stored_call_id.as_deref(), Some("call_screenshot_one"));
}
#[test]
fn ingests_openai_computer_call_output_data_url_screenshot_without_leaking_base64() {
let root = tempdir().unwrap();
let mut store = Store::initialize(&root.path().join("store")).unwrap();
let file = root.path().join("computer-call-output-data-url.jsonl");
let transcript = concat!(
"{\"role\":\"user\",\"content\":\"Take a screenshot please.\"}\n",
"{\"type\":\"computer_call\",\"id\":\"cu_data_one\",",
"\"call_id\":\"call_data_one\",\"status\":\"completed\",",
"\"action\":{\"type\":\"screenshot\"}}\n",
"{\"type\":\"computer_call_output\",\"id\":\"cuo_data_one\",",
"\"call_id\":\"call_data_one\",\"status\":\"completed\",",
"\"output\":{\"type\":\"input_image\",",
"\"image_url\":\"data:image/png;base64,iVBORw0KOPAQUEPAYLOADNEVERINDEX\"}}\n",
"{\"role\":\"assistant\",\"content\":\"Captured the desktop.\"}\n",
);
fs::write(&file, transcript).unwrap();
let report = ingest_path(&mut store, &file).unwrap();
assert_eq!(report.ingested.len(), 1);
assert_eq!(report.ingested[0].chunks, 4);
let prefix_row: Option<(String, String, Option<String>)> = store
.conn()
.query_row(
"SELECT id, text, tool_call_id FROM chunks \
WHERE text LIKE '[computer_call_output:image/png]%' LIMIT 1",
[],
|row| Ok((row.get(0)?, row.get(1)?, row.get(2)?)),
)
.ok();
let (_prefix_id, prefix_text, prefix_call_id) = prefix_row
.expect("computer_call_output should surface the media_type behind the typed prefix");
assert_eq!(prefix_text, "[computer_call_output:image/png]");
assert_eq!(prefix_call_id.as_deref(), Some("call_data_one"));
let store_has_base64_sentinel: bool = store
.conn()
.query_row(
"SELECT EXISTS(SELECT 1 FROM chunks WHERE text LIKE '%OPAQUEPAYLOADNEVERINDEX%')",
[],
|row| row.get(0),
)
.unwrap();
assert!(
!store_has_base64_sentinel,
"base64 payload must NOT leak into any chunk text"
);
let item_id_hits = search(&store, "cuo_data_one", SearchOptions::default()).unwrap();
let bound_to_item_id = item_id_hits
.iter()
.any(|h| h.tool_call_id.as_deref() == Some("cuo_data_one"));
assert!(
!bound_to_item_id,
"response item id must NOT be promoted to tool_call_id"
);
let prose_hits = search(&store, "Captured", SearchOptions::default()).unwrap();
assert_eq!(prose_hits.len(), 1);
assert!(prose_hits[0].text.starts_with("[assistant]"));
}
#[test]
fn ingests_openai_computer_call_output_with_current_url_anchor() {
let root = tempdir().unwrap();
let mut store = Store::initialize(&root.path().join("store")).unwrap();
let file = root.path().join("computer-call-output-current-url.jsonl");
let transcript = concat!(
"{\"role\":\"user\",\"content\":\"Open the dashboard please.\"}\n",
"{\"type\":\"computer_call\",\"id\":\"cu_curl\",",
"\"call_id\":\"call_curl\",\"status\":\"completed\",",
"\"action\":{\"type\":\"screenshot\"}}\n",
"{\"type\":\"computer_call_output\",\"id\":\"cuo_curl\",",
"\"call_id\":\"call_curl\",\"status\":\"completed\",",
"\"output\":{\"type\":\"computer_screenshot\",",
"\"image_url\":\"data:image/png;base64,SCREENSHOTPAYLOADNEVERINDEX\",",
"\"current_url\":\"https://dashboard.example.test/projects/lantern\"}}\n",
"{\"role\":\"assistant\",\"content\":\"Dashboard captured.\"}\n",
);
fs::write(&file, transcript).unwrap();
let report = ingest_path(&mut store, &file).unwrap();
assert_eq!(report.ingested.len(), 1);
assert_eq!(report.ingested[0].chunks, 4);
let prefix_row: Option<(String, Option<String>)> = store
.conn()
.query_row(
"SELECT text, tool_call_id FROM chunks \
WHERE text LIKE '[computer_call_output:image/png]%' LIMIT 1",
[],
|row| Ok((row.get(0)?, row.get(1)?)),
)
.ok();
let (prefix_text, prefix_call_id) =
prefix_row.expect("computer_call_output chunk should land behind the typed prefix");
assert_eq!(
prefix_text,
"[computer_call_output:image/png] https://dashboard.example.test/projects/lantern"
);
assert_eq!(prefix_call_id.as_deref(), Some("call_curl"));
let url_hits = search(&store, "projects", SearchOptions::default()).unwrap();
assert!(
url_hits.iter().any(|h| h
.text
.contains("https://dashboard.example.test/projects/lantern")),
"current_url path segment must be FTS-searchable on the computer_call_output chunk"
);
let store_has_base64_sentinel: bool = store
.conn()
.query_row(
"SELECT EXISTS(SELECT 1 FROM chunks WHERE text LIKE '%SCREENSHOTPAYLOADNEVERINDEX%')",
[],
|row| row.get(0),
)
.unwrap();
assert!(
!store_has_base64_sentinel,
"base64 payload must NOT leak into any chunk text"
);
let prose_hits = search(&store, "Dashboard captured", SearchOptions::default()).unwrap();
assert_eq!(prose_hits.len(), 1);
assert!(prose_hits[0].text.starts_with("[assistant]"));
}
#[test]
fn ingests_openai_reasoning_top_level_line_end_to_end() {
let root = tempdir().unwrap();
let mut store = Store::initialize(&root.path().join("store")).unwrap();
let file = root.path().join("reasoning.jsonl");
let transcript = concat!(
"{\"role\":\"user\",\"content\":\"Help me debug this.\"}\n",
"{\"type\":\"reasoning\",\"id\":\"rs_traceuniq\",",
"\"status\":\"completed\",",
"\"encrypted_content\":\"OPAQUE_BASE64_ZZZ_NEVER_INDEX\",",
"\"summary\":[",
"{\"type\":\"summary_text\",\"text\":\"first weigh polyglot tradeoffs\"},",
"{\"type\":\"summary_text\",\"text\":\"then choose the rust path\"}",
"]}\n",
"{\"role\":\"assistant\",\"content\":\"Use Rust here.\"}\n",
);
fs::write(&file, transcript).unwrap();
let report = ingest_path(&mut store, &file).unwrap();
assert_eq!(report.ingested.len(), 1);
assert_eq!(report.ingested[0].chunks, 3);
let hits = search(&store, "polyglot", SearchOptions::default()).unwrap();
assert_eq!(hits.len(), 1);
assert!(
hits[0].text.starts_with("[reasoning] "),
"reasoning line should render behind the distinct prefix, got: {}",
hits[0].text
);
assert!(hits[0].text.contains("first weigh polyglot tradeoffs"));
assert!(hits[0].text.contains("then choose the rust path"));
let encrypted_hits = search(&store, "OPAQUE_BASE64_ZZZ", SearchOptions::default()).unwrap();
assert!(
encrypted_hits.is_empty(),
"encrypted_content base64 blob must not be searchable"
);
assert!(hits[0].tool_call_id.is_none());
let item_id_hits = search(&store, "rs_traceuniq", SearchOptions::default()).unwrap();
let bound_to_item_id = item_id_hits
.iter()
.any(|h| h.tool_call_id.as_deref() == Some("rs_traceuniq"));
assert!(
!bound_to_item_id,
"top-level reasoning item id must NOT be promoted to tool_call_id"
);
let prose_hits = search(&store, "Use Rust here", SearchOptions::default()).unwrap();
assert_eq!(prose_hits.len(), 1);
assert!(prose_hits[0].text.starts_with("[assistant]"));
let stored_call_id: Option<String> = store
.conn()
.query_row(
"SELECT tool_call_id FROM chunks WHERE id = ?",
params![hits[0].chunk_id],
|row| row.get(0),
)
.unwrap();
assert!(stored_call_id.is_none());
}
#[test]
fn ingests_openai_function_call_output_top_level_line_end_to_end() {
let root = tempdir().unwrap();
let mut store = Store::initialize(&root.path().join("store")).unwrap();
let file = root.path().join("function-call-output.jsonl");
let transcript = concat!(
"{\"role\":\"user\",\"content\":\"Search docs for the lantern config.\"}\n",
"{\"type\":\"function_call\",\"id\":\"fc_item_one\",",
"\"call_id\":\"call_search_one\",\"status\":\"completed\",",
"\"name\":\"docs_search\",",
"\"arguments\":\"{\\\"query\\\":\\\"lantern config\\\"}\"}\n",
"{\"type\":\"function_call_output\",\"id\":\"fco_item_one\",",
"\"call_id\":\"call_search_one\",\"status\":\"completed\",",
"\"output\":\"alpha_match_token in docs/config.md\"}\n",
"{\"role\":\"assistant\",\"content\":\"Configured properly.\"}\n",
);
fs::write(&file, transcript).unwrap();
let report = ingest_path(&mut store, &file).unwrap();
assert_eq!(report.ingested.len(), 1);
assert_eq!(report.ingested[0].chunks, 4);
let hits = search(&store, "alpha_match_token", SearchOptions::default()).unwrap();
assert_eq!(hits.len(), 1);
assert!(
hits[0].text.contains("[function_call_output]"),
"function_call_output line should render behind the distinct prefix, got: {}",
hits[0].text
);
assert!(
hits[0].text.contains("alpha_match_token in docs/config.md"),
"output text should be preserved in chunk text"
);
assert_eq!(hits[0].tool_call_id.as_deref(), Some("call_search_one"));
assert!(
!hits[0].text.contains("[tool:docs_search]"),
"output prefix must not collide with [tool:NAME]"
);
let item_id_hits = search(&store, "fco_item_one", SearchOptions::default()).unwrap();
let bound_to_item_id = item_id_hits
.iter()
.any(|h| h.tool_call_id.as_deref() == Some("fco_item_one"));
assert!(
!bound_to_item_id,
"response item id must NOT be promoted to tool_call_id"
);
let prose_hits = search(&store, "Configured properly", SearchOptions::default()).unwrap();
assert_eq!(prose_hits.len(), 1);
assert!(prose_hits[0].text.starts_with("[assistant]"));
let stored_call_id: Option<String> = store
.conn()
.query_row(
"SELECT tool_call_id FROM chunks WHERE id = ?",
params![hits[0].chunk_id],
|row| row.get(0),
)
.unwrap();
assert_eq!(stored_call_id.as_deref(), Some("call_search_one"));
}
#[test]
fn ingests_openai_code_interpreter_call_output_top_level_line_end_to_end() {
let root = tempdir().unwrap();
let mut store = Store::initialize(&root.path().join("store")).unwrap();
let file = root.path().join("code-interpreter-call-output.jsonl");
let transcript = concat!(
"{\"role\":\"user\",\"content\":\"Compute the mean of these values.\"}\n",
"{\"type\":\"code_interpreter_call\",\"id\":\"ci_item_one\",",
"\"call_id\":\"call_interp_one\",\"status\":\"completed\",",
"\"code\":\"import statistics\\nprint(statistics.mean([1,2,3]))\"}\n",
"{\"type\":\"code_interpreter_call_output\",\"id\":\"cio_item_one\",",
"\"call_id\":\"call_interp_one\",",
"\"output\":\"beta_match_token mean=2.0\"}\n",
"{\"role\":\"assistant\",\"content\":\"The mean is 2.0.\"}\n",
);
fs::write(&file, transcript).unwrap();
let report = ingest_path(&mut store, &file).unwrap();
assert_eq!(report.ingested.len(), 1);
assert_eq!(report.ingested[0].chunks, 4);
let hits = search(&store, "beta_match_token", SearchOptions::default()).unwrap();
assert_eq!(hits.len(), 1);
assert!(
hits[0].text.contains("[code_interpreter_call_output]"),
"code_interpreter_call_output line should render behind the distinct prefix, got: {}",
hits[0].text
);
assert!(
hits[0].text.contains("beta_match_token mean=2.0"),
"output text should be preserved in chunk text"
);
assert_eq!(hits[0].tool_call_id.as_deref(), Some("call_interp_one"));
assert!(
!hits[0].text.contains("[code_interpreter_call] "),
"output prefix must not collide with [code_interpreter_call]"
);
let item_id_hits = search(&store, "cio_item_one", SearchOptions::default()).unwrap();
let bound_to_item_id = item_id_hits
.iter()
.any(|h| h.tool_call_id.as_deref() == Some("cio_item_one"));
assert!(
!bound_to_item_id,
"response item id must NOT be promoted to tool_call_id"
);
let prose_hits = search(&store, "The mean is", SearchOptions::default()).unwrap();
assert_eq!(prose_hits.len(), 1);
assert!(prose_hits[0].text.starts_with("[assistant]"));
let stored_call_id: Option<String> = store
.conn()
.query_row(
"SELECT tool_call_id FROM chunks WHERE id = ?",
params![hits[0].chunk_id],
|row| row.get(0),
)
.unwrap();
assert_eq!(stored_call_id.as_deref(), Some("call_interp_one"));
}
#[test]
fn ingests_openai_image_generation_call_top_level_line_end_to_end() {
let root = tempdir().unwrap();
let mut store = Store::initialize(&root.path().join("store")).unwrap();
let file = root.path().join("image-generation-call-line.jsonl");
let transcript = concat!(
"{\"role\":\"user\",\"content\":\"Please draw a small lantern in a forest.\"}\n",
"{\"type\":\"image_generation_call\",\"id\":\"ig_call_one\",",
"\"status\":\"completed\",\"size\":\"1024x1024\",\"quality\":\"high\",",
"\"output_format\":\"png\",",
"\"result\":\"iVBORw0KGgoAAAANSUhEUgAA_NOISE_BASE64_BLOB_PAYLOAD\",",
"\"revised_prompt\":\"a small glowing lantern hanging from a tree branch in a misty forest at dusk\"}\n",
"{\"role\":\"assistant\",\"content\":\"Here is the rendered image.\"}\n",
);
fs::write(&file, transcript).unwrap();
let report = ingest_path(&mut store, &file).unwrap();
assert_eq!(report.ingested.len(), 1);
assert_eq!(report.ingested[0].chunks, 3);
let hits = search(&store, "lantern", SearchOptions::default()).unwrap();
let image_hit = hits
.iter()
.find(|h| h.text.contains("[image_generation_call]"))
.expect("image_generation_call line should be keyword-searchable");
assert!(
image_hit
.text
.contains("[image_generation_call] a small glowing lantern"),
"image_generation_call line should render behind the distinct prefix"
);
assert!(
image_hit.text.contains("misty forest at dusk"),
"revised_prompt should be preserved verbatim in the chunk text"
);
assert_eq!(image_hit.tool_call_id.as_deref(), Some("ig_call_one"));
let noise_hits = search(
&store,
"NOISE_BASE64_BLOB_PAYLOAD",
SearchOptions::default(),
)
.unwrap();
assert!(
noise_hits.is_empty(),
"base64 result blob must not leak into chunk text"
);
let prose_hits = search(&store, "rendered image", SearchOptions::default()).unwrap();
assert_eq!(prose_hits.len(), 1);
assert!(prose_hits[0].text.starts_with("[assistant]"));
let stored_call_id: Option<String> = store
.conn()
.query_row(
"SELECT tool_call_id FROM chunks WHERE id = ?",
params![image_hit.chunk_id],
|row| row.get(0),
)
.unwrap();
assert_eq!(stored_call_id.as_deref(), Some("ig_call_one"));
}
#[test]
fn ingests_openai_mcp_call_top_level_line_end_to_end() {
let root = tempdir().unwrap();
let mut store = Store::initialize(&root.path().join("store")).unwrap();
let file = root.path().join("mcp-call-line.jsonl");
let transcript = concat!(
"{\"role\":\"user\",\"content\":\"File a bug in Linear for the JSONL drop.\"}\n",
"{\"type\":\"mcp_call\",\"id\":\"mcp_call_one\",",
"\"server_label\":\"linear\",\"name\":\"create_issue\",",
"\"arguments\":\"{\\\"title\\\":\\\"JSONL ingest drops mcp_call lines\\\",",
"\\\"team\\\":\\\"INGEST\\\"}\"}\n",
"{\"role\":\"assistant\",\"content\":\"Filed.\"}\n",
);
fs::write(&file, transcript).unwrap();
let report = ingest_path(&mut store, &file).unwrap();
assert_eq!(report.ingested.len(), 1);
assert_eq!(report.ingested[0].chunks, 3);
let server_hits = search(&store, "linear", SearchOptions::default()).unwrap();
let mcp_hit = server_hits
.iter()
.find(|h| h.text.contains("[mcp_call:"))
.expect("mcp_call line should be keyword-searchable via server_label");
assert!(
mcp_hit.text.starts_with("[mcp_call:linear/create_issue]"),
"mcp_call line should render behind the distinct prefix with server_label folded in"
);
assert!(
mcp_hit.text.contains("JSONL ingest drops mcp_call lines"),
"JSON-encoded arguments string should pass through unparsed so its tokens stay searchable"
);
assert!(
mcp_hit.text.contains("INGEST"),
"team field inside the arguments JSON should also be searchable"
);
assert_eq!(mcp_hit.tool_call_id.as_deref(), Some("mcp_call_one"));
assert_eq!(mcp_hit.tool_name.as_deref(), Some("create_issue"));
assert!(!mcp_hit.text.contains("[mcp_tool_use:"));
let user_hits = search(&store, "JSONL drop", SearchOptions::default()).unwrap();
assert!(
user_hits.iter().any(|h| h.text.starts_with("[user]")),
"user turn should still be searchable"
);
let stored_call_id: Option<String> = store
.conn()
.query_row(
"SELECT tool_call_id FROM chunks WHERE id = ?",
params![mcp_hit.chunk_id],
|row| row.get(0),
)
.unwrap();
assert_eq!(stored_call_id.as_deref(), Some("mcp_call_one"));
}
#[test]
fn ingests_openai_mcp_call_top_level_output_and_error_end_to_end() {
let root = tempdir().unwrap();
let mut store = Store::initialize(&root.path().join("store")).unwrap();
let file = root.path().join("mcp-call-output-error.jsonl");
let transcript = concat!(
"{\"role\":\"user\",\"content\":\"File the bug, then try the broken one.\"}\n",
"{\"type\":\"mcp_call\",\"id\":\"mcp_ok_one\",",
"\"server_label\":\"linear\",\"name\":\"create_issue\",",
"\"arguments\":\"{\\\"title\\\":\\\"ingest drops mcp output\\\"}\",",
"\"output\":\"Issue LIN-987 created in INGEST team.\"}\n",
"{\"type\":\"mcp_call\",\"id\":\"mcp_err_one\",",
"\"server_label\":\"linear\",\"name\":\"create_issue\",",
"\"arguments\":\"{\\\"title\\\":\\\"forbidden bug\\\"}\",",
"\"error\":{\"code\":\"E_AUTH\",\"message\":\"missing token gunpowder\"}}\n",
"{\"role\":\"assistant\",\"content\":\"Filed one; the other was denied.\"}\n",
);
fs::write(&file, transcript).unwrap();
let report = ingest_path(&mut store, &file).unwrap();
assert_eq!(report.ingested.len(), 1);
assert_eq!(report.ingested[0].chunks, 4);
let success_hits = search(&store, "created", SearchOptions::default()).unwrap();
let success_hit = success_hits
.iter()
.find(|h| h.text.contains("[mcp_call_output:"))
.expect("mcp_call output line should be keyword-searchable via the response body");
assert!(
success_hit.text.contains("[mcp_call:linear/create_issue]"),
"success chunk should still carry the original call prefix line"
);
assert!(
success_hit.text.contains(
"[mcp_call_output:linear/create_issue] Issue LIN-987 created in INGEST team."
),
"success reply must render behind the typed output prefix on its own line; got: {}",
success_hit.text
);
assert_eq!(success_hit.tool_call_id.as_deref(), Some("mcp_ok_one"));
let error_hits = search(&store, "gunpowder", SearchOptions::default()).unwrap();
let error_hit = error_hits
.iter()
.find(|h| h.text.contains("[mcp_call_error:"))
.expect("mcp_call error line should be keyword-searchable via the error message");
assert!(
error_hit.text.contains(
"[mcp_call_error:linear/create_issue] {\"code\":\"E_AUTH\",\"message\":\"missing token gunpowder\"}"
),
"structured error must serialize with BTreeMap-sorted keys behind the typed error prefix; got: {}",
error_hit.text
);
assert_eq!(error_hit.tool_call_id.as_deref(), Some("mcp_err_one"));
assert!(!success_hit.text.contains("[mcp_tool_use:"));
assert!(!success_hit.text.contains("[mcp_tool_result"));
assert!(!error_hit.text.contains("[mcp_tool_error"));
let user_hits = search(&store, "broken one", SearchOptions::default()).unwrap();
assert!(
user_hits.iter().any(|h| h.text.starts_with("[user]")),
"user turn should still be searchable"
);
let assistant_hits = search(&store, "denied", SearchOptions::default()).unwrap();
assert!(
assistant_hits
.iter()
.any(|h| h.text.starts_with("[assistant]")),
"assistant turn should still be searchable"
);
}
#[test]
fn ingests_openai_mcp_call_top_level_approval_request_id_end_to_end() {
let root = tempdir().unwrap();
let mut store = Store::initialize(&root.path().join("store")).unwrap();
let file = root.path().join("mcp-call-approval-link.jsonl");
let transcript = concat!(
"{\"role\":\"user\",\"content\":\"File the bug, with approval.\"}\n",
"{\"type\":\"mcp_approval_request\",\"id\":\"mcpr_audit_chain\",",
"\"server_label\":\"linear\",\"name\":\"create_issue\",",
"\"arguments\":\"{\\\"title\\\":\\\"file gated bug\\\"}\"}\n",
"{\"type\":\"mcp_approval_response\",",
"\"approval_request_id\":\"mcpr_audit_chain\",",
"\"approve\":true,\"reason\":\"operator confirmed manually\"}\n",
"{\"type\":\"mcp_call\",\"id\":\"mcp_gated_call_one\",",
"\"server_label\":\"linear\",\"name\":\"create_issue\",",
"\"arguments\":\"{\\\"title\\\":\\\"file gated bug\\\"}\",",
"\"approval_request_id\":\"mcpr_audit_chain\",",
"\"output\":\"Issue LIN-555 created.\"}\n",
"{\"role\":\"assistant\",\"content\":\"Filed after approval.\"}\n",
);
fs::write(&file, transcript).unwrap();
let report = ingest_path(&mut store, &file).unwrap();
assert_eq!(report.ingested.len(), 1);
assert_eq!(report.ingested[0].chunks, 5);
let chain_hits = search(&store, "mcpr_audit_chain", SearchOptions::default()).unwrap();
let call_hit = chain_hits
.iter()
.find(|h| h.text.contains("[mcp_call:"))
.expect("gated mcp_call line should now be searchable by approval_request_id");
assert!(
call_hit
.text
.contains("[mcp_call:linear/create_issue] {\"title\":\"file gated bug\"}"),
"call prefix must survive unchanged; got: {}",
call_hit.text
);
assert!(
call_hit
.text
.contains("[mcp_call_approval_request:linear/create_issue] mcpr_audit_chain"),
"call must carry the approval-request reference behind the typed prefix; got: {}",
call_hit.text
);
assert!(
call_hit
.text
.contains("[mcp_call_output:linear/create_issue] Issue LIN-555 created."),
"output line must still render after the call/approval pair; got: {}",
call_hit.text
);
assert!(
!call_hit
.text
.contains("[mcp_approval_request:linear/create_issue]")
);
assert_eq!(
call_hit.tool_call_id.as_deref(),
Some("mcp_gated_call_one"),
"mcp_call must keep its own mcp_... id as tool_call_id even with approval_request_id present"
);
let request_hits = search(&store, "file gated bug", SearchOptions::default()).unwrap();
assert!(
request_hits.iter().any(|h| h
.text
.contains("[mcp_approval_request:linear/create_issue]")),
"approval request line should still be searchable via its arguments"
);
let response_hits = search(
&store,
"operator confirmed manually",
SearchOptions::default(),
)
.unwrap();
assert!(
response_hits
.iter()
.any(|h| h.text.contains("[mcp_approval_response:approved]")),
"approval response line should still be searchable via its reason"
);
let user_hits = search(&store, "with approval", SearchOptions::default()).unwrap();
assert!(
user_hits.iter().any(|h| h.text.starts_with("[user]")),
"user turn should still be searchable"
);
let assistant_hits = search(&store, "Filed after approval", SearchOptions::default()).unwrap();
assert!(
assistant_hits
.iter()
.any(|h| h.text.starts_with("[assistant]")),
"assistant turn should still be searchable"
);
}
#[test]
fn ingests_openai_function_call_top_level_line_end_to_end() {
let root = tempdir().unwrap();
let mut store = Store::initialize(&root.path().join("store")).unwrap();
let file = root.path().join("function-call-line.jsonl");
let transcript = concat!(
"{\"role\":\"user\",\"content\":\"Look up the lantern handbook entry on porches.\"}\n",
"{\"type\":\"function_call\",\"id\":\"fc_item_one\",",
"\"call_id\":\"call_responses_321\",\"name\":\"docs_search\",",
"\"arguments\":\"{\\\"query\\\":\\\"lantern porches\\\",",
"\\\"index\\\":\\\"handbook\\\"}\"}\n",
"{\"role\":\"assistant\",\"content\":\"Hang it under the porch eaves.\"}\n",
);
fs::write(&file, transcript).unwrap();
let report = ingest_path(&mut store, &file).unwrap();
assert_eq!(report.ingested.len(), 1);
assert_eq!(report.ingested[0].chunks, 3);
let prefix_hits = search(&store, "docs_search", SearchOptions::default()).unwrap();
let call_hit = prefix_hits
.iter()
.find(|h| h.text.contains("[function_call:"))
.expect("function_call line should be keyword-searchable by function name");
assert!(
call_hit.text.starts_with("[function_call:docs_search]"),
"function_call line should render behind the distinct prefix, got: {}",
call_hit.text
);
assert!(
!call_hit.text.starts_with("[tool:docs_search]"),
"raw-arguments fallback must be preempted, got: {}",
call_hit.text
);
assert!(!call_hit.text.contains("[tool_use:"));
let args_hits = search(&store, "porches", SearchOptions::default()).unwrap();
assert!(
args_hits
.iter()
.any(|h| h.text.starts_with("[function_call:docs_search]")
&& h.text.contains("\"query\":\"lantern porches\"")),
"arguments JSON token should pass through unparsed and be searchable"
);
let index_hits = search(&store, "handbook", SearchOptions::default()).unwrap();
assert!(
index_hits
.iter()
.any(|h| h.text.starts_with("[function_call:docs_search]")
&& h.text.contains("\"index\":\"handbook\"")),
"second arguments field should also be searchable end-to-end"
);
assert_eq!(call_hit.tool_name.as_deref(), Some("docs_search"));
assert_eq!(call_hit.tool_call_id.as_deref(), Some("call_responses_321"));
let stored_call_id: Option<String> = store
.conn()
.query_row(
"SELECT tool_call_id FROM chunks WHERE id = ?",
params![call_hit.chunk_id],
|row| row.get(0),
)
.unwrap();
assert_eq!(stored_call_id.as_deref(), Some("call_responses_321"));
assert_ne!(
stored_call_id.as_deref(),
Some("fc_item_one"),
"response item id must not be promoted to tool_call_id"
);
let user_hits = search(&store, "handbook entry", SearchOptions::default()).unwrap();
assert!(
user_hits.iter().any(|h| h.text.starts_with("[user]")),
"user turn should still be searchable"
);
let assistant_hits = search(&store, "porch eaves", SearchOptions::default()).unwrap();
assert!(
assistant_hits
.iter()
.any(|h| h.text.starts_with("[assistant]")),
"assistant reply should still be searchable"
);
}
#[test]
fn ingests_openai_mcp_list_tools_top_level_line_end_to_end() {
let root = tempdir().unwrap();
let mut store = Store::initialize(&root.path().join("store")).unwrap();
let file = root.path().join("mcp-list-tools-line.jsonl");
let transcript = concat!(
"{\"role\":\"user\",\"content\":\"What can the linear MCP server do?\"}\n",
"{\"type\":\"mcp_list_tools\",\"id\":\"mcpl_item_one\",",
"\"server_label\":\"linear\",",
"\"tools\":[",
"{\"name\":\"create_issue\",\"description\":\"Create a new issue\"},",
"{\"name\":\"search_issues\",\"description\":\"Search existing issues\"},",
"{\"name\":\"update_issue\"}",
"]}\n",
"{\"role\":\"assistant\",\"content\":\"It can create, search, and update issues.\"}\n",
);
fs::write(&file, transcript).unwrap();
let report = ingest_path(&mut store, &file).unwrap();
assert_eq!(report.ingested.len(), 1);
assert_eq!(report.ingested[0].chunks, 3);
let server_hits = search(&store, "linear", SearchOptions::default()).unwrap();
let list_hit = server_hits
.iter()
.find(|h| h.text.contains("[mcp_list_tools:"))
.expect("mcp_list_tools line should be keyword-searchable via server_label");
assert_eq!(
list_hit.text,
"[mcp_list_tools:linear] create_issue, search_issues, update_issue"
);
for tool in ["create_issue", "search_issues", "update_issue"] {
let tool_hits = search(&store, tool, SearchOptions::default()).unwrap();
assert!(
tool_hits
.iter()
.any(|h| h.text.starts_with("[mcp_list_tools:linear]")),
"tool name '{tool}' should be keyword-searchable in the mcp_list_tools chunk"
);
}
let desc_hits = search(&store, "Create a new issue", SearchOptions::default()).unwrap();
assert!(
desc_hits
.iter()
.all(|h| !h.text.contains("[mcp_list_tools:")),
"tool descriptions should not be inlined into the mcp_list_tools chunk"
);
assert!(!list_hit.text.contains("[mcp_call:"));
assert!(!list_hit.text.contains("[mcp_tool_use:"));
let user_hits = search(&store, "linear MCP server", SearchOptions::default()).unwrap();
assert!(
user_hits.iter().any(|h| h.text.starts_with("[user]")),
"user turn should still be searchable"
);
let assistant_hits = search(
&store,
"create, search, and update",
SearchOptions::default(),
)
.unwrap();
assert!(
assistant_hits
.iter()
.any(|h| h.text.starts_with("[assistant]")),
"assistant reply should still be searchable"
);
assert!(list_hit.tool_call_id.is_none());
let stored_call_id: Option<String> = store
.conn()
.query_row(
"SELECT tool_call_id FROM chunks WHERE id = ?",
params![list_hit.chunk_id],
|row| row.get(0),
)
.unwrap();
assert!(
stored_call_id.is_none(),
"mcp_list_tools response item id must not be promoted to tool_call_id, got {stored_call_id:?}"
);
}
#[test]
fn ingests_openai_mcp_list_tools_failure_inline_error_end_to_end() {
let root = tempdir().unwrap();
let mut store = Store::initialize(&root.path().join("store")).unwrap();
let file = root.path().join("mcp-list-tools-error.jsonl");
let transcript = concat!(
"{\"role\":\"user\",\"content\":\"List the tools for the linear MCP server.\"}\n",
"{\"type\":\"mcp_list_tools\",\"id\":\"mcpl_ok_pair\",",
"\"server_label\":\"linear\",",
"\"tools\":[{\"name\":\"create_issue\"}]}\n",
"{\"type\":\"mcp_list_tools\",\"id\":\"mcpl_failed_pair\",",
"\"server_label\":\"deepwiki\",",
"\"error\":\"could not reach mcp server: connection refused\"}\n",
"{\"role\":\"assistant\",\"content\":\"linear is listed; deepwiki failed.\"}\n",
);
fs::write(&file, transcript).unwrap();
let report = ingest_path(&mut store, &file).unwrap();
assert_eq!(report.ingested.len(), 1);
assert_eq!(report.ingested[0].chunks, 4);
let err_hits = search(&store, "deepwiki", SearchOptions::default()).unwrap();
let err_hit = err_hits
.iter()
.find(|h| h.text.contains("[mcp_list_tools_error:"))
.expect("mcp_list_tools error line should be keyword-searchable via server_label");
assert_eq!(
err_hit.text,
"[mcp_list_tools:deepwiki]\n[mcp_list_tools_error:deepwiki] could not reach mcp server: connection refused"
);
let msg_hits = search(&store, "connection refused", SearchOptions::default()).unwrap();
assert!(
msg_hits
.iter()
.any(|h| h.text.contains("[mcp_list_tools_error:deepwiki]")),
"error message text should be keyword-searchable in the mcp_list_tools_error chunk"
);
let ok_hits = search(&store, "create_issue", SearchOptions::default()).unwrap();
let ok_hit = ok_hits
.iter()
.find(|h| h.text.starts_with("[mcp_list_tools:linear]"))
.expect("successful linear listing should still render via the existing prefix");
assert_eq!(ok_hit.text, "[mcp_list_tools:linear] create_issue");
assert!(err_hit.text.contains("[mcp_list_tools:deepwiki]"));
assert!(!err_hit.text.contains("[mcp_call_error:"));
assert!(err_hit.tool_call_id.is_none());
let stored_err_call_id: Option<String> = store
.conn()
.query_row(
"SELECT tool_call_id FROM chunks WHERE id = ?",
params![err_hit.chunk_id],
|row| row.get(0),
)
.unwrap();
assert!(
stored_err_call_id.is_none(),
"mcp_list_tools failure response item id must not be promoted to tool_call_id, got {stored_err_call_id:?}"
);
let user_hits = search(&store, "List the tools", SearchOptions::default()).unwrap();
assert!(
user_hits.iter().any(|h| h.text.starts_with("[user]")),
"user turn should still be searchable"
);
let assistant_hits = search(&store, "deepwiki failed", SearchOptions::default()).unwrap();
assert!(
assistant_hits
.iter()
.any(|h| h.text.starts_with("[assistant]")),
"assistant reply should still be searchable"
);
}
#[test]
fn ingests_openai_mcp_approval_request_and_response_pair_end_to_end() {
let root = tempdir().unwrap();
let mut store = Store::initialize(&root.path().join("store")).unwrap();
let file = root.path().join("mcp-approval-pair.jsonl");
let transcript = concat!(
"{\"role\":\"user\",\"content\":\"Please file a Linear issue when ready.\"}\n",
"{\"type\":\"mcp_approval_request\",\"id\":\"mcpr_pair_one\",",
"\"server_label\":\"linear\",\"name\":\"create_issue\",",
"\"arguments\":\"{\\\"title\\\":\\\"approval pair end-to-end\\\"}\"}\n",
"{\"type\":\"mcp_approval_response\",",
"\"approval_request_id\":\"mcpr_pair_one\",",
"\"approve\":true,\"reason\":\"operator approved manually\"}\n",
"{\"role\":\"assistant\",\"content\":\"Filed the issue.\"}\n",
);
fs::write(&file, transcript).unwrap();
let report = ingest_path(&mut store, &file).unwrap();
assert_eq!(report.ingested.len(), 1);
assert_eq!(report.ingested[0].chunks, 4);
let approved_hits = search(&store, "approved", SearchOptions::default()).unwrap();
let response_hit = approved_hits
.iter()
.find(|h| h.text.contains("[mcp_approval_response:"))
.expect("mcp_approval_response line should be keyword-searchable by status");
assert_eq!(
response_hit.text,
"[mcp_approval_response:approved] operator approved manually"
);
let reason_hits = search(&store, "operator approved", SearchOptions::default()).unwrap();
assert!(
reason_hits
.iter()
.any(|h| h.text.starts_with("[mcp_approval_response:approved]")),
"reason text should be searchable through the response chunk"
);
assert!(!response_hit.text.contains("[mcp_approval_request:"));
assert!(!response_hit.text.contains("[mcp_call:"));
let request_hits = search(&store, "create_issue", SearchOptions::default()).unwrap();
let request_hit = request_hits
.iter()
.find(|h| h.text.contains("[mcp_approval_request:"))
.expect("mcp_approval_request line should be keyword-searchable by tool name");
assert_eq!(
request_hit.tool_call_id.as_deref(),
Some("mcpr_pair_one"),
"mcp_approval_request must promote its top-level id into tool_call_id"
);
assert_eq!(
response_hit.tool_call_id.as_deref(),
Some("mcpr_pair_one"),
"mcp_approval_response must promote approval_request_id into tool_call_id"
);
assert_eq!(request_hit.tool_call_id, response_hit.tool_call_id);
for hit in [request_hit, response_hit] {
let stored_call_id: Option<String> = store
.conn()
.query_row(
"SELECT tool_call_id FROM chunks WHERE id = ?",
params![hit.chunk_id],
|row| row.get(0),
)
.unwrap();
assert_eq!(stored_call_id.as_deref(), Some("mcpr_pair_one"));
}
let user_hits = search(&store, "Linear issue when ready", SearchOptions::default()).unwrap();
assert!(
user_hits.iter().any(|h| h.text.starts_with("[user]")),
"user turn should still be searchable"
);
let assistant_hits = search(&store, "Filed the issue", SearchOptions::default()).unwrap();
assert!(
assistant_hits
.iter()
.any(|h| h.text.starts_with("[assistant]")),
"assistant turn should still be searchable"
);
}
#[test]
fn ingests_openai_custom_tool_call_top_level_line_end_to_end() {
let root = tempdir().unwrap();
let mut store = Store::initialize(&root.path().join("store")).unwrap();
let file = root.path().join("custom-tool-call-line.jsonl");
let transcript = concat!(
"{\"role\":\"user\",\"content\":\"Find the handbook entry on porches.\"}\n",
"{\"type\":\"custom_tool_call\",\"id\":\"ctc_item_one\",",
"\"call_id\":\"call_responses_654\",\"name\":\"search_docs\",",
"\"input\":\"lantern porches handbook\"}\n",
"{\"role\":\"assistant\",\"content\":\"Hang it under the porch eaves.\"}\n",
);
fs::write(&file, transcript).unwrap();
let report = ingest_path(&mut store, &file).unwrap();
assert_eq!(report.ingested.len(), 1);
assert_eq!(report.ingested[0].chunks, 3);
let prefix_hits = search(&store, "search_docs", SearchOptions::default()).unwrap();
let call_hit = prefix_hits
.iter()
.find(|h| h.text.contains("[custom_tool_call:"))
.expect("custom_tool_call line should be keyword-searchable by tool name");
assert!(
call_hit.text.starts_with("[custom_tool_call:search_docs]"),
"custom_tool_call line should render behind the distinct prefix, got: {}",
call_hit.text
);
assert!(
!call_hit.text.starts_with("[tool:search_docs]"),
"raw-input fallback must be preempted, got: {}",
call_hit.text
);
assert!(!call_hit.text.contains("[function_call:"));
assert!(!call_hit.text.contains("[tool_use:"));
let porches_hits = search(&store, "porches", SearchOptions::default()).unwrap();
assert!(
porches_hits
.iter()
.any(|h| h.text.starts_with("[custom_tool_call:search_docs]")
&& h.text.contains("lantern porches handbook")),
"custom_tool_call input should be keyword-searchable end-to-end"
);
assert_eq!(call_hit.tool_name.as_deref(), Some("search_docs"));
assert_eq!(call_hit.tool_call_id.as_deref(), Some("call_responses_654"));
let stored_call_id: Option<String> = store
.conn()
.query_row(
"SELECT tool_call_id FROM chunks WHERE id = ?",
params![call_hit.chunk_id],
|row| row.get(0),
)
.unwrap();
assert_eq!(stored_call_id.as_deref(), Some("call_responses_654"));
assert_ne!(
stored_call_id.as_deref(),
Some("ctc_item_one"),
"response item id must not be promoted to tool_call_id"
);
let user_hits = search(&store, "handbook entry", SearchOptions::default()).unwrap();
assert!(
user_hits.iter().any(|h| h.text.starts_with("[user]")),
"user turn should still be searchable"
);
let assistant_hits = search(&store, "porch eaves", SearchOptions::default()).unwrap();
assert!(
assistant_hits
.iter()
.any(|h| h.text.starts_with("[assistant]")),
"assistant reply should still be searchable"
);
}
#[test]
fn ingests_openai_custom_tool_call_output_top_level_line_end_to_end() {
let root = tempdir().unwrap();
let mut store = Store::initialize(&root.path().join("store")).unwrap();
let file = root.path().join("custom-tool-call-output-line.jsonl");
let transcript = concat!(
"{\"role\":\"user\",\"content\":\"Find the handbook entry on porches.\"}\n",
"{\"type\":\"custom_tool_call\",\"id\":\"ctc_item_one\",",
"\"call_id\":\"call_responses_654\",\"name\":\"search_docs\",",
"\"input\":\"lantern porches handbook\"}\n",
"{\"type\":\"custom_tool_call_output\",\"id\":\"ctco_item_two\",",
"\"call_id\":\"call_responses_654\",",
"\"output\":\"Result: hang the lantern under the porch eaves to avoid drafts.\"}\n",
"{\"role\":\"assistant\",\"content\":\"Hang it under the porch eaves.\"}\n",
);
fs::write(&file, transcript).unwrap();
let report = ingest_path(&mut store, &file).unwrap();
assert_eq!(report.ingested.len(), 1);
assert_eq!(report.ingested[0].chunks, 4);
let reply_hits = search(&store, "drafts", SearchOptions::default()).unwrap();
let reply_hit = reply_hits
.iter()
.find(|h| h.text.contains("[custom_tool_call_output]"))
.expect("custom_tool_call_output line should be keyword-searchable by output text");
assert!(
reply_hit
.text
.starts_with("[custom_tool_call_output] Result: hang the lantern"),
"reply line should render behind the distinct prefix, got: {}",
reply_hit.text
);
assert!(!reply_hit.text.contains("[function_call_output]"));
assert!(!reply_hit.text.contains("[local_shell_call_output]"));
assert!(!reply_hit.text.contains("[tool_result]"));
assert_eq!(
reply_hit.tool_call_id.as_deref(),
Some("call_responses_654")
);
let stored_call_id: Option<String> = store
.conn()
.query_row(
"SELECT tool_call_id FROM chunks WHERE id = ?",
params![reply_hit.chunk_id],
|row| row.get(0),
)
.unwrap();
assert_eq!(stored_call_id.as_deref(), Some("call_responses_654"));
assert_ne!(
stored_call_id.as_deref(),
Some("ctco_item_two"),
"response item id must not be promoted to tool_call_id"
);
let call_hits = search(&store, "search_docs", SearchOptions::default()).unwrap();
let call_hit = call_hits
.iter()
.find(|h| h.text.starts_with("[custom_tool_call:"))
.expect("matching custom_tool_call line should still surface");
assert_eq!(
call_hit.tool_call_id.as_deref(),
Some("call_responses_654"),
"request and reply should share tool_call_id"
);
let user_hits = search(&store, "handbook entry", SearchOptions::default()).unwrap();
assert!(
user_hits.iter().any(|h| h.text.starts_with("[user]")),
"user turn should still be searchable"
);
let assistant_hits = search(&store, "porch eaves", SearchOptions::default()).unwrap();
assert!(
assistant_hits
.iter()
.any(|h| h.text.starts_with("[assistant]")),
"assistant reply should still be searchable"
);
}
#[test]
fn ingests_openai_responses_input_image_block_end_to_end() {
let root = tempdir().unwrap();
let mut store = Store::initialize(&root.path().join("store")).unwrap();
let file = root.path().join("input-image.jsonl");
let transcript = concat!(
"{\"role\":\"user\",\"content\":[",
"{\"type\":\"input_text\",\"text\":\"identify the unique_lantern_token please\"},",
"{\"type\":\"input_image\",",
"\"image_url\":\"https://example.test/lantern_unique_url_token.png\",",
"\"detail\":\"high\"}",
"]}\n",
"{\"role\":\"user\",\"content\":[",
"{\"type\":\"input_image\",",
"\"image_url\":\"data:image/jpeg;base64,BASE64NOISEBLOB_NEVER_INDEX_ME\"}",
"]}\n",
"{\"role\":\"user\",\"content\":[",
"{\"type\":\"input_image\",\"file_id\":\"file_unique_handle_token\"}",
"]}\n",
"{\"role\":\"assistant\",\"content\":\"Got it.\"}\n",
);
fs::write(&file, transcript).unwrap();
let report = ingest_path(&mut store, &file).unwrap();
assert_eq!(report.ingested.len(), 1);
assert_eq!(
report.ingested[0].chunks, 4,
"three user turns plus assistant reply should land as chunks"
);
let url_hits = search(&store, "lantern_unique_url_token", SearchOptions::default()).unwrap();
assert_eq!(url_hits.len(), 1);
assert!(url_hits[0].text.starts_with("[user]"));
assert!(
url_hits[0]
.text
.contains("[input_image] https://example.test/lantern_unique_url_token.png"),
"input_image URL anchor should render behind the [input_image] prefix; got: {}",
url_hits[0].text
);
assert!(
url_hits[0]
.text
.contains("identify the unique_lantern_token"),
"input_text caption should join the input_image anchor in array order"
);
assert!(
!url_hits[0].text.contains("\"detail\""),
"detail metadata must not leak into chunk text"
);
let media_hits = search(&store, "jpeg", SearchOptions::default()).unwrap();
assert!(
media_hits
.iter()
.any(|h| h.text.contains("[input_image:image/jpeg]")),
"data: URL should surface only its media_type"
);
let blob_hits = search(
&store,
"BASE64NOISEBLOB_NEVER_INDEX_ME",
SearchOptions::default(),
)
.unwrap();
assert!(
blob_hits.is_empty(),
"base64 payload from data: URL must never appear in any chunk"
);
let file_hits = search(&store, "file_unique_handle_token", SearchOptions::default()).unwrap();
assert_eq!(file_hits.len(), 1);
assert!(
file_hits[0]
.text
.contains("[input_image:file] file_unique_handle_token"),
"file_id anchor should render behind the :file suffix"
);
let assistant_hits = search(&store, "Got it", SearchOptions::default()).unwrap();
assert!(
assistant_hits
.iter()
.any(|h| h.text.starts_with("[assistant]")),
"assistant reply should still be searchable"
);
}
#[test]
fn ingests_openai_responses_input_file_block_end_to_end() {
let root = tempdir().unwrap();
let mut store = Store::initialize(&root.path().join("store")).unwrap();
let file = root.path().join("input-file.jsonl");
let transcript = concat!(
"{\"role\":\"user\",\"content\":[",
"{\"type\":\"input_text\",\"text\":\"please summarize unique_lantern_caption\"},",
"{\"type\":\"input_file\",",
"\"file_url\":\"https://example.test/lantern_unique_url_doc.pdf\"}",
"]}\n",
"{\"role\":\"user\",\"content\":[",
"{\"type\":\"input_file\",\"filename\":\"unique_lantern_filename.pdf\",",
"\"file_data\":\"data:application/pdf;base64,BASE64NOISEBLOB_NEVER_INDEX_ME\"}",
"]}\n",
"{\"role\":\"user\",\"content\":[",
"{\"type\":\"input_file\",\"file_id\":\"file_unique_doc_handle_token\"}",
"]}\n",
"{\"role\":\"assistant\",\"content\":\"Read it.\"}\n",
);
fs::write(&file, transcript).unwrap();
let report = ingest_path(&mut store, &file).unwrap();
assert_eq!(report.ingested.len(), 1);
assert_eq!(
report.ingested[0].chunks, 4,
"three user turns plus assistant reply should land as chunks"
);
let url_hits = search(&store, "lantern_unique_url_doc", SearchOptions::default()).unwrap();
assert_eq!(url_hits.len(), 1);
assert!(url_hits[0].text.starts_with("[user]"));
assert!(
url_hits[0]
.text
.contains("[input_file] https://example.test/lantern_unique_url_doc.pdf"),
"input_file URL anchor should render behind the [input_file] prefix; got: {}",
url_hits[0].text
);
assert!(
url_hits[0].text.contains("unique_lantern_caption"),
"input_text caption should join the input_file anchor in array order"
);
let filename_hits =
search(&store, "unique_lantern_filename", SearchOptions::default()).unwrap();
assert_eq!(filename_hits.len(), 1);
assert!(
filename_hits[0]
.text
.contains("[input_file] unique_lantern_filename.pdf"),
"filename anchor should render behind the [input_file] prefix; got: {}",
filename_hits[0].text
);
let blob_hits = search(
&store,
"BASE64NOISEBLOB_NEVER_INDEX_ME",
SearchOptions::default(),
)
.unwrap();
assert!(
blob_hits.is_empty(),
"base64 payload from file_data must never appear in any chunk"
);
let file_hits = search(
&store,
"file_unique_doc_handle_token",
SearchOptions::default(),
)
.unwrap();
assert_eq!(file_hits.len(), 1);
assert!(
file_hits[0]
.text
.contains("[input_file:file] file_unique_doc_handle_token"),
"file_id anchor should render behind the :file suffix"
);
let assistant_hits = search(&store, "Read it", SearchOptions::default()).unwrap();
assert!(
assistant_hits
.iter()
.any(|h| h.text.starts_with("[assistant]")),
"assistant reply should still be searchable"
);
}
#[test]
fn ingests_openai_responses_input_audio_block_end_to_end() {
let root = tempdir().unwrap();
let mut store = Store::initialize(&root.path().join("store")).unwrap();
let file = root.path().join("input-audio.jsonl");
let transcript = concat!(
"{\"role\":\"user\",\"content\":[",
"{\"type\":\"input_text\",\"text\":\"transcribe unique_lantern_audio_caption\"},",
"{\"type\":\"input_audio\",",
"\"input_audio\":{\"data\":\"BASE64AUDIOBLOB_NEVER_INDEX_ME\",\"format\":\"mp3\"},",
"\"transcript\":\"lantern_unique_transcript_token saying hello\"}",
"]}\n",
"{\"role\":\"user\",\"content\":[",
"{\"type\":\"input_audio\",\"file_id\":\"file_unique_audio_handle_token\"}",
"]}\n",
"{\"role\":\"assistant\",\"content\":\"Transcribed.\"}\n",
);
fs::write(&file, transcript).unwrap();
let report = ingest_path(&mut store, &file).unwrap();
assert_eq!(report.ingested.len(), 1);
assert_eq!(
report.ingested[0].chunks, 3,
"two user turns plus assistant reply should land as chunks"
);
let transcript_hits = search(
&store,
"lantern_unique_transcript_token",
SearchOptions::default(),
)
.unwrap();
assert_eq!(transcript_hits.len(), 1);
assert!(transcript_hits[0].text.starts_with("[user]"));
assert!(
transcript_hits[0]
.text
.contains("[input_audio] lantern_unique_transcript_token saying hello"),
"input_audio transcript should render behind the [input_audio] prefix; got: {}",
transcript_hits[0].text
);
assert!(
transcript_hits[0]
.text
.contains("unique_lantern_audio_caption"),
"input_text caption should join the input_audio anchor in array order"
);
let blob_hits = search(
&store,
"BASE64AUDIOBLOB_NEVER_INDEX_ME",
SearchOptions::default(),
)
.unwrap();
assert!(
blob_hits.is_empty(),
"base64 audio payload from input_audio.data must never appear in any chunk"
);
let file_hits = search(
&store,
"file_unique_audio_handle_token",
SearchOptions::default(),
)
.unwrap();
assert_eq!(file_hits.len(), 1);
assert!(
file_hits[0]
.text
.contains("[input_audio:file] file_unique_audio_handle_token"),
"file_id anchor should render behind the :file suffix"
);
let assistant_hits = search(&store, "Transcribed", SearchOptions::default()).unwrap();
assert!(
assistant_hits
.iter()
.any(|h| h.text.starts_with("[assistant]")),
"assistant reply should still be searchable"
);
}
#[test]
fn ingests_openai_responses_output_audio_block_end_to_end() {
let root = tempdir().unwrap();
let mut store = Store::initialize(&root.path().join("store")).unwrap();
let file = root.path().join("output-audio.jsonl");
let transcript = concat!(
"{\"role\":\"user\",\"content\":\"Read me a unique_lantern_question.\"}\n",
"{\"role\":\"assistant\",\"content\":[",
"{\"type\":\"text\",\"text\":\"unique_lantern_caption here is your reply\"},",
"{\"type\":\"output_audio\",",
"\"data\":\"BASE64OUTAUDIO_NEVER_INDEX_ME\",\"format\":\"wav\",",
"\"transcript\":\"lantern_unique_out_transcript_token spoken aloud\"}",
"]}\n",
"{\"role\":\"assistant\",\"content\":[",
"{\"type\":\"output_audio\",",
"\"data\":\"BASE64OUTAUDIO_NEVER_INDEX_ME\",\"format\":\"lantern_unique_pcmcodec\"}",
"]}\n",
);
fs::write(&file, transcript).unwrap();
let report = ingest_path(&mut store, &file).unwrap();
assert_eq!(report.ingested.len(), 1);
assert_eq!(
report.ingested[0].chunks, 3,
"user prompt plus two assistant turns should land as chunks"
);
let transcript_hits = search(
&store,
"lantern_unique_out_transcript_token",
SearchOptions::default(),
)
.unwrap();
assert_eq!(transcript_hits.len(), 1);
assert!(transcript_hits[0].text.starts_with("[assistant]"));
assert!(
transcript_hits[0]
.text
.contains("[output_audio] lantern_unique_out_transcript_token spoken aloud"),
"output_audio transcript should render behind the [output_audio] prefix; got: {}",
transcript_hits[0].text
);
assert!(
transcript_hits[0].text.contains("unique_lantern_caption"),
"text caption should join the output_audio anchor in array order"
);
let blob_hits = search(
&store,
"BASE64OUTAUDIO_NEVER_INDEX_ME",
SearchOptions::default(),
)
.unwrap();
assert!(
blob_hits.is_empty(),
"base64 audio payload from output_audio.data must never appear in any chunk"
);
let format_hits = search(&store, "lantern_unique_pcmcodec", SearchOptions::default()).unwrap();
assert_eq!(format_hits.len(), 1);
assert!(
format_hits[0]
.text
.contains("[output_audio:lantern_unique_pcmcodec]"),
"flat format anchor should render behind the codec-scoped suffix; got: {}",
format_hits[0].text
);
let user_hits = search(&store, "unique_lantern_question", SearchOptions::default()).unwrap();
assert!(
user_hits.iter().any(|h| h.text.starts_with("[user]")),
"user prompt should still be searchable"
);
}
#[test]
fn output_text_url_citation_annotations_survive_end_to_end_search() {
let transcript = concat!(
"{\"role\":\"user\",\"content\":\"lantern_unique_citations_question\"}\n",
"{\"role\":\"assistant\",\"content\":[",
"{\"type\":\"output_text\",\"text\":\"lantern_unique_citations_body\",",
"\"annotations\":[",
"{\"type\":\"url_citation\",",
"\"url\":\"https://example.com/lantern_unique_citations_url\",",
"\"title\":\"lantern_unique_citations_title\",",
"\"start_index\":0,\"end_index\":40},",
"{\"type\":\"file_citation\",",
"\"file_id\":\"file_lantern_unique_citations_fileid\",",
"\"filename\":\"lantern_unique_citations_filename.pdf\",\"index\":1}",
"]}",
"]}\n",
);
let root = tempdir().unwrap();
let mut store = Store::initialize(&root.path().join("store")).unwrap();
let file = root.path().join("session.jsonl");
fs::write(&file, transcript).unwrap();
let report = ingest_path(&mut store, &file).unwrap();
assert_eq!(report.ingested.len(), 1);
assert_eq!(
report.ingested[0].chunks, 2,
"user prompt and assistant output_text should land as 2 chunks"
);
let body_hits = search(
&store,
"lantern_unique_citations_body",
SearchOptions::default(),
)
.unwrap();
assert_eq!(body_hits.len(), 1);
assert!(body_hits[0].text.starts_with("[assistant]"));
assert!(body_hits[0].text.contains("lantern_unique_citations_body"));
let url_hits = search(
&store,
"lantern_unique_citations_url",
SearchOptions::default(),
)
.unwrap();
assert_eq!(url_hits.len(), 1);
assert!(
url_hits[0]
.text
.contains("[url_citation] https://example.com/lantern_unique_citations_url lantern_unique_citations_title"),
"url_citation should render behind the [url_citation] prefix with title; got: {}",
url_hits[0].text
);
let title_hits = search(
&store,
"lantern_unique_citations_title",
SearchOptions::default(),
)
.unwrap();
assert_eq!(title_hits.len(), 1);
let fileid_hits = search(
&store,
"file_lantern_unique_citations_fileid",
SearchOptions::default(),
)
.unwrap();
assert_eq!(fileid_hits.len(), 1);
assert!(
fileid_hits[0]
.text
.contains("[file_citation] file_lantern_unique_citations_fileid lantern_unique_citations_filename.pdf"),
"file_citation should render behind the [file_citation] prefix with filename; got: {}",
fileid_hits[0].text
);
let filename_hits = search(
&store,
"lantern_unique_citations_filename",
SearchOptions::default(),
)
.unwrap();
assert_eq!(filename_hits.len(), 1);
assert_eq!(body_hits[0].text, url_hits[0].text);
assert_eq!(body_hits[0].text, fileid_hits[0].text);
let user_hits = search(
&store,
"lantern_unique_citations_question",
SearchOptions::default(),
)
.unwrap();
assert!(user_hits.iter().any(|h| h.text.starts_with("[user]")));
}
#[test]
fn top_level_output_text_line_with_annotations_survives_end_to_end_search() {
let transcript = concat!(
"{\"role\":\"user\",\"content\":\"lantern_unique_top_output_text_question\"}\n",
"{\"type\":\"output_text\",\"text\":\"lantern_unique_top_output_text_body\",",
"\"annotations\":[",
"{\"type\":\"url_citation\",",
"\"url\":\"https://example.com/lantern_unique_top_output_text_url\",",
"\"title\":\"lantern_unique_top_output_text_title\",",
"\"start_index\":0,\"end_index\":40},",
"{\"type\":\"file_citation\",",
"\"file_id\":\"file_lantern_unique_top_output_text_fileid\",",
"\"filename\":\"lantern_unique_top_output_text_filename.pdf\",\"index\":1}",
"]}\n",
);
let root = tempdir().unwrap();
let mut store = Store::initialize(&root.path().join("store")).unwrap();
let file = root.path().join("session.jsonl");
fs::write(&file, transcript).unwrap();
let report = ingest_path(&mut store, &file).unwrap();
assert_eq!(report.ingested.len(), 1);
assert_eq!(
report.ingested[0].chunks, 2,
"user prompt and top-level output_text line should land as 2 chunks"
);
let body_hits = search(
&store,
"lantern_unique_top_output_text_body",
SearchOptions::default(),
)
.unwrap();
assert_eq!(body_hits.len(), 1);
assert!(
body_hits[0]
.text
.contains("lantern_unique_top_output_text_body")
);
let url_hits = search(
&store,
"lantern_unique_top_output_text_url",
SearchOptions::default(),
)
.unwrap();
assert_eq!(url_hits.len(), 1);
assert!(
url_hits[0].text.contains(
"[url_citation] https://example.com/lantern_unique_top_output_text_url lantern_unique_top_output_text_title"
),
"url_citation should render behind the [url_citation] prefix with title; got: {}",
url_hits[0].text
);
let title_hits = search(
&store,
"lantern_unique_top_output_text_title",
SearchOptions::default(),
)
.unwrap();
assert_eq!(title_hits.len(), 1);
let fileid_hits = search(
&store,
"file_lantern_unique_top_output_text_fileid",
SearchOptions::default(),
)
.unwrap();
assert_eq!(fileid_hits.len(), 1);
assert!(
fileid_hits[0].text.contains(
"[file_citation] file_lantern_unique_top_output_text_fileid lantern_unique_top_output_text_filename.pdf"
),
"file_citation should render behind the [file_citation] prefix with filename; got: {}",
fileid_hits[0].text
);
let filename_hits = search(
&store,
"lantern_unique_top_output_text_filename",
SearchOptions::default(),
)
.unwrap();
assert_eq!(filename_hits.len(), 1);
assert_eq!(body_hits[0].text, url_hits[0].text);
assert_eq!(body_hits[0].text, fileid_hits[0].text);
let user_hits = search(
&store,
"lantern_unique_top_output_text_question",
SearchOptions::default(),
)
.unwrap();
assert!(user_hits.iter().any(|h| h.text.starts_with("[user]")));
}
#[test]
fn anthropic_text_citations_survive_end_to_end_search() {
let transcript = concat!(
"{\"role\":\"user\",\"content\":\"lantern_unique_anthropic_citations_question\"}\n",
"{\"role\":\"assistant\",\"content\":[",
"{\"type\":\"text\",\"text\":\"lantern_unique_anthropic_citations_body\",",
"\"citations\":[",
"{\"type\":\"char_location\",",
"\"document_title\":\"lantern_unique_anthropic_citations_doctitle\",",
"\"cited_text\":\"lantern_unique_anthropic_citations_quote\",",
"\"document_index\":0,",
"\"start_char_index\":100,\"end_char_index\":200},",
"{\"type\":\"page_location\",",
"\"document_title\":\"lantern_unique_anthropic_citations_pagedoc\",",
"\"cited_text\":\"lantern_unique_anthropic_citations_pagequote\",",
"\"document_index\":1,",
"\"start_page_number\":3,\"end_page_number\":4}",
"]}",
"]}\n",
);
let root = tempdir().unwrap();
let mut store = Store::initialize(&root.path().join("store")).unwrap();
let file = root.path().join("session.jsonl");
fs::write(&file, transcript).unwrap();
let report = ingest_path(&mut store, &file).unwrap();
assert_eq!(report.ingested.len(), 1);
assert_eq!(
report.ingested[0].chunks, 2,
"user prompt and assistant text-with-citations should land as 2 chunks"
);
let body_hits = search(
&store,
"lantern_unique_anthropic_citations_body",
SearchOptions::default(),
)
.unwrap();
assert_eq!(body_hits.len(), 1);
assert!(body_hits[0].text.starts_with("[assistant]"));
assert!(
body_hits[0]
.text
.contains("lantern_unique_anthropic_citations_body")
);
let doctitle_hits = search(
&store,
"lantern_unique_anthropic_citations_doctitle",
SearchOptions::default(),
)
.unwrap();
assert_eq!(doctitle_hits.len(), 1);
assert!(
doctitle_hits[0].text.contains(
"[citation:char_location] lantern_unique_anthropic_citations_doctitle lantern_unique_anthropic_citations_quote"
),
"char_location citation should render behind the [citation:char_location] prefix with title + cited_text; got: {}",
doctitle_hits[0].text
);
let quote_hits = search(
&store,
"lantern_unique_anthropic_citations_quote",
SearchOptions::default(),
)
.unwrap();
assert_eq!(quote_hits.len(), 1);
let pagedoc_hits = search(
&store,
"lantern_unique_anthropic_citations_pagedoc",
SearchOptions::default(),
)
.unwrap();
assert_eq!(pagedoc_hits.len(), 1);
assert!(
pagedoc_hits[0].text.contains(
"[citation:page_location] lantern_unique_anthropic_citations_pagedoc lantern_unique_anthropic_citations_pagequote"
),
"page_location citation should render behind the [citation:page_location] prefix; got: {}",
pagedoc_hits[0].text
);
let pagequote_hits = search(
&store,
"lantern_unique_anthropic_citations_pagequote",
SearchOptions::default(),
)
.unwrap();
assert_eq!(pagequote_hits.len(), 1);
assert_eq!(body_hits[0].text, doctitle_hits[0].text);
assert_eq!(body_hits[0].text, pagedoc_hits[0].text);
let user_hits = search(
&store,
"lantern_unique_anthropic_citations_question",
SearchOptions::default(),
)
.unwrap();
assert!(user_hits.iter().any(|h| h.text.starts_with("[user]")));
}
#[test]
fn anthropic_web_and_search_result_citations_survive_end_to_end_search() {
let transcript = concat!(
"{\"role\":\"user\",\"content\":\"lantern_unique_anthropic_web_citations_question\"}\n",
"{\"role\":\"assistant\",\"content\":[",
"{\"type\":\"text\",\"text\":\"lantern_unique_anthropic_web_citations_body\",",
"\"citations\":[",
"{\"type\":\"web_search_result_location\",",
"\"url\":\"https://example.com/lantern_unique_anthropic_webcite_url\",",
"\"title\":\"lantern_unique_anthropic_webcite_title\",",
"\"cited_text\":\"lantern_unique_anthropic_webcite_quote\",",
"\"encrypted_index\":\"lantern_unique_anthropic_webcite_opaqueblob\"},",
"{\"type\":\"search_result_location\",",
"\"source\":\"lantern_unique_anthropic_searchcite_source\",",
"\"title\":\"lantern_unique_anthropic_searchcite_title\",",
"\"cited_text\":\"lantern_unique_anthropic_searchcite_quote\",",
"\"start_block_index\":3,\"end_block_index\":4}",
"]}",
"]}\n",
);
let root = tempdir().unwrap();
let mut store = Store::initialize(&root.path().join("store")).unwrap();
let file = root.path().join("session.jsonl");
fs::write(&file, transcript).unwrap();
let report = ingest_path(&mut store, &file).unwrap();
assert_eq!(report.ingested.len(), 1);
assert_eq!(
report.ingested[0].chunks, 2,
"user prompt + assistant text-with-citations should land as 2 chunks"
);
let body_hits = search(
&store,
"lantern_unique_anthropic_web_citations_body",
SearchOptions::default(),
)
.unwrap();
assert_eq!(body_hits.len(), 1);
assert!(body_hits[0].text.starts_with("[assistant]"));
let webcite_hits = search(
&store,
"lantern_unique_anthropic_webcite_url",
SearchOptions::default(),
)
.unwrap();
assert_eq!(webcite_hits.len(), 1);
assert!(
webcite_hits[0].text.contains(
"[citation:web_search_result_location] https://example.com/lantern_unique_anthropic_webcite_url lantern_unique_anthropic_webcite_title lantern_unique_anthropic_webcite_quote"
),
"web_search_result_location should render behind its distinct prefix with url + title + cited_text; got: {}",
webcite_hits[0].text
);
let webtitle_hits = search(
&store,
"lantern_unique_anthropic_webcite_title",
SearchOptions::default(),
)
.unwrap();
assert_eq!(webtitle_hits.len(), 1);
let webquote_hits = search(
&store,
"lantern_unique_anthropic_webcite_quote",
SearchOptions::default(),
)
.unwrap();
assert_eq!(webquote_hits.len(), 1);
let opaque_hits = search(
&store,
"lantern_unique_anthropic_webcite_opaqueblob",
SearchOptions::default(),
)
.unwrap();
assert!(
opaque_hits.is_empty(),
"encrypted_index must not leak into chunk text; got hits: {:?}",
opaque_hits
.iter()
.map(|h| h.text.as_str())
.collect::<Vec<_>>()
);
let searchcite_hits = search(
&store,
"lantern_unique_anthropic_searchcite_source",
SearchOptions::default(),
)
.unwrap();
assert_eq!(searchcite_hits.len(), 1);
assert!(
searchcite_hits[0].text.contains(
"[citation:search_result_location] lantern_unique_anthropic_searchcite_source lantern_unique_anthropic_searchcite_title lantern_unique_anthropic_searchcite_quote"
),
"search_result_location should render behind its distinct prefix with source + title + cited_text; got: {}",
searchcite_hits[0].text
);
assert_eq!(body_hits[0].text, webcite_hits[0].text);
assert_eq!(body_hits[0].text, searchcite_hits[0].text);
let user_hits = search(
&store,
"lantern_unique_anthropic_web_citations_question",
SearchOptions::default(),
)
.unwrap();
assert!(user_hits.iter().any(|h| h.text.starts_with("[user]")));
}
#[test]
fn output_text_container_file_citation_survives_end_to_end_search() {
let transcript = concat!(
"{\"role\":\"user\",\"content\":\"lantern_unique_cfile_question\"}\n",
"{\"role\":\"assistant\",\"content\":[",
"{\"type\":\"output_text\",\"text\":\"lantern_unique_cfile_body\",",
"\"annotations\":[",
"{\"type\":\"container_file_citation\",",
"\"container_id\":\"cntr_lantern_unique_cfile_sandbox\",",
"\"file_id\":\"cfile_lantern_unique_cfile_fileid\",",
"\"filename\":\"lantern_unique_cfile_filename.png\",",
"\"start_index\":0,\"end_index\":24}",
"]}",
"]}\n",
);
let root = tempdir().unwrap();
let mut store = Store::initialize(&root.path().join("store")).unwrap();
let file = root.path().join("session.jsonl");
fs::write(&file, transcript).unwrap();
let report = ingest_path(&mut store, &file).unwrap();
assert_eq!(report.ingested.len(), 1);
assert_eq!(
report.ingested[0].chunks, 2,
"user prompt and assistant output_text should land as 2 chunks"
);
let body_hits = search(
&store,
"lantern_unique_cfile_body",
SearchOptions::default(),
)
.unwrap();
assert_eq!(body_hits.len(), 1);
assert!(body_hits[0].text.starts_with("[assistant]"));
assert!(body_hits[0].text.contains("lantern_unique_cfile_body"));
let fileid_hits = search(
&store,
"cfile_lantern_unique_cfile_fileid",
SearchOptions::default(),
)
.unwrap();
assert_eq!(fileid_hits.len(), 1);
assert!(
fileid_hits[0].text.contains(
"[container_file_citation] cfile_lantern_unique_cfile_fileid lantern_unique_cfile_filename.png"
),
"container_file_citation should render behind the [container_file_citation] prefix with both anchors; got: {}",
fileid_hits[0].text
);
let filename_hits = search(
&store,
"lantern_unique_cfile_filename",
SearchOptions::default(),
)
.unwrap();
assert_eq!(filename_hits.len(), 1);
assert_eq!(body_hits[0].text, fileid_hits[0].text);
let container_hits = search(
&store,
"cntr_lantern_unique_cfile_sandbox",
SearchOptions::default(),
)
.unwrap();
assert!(
container_hits.is_empty(),
"container_id must never appear in any chunk; got hits: {:?}",
container_hits.iter().map(|h| &h.text).collect::<Vec<_>>()
);
assert!(
!body_hits[0]
.text
.contains("cntr_lantern_unique_cfile_sandbox")
);
assert!(!body_hits[0].text.contains("[file_citation] "));
let user_hits = search(
&store,
"lantern_unique_cfile_question",
SearchOptions::default(),
)
.unwrap();
assert!(user_hits.iter().any(|h| h.text.starts_with("[user]")));
}
#[test]
fn ingests_anthropic_search_result_content_block_end_to_end() {
let root = tempdir().unwrap();
let mut store = Store::initialize(&root.path().join("store")).unwrap();
let file = root.path().join("search-result-block.jsonl");
let transcript = concat!(
"{\"role\":\"user\",\"content\":[",
"{\"type\":\"text\",\"text\":\"what does the lantern_unique_searchres_doc say?\"},",
"{\"type\":\"search_result\",",
"\"source\":\"https://docs.example/lantern_unique_searchres_src\",",
"\"title\":\"Lantern unique searchres title\",",
"\"content\":[",
"{\"type\":\"text\",\"text\":\"lantern_unique_searchres_excerpt body\"}",
"]}",
"]}\n",
"{\"role\":\"assistant\",\"content\":\"answer based on the cited doc\"}\n",
);
fs::write(&file, transcript).unwrap();
let report = ingest_path(&mut store, &file).unwrap();
assert_eq!(report.ingested.len(), 1);
assert_eq!(report.ingested[0].chunks, 2);
let excerpt_hits = search(
&store,
"lantern_unique_searchres_excerpt",
SearchOptions::default(),
)
.unwrap();
assert_eq!(excerpt_hits.len(), 1);
assert!(
excerpt_hits[0].text.contains("[search_result]"),
"matched excerpt should render behind the distinct search_result prefix; got {}",
excerpt_hits[0].text
);
let source_hits = search(
&store,
"lantern_unique_searchres_src",
SearchOptions::default(),
)
.unwrap();
assert_eq!(source_hits.len(), 1);
assert!(source_hits[0].text.contains("[search_result]"));
let title_hits = search(&store, "searchres title", SearchOptions::default()).unwrap();
assert_eq!(title_hits.len(), 1);
assert!(title_hits[0].text.contains("[search_result]"));
assert_eq!(excerpt_hits[0].text, source_hits[0].text);
assert_eq!(excerpt_hits[0].text, title_hits[0].text);
assert!(excerpt_hits[0].text.starts_with("[user] "));
assert!(
excerpt_hits[0]
.text
.contains("what does the lantern_unique_searchres_doc say?"),
"surrounding user prose should still land in the same chunk; got {}",
excerpt_hits[0].text
);
assert!(
excerpt_hits[0].text.contains(
"[search_result] https://docs.example/lantern_unique_searchres_src \
Lantern unique searchres title\nlantern_unique_searchres_excerpt body"
),
"header + excerpt should join with a newline in array order; got {}",
excerpt_hits[0].text
);
let assistant_hits = search(&store, "answer based on", SearchOptions::default()).unwrap();
assert!(
assistant_hits
.iter()
.any(|h| h.text.starts_with("[assistant]"))
);
let stored_tool_call_id: Option<String> = store
.conn()
.query_row(
"SELECT tool_call_id FROM chunks WHERE id = ?",
params![excerpt_hits[0].chunk_id],
|row| row.get(0),
)
.unwrap();
assert!(
stored_tool_call_id.is_none(),
"search_result is not a tool call — must not populate tool_call_id"
);
}
#[test]
fn ingests_legacy_openai_chatcompletions_function_call_field_end_to_end() {
let root = tempdir().unwrap();
let mut store = Store::initialize(&root.path().join("store")).unwrap();
let file = root.path().join("legacy-function-call.jsonl");
let transcript = concat!(
"{\"role\":\"user\",\"content\":\"What is the weather in NYC?\"}\n",
"{\"role\":\"assistant\",\"content\":null,",
"\"function_call\":{\"name\":\"get_weather\",",
"\"arguments\":\"{\\\"location\\\":\\\"unique_legacy_fc_city\\\"}\"}}\n",
"{\"role\":\"function\",\"name\":\"get_weather\",",
"\"content\":\"unique_legacy_fc_reply: 72F sunny\"}\n",
"{\"role\":\"assistant\",\"content\":\"It is 72F and sunny in NYC.\"}\n",
);
fs::write(&file, transcript).unwrap();
let report = ingest_path(&mut store, &file).unwrap();
assert_eq!(report.ingested.len(), 1);
assert_eq!(report.ingested[0].chunks, 4);
let call_hits = search(&store, "unique_legacy_fc_city", SearchOptions::default()).unwrap();
assert_eq!(call_hits.len(), 1);
assert!(
call_hits[0].text.contains("[tool_use:get_weather]"),
"legacy function_call should render behind [tool_use:NAME] prefix; got: {}",
call_hits[0].text
);
assert!(
call_hits[0]
.text
.contains("{\"location\":\"unique_legacy_fc_city\"}"),
"JSON-encoded arguments should pass through unchanged; got: {}",
call_hits[0].text
);
assert!(
call_hits[0]
.text
.starts_with("[assistant] [tool_use:get_weather]"),
"call chunk should be `[assistant] [tool_use:...]`; got: {}",
call_hits[0].text
);
assert_eq!(call_hits[0].tool_name.as_deref(), Some("get_weather"));
assert_eq!(call_hits[0].tool_call_id.as_deref(), None);
let reply_hits = search(&store, "unique_legacy_fc_reply", SearchOptions::default()).unwrap();
assert_eq!(reply_hits.len(), 1);
assert!(
reply_hits[0].text.starts_with("[function:get_weather]"),
"legacy reply should render under the function role prefix with name folded in; got: {}",
reply_hits[0].text
);
assert_eq!(reply_hits[0].tool_name.as_deref(), Some("get_weather"));
let prose_hits = search(&store, "72F and sunny", SearchOptions::default()).unwrap();
assert_eq!(prose_hits.len(), 1);
assert!(prose_hits[0].text.starts_with("[assistant]"));
let stored_tool_name: Option<String> = store
.conn()
.query_row(
"SELECT tool_name FROM chunks WHERE id = ?",
params![call_hits[0].chunk_id],
|row| row.get(0),
)
.unwrap();
assert_eq!(stored_tool_name.as_deref(), Some("get_weather"));
let stored_tool_call_id: Option<String> = store
.conn()
.query_row(
"SELECT tool_call_id FROM chunks WHERE id = ?",
params![call_hits[0].chunk_id],
|row| row.get(0),
)
.unwrap();
assert!(
stored_tool_call_id.is_none(),
"legacy function_call has no id — tool_call_id must stay NULL"
);
}
#[test]
fn ingests_chatcompletions_participant_name_into_role_prefix_end_to_end() {
let root = tempdir().unwrap();
let mut store = Store::initialize(&root.path().join("store")).unwrap();
let file = root.path().join("participant-name.jsonl");
let transcript = concat!(
"{\"role\":\"user\",\"name\":\"alice\",\"content\":\"unique_participant_name_prompt\"}\n",
"{\"role\":\"assistant\",\"name\":\"scribe\",\"content\":\"unique_participant_name_reply\"}\n",
"{\"role\":\"tool\",\"name\":\"search\",\"content\":\"unique_participant_tool_output\"}\n",
);
fs::write(&file, transcript).unwrap();
let report = ingest_path(&mut store, &file).unwrap();
assert_eq!(report.ingested.len(), 1);
assert_eq!(report.ingested[0].chunks, 3);
let alice_hits = search(
&store,
"unique_participant_name_prompt",
SearchOptions::default(),
)
.unwrap();
assert_eq!(alice_hits.len(), 1);
assert!(
alice_hits[0].text.starts_with("[user:alice]"),
"participant user turn should fold name into role prefix; got: {}",
alice_hits[0].text
);
assert_eq!(alice_hits[0].tool_name, None);
let stored_alice_user: Option<String> = store
.conn()
.query_row(
"SELECT user FROM chunks WHERE id = ?",
params![alice_hits[0].chunk_id],
|row| row.get(0),
)
.unwrap();
assert_eq!(stored_alice_user.as_deref(), Some("alice"));
let scribe_hits = search(
&store,
"unique_participant_name_reply",
SearchOptions::default(),
)
.unwrap();
assert_eq!(scribe_hits.len(), 1);
assert!(
scribe_hits[0].text.starts_with("[assistant:scribe]"),
"participant assistant turn should fold name into role prefix; got: {}",
scribe_hits[0].text
);
assert_eq!(scribe_hits[0].tool_name, None);
let stored_scribe_user: Option<String> = store
.conn()
.query_row(
"SELECT user FROM chunks WHERE id = ?",
params![scribe_hits[0].chunk_id],
|row| row.get(0),
)
.unwrap();
assert_eq!(stored_scribe_user.as_deref(), Some("scribe"));
let name_hits = search(&store, "alice", SearchOptions::default()).unwrap();
assert_eq!(name_hits.len(), 1);
assert!(name_hits[0].text.starts_with("[user:alice]"));
let tool_hits = search(
&store,
"unique_participant_tool_output",
SearchOptions::default(),
)
.unwrap();
assert_eq!(tool_hits.len(), 1);
assert!(
tool_hits[0].text.starts_with("[tool:search]"),
"tool role should keep existing tool-name rendering; got: {}",
tool_hits[0].text
);
let stored_tool_user: Option<String> = store
.conn()
.query_row(
"SELECT user FROM chunks WHERE id = ?",
params![tool_hits[0].chunk_id],
|row| row.get(0),
)
.unwrap();
assert_eq!(stored_tool_user, None);
assert_eq!(tool_hits[0].tool_name.as_deref(), Some("search"));
}
#[test]
fn ingests_chat_completions_image_url_block_end_to_end() {
let root = tempdir().unwrap();
let mut store = Store::initialize(&root.path().join("store")).unwrap();
let file = root.path().join("chat-completions-image-url.jsonl");
let transcript = concat!(
"{\"role\":\"user\",\"content\":[",
"{\"type\":\"text\",\"text\":\"what is this lantern_unique_caption_token?\"},",
"{\"type\":\"image_url\",\"image_url\":",
"{\"url\":\"https://example.test/lantern_legacy_url_token.png\",",
"\"detail\":\"high\"}}",
"]}\n",
"{\"role\":\"user\",\"content\":[",
"{\"type\":\"image_url\",\"image_url\":",
"{\"url\":\"data:image/jpeg;base64,LEGACY_BASE64_NEVER_INDEX_ME\"}}",
"]}\n",
"{\"role\":\"assistant\",\"content\":\"A wrought iron lantern.\"}\n",
);
fs::write(&file, transcript).unwrap();
let report = ingest_path(&mut store, &file).unwrap();
assert_eq!(report.ingested.len(), 1);
assert_eq!(
report.ingested[0].chunks, 3,
"two user turns plus one assistant reply should land as chunks"
);
let url_hits = search(&store, "lantern_legacy_url_token", SearchOptions::default()).unwrap();
assert_eq!(url_hits.len(), 1);
assert!(url_hits[0].text.starts_with("[user]"));
assert!(
url_hits[0]
.text
.contains("[image_url] https://example.test/lantern_legacy_url_token.png"),
"image_url URL anchor should render behind the [image_url] prefix; got: {}",
url_hits[0].text
);
assert!(
url_hits[0].text.contains("lantern_unique_caption_token"),
"text caption should join the image_url anchor in array order"
);
assert!(
!url_hits[0].text.contains("\"detail\""),
"detail metadata must not leak into chunk text"
);
assert!(
!url_hits[0].text.contains("[input_image]"),
"[image_url] must not be mislabelled as [input_image]"
);
let media_hits = search(&store, "jpeg", SearchOptions::default()).unwrap();
assert!(
media_hits
.iter()
.any(|h| h.text.contains("[image_url:image/jpeg]")),
"data: URL should surface only its media_type"
);
let blob_hits = search(
&store,
"LEGACY_BASE64_NEVER_INDEX_ME",
SearchOptions::default(),
)
.unwrap();
assert!(
blob_hits.is_empty(),
"base64 payload from data: URL must never appear in any chunk"
);
let assistant_hits = search(&store, "wrought iron lantern", SearchOptions::default()).unwrap();
assert!(
assistant_hits
.iter()
.any(|h| h.text.starts_with("[assistant]")),
"assistant reply should still be searchable"
);
}
#[test]
fn ingests_chat_completions_assistant_audio_field_end_to_end() {
let root = tempdir().unwrap();
let mut store = Store::initialize(&root.path().join("store")).unwrap();
let file = root.path().join("chat-completions-audio.jsonl");
let transcript = concat!(
"{\"role\":\"user\",\"content\":\"Read me the prologue.\"}\n",
"{\"role\":\"assistant\",\"content\":null,",
"\"audio\":{\"id\":\"audio_session_001\",",
"\"data\":\"LEGACY_AUDIO_BASE64_NEVER_INDEX_ME\",",
"\"transcript\":\"lantern_legacy_audio_transcript_token welcome traveler\",",
"\"expires_at\":1700000000}}\n",
"{\"role\":\"user\",\"content\":\"Thanks.\"}\n",
);
fs::write(&file, transcript).unwrap();
let report = ingest_path(&mut store, &file).unwrap();
assert_eq!(report.ingested.len(), 1);
assert_eq!(
report.ingested[0].chunks, 3,
"two user turns plus one assistant audio reply should land as chunks"
);
let hits = search(
&store,
"lantern_legacy_audio_transcript_token",
SearchOptions::default(),
)
.unwrap();
assert_eq!(hits.len(), 1);
assert!(hits[0].text.starts_with("[assistant]"));
assert!(
hits[0]
.text
.contains("[audio] lantern_legacy_audio_transcript_token welcome traveler"),
"transcript should render behind the [audio] prefix; got: {}",
hits[0].text
);
let leak_hits = search(
&store,
"LEGACY_AUDIO_BASE64_NEVER_INDEX_ME",
SearchOptions::default(),
)
.unwrap();
assert!(
leak_hits.is_empty(),
"base64 audio payload must never appear in any chunk text"
);
let id_hits = search(&store, "audio_session_001", SearchOptions::default()).unwrap();
assert!(
id_hits.is_empty(),
"audio id metadata must not be searchable as chunk content"
);
assert!(!hits[0].text.contains("[input_audio]"));
assert!(!hits[0].text.contains("[output_audio]"));
let stored_tool_call_id: Option<String> = store
.conn()
.query_row(
"SELECT tool_call_id FROM chunks WHERE id = ?",
params![hits[0].chunk_id],
|row| row.get(0),
)
.unwrap();
assert!(
stored_tool_call_id.is_none(),
"legacy assistant audio has no tool call id — column must stay NULL"
);
let user_hits = search(&store, "prologue", SearchOptions::default()).unwrap();
assert!(
user_hits.iter().any(|h| h.text.starts_with("[user]")),
"user prompt should still be searchable"
);
}
#[test]
fn ingests_chat_completions_assistant_refusal_field_end_to_end() {
let root = tempdir().unwrap();
let mut store = Store::initialize(&root.path().join("store")).unwrap();
let file = root.path().join("chat-completions-refusal.jsonl");
let transcript = concat!(
"{\"role\":\"user\",\"content\":\"Walk me through bypassing the gate.\"}\n",
"{\"role\":\"assistant\",\"content\":null,",
"\"refusal\":\"lantern_legacy_refusal_token I can't help with that\"}\n",
"{\"role\":\"user\",\"content\":\"Understood, thanks.\"}\n",
);
fs::write(&file, transcript).unwrap();
let report = ingest_path(&mut store, &file).unwrap();
assert_eq!(report.ingested.len(), 1);
assert_eq!(
report.ingested[0].chunks, 3,
"two user turns plus one assistant refusal should land as chunks"
);
let hits = search(
&store,
"lantern_legacy_refusal_token",
SearchOptions::default(),
)
.unwrap();
assert_eq!(hits.len(), 1);
assert!(hits[0].text.starts_with("[assistant]"));
assert!(
hits[0]
.text
.contains("[refusal] lantern_legacy_refusal_token I can't help with that"),
"refusal should render behind the [refusal] prefix; got: {}",
hits[0].text
);
let stored_tool_call_id: Option<String> = store
.conn()
.query_row(
"SELECT tool_call_id FROM chunks WHERE id = ?",
params![hits[0].chunk_id],
|row| row.get(0),
)
.unwrap();
assert!(
stored_tool_call_id.is_none(),
"legacy assistant refusal has no tool call id — column must stay NULL"
);
let user_hits = search(&store, "gate", SearchOptions::default()).unwrap();
assert!(
user_hits.iter().any(|h| h.text.starts_with("[user]")),
"user prompt should still be searchable"
);
}
#[test]
fn ingests_chat_completions_assistant_reasoning_content_field_end_to_end() {
let root = tempdir().unwrap();
let mut store = Store::initialize(&root.path().join("store")).unwrap();
let file = root.path().join("chat-completions-reasoning.jsonl");
let transcript = concat!(
"{\"role\":\"user\",\"content\":\"How many vertices does an octahedron have?\"}\n",
"{\"role\":\"assistant\",\"content\":null,",
"\"reasoning_content\":\"lantern_legacy_reasoning_token an octahedron has two pyramids glued at the square base\"}\n",
"{\"role\":\"assistant\",\"content\":\"It has 6 vertices.\"}\n",
);
fs::write(&file, transcript).unwrap();
let report = ingest_path(&mut store, &file).unwrap();
assert_eq!(report.ingested.len(), 1);
assert_eq!(
report.ingested[0].chunks, 3,
"one user turn plus one reasoning-only assistant turn plus the final assistant answer should land as chunks"
);
let hits = search(
&store,
"lantern_legacy_reasoning_token",
SearchOptions::default(),
)
.unwrap();
assert_eq!(hits.len(), 1);
assert!(hits[0].text.starts_with("[assistant]"));
assert!(
hits[0].text.contains(
"[thinking] lantern_legacy_reasoning_token an octahedron has two pyramids glued at the square base"
),
"reasoning should render behind the [thinking] prefix; got: {}",
hits[0].text
);
let stored_tool_call_id: Option<String> = store
.conn()
.query_row(
"SELECT tool_call_id FROM chunks WHERE id = ?",
params![hits[0].chunk_id],
|row| row.get(0),
)
.unwrap();
assert!(
stored_tool_call_id.is_none(),
"legacy assistant reasoning_content has no tool call id — column must stay NULL"
);
let user_hits = search(&store, "octahedron", SearchOptions::default()).unwrap();
assert!(
user_hits.iter().any(|h| h.text.starts_with("[user]")),
"user prompt should still be searchable"
);
let answer_hits = search(&store, "6 vertices", SearchOptions::default()).unwrap();
assert!(
answer_hits
.iter()
.any(|h| h.text == "[assistant] It has 6 vertices."),
"normal assistant reply with content should ingest unchanged"
);
}
#[test]
fn ingests_chat_completions_assistant_url_citation_annotations_end_to_end() {
let root = tempdir().unwrap();
let mut store = Store::initialize(&root.path().join("store")).unwrap();
let file = root.path().join("chat-completions-annotations.jsonl");
let transcript = concat!(
"{\"role\":\"user\",\"content\":\"What is the capital of France?\"}\n",
"{\"role\":\"assistant\",",
"\"content\":\"lantern_legacy_citation_token The capital of France is Paris.\",",
"\"annotations\":[",
"{\"type\":\"url_citation\",\"url_citation\":{",
"\"url\":\"https://example.com/lantern_cite_alpha_path\",",
"\"title\":\"lantern_cite_alpha_title\",",
"\"start_index\":0,\"end_index\":30}},",
"{\"type\":\"url_citation\",\"url_citation\":{",
"\"url\":\"https://example.com/lantern_cite_beta_path\",",
"\"title\":\"lantern_cite_beta_title\"}}",
"]}\n",
"{\"role\":\"assistant\",\"content\":\"plain followup with no citations\"}\n",
);
fs::write(&file, transcript).unwrap();
let report = ingest_path(&mut store, &file).unwrap();
assert_eq!(report.ingested.len(), 1);
assert_eq!(
report.ingested[0].chunks, 3,
"user prompt + cited assistant turn + plain assistant followup should land as chunks"
);
let body_hits = search(
&store,
"lantern_legacy_citation_token",
SearchOptions::default(),
)
.unwrap();
assert_eq!(body_hits.len(), 1);
assert!(body_hits[0].text.starts_with("[assistant]"));
let alpha_url_hits =
search(&store, "lantern_cite_alpha_path", SearchOptions::default()).unwrap();
assert_eq!(alpha_url_hits.len(), 1);
assert_eq!(alpha_url_hits[0].chunk_id, body_hits[0].chunk_id);
let alpha_title_hits =
search(&store, "lantern_cite_alpha_title", SearchOptions::default()).unwrap();
assert_eq!(alpha_title_hits.len(), 1);
assert_eq!(alpha_title_hits[0].chunk_id, body_hits[0].chunk_id);
let beta_url_hits = search(&store, "lantern_cite_beta_path", SearchOptions::default()).unwrap();
assert_eq!(beta_url_hits.len(), 1);
assert_eq!(beta_url_hits[0].chunk_id, body_hits[0].chunk_id);
let beta_title_hits =
search(&store, "lantern_cite_beta_title", SearchOptions::default()).unwrap();
assert_eq!(beta_title_hits.len(), 1);
assert_eq!(beta_title_hits[0].chunk_id, body_hits[0].chunk_id);
let chunk_text = &body_hits[0].text;
assert!(
chunk_text.contains(
"[url_citation] https://example.com/lantern_cite_alpha_path lantern_cite_alpha_title"
),
"first citation should render with url and title behind [url_citation]; got: {chunk_text}"
);
assert!(
chunk_text.contains(
"[url_citation] https://example.com/lantern_cite_beta_path lantern_cite_beta_title"
),
"second citation should render with url and title behind [url_citation]; got: {chunk_text}"
);
let alpha_pos = chunk_text
.find("lantern_cite_alpha_path")
.expect("alpha url should appear in chunk");
let beta_pos = chunk_text
.find("lantern_cite_beta_path")
.expect("beta url should appear in chunk");
assert!(
alpha_pos < beta_pos,
"citations should join in array order; got alpha at {alpha_pos}, beta at {beta_pos}"
);
let stored_tool_call_id: Option<String> = store
.conn()
.query_row(
"SELECT tool_call_id FROM chunks WHERE id = ?",
params![body_hits[0].chunk_id],
|row| row.get(0),
)
.unwrap();
assert!(
stored_tool_call_id.is_none(),
"ChatCompletions message-level annotations have no tool call id — column must stay NULL"
);
let user_hits = search(&store, "capital of France", SearchOptions::default()).unwrap();
assert!(
user_hits.iter().any(|h| h.text.starts_with("[user]")),
"user prompt should still be searchable"
);
let followup_hits = search(&store, "plain followup", SearchOptions::default()).unwrap();
assert!(
followup_hits
.iter()
.any(|h| h.text == "[assistant] plain followup with no citations"),
"plain assistant reply without annotations should ingest unchanged"
);
}
#[test]
fn ingests_gemini_parts_transcript_end_to_end() {
let root = tempdir().unwrap();
let mut store = Store::initialize(&root.path().join("store")).unwrap();
let file = root.path().join("gemini-transcript.jsonl");
let transcript = concat!(
"{\"role\":\"user\",\"parts\":[{\"text\":\"gemini_user_token where do lanterns belong\"}]}\n",
"{\"role\":\"model\",\"parts\":[",
"{\"text\":\"gemini_model_step_one hang it near the door\"},",
"{\"text\":\"gemini_model_step_two it catches the evening breeze\"}",
"]}\n",
);
fs::write(&file, transcript).unwrap();
let report = ingest_path(&mut store, &file).unwrap();
assert_eq!(report.ingested.len(), 1);
assert_eq!(
report.ingested[0].chunks, 2,
"both gemini-shaped turns should land as their own chunks"
);
let user_hits = search(&store, "gemini_user_token", SearchOptions::default()).unwrap();
assert_eq!(user_hits.len(), 1);
assert!(
user_hits[0].text.starts_with("[user]"),
"user role should drive the standard role prefix; got: {}",
user_hits[0].text
);
assert!(
user_hits[0].text.contains("where do lanterns belong"),
"user text part should survive ingest"
);
let step_one_hits = search(&store, "gemini_model_step_one", SearchOptions::default()).unwrap();
assert_eq!(step_one_hits.len(), 1);
assert!(
step_one_hits[0].text.starts_with("[model]"),
"gemini assistant role 'model' should pass through to the chunk prefix; got: {}",
step_one_hits[0].text
);
let step_two_hits = search(&store, "gemini_model_step_two", SearchOptions::default()).unwrap();
assert_eq!(step_two_hits.len(), 1);
assert_eq!(
step_one_hits[0].chunk_id, step_two_hits[0].chunk_id,
"both model text parts should join into the same chunk"
);
let stored_role: Option<String> = store
.conn()
.query_row(
"SELECT role FROM chunks WHERE id = ?",
params![step_one_hits[0].chunk_id],
|row| row.get(0),
)
.unwrap();
assert_eq!(stored_role.as_deref(), Some("model"));
}
#[test]
fn ingests_gemini_parts_function_call_and_response_end_to_end() {
let root = tempdir().unwrap();
let mut store = Store::initialize(&root.path().join("store")).unwrap();
let file = root.path().join("gemini-tool-transcript.jsonl");
let transcript = concat!(
"{\"role\":\"model\",\"parts\":[",
"{\"text\":\"gemini_call_intro searching now\"},",
"{\"functionCall\":{\"name\":\"gemini_tool_name_search\",",
"\"args\":{\"zeta\":1,\"alpha\":\"gemini_call_arg_token\"}}}",
"]}\n",
"{\"role\":\"user\",\"parts\":[",
"{\"functionResponse\":{\"name\":\"gemini_tool_name_search\",",
"\"response\":{\"gemini_response_token\":\"found it\",\"count\":3}}}",
"]}\n",
);
fs::write(&file, transcript).unwrap();
let report = ingest_path(&mut store, &file).unwrap();
assert_eq!(report.ingested.len(), 1);
assert_eq!(
report.ingested[0].chunks, 2,
"both gemini structured-tool turns should land as their own chunks"
);
let call_hits = search(&store, "gemini_tool_name_search", SearchOptions::default()).unwrap();
assert_eq!(
call_hits.len(),
1,
"tool name should appear only on the call chunk, got: {call_hits:?}"
);
let call_chunk = call_hits
.iter()
.find(|h| h.text.starts_with("[model]"))
.expect("model call chunk should be present");
assert!(
call_chunk.text.contains("gemini_call_intro"),
"sibling text part should join with the call part in array order; got: {}",
call_chunk.text
);
assert!(
call_chunk
.text
.contains("[tool_use:gemini_tool_name_search]"),
"functionCall part should render behind the shared [tool_use:NAME] prefix; got: {}",
call_chunk.text
);
assert!(
call_chunk
.text
.contains("{\"alpha\":\"gemini_call_arg_token\",\"zeta\":1}"),
"args object should serialize deterministically (alpha before zeta); got: {}",
call_chunk.text
);
let arg_hits = search(&store, "gemini_call_arg_token", SearchOptions::default()).unwrap();
assert_eq!(arg_hits.len(), 1);
assert_eq!(arg_hits[0].chunk_id, call_chunk.chunk_id);
let prefix_hits = search(&store, "tool_result", SearchOptions::default()).unwrap();
let response_chunk = prefix_hits
.iter()
.find(|h| h.text.starts_with("[user]"))
.expect("user response chunk should be present");
assert!(
response_chunk.text.contains("[tool_result]"),
"functionResponse part should render behind the [tool_result] prefix; got: {}",
response_chunk.text
);
assert!(
response_chunk
.text
.contains("{\"count\":3,\"gemini_response_token\":\"found it\"}"),
"response object should serialize deterministically; got: {}",
response_chunk.text
);
let response_hits = search(&store, "gemini_response_token", SearchOptions::default()).unwrap();
assert_eq!(response_hits.len(), 1);
assert_eq!(response_hits[0].chunk_id, response_chunk.chunk_id);
}