use serde_json::Value;
use crate::chunk::Chunk;
pub fn extract_jsonl(text: &str) -> Vec<Chunk> {
let mut chunks = Vec::new();
let mut ordinal = 0usize;
let mut cursor = 0usize;
for line in text.split_inclusive('\n') {
let byte_start = cursor;
let byte_end = cursor + line.len();
cursor = byte_end;
let trimmed = line.trim();
if trimmed.is_empty() {
continue;
}
if let Some(chunk) = extract_line(trimmed, ordinal, byte_start, byte_end) {
chunks.push(chunk);
ordinal += 1;
}
}
chunks
}
fn extract_line(line: &str, ordinal: usize, byte_start: usize, byte_end: usize) -> Option<Chunk> {
let value: Value = serde_json::from_str(line).ok()?;
let obj = value.as_object()?;
let content = obj
.get("content")
.and_then(content_to_text)
.or_else(|| scalar_text(obj, "text"))
.or_else(|| scalar_text(obj, "message"))
.or_else(|| scalar_text(obj, "body"))
.or_else(|| scalar_text(obj, "arguments"))
.or_else(|| scalar_text(obj, "input"))?;
if content.trim().is_empty() {
return None;
}
let role = scalar_text(obj, "role");
let session_id = first_present_string(obj, &["session_id", "conversation_id", "thread_id"]);
let turn_id = first_present_string(obj, &["turn_id", "turn", "message_id"]);
let tool_name = first_present_string(obj, &["tool_name", "tool", "name"]);
let timestamp_unix = first_present_timestamp(obj, &["timestamp", "created_at", "ts"]);
let text = if let Some(role) = role.clone() {
format!("[{role}] {content}")
} else if let Some(tool_name) = tool_name.as_deref() {
format!("[tool:{tool_name}] {content}")
} else {
content
};
Some(Chunk {
ordinal,
byte_start,
byte_end,
text,
role,
session_id,
turn_id,
tool_name,
timestamp_unix,
})
}
fn scalar_text(obj: &serde_json::Map<String, Value>, key: &str) -> Option<String> {
obj.get(key)
.and_then(value_to_string)
.filter(|s| !s.trim().is_empty())
}
fn first_present_string(obj: &serde_json::Map<String, Value>, keys: &[&str]) -> Option<String> {
keys.iter().find_map(|key| scalar_text(obj, key))
}
fn first_present_timestamp(obj: &serde_json::Map<String, Value>, keys: &[&str]) -> Option<i64> {
keys.iter().find_map(|key| {
let value = obj.get(*key)?;
match value {
Value::Number(n) => n.as_i64().or_else(|| n.as_u64().map(|u| u as i64)),
Value::String(s) => s.trim().parse::<i64>().ok(),
_ => None,
}
})
}
fn value_to_string(value: &Value) -> Option<String> {
match value {
Value::String(s) if !s.trim().is_empty() => Some(s.clone()),
Value::Number(_) | Value::Bool(_) => Some(value.to_string()),
_ => None,
}
}
fn content_to_text(v: &Value) -> Option<String> {
match v {
Value::String(s) if !s.trim().is_empty() => Some(s.clone()),
Value::Array(arr) => {
let parts: Vec<String> = arr
.iter()
.filter_map(|item| match item {
Value::String(s) if !s.trim().is_empty() => Some(s.clone()),
Value::Object(obj) => obj
.get("text")
.and_then(Value::as_str)
.or_else(|| obj.get("content").and_then(Value::as_str))
.map(str::to_string)
.filter(|s| !s.trim().is_empty()),
_ => None,
})
.collect();
if parts.is_empty() {
None
} else {
Some(parts.join("\n"))
}
}
_ => None,
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn extracts_simple_string_content_with_role_prefix() {
let jsonl = "{\"role\":\"user\",\"content\":\"hello there\"}\n";
let chunks = extract_jsonl(jsonl);
assert_eq!(chunks.len(), 1);
assert_eq!(chunks[0].text, "[user] hello there");
assert_eq!(chunks[0].ordinal, 0);
assert_eq!(chunks[0].byte_start, 0);
assert_eq!(chunks[0].byte_end, jsonl.len());
}
#[test]
fn extracts_anthropic_content_block_array() {
let jsonl = concat!(
"{\"role\":\"assistant\",\"content\":[",
"{\"type\":\"text\",\"text\":\"first part\"},",
"{\"type\":\"text\",\"text\":\"second part\"}",
"]}\n"
);
let chunks = extract_jsonl(jsonl);
assert_eq!(chunks.len(), 1);
assert_eq!(chunks[0].text, "[assistant] first part\nsecond part");
}
#[test]
fn extracts_tool_and_turn_metadata_with_tool_prefix() {
let jsonl = concat!(
"{\"conversation_id\":\"sess-1\",\"turn\":\"turn-7\",\"tool_name\":\"search\",",
"\"created_at\":\"1700000000\",\"content\":\"tool output\"}\n"
);
let chunks = extract_jsonl(jsonl);
assert_eq!(chunks.len(), 1);
assert_eq!(chunks[0].session_id.as_deref(), Some("sess-1"));
assert_eq!(chunks[0].turn_id.as_deref(), Some("turn-7"));
assert_eq!(chunks[0].tool_name.as_deref(), Some("search"));
assert_eq!(chunks[0].timestamp_unix, Some(1_700_000_000));
assert_eq!(chunks[0].text, "[tool:search] tool output");
assert_eq!(chunks[0].role, None);
}
#[test]
fn extracts_role_prefixed_message_with_timestamp() {
let jsonl = "{\"role\":\"assistant\",\"timestamp\":1700000001,\"content\":\"hello\"}\n";
let chunks = extract_jsonl(jsonl);
assert_eq!(chunks.len(), 1);
assert_eq!(chunks[0].role.as_deref(), Some("assistant"));
assert_eq!(chunks[0].timestamp_unix, Some(1_700_000_001));
assert_eq!(chunks[0].text, "[assistant] hello");
}
#[test]
fn extracts_from_alternate_text_fields() {
let jsonl = "{\"text\":\"alpha\"}\n{\"message\":\"beta\"}\n{\"body\":\"gamma\"}\n";
let chunks = extract_jsonl(jsonl);
assert_eq!(chunks.len(), 3);
assert_eq!(chunks[0].text, "alpha");
assert_eq!(chunks[1].text, "beta");
assert_eq!(chunks[2].text, "gamma");
}
#[test]
fn byte_ranges_point_to_source_lines() {
let line_a = "{\"text\":\"aaa\"}\n";
let line_b = "{\"text\":\"bbb\"}\n";
let jsonl = format!("{line_a}{line_b}");
let chunks = extract_jsonl(&jsonl);
assert_eq!(chunks.len(), 2);
assert_eq!(chunks[0].byte_start, 0);
assert_eq!(chunks[0].byte_end, line_a.len());
assert_eq!(chunks[1].byte_start, line_a.len());
assert_eq!(chunks[1].byte_end, line_a.len() + line_b.len());
}
#[test]
fn skips_blank_and_malformed_and_empty_lines() {
let jsonl = "\n{not json}\n{\"role\":\"u\",\"content\":\"\"}\n{\"text\":\"ok\"}\n";
let chunks = extract_jsonl(jsonl);
assert_eq!(chunks.len(), 1);
assert_eq!(chunks[0].text, "ok");
assert_eq!(chunks[0].ordinal, 0);
}
#[test]
fn returns_empty_for_empty_input() {
assert!(extract_jsonl("").is_empty());
}
#[test]
fn ignores_lines_without_extractable_fields() {
let jsonl = "{\"timestamp\":1234}\n{\"foo\":\"bar\"}\n";
assert!(extract_jsonl(jsonl).is_empty());
}
#[test]
fn missing_role_produces_unprefixed_text() {
let jsonl = "{\"content\":\"no role here\"}\n";
let chunks = extract_jsonl(jsonl);
assert_eq!(chunks.len(), 1);
assert_eq!(chunks[0].text, "no role here");
}
}