lantern 0.2.2 - Docs.rs

//! Transcript extraction for JSONL session files.
//!
//! Each line is expected to be a JSON object describing one turn/message.
//! We accept a pragmatic set of shapes rather than prescribing one: for
//! every line we try, in order, `content` (string or Anthropic-style array
//! of `{type, text}` blocks), `text`, `message`, `body`. If a `role` field
//! is present it becomes a `[role] ...` prefix on the chunk so searches can
//! discriminate between, say, `[user]` and `[assistant]` turns.
//!
//! Output is a `Vec<Chunk>` whose byte offsets point back to the line in
//! the original JSONL file, so existing provenance fields remain meaningful.

use serde_json::Value;

use crate::chunk::Chunk;

pub fn extract_jsonl(text: &str) -> Vec<Chunk> {
    let mut chunks = Vec::new();
    let mut ordinal = 0usize;
    let mut cursor = 0usize;
    for line in text.split_inclusive('\n') {
        let byte_start = cursor;
        let byte_end = cursor + line.len();
        cursor = byte_end;
        let trimmed = line.trim();
        if trimmed.is_empty() {
            continue;
        }
        if let Some(chunk) = extract_line(trimmed, ordinal, byte_start, byte_end) {
            chunks.push(chunk);
            ordinal += 1;
        }
    }
    chunks
}

fn extract_line(line: &str, ordinal: usize, byte_start: usize, byte_end: usize) -> Option<Chunk> {
    let value: Value = serde_json::from_str(line).ok()?;
    let obj = value.as_object()?;

    let content = obj
        .get("content")
        .and_then(content_to_text)
        .or_else(|| scalar_text(obj, "text"))
        .or_else(|| scalar_text(obj, "message"))
        .or_else(|| scalar_text(obj, "body"))
        .or_else(|| scalar_text(obj, "arguments"))
        .or_else(|| scalar_text(obj, "input"))?;

    if content.trim().is_empty() {
        return None;
    }

    let role = scalar_text(obj, "role");
    let session_id = first_present_string(obj, &["session_id", "conversation_id", "thread_id"]);
    let turn_id = first_present_string(obj, &["turn_id", "turn", "message_id"]);
    let tool_name = first_present_string(obj, &["tool_name", "tool", "name"]);
    let timestamp_unix = first_present_timestamp(obj, &["timestamp", "created_at", "ts"]);

    let text = if let Some(role) = role.clone() {
        format!("[{role}] {content}")
    } else if let Some(tool_name) = tool_name.as_deref() {
        format!("[tool:{tool_name}] {content}")
    } else {
        content
    };

    Some(Chunk {
        ordinal,
        byte_start,
        byte_end,
        text,
        role,
        session_id,
        turn_id,
        tool_name,
        timestamp_unix,
    })
}

fn scalar_text(obj: &serde_json::Map<String, Value>, key: &str) -> Option<String> {
    obj.get(key)
        .and_then(value_to_string)
        .filter(|s| !s.trim().is_empty())
}

fn first_present_string(obj: &serde_json::Map<String, Value>, keys: &[&str]) -> Option<String> {
    keys.iter().find_map(|key| scalar_text(obj, key))
}

fn first_present_timestamp(obj: &serde_json::Map<String, Value>, keys: &[&str]) -> Option<i64> {
    keys.iter().find_map(|key| {
        let value = obj.get(*key)?;
        match value {
            Value::Number(n) => n.as_i64().or_else(|| n.as_u64().map(|u| u as i64)),
            Value::String(s) => s.trim().parse::<i64>().ok(),
            _ => None,
        }
    })
}

fn value_to_string(value: &Value) -> Option<String> {
    match value {
        Value::String(s) if !s.trim().is_empty() => Some(s.clone()),
        Value::Number(_) | Value::Bool(_) => Some(value.to_string()),
        _ => None,
    }
}

fn content_to_text(v: &Value) -> Option<String> {
    match v {
        Value::String(s) if !s.trim().is_empty() => Some(s.clone()),
        Value::Array(arr) => {
            let parts: Vec<String> = arr
                .iter()
                .filter_map(|item| match item {
                    Value::String(s) if !s.trim().is_empty() => Some(s.clone()),
                    Value::Object(obj) => obj
                        .get("text")
                        .and_then(Value::as_str)
                        .or_else(|| obj.get("content").and_then(Value::as_str))
                        .map(str::to_string)
                        .filter(|s| !s.trim().is_empty()),
                    _ => None,
                })
                .collect();
            if parts.is_empty() {
                None
            } else {
                Some(parts.join("\n"))
            }
        }
        _ => None,
    }
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn extracts_simple_string_content_with_role_prefix() {
        let jsonl = "{\"role\":\"user\",\"content\":\"hello there\"}\n";
        let chunks = extract_jsonl(jsonl);
        assert_eq!(chunks.len(), 1);
        assert_eq!(chunks[0].text, "[user] hello there");
        assert_eq!(chunks[0].ordinal, 0);
        assert_eq!(chunks[0].byte_start, 0);
        assert_eq!(chunks[0].byte_end, jsonl.len());
    }

    #[test]
    fn extracts_anthropic_content_block_array() {
        let jsonl = concat!(
            "{\"role\":\"assistant\",\"content\":[",
            "{\"type\":\"text\",\"text\":\"first part\"},",
            "{\"type\":\"text\",\"text\":\"second part\"}",
            "]}\n"
        );
        let chunks = extract_jsonl(jsonl);
        assert_eq!(chunks.len(), 1);
        assert_eq!(chunks[0].text, "[assistant] first part\nsecond part");
    }

    #[test]
    fn extracts_tool_and_turn_metadata_with_tool_prefix() {
        let jsonl = concat!(
            "{\"conversation_id\":\"sess-1\",\"turn\":\"turn-7\",\"tool_name\":\"search\",",
            "\"created_at\":\"1700000000\",\"content\":\"tool output\"}\n"
        );
        let chunks = extract_jsonl(jsonl);
        assert_eq!(chunks.len(), 1);
        assert_eq!(chunks[0].session_id.as_deref(), Some("sess-1"));
        assert_eq!(chunks[0].turn_id.as_deref(), Some("turn-7"));
        assert_eq!(chunks[0].tool_name.as_deref(), Some("search"));
        assert_eq!(chunks[0].timestamp_unix, Some(1_700_000_000));
        assert_eq!(chunks[0].text, "[tool:search] tool output");
        assert_eq!(chunks[0].role, None);
    }

    #[test]
    fn extracts_role_prefixed_message_with_timestamp() {
        let jsonl = "{\"role\":\"assistant\",\"timestamp\":1700000001,\"content\":\"hello\"}\n";
        let chunks = extract_jsonl(jsonl);
        assert_eq!(chunks.len(), 1);
        assert_eq!(chunks[0].role.as_deref(), Some("assistant"));
        assert_eq!(chunks[0].timestamp_unix, Some(1_700_000_001));
        assert_eq!(chunks[0].text, "[assistant] hello");
    }

    #[test]
    fn extracts_from_alternate_text_fields() {
        let jsonl = "{\"text\":\"alpha\"}\n{\"message\":\"beta\"}\n{\"body\":\"gamma\"}\n";
        let chunks = extract_jsonl(jsonl);
        assert_eq!(chunks.len(), 3);
        assert_eq!(chunks[0].text, "alpha");
        assert_eq!(chunks[1].text, "beta");
        assert_eq!(chunks[2].text, "gamma");
    }

    #[test]
    fn byte_ranges_point_to_source_lines() {
        let line_a = "{\"text\":\"aaa\"}\n";
        let line_b = "{\"text\":\"bbb\"}\n";
        let jsonl = format!("{line_a}{line_b}");
        let chunks = extract_jsonl(&jsonl);
        assert_eq!(chunks.len(), 2);
        assert_eq!(chunks[0].byte_start, 0);
        assert_eq!(chunks[0].byte_end, line_a.len());
        assert_eq!(chunks[1].byte_start, line_a.len());
        assert_eq!(chunks[1].byte_end, line_a.len() + line_b.len());
    }

    #[test]
    fn skips_blank_and_malformed_and_empty_lines() {
        let jsonl = "\n{not json}\n{\"role\":\"u\",\"content\":\"\"}\n{\"text\":\"ok\"}\n";
        let chunks = extract_jsonl(jsonl);
        assert_eq!(chunks.len(), 1);
        assert_eq!(chunks[0].text, "ok");
        assert_eq!(chunks[0].ordinal, 0);
    }

    #[test]
    fn returns_empty_for_empty_input() {
        assert!(extract_jsonl("").is_empty());
    }

    #[test]
    fn ignores_lines_without_extractable_fields() {
        let jsonl = "{\"timestamp\":1234}\n{\"foo\":\"bar\"}\n";
        assert!(extract_jsonl(jsonl).is_empty());
    }

    #[test]
    fn missing_role_produces_unprefixed_text() {
        let jsonl = "{\"content\":\"no role here\"}\n";
        let chunks = extract_jsonl(jsonl);
        assert_eq!(chunks.len(), 1);
        assert_eq!(chunks[0].text, "no role here");
    }
}