use std::fs::File;
use std::io::{BufRead, BufReader};
use std::path::Path;
use crate::model::entry::TranscriptEntry;
#[derive(Debug, Clone)]
pub struct ParseWarning {
pub line: usize,
pub message: String,
}
#[derive(Debug)]
pub struct ParseResult {
pub entries: Vec<TranscriptEntry>,
pub warnings: Vec<ParseWarning>,
}
pub fn parse_file(path: &Path) -> Result<ParseResult, std::io::Error> {
let file = File::open(path)?;
let reader = BufReader::new(file);
parse_reader(reader)
}
pub fn parse_reader<R: BufRead>(reader: R) -> Result<ParseResult, std::io::Error> {
let mut entries = Vec::new();
let mut warnings = Vec::new();
for (line_num, line_result) in reader.lines().enumerate() {
let line = line_result?;
let line_no = line_num + 1;
let line = if line_num == 0 {
line.strip_prefix('\u{FEFF}').unwrap_or(&line).to_string()
} else {
line
};
let trimmed = line.trim();
if trimmed.is_empty() {
continue;
}
match serde_json::from_str::<TranscriptEntry>(trimmed) {
Ok(entry) => entries.push(entry),
Err(e) => {
warnings.push(ParseWarning {
line: line_no,
message: format!("failed to parse line {}: {}", line_no, e),
});
}
}
}
Ok(ParseResult { entries, warnings })
}
#[cfg(test)]
mod tests {
use super::*;
use std::io::Cursor;
fn make_user_entry(uuid: &str, text: &str) -> String {
serde_json::json!({
"type": "user",
"uuid": uuid,
"timestamp": "2025-06-15T10:30:00Z",
"sessionId": "session-1",
"message": {"role": "user", "content": [{"type": "text", "text": text}]}
})
.to_string()
}
#[test]
fn parse_empty_input() {
let input = "";
let result = parse_reader(Cursor::new(input)).unwrap();
assert!(result.entries.is_empty());
assert!(result.warnings.is_empty());
}
#[test]
fn parse_blank_lines_only() {
let input = "\n\n \n";
let result = parse_reader(Cursor::new(input)).unwrap();
assert!(result.entries.is_empty());
assert!(result.warnings.is_empty());
}
#[test]
fn parse_single_user_entry() {
let input = serde_json::json!({
"type": "user",
"uuid": "550e8400-e29b-41d4-a716-446655440000",
"timestamp": "2025-06-15T10:30:00Z",
"sessionId": "session-1",
"message": {
"role": "user",
"content": [{"type": "text", "text": "Hello"}]
}
})
.to_string();
let result = parse_reader(Cursor::new(input)).unwrap();
assert_eq!(result.entries.len(), 1);
assert!(result.warnings.is_empty());
assert_eq!(result.entries[0].entry_type(), "user");
}
#[test]
fn parse_multiple_entries() {
let user = serde_json::json!({
"type": "user",
"uuid": "550e8400-e29b-41d4-a716-446655440000",
"timestamp": "2025-06-15T10:30:00Z",
"sessionId": "session-1",
"message": {"role": "user", "content": [{"type": "text", "text": "hi"}]}
})
.to_string();
let assistant = serde_json::json!({
"type": "assistant",
"uuid": "660e8400-e29b-41d4-a716-446655440000",
"timestamp": "2025-06-15T10:30:05Z",
"sessionId": "session-1",
"message": {
"role": "assistant",
"content": [{"type": "text", "text": "Hello!"}]
}
})
.to_string();
let file = format!("{}\n{}\n", user, assistant);
let result = parse_reader(Cursor::new(&file)).unwrap();
assert_eq!(result.entries.len(), 2);
assert!(result.warnings.is_empty());
assert_eq!(result.entries[0].entry_type(), "user");
assert_eq!(result.entries[1].entry_type(), "assistant");
}
#[test]
fn malformed_line_collects_warning_not_abort() {
let good = serde_json::json!({
"type": "user",
"uuid": "550e8400-e29b-41d4-a716-446655440000",
"timestamp": "2025-06-15T10:30:00Z",
"sessionId": "session-1",
"message": {"role": "user", "content": [{"type": "text", "text": "hi"}]}
})
.to_string();
let bad = "this is not valid json";
let file = format!("{}\n{}\n{}\n", good, bad, good);
let result = parse_reader(Cursor::new(&file)).unwrap();
assert_eq!(result.entries.len(), 2);
assert_eq!(result.warnings.len(), 1);
assert_eq!(result.warnings[0].line, 2);
assert!(
result.warnings[0].message.contains("failed to parse line"),
"warning message should mention the line: {}",
result.warnings[0].message
);
}
#[test]
fn bom_at_start_is_stripped() {
let json = serde_json::json!({
"type": "user",
"uuid": "550e8400-e29b-41d4-a716-446655440000",
"timestamp": "2025-06-15T10:30:00Z",
"sessionId": "session-1",
"message": {"role": "user", "content": [{"type": "text", "text": "hi"}]}
})
.to_string();
let with_bom = format!("\u{FEFF}{}", json);
let result = parse_reader(Cursor::new(&with_bom)).unwrap();
assert_eq!(result.entries.len(), 1);
assert!(result.warnings.is_empty());
}
#[test]
fn unknown_entry_type_parses_as_unknown_variant() {
let input = serde_json::json!({
"type": "future-type",
"uuid": "550e8400-e29b-41d4-a716-446655440000",
"timestamp": "2025-06-15T10:30:00Z",
"sessionId": "session-1",
"extraField": 42
})
.to_string();
let result = parse_reader(Cursor::new(input)).unwrap();
assert_eq!(result.entries.len(), 1);
assert_eq!(result.entries[0].entry_type(), "future-type");
}
#[test]
fn parse_header_line() {
let input = serde_json::json!({
"type": "system",
"uuid": "00000000-0000-0000-0000-000000000001",
"timestamp": "2025-06-15T10:29:00Z",
"sessionId": "session-1",
"system": {"version": "1.0.0"}
})
.to_string();
let result = parse_reader(Cursor::new(input)).unwrap();
assert_eq!(result.entries.len(), 1);
}
#[test]
fn parse_file_round_trips_through_filesystem() {
let dir = std::env::temp_dir();
let path = dir.join("weavr_test_parse_file.jsonl");
let json = make_user_entry("550e8400-e29b-41d4-a716-446655440000", "hello");
std::fs::write(&path, &json).unwrap();
let result = parse_file(&path).unwrap();
assert_eq!(result.entries.len(), 1);
assert!(result.warnings.is_empty());
let _ = std::fs::remove_file(&path);
}
#[test]
fn parse_file_nonexistent_returns_io_error() {
let result = parse_file(Path::new("/nonexistent/path/file.jsonl"));
assert!(result.is_err());
}
#[test]
fn entry_with_empty_type_field_is_unknown() {
let input = r#"{"type": "", "uuid": "550e8400-e29b-41d4-a716-446655440000", "timestamp": "2025-06-15T10:30:00Z", "sessionId": "s1"}"#;
let result = parse_reader(Cursor::new(input)).unwrap();
assert_eq!(result.entries.len(), 1);
assert_eq!(result.entries[0].entry_type(), "");
}
#[test]
fn mixed_valid_invalid_lines_correct_counts() {
let good1 = make_user_entry("550e8400-e29b-41d4-a716-446655440001", "msg1");
let good2 = make_user_entry("550e8400-e29b-41d4-a716-446655440002", "msg2");
let input = format!("{}\nnot json\n\n{}\n{{\"broken", good1, good2);
let result = parse_reader(Cursor::new(&input)).unwrap();
assert_eq!(result.entries.len(), 2);
assert_eq!(result.warnings.len(), 2);
}
#[test]
fn bom_only_line_is_empty() {
let input = "\u{FEFF}";
let result = parse_reader(Cursor::new(input)).unwrap();
assert!(result.entries.is_empty());
assert!(result.warnings.is_empty());
}
#[test]
fn parse_warning_displays_line_number() {
let input = "line 1 is bad json\n";
let result = parse_reader(Cursor::new(input)).unwrap();
assert_eq!(result.warnings.len(), 1);
assert_eq!(result.warnings[0].line, 1);
assert!(result.warnings[0].message.contains("line 1"));
}
}