1use std::io::Write;
27
28use anyhow::Result;
29use serde::{Deserialize, Serialize};
30use serde_json::Value;
31
32#[derive(Debug, Clone, Serialize, Deserialize, Default)]
34pub struct DocumentRecord {
35 pub doc_id: String,
36 pub title: Option<String>,
37 pub source_type: String,
38 pub source_format: String,
39 pub source_ref: String,
40 #[serde(default)]
41 pub tags: Vec<String>,
42}
43
44#[derive(Debug, Clone, Serialize, Deserialize, Default)]
46pub struct PageRecord {
47 pub page_id: String,
48 pub doc_id: String,
49 pub page_number: u32,
50 pub approx_tokens: Option<u32>,
51 #[serde(default)]
52 pub meta: Value,
53}
54
55#[derive(Debug, Clone, Serialize, Deserialize, Default)]
57pub struct CellRecord {
58 pub cell_id: String,
59 pub doc_id: String,
60 pub page_id: String,
61 pub kind: String,
62 pub text: String,
63 pub importance: f32,
64 pub bbox: Option<[f32; 4]>,
65 pub numguard: Option<Value>,
66 #[serde(default)]
67 pub meta: Value,
68}
69
70pub struct JsonlWriter<W> {
72 writer: W,
73}
74
75impl<W: Write> JsonlWriter<W> {
76 pub fn new(writer: W) -> Self {
78 Self { writer }
79 }
80
81 pub fn write_record<T: Serialize>(&mut self, record: &T) -> Result<()> {
83 let mut buf = serde_json::to_vec(record)?;
84 buf.push(b'\n');
85 self.writer.write_all(&buf)?;
86 Ok(())
87 }
88
89 pub fn flush(&mut self) -> Result<()> {
91 self.writer.flush()?;
92 Ok(())
93 }
94
95 pub fn into_inner(self) -> W {
97 self.writer
98 }
99}
100
101#[cfg(test)]
102mod tests {
103 use super::*;
104
105 #[test]
106 fn jsonl_writer_roundtrips_records() {
107 let record = DocumentRecord {
108 doc_id: "doc_1".to_string(),
109 title: Some("Test".to_string()),
110 source_type: "files".to_string(),
111 source_format: "pdf".to_string(),
112 source_ref: "/tmp/input.pdf".to_string(),
113 tags: vec!["tag1".to_string()],
114 };
115 let writer = Vec::new();
116 let mut writer = JsonlWriter::new(writer);
117 writer.write_record(&record).unwrap();
118 let buf = writer.into_inner();
119 assert!(buf.ends_with(b"\n"));
120 let parsed: DocumentRecord = serde_json::from_slice(&buf).unwrap();
121 assert_eq!(parsed.doc_id, "doc_1");
122 assert_eq!(parsed.title.unwrap(), "Test");
123 }
124
125 #[test]
126 fn jsonl_writer_multiple_records() {
127 let mut writer = JsonlWriter::new(Vec::new());
128
129 writer
130 .write_record(&DocumentRecord {
131 doc_id: "doc_1".to_string(),
132 ..Default::default()
133 })
134 .unwrap();
135 writer
136 .write_record(&DocumentRecord {
137 doc_id: "doc_2".to_string(),
138 ..Default::default()
139 })
140 .unwrap();
141
142 let buf = writer.into_inner();
143 let lines: Vec<&str> = std::str::from_utf8(&buf).unwrap().lines().collect();
144
145 assert_eq!(lines.len(), 2);
146 assert!(lines[0].contains("doc_1"));
147 assert!(lines[1].contains("doc_2"));
148 }
149}