Skip to main content

three_dcf_core/
index.rs

1//! Index types for JSONL output and dataset pipelines.
2//!
3//! This module provides data structures for exporting documents to JSONL format,
4//! suitable for downstream ML pipelines and vector databases.
5//!
6//! # Example
7//!
8//! ```rust,no_run
9//! use three_dcf_core::index::{DocumentRecord, JsonlWriter};
10//! use std::fs::File;
11//!
12//! let file = File::create("output.jsonl")?;
13//! let mut writer = JsonlWriter::new(file);
14//!
15//! writer.write_record(&DocumentRecord {
16//!     doc_id: "doc_001".to_string(),
17//!     title: Some("Annual Report 2024".to_string()),
18//!     source_type: "files".to_string(),
19//!     source_format: "pdf".to_string(),
20//!     source_ref: "/data/reports/annual_2024.pdf".to_string(),
21//!     tags: vec!["finance".to_string(), "annual".to_string()],
22//! })?;
23//! # Ok::<(), anyhow::Error>(())
24//! ```
25
26use std::io::Write;
27
28use anyhow::Result;
29use serde::{Deserialize, Serialize};
30use serde_json::Value;
31
32/// Metadata record for a processed document.
33#[derive(Debug, Clone, Serialize, Deserialize, Default)]
34pub struct DocumentRecord {
35    pub doc_id: String,
36    pub title: Option<String>,
37    pub source_type: String,
38    pub source_format: String,
39    pub source_ref: String,
40    #[serde(default)]
41    pub tags: Vec<String>,
42}
43
44/// Metadata record for a single page within a document.
45#[derive(Debug, Clone, Serialize, Deserialize, Default)]
46pub struct PageRecord {
47    pub page_id: String,
48    pub doc_id: String,
49    pub page_number: u32,
50    pub approx_tokens: Option<u32>,
51    #[serde(default)]
52    pub meta: Value,
53}
54
55/// Record for a single cell (text block) within a page.
56#[derive(Debug, Clone, Serialize, Deserialize, Default)]
57pub struct CellRecord {
58    pub cell_id: String,
59    pub doc_id: String,
60    pub page_id: String,
61    pub kind: String,
62    pub text: String,
63    pub importance: f32,
64    pub bbox: Option<[f32; 4]>,
65    pub numguard: Option<Value>,
66    #[serde(default)]
67    pub meta: Value,
68}
69
70/// A streaming JSONL writer for efficient dataset export.
71pub struct JsonlWriter<W> {
72    writer: W,
73}
74
75impl<W: Write> JsonlWriter<W> {
76    /// Create a new JSONL writer wrapping the given writer.
77    pub fn new(writer: W) -> Self {
78        Self { writer }
79    }
80
81    /// Write a single record as a JSON line.
82    pub fn write_record<T: Serialize>(&mut self, record: &T) -> Result<()> {
83        let mut buf = serde_json::to_vec(record)?;
84        buf.push(b'\n');
85        self.writer.write_all(&buf)?;
86        Ok(())
87    }
88
89    /// Flush the underlying writer.
90    pub fn flush(&mut self) -> Result<()> {
91        self.writer.flush()?;
92        Ok(())
93    }
94
95    /// Consume the writer and return the inner writer.
96    pub fn into_inner(self) -> W {
97        self.writer
98    }
99}
100
101#[cfg(test)]
102mod tests {
103    use super::*;
104
105    #[test]
106    fn jsonl_writer_roundtrips_records() {
107        let record = DocumentRecord {
108            doc_id: "doc_1".to_string(),
109            title: Some("Test".to_string()),
110            source_type: "files".to_string(),
111            source_format: "pdf".to_string(),
112            source_ref: "/tmp/input.pdf".to_string(),
113            tags: vec!["tag1".to_string()],
114        };
115        let writer = Vec::new();
116        let mut writer = JsonlWriter::new(writer);
117        writer.write_record(&record).unwrap();
118        let buf = writer.into_inner();
119        assert!(buf.ends_with(b"\n"));
120        let parsed: DocumentRecord = serde_json::from_slice(&buf).unwrap();
121        assert_eq!(parsed.doc_id, "doc_1");
122        assert_eq!(parsed.title.unwrap(), "Test");
123    }
124
125    #[test]
126    fn jsonl_writer_multiple_records() {
127        let mut writer = JsonlWriter::new(Vec::new());
128
129        writer
130            .write_record(&DocumentRecord {
131                doc_id: "doc_1".to_string(),
132                ..Default::default()
133            })
134            .unwrap();
135        writer
136            .write_record(&DocumentRecord {
137                doc_id: "doc_2".to_string(),
138                ..Default::default()
139            })
140            .unwrap();
141
142        let buf = writer.into_inner();
143        let lines: Vec<&str> = std::str::from_utf8(&buf).unwrap().lines().collect();
144
145        assert_eq!(lines.len(), 2);
146        assert!(lines[0].contains("doc_1"));
147        assert!(lines[1].contains("doc_2"));
148    }
149}