Skip to main content

three_dcf_core/
ingest.rs

1use std::collections::HashMap;
2use std::fs::{self, OpenOptions};
3use std::io::BufWriter;
4use std::path::{Path, PathBuf};
5
6use anyhow::Result;
7use serde_json::json;
8
9use crate::index::{CellRecord as IndexCellRecord, DocumentRecord, JsonlWriter, PageRecord};
10
11use crate::{document::CellType, Document, Encoder};
12
13#[derive(Debug, Clone)]
14pub struct IngestOptions {
15    pub preset: String,
16    pub enable_ocr: bool,
17    pub force_ocr: bool,
18    pub ocr_languages: Vec<String>,
19    pub source_override: Option<PathBuf>,
20}
21
22impl Default for IngestOptions {
23    fn default() -> Self {
24        Self {
25            preset: "reports".to_string(),
26            enable_ocr: false,
27            force_ocr: false,
28            ocr_languages: vec!["eng".to_string()],
29            source_override: None,
30        }
31    }
32}
33
34pub fn ingest_to_index(input_path: &Path, output_dir: &Path) -> Result<()> {
35    let opts = IngestOptions::default();
36    ingest_to_index_with_opts(input_path, output_dir, &opts)
37}
38
39pub fn ingest_to_index_with_opts(
40    input_path: &Path,
41    output_dir: &Path,
42    opts: &IngestOptions,
43) -> Result<()> {
44    fs::create_dir_all(output_dir.join("index"))?;
45    fs::create_dir_all(output_dir.join("raw/3dcf"))?;
46
47    let ocr_langs = if opts.ocr_languages.is_empty() {
48        vec!["eng".to_string()]
49    } else {
50        opts.ocr_languages.clone()
51    };
52    let builder = Encoder::builder(&opts.preset)?
53        .enable_ocr(opts.enable_ocr)
54        .force_ocr(opts.force_ocr)
55        .ocr_languages(ocr_langs);
56    let encoder = builder.build();
57    let (document, _metrics) = encoder.encode_path(input_path)?;
58    let source_path = opts.source_override.as_deref().unwrap_or(input_path);
59
60    let doc_id = next_doc_id(&output_dir.join("raw/3dcf"))?;
61
62    write_raw(&document, output_dir, &doc_id)?;
63    write_index_records(output_dir, &doc_id, source_path, &document)?;
64
65    Ok(())
66}
67
68#[cfg(test)]
69mod tests {
70    use super::*;
71    use serde_json::Value;
72    use tempfile::tempdir;
73
74    #[test]
75    fn ingest_creates_raw_and_index_files() {
76        let dir = tempdir().unwrap();
77        let input = dir.path().join("sample.md");
78        std::fs::write(
79            &input,
80            "## Heading\n\n".to_string() + &"Body text ".repeat(50),
81        )
82        .unwrap();
83        let output_dir = dir.path().join("dataset");
84
85        ingest_to_index(&input, &output_dir).unwrap();
86
87        assert!(output_dir.join("raw/3dcf/doc_0001.3dcf").exists());
88        assert!(output_dir.join("raw/3dcf/doc_0001.3dcf.json").exists());
89
90        let docs_path = output_dir.join("index/documents.jsonl");
91        let docs_content = std::fs::read_to_string(&docs_path).unwrap();
92        let first_line = docs_content.lines().next().unwrap();
93        let doc_record: DocumentRecord = serde_json::from_str(first_line).unwrap();
94        assert_eq!(doc_record.doc_id, "doc_0001");
95        assert_eq!(doc_record.source_format, "md");
96
97        let cells_path = output_dir.join("index/cells.jsonl");
98        let cells_content = std::fs::read_to_string(cells_path).unwrap();
99        assert!(!cells_content.is_empty());
100        let first_cell: Value =
101            serde_json::from_str(cells_content.lines().next().unwrap()).unwrap();
102        assert_eq!(
103            first_cell.get("doc_id").unwrap().as_str().unwrap(),
104            "doc_0001"
105        );
106    }
107}
108
109fn write_raw(document: &Document, output_dir: &Path, doc_id: &str) -> Result<()> {
110    let raw_dir = output_dir.join("raw/3dcf");
111    fs::create_dir_all(&raw_dir)?;
112    let bin_path = raw_dir.join(format!("{doc_id}.3dcf"));
113    let json_path = raw_dir.join(format!("{doc_id}.3dcf.json"));
114    document.save_bin(&bin_path)?;
115    document.save_json(&json_path)?;
116    Ok(())
117}
118
119fn write_index_records(
120    output_dir: &Path,
121    doc_id: &str,
122    source_path: &Path,
123    document: &Document,
124) -> Result<()> {
125    let index_dir = output_dir.join("index");
126    fs::create_dir_all(&index_dir)?;
127    let documents_file = OpenOptions::new()
128        .create(true)
129        .append(true)
130        .open(index_dir.join("documents.jsonl"))?;
131    let pages_file = OpenOptions::new()
132        .create(true)
133        .append(true)
134        .open(index_dir.join("pages.jsonl"))?;
135    let cells_file = OpenOptions::new()
136        .create(true)
137        .append(true)
138        .open(index_dir.join("cells.jsonl"))?;
139
140    let mut documents_writer = JsonlWriter::new(BufWriter::new(documents_file));
141    let mut pages_writer = JsonlWriter::new(BufWriter::new(pages_file));
142    let mut cells_writer = JsonlWriter::new(BufWriter::new(cells_file));
143
144    let doc_record = DocumentRecord {
145        doc_id: doc_id.to_string(),
146        title: source_path
147            .file_stem()
148            .and_then(|stem| stem.to_str())
149            .map(|s| s.to_string()),
150        source_type: "files".to_string(),
151        source_format: source_path
152            .extension()
153            .and_then(|ext| ext.to_str())
154            .unwrap_or("unknown")
155            .to_lowercase(),
156        source_ref: source_path.display().to_string(),
157        tags: Vec::new(),
158    };
159    documents_writer.write_record(&doc_record)?;
160
161    let mut page_lookup = HashMap::new();
162    for (idx, page) in document.pages.iter().enumerate() {
163        let page_id = format!("{doc_id}_page_{:04}", idx + 1);
164        let page_text = document.decode_page_to_text(page.z);
165        let approx_tokens = if page_text.trim().is_empty() {
166            None
167        } else {
168            Some(page_text.split_whitespace().count() as u32)
169        };
170        let page_record = PageRecord {
171            page_id: page_id.clone(),
172            doc_id: doc_id.to_string(),
173            page_number: (idx + 1) as u32,
174            approx_tokens,
175            meta: json!({
176                "width_px": page.width_px,
177                "height_px": page.height_px,
178                "z": page.z,
179            }),
180        };
181        pages_writer.write_record(&page_record)?;
182        page_lookup.insert(page.z, page_id);
183    }
184
185    let ordered_cells = document.ordered_cells();
186    for (idx, cell) in ordered_cells.iter().enumerate() {
187        let cell_id = format!("{doc_id}_cell_{:06}", idx + 1);
188        let text = document
189            .payload_for(&cell.code_id)
190            .map(|s| s.to_string())
191            .unwrap_or_default();
192        let page_id = page_lookup
193            .get(&cell.z)
194            .cloned()
195            .unwrap_or_else(|| format!("{doc_id}_page_{:04}", cell.z + 1));
196        let bbox = Some([
197            cell.x as f32,
198            cell.y as f32,
199            (cell.x as f32) + cell.w as f32,
200            (cell.y as f32) + cell.h as f32,
201        ]);
202        let record = IndexCellRecord {
203            cell_id,
204            doc_id: doc_id.to_string(),
205            page_id,
206            kind: normalize_kind(cell.cell_type),
207            text,
208            importance: (cell.importance as f32) / 255.0,
209            bbox,
210            numguard: None,
211            meta: json!({
212                "rle": cell.rle,
213            }),
214        };
215        cells_writer.write_record(&record)?;
216    }
217
218    Ok(())
219}
220
221fn normalize_kind(cell_type: CellType) -> String {
222    match cell_type {
223        CellType::Text => "text",
224        CellType::Table => "table",
225        CellType::Figure => "figure",
226        CellType::Footer => "footer",
227        CellType::Header => "heading",
228    }
229    .to_string()
230}
231
232fn next_doc_id(raw_dir: &Path) -> Result<String> {
233    fs::create_dir_all(raw_dir)?;
234    let mut max_id = 0u32;
235    for entry in fs::read_dir(raw_dir)? {
236        let entry = entry?;
237        if !entry.path().is_file() {
238            continue;
239        }
240        if let Some(name) = entry.file_name().to_str() {
241            if let Some(stripped) = name.strip_prefix("doc_") {
242                if let Some(number_part) = stripped.strip_suffix(".3dcf") {
243                    if let Ok(num) = number_part.parse::<u32>() {
244                        max_id = max_id.max(num);
245                    }
246                }
247            }
248        }
249    }
250    Ok(format!("doc_{:04}", max_id + 1))
251}