1use std::collections::HashMap;
2use std::fs::{self, OpenOptions};
3use std::io::BufWriter;
4use std::path::{Path, PathBuf};
5
6use anyhow::Result;
7use serde_json::json;
8
9use crate::index::{CellRecord as IndexCellRecord, DocumentRecord, JsonlWriter, PageRecord};
10
11use crate::{document::CellType, Document, Encoder};
12
13#[derive(Debug, Clone)]
14pub struct IngestOptions {
15 pub preset: String,
16 pub enable_ocr: bool,
17 pub force_ocr: bool,
18 pub ocr_languages: Vec<String>,
19 pub source_override: Option<PathBuf>,
20}
21
22impl Default for IngestOptions {
23 fn default() -> Self {
24 Self {
25 preset: "reports".to_string(),
26 enable_ocr: false,
27 force_ocr: false,
28 ocr_languages: vec!["eng".to_string()],
29 source_override: None,
30 }
31 }
32}
33
34pub fn ingest_to_index(input_path: &Path, output_dir: &Path) -> Result<()> {
35 let opts = IngestOptions::default();
36 ingest_to_index_with_opts(input_path, output_dir, &opts)
37}
38
39pub fn ingest_to_index_with_opts(
40 input_path: &Path,
41 output_dir: &Path,
42 opts: &IngestOptions,
43) -> Result<()> {
44 fs::create_dir_all(output_dir.join("index"))?;
45 fs::create_dir_all(output_dir.join("raw/3dcf"))?;
46
47 let ocr_langs = if opts.ocr_languages.is_empty() {
48 vec!["eng".to_string()]
49 } else {
50 opts.ocr_languages.clone()
51 };
52 let builder = Encoder::builder(&opts.preset)?
53 .enable_ocr(opts.enable_ocr)
54 .force_ocr(opts.force_ocr)
55 .ocr_languages(ocr_langs);
56 let encoder = builder.build();
57 let (document, _metrics) = encoder.encode_path(input_path)?;
58 let source_path = opts.source_override.as_deref().unwrap_or(input_path);
59
60 let doc_id = next_doc_id(&output_dir.join("raw/3dcf"))?;
61
62 write_raw(&document, output_dir, &doc_id)?;
63 write_index_records(output_dir, &doc_id, source_path, &document)?;
64
65 Ok(())
66}
67
68#[cfg(test)]
69mod tests {
70 use super::*;
71 use serde_json::Value;
72 use tempfile::tempdir;
73
74 #[test]
75 fn ingest_creates_raw_and_index_files() {
76 let dir = tempdir().unwrap();
77 let input = dir.path().join("sample.md");
78 std::fs::write(
79 &input,
80 "## Heading\n\n".to_string() + &"Body text ".repeat(50),
81 )
82 .unwrap();
83 let output_dir = dir.path().join("dataset");
84
85 ingest_to_index(&input, &output_dir).unwrap();
86
87 assert!(output_dir.join("raw/3dcf/doc_0001.3dcf").exists());
88 assert!(output_dir.join("raw/3dcf/doc_0001.3dcf.json").exists());
89
90 let docs_path = output_dir.join("index/documents.jsonl");
91 let docs_content = std::fs::read_to_string(&docs_path).unwrap();
92 let first_line = docs_content.lines().next().unwrap();
93 let doc_record: DocumentRecord = serde_json::from_str(first_line).unwrap();
94 assert_eq!(doc_record.doc_id, "doc_0001");
95 assert_eq!(doc_record.source_format, "md");
96
97 let cells_path = output_dir.join("index/cells.jsonl");
98 let cells_content = std::fs::read_to_string(cells_path).unwrap();
99 assert!(!cells_content.is_empty());
100 let first_cell: Value =
101 serde_json::from_str(cells_content.lines().next().unwrap()).unwrap();
102 assert_eq!(
103 first_cell.get("doc_id").unwrap().as_str().unwrap(),
104 "doc_0001"
105 );
106 }
107}
108
109fn write_raw(document: &Document, output_dir: &Path, doc_id: &str) -> Result<()> {
110 let raw_dir = output_dir.join("raw/3dcf");
111 fs::create_dir_all(&raw_dir)?;
112 let bin_path = raw_dir.join(format!("{doc_id}.3dcf"));
113 let json_path = raw_dir.join(format!("{doc_id}.3dcf.json"));
114 document.save_bin(&bin_path)?;
115 document.save_json(&json_path)?;
116 Ok(())
117}
118
119fn write_index_records(
120 output_dir: &Path,
121 doc_id: &str,
122 source_path: &Path,
123 document: &Document,
124) -> Result<()> {
125 let index_dir = output_dir.join("index");
126 fs::create_dir_all(&index_dir)?;
127 let documents_file = OpenOptions::new()
128 .create(true)
129 .append(true)
130 .open(index_dir.join("documents.jsonl"))?;
131 let pages_file = OpenOptions::new()
132 .create(true)
133 .append(true)
134 .open(index_dir.join("pages.jsonl"))?;
135 let cells_file = OpenOptions::new()
136 .create(true)
137 .append(true)
138 .open(index_dir.join("cells.jsonl"))?;
139
140 let mut documents_writer = JsonlWriter::new(BufWriter::new(documents_file));
141 let mut pages_writer = JsonlWriter::new(BufWriter::new(pages_file));
142 let mut cells_writer = JsonlWriter::new(BufWriter::new(cells_file));
143
144 let doc_record = DocumentRecord {
145 doc_id: doc_id.to_string(),
146 title: source_path
147 .file_stem()
148 .and_then(|stem| stem.to_str())
149 .map(|s| s.to_string()),
150 source_type: "files".to_string(),
151 source_format: source_path
152 .extension()
153 .and_then(|ext| ext.to_str())
154 .unwrap_or("unknown")
155 .to_lowercase(),
156 source_ref: source_path.display().to_string(),
157 tags: Vec::new(),
158 };
159 documents_writer.write_record(&doc_record)?;
160
161 let mut page_lookup = HashMap::new();
162 for (idx, page) in document.pages.iter().enumerate() {
163 let page_id = format!("{doc_id}_page_{:04}", idx + 1);
164 let page_text = document.decode_page_to_text(page.z);
165 let approx_tokens = if page_text.trim().is_empty() {
166 None
167 } else {
168 Some(page_text.split_whitespace().count() as u32)
169 };
170 let page_record = PageRecord {
171 page_id: page_id.clone(),
172 doc_id: doc_id.to_string(),
173 page_number: (idx + 1) as u32,
174 approx_tokens,
175 meta: json!({
176 "width_px": page.width_px,
177 "height_px": page.height_px,
178 "z": page.z,
179 }),
180 };
181 pages_writer.write_record(&page_record)?;
182 page_lookup.insert(page.z, page_id);
183 }
184
185 let ordered_cells = document.ordered_cells();
186 for (idx, cell) in ordered_cells.iter().enumerate() {
187 let cell_id = format!("{doc_id}_cell_{:06}", idx + 1);
188 let text = document
189 .payload_for(&cell.code_id)
190 .map(|s| s.to_string())
191 .unwrap_or_default();
192 let page_id = page_lookup
193 .get(&cell.z)
194 .cloned()
195 .unwrap_or_else(|| format!("{doc_id}_page_{:04}", cell.z + 1));
196 let bbox = Some([
197 cell.x as f32,
198 cell.y as f32,
199 (cell.x as f32) + cell.w as f32,
200 (cell.y as f32) + cell.h as f32,
201 ]);
202 let record = IndexCellRecord {
203 cell_id,
204 doc_id: doc_id.to_string(),
205 page_id,
206 kind: normalize_kind(cell.cell_type),
207 text,
208 importance: (cell.importance as f32) / 255.0,
209 bbox,
210 numguard: None,
211 meta: json!({
212 "rle": cell.rle,
213 }),
214 };
215 cells_writer.write_record(&record)?;
216 }
217
218 Ok(())
219}
220
221fn normalize_kind(cell_type: CellType) -> String {
222 match cell_type {
223 CellType::Text => "text",
224 CellType::Table => "table",
225 CellType::Figure => "figure",
226 CellType::Footer => "footer",
227 CellType::Header => "heading",
228 }
229 .to_string()
230}
231
232fn next_doc_id(raw_dir: &Path) -> Result<String> {
233 fs::create_dir_all(raw_dir)?;
234 let mut max_id = 0u32;
235 for entry in fs::read_dir(raw_dir)? {
236 let entry = entry?;
237 if !entry.path().is_file() {
238 continue;
239 }
240 if let Some(name) = entry.file_name().to_str() {
241 if let Some(stripped) = name.strip_prefix("doc_") {
242 if let Some(number_part) = stripped.strip_suffix(".3dcf") {
243 if let Ok(num) = number_part.parse::<u32>() {
244 max_id = max_id.max(num);
245 }
246 }
247 }
248 }
249 }
250 Ok(format!("doc_{:04}", max_id + 1))
251}