use std::env;
use std::fs::{self, File};
use std::io::BufWriter;
use std::path::{Path, PathBuf};
use three_dcf_core::index::CellRecord as IndexCellRecord;
use three_dcf_core::prelude::*;
fn main() -> Result<()> {
tracing_subscriber::fmt::init();
let args: Vec<String> = env::args().collect();
let input_dir = args
.get(1)
.map(PathBuf::from)
.unwrap_or_else(|| PathBuf::from("./documents"));
if !input_dir.is_dir() {
eprintln!("Directory not found: {}", input_dir.display());
eprintln!("Usage: cargo run --example batch_process -- <directory>");
std::process::exit(1);
}
let files = collect_documents(&input_dir)?;
println!("Found {} documents to process", files.len());
if files.is_empty() {
println!("No supported files found (.pdf, .md, .html, .txt)");
return Ok(());
}
let output_dir = input_dir.join("output");
fs::create_dir_all(&output_dir)?;
let docs_file = BufWriter::new(File::create(output_dir.join("documents.jsonl"))?);
let pages_file = BufWriter::new(File::create(output_dir.join("pages.jsonl"))?);
let cells_file = BufWriter::new(File::create(output_dir.join("cells.jsonl"))?);
let mut docs_writer = JsonlWriter::new(docs_file);
let mut pages_writer = JsonlWriter::new(pages_file);
let mut cells_writer = JsonlWriter::new(cells_file);
let encoder = Encoder::from_preset("reports")?;
let serializer = TextSerializer::new();
let mut total_pages = 0u32;
let mut total_cells = 0u32;
for (idx, path) in files.iter().enumerate() {
let doc_id = format!("doc_{:06}", idx);
let filename = path.file_name().unwrap().to_string_lossy();
print!("[{}/{}] Processing {}...", idx + 1, files.len(), filename);
match encoder.encode_path(path) {
Ok((document, metrics)) => {
println!(" {} pages, {} cells", metrics.pages, metrics.cells_kept);
docs_writer.write_record(&DocumentRecord {
doc_id: doc_id.clone(),
title: Some(filename.to_string()),
source_type: "files".to_string(),
source_format: get_format(path),
source_ref: path.to_string_lossy().to_string(),
tags: vec![],
})?;
for (page_idx, page_info) in document.pages.iter().enumerate() {
let page_id = format!("{}_page_{}", doc_id, page_idx);
pages_writer.write_record(&PageRecord {
page_id: page_id.clone(),
doc_id: doc_id.clone(),
page_number: page_info.z,
approx_tokens: None,
meta: serde_json::Value::Null,
})?;
for (cell_idx, cell) in document
.cells
.iter()
.filter(|c| c.z == page_info.z)
.enumerate()
{
let cell_id = format!("{}_cell_{}", page_id, cell_idx);
let text = document.payload_for(&cell.code_id).unwrap_or_default();
cells_writer.write_record(&IndexCellRecord {
cell_id,
doc_id: doc_id.clone(),
page_id: page_id.clone(),
kind: format!("{:?}", cell.cell_type).to_lowercase(),
text: text.to_string(),
importance: cell.importance as f32 / 100.0,
bbox: Some([
cell.x as f32,
cell.y as f32,
cell.w as f32,
cell.h as f32,
]),
numguard: None,
meta: serde_json::Value::Null,
})?;
}
}
let text_output = serializer.to_string(&document)?;
let output_path = output_dir.join(format!("{}.3dcf.txt", doc_id));
fs::write(&output_path, text_output)?;
total_pages += metrics.pages;
total_cells += metrics.cells_kept;
}
Err(e) => {
println!(" ERROR: {}", e);
}
}
}
docs_writer.flush()?;
pages_writer.flush()?;
cells_writer.flush()?;
println!("\n=== Summary ===");
println!("Documents: {}", files.len());
println!("Pages: {}", total_pages);
println!("Cells: {}", total_cells);
println!("Output: {}", output_dir.display());
Ok(())
}
fn collect_documents(dir: &Path) -> Result<Vec<PathBuf>> {
let mut files = Vec::new();
let extensions = ["pdf", "md", "markdown", "html", "htm", "txt"];
for entry in fs::read_dir(dir)? {
let entry = entry?;
let path = entry.path();
if path.is_file() {
if let Some(ext) = path.extension() {
if extensions.contains(&ext.to_string_lossy().to_lowercase().as_str()) {
files.push(path);
}
}
}
}
files.sort();
Ok(files)
}
fn get_format(path: &Path) -> String {
path.extension()
.map(|e| e.to_string_lossy().to_lowercase())
.unwrap_or_else(|| "unknown".to_string())
}