three-dcf-core 0.2.0

Document-to-dataset encoding library for LLM training data preparation. Converts PDFs, Markdown, HTML into structured formats optimized for machine learning.
Documentation
use std::collections::HashMap;
use std::fs::{self, OpenOptions};
use std::io::BufWriter;
use std::path::{Path, PathBuf};

use anyhow::Result;
use serde_json::json;

use crate::index::{CellRecord as IndexCellRecord, DocumentRecord, JsonlWriter, PageRecord};

use crate::{document::CellType, Document, Encoder};

#[derive(Debug, Clone)]
pub struct IngestOptions {
    pub preset: String,
    pub enable_ocr: bool,
    pub force_ocr: bool,
    pub ocr_languages: Vec<String>,
    pub source_override: Option<PathBuf>,
}

impl Default for IngestOptions {
    fn default() -> Self {
        Self {
            preset: "reports".to_string(),
            enable_ocr: false,
            force_ocr: false,
            ocr_languages: vec!["eng".to_string()],
            source_override: None,
        }
    }
}

pub fn ingest_to_index(input_path: &Path, output_dir: &Path) -> Result<()> {
    let opts = IngestOptions::default();
    ingest_to_index_with_opts(input_path, output_dir, &opts)
}

pub fn ingest_to_index_with_opts(
    input_path: &Path,
    output_dir: &Path,
    opts: &IngestOptions,
) -> Result<()> {
    fs::create_dir_all(output_dir.join("index"))?;
    fs::create_dir_all(output_dir.join("raw/3dcf"))?;

    let ocr_langs = if opts.ocr_languages.is_empty() {
        vec!["eng".to_string()]
    } else {
        opts.ocr_languages.clone()
    };
    let builder = Encoder::builder(&opts.preset)?
        .enable_ocr(opts.enable_ocr)
        .force_ocr(opts.force_ocr)
        .ocr_languages(ocr_langs);
    let encoder = builder.build();
    let (document, _metrics) = encoder.encode_path(input_path)?;
    let source_path = opts.source_override.as_deref().unwrap_or(input_path);

    let doc_id = next_doc_id(&output_dir.join("raw/3dcf"))?;

    write_raw(&document, output_dir, &doc_id)?;
    write_index_records(output_dir, &doc_id, source_path, &document)?;

    Ok(())
}

#[cfg(test)]
mod tests {
    use super::*;
    use serde_json::Value;
    use tempfile::tempdir;

    #[test]
    fn ingest_creates_raw_and_index_files() {
        let dir = tempdir().unwrap();
        let input = dir.path().join("sample.md");
        std::fs::write(
            &input,
            "## Heading\n\n".to_string() + &"Body text ".repeat(50),
        )
        .unwrap();
        let output_dir = dir.path().join("dataset");

        ingest_to_index(&input, &output_dir).unwrap();

        assert!(output_dir.join("raw/3dcf/doc_0001.3dcf").exists());
        assert!(output_dir.join("raw/3dcf/doc_0001.3dcf.json").exists());

        let docs_path = output_dir.join("index/documents.jsonl");
        let docs_content = std::fs::read_to_string(&docs_path).unwrap();
        let first_line = docs_content.lines().next().unwrap();
        let doc_record: DocumentRecord = serde_json::from_str(first_line).unwrap();
        assert_eq!(doc_record.doc_id, "doc_0001");
        assert_eq!(doc_record.source_format, "md");

        let cells_path = output_dir.join("index/cells.jsonl");
        let cells_content = std::fs::read_to_string(cells_path).unwrap();
        assert!(!cells_content.is_empty());
        let first_cell: Value =
            serde_json::from_str(cells_content.lines().next().unwrap()).unwrap();
        assert_eq!(
            first_cell.get("doc_id").unwrap().as_str().unwrap(),
            "doc_0001"
        );
    }
}

fn write_raw(document: &Document, output_dir: &Path, doc_id: &str) -> Result<()> {
    let raw_dir = output_dir.join("raw/3dcf");
    fs::create_dir_all(&raw_dir)?;
    let bin_path = raw_dir.join(format!("{doc_id}.3dcf"));
    let json_path = raw_dir.join(format!("{doc_id}.3dcf.json"));
    document.save_bin(&bin_path)?;
    document.save_json(&json_path)?;
    Ok(())
}

fn write_index_records(
    output_dir: &Path,
    doc_id: &str,
    source_path: &Path,
    document: &Document,
) -> Result<()> {
    let index_dir = output_dir.join("index");
    fs::create_dir_all(&index_dir)?;
    let documents_file = OpenOptions::new()
        .create(true)
        .append(true)
        .open(index_dir.join("documents.jsonl"))?;
    let pages_file = OpenOptions::new()
        .create(true)
        .append(true)
        .open(index_dir.join("pages.jsonl"))?;
    let cells_file = OpenOptions::new()
        .create(true)
        .append(true)
        .open(index_dir.join("cells.jsonl"))?;

    let mut documents_writer = JsonlWriter::new(BufWriter::new(documents_file));
    let mut pages_writer = JsonlWriter::new(BufWriter::new(pages_file));
    let mut cells_writer = JsonlWriter::new(BufWriter::new(cells_file));

    let doc_record = DocumentRecord {
        doc_id: doc_id.to_string(),
        title: source_path
            .file_stem()
            .and_then(|stem| stem.to_str())
            .map(|s| s.to_string()),
        source_type: "files".to_string(),
        source_format: source_path
            .extension()
            .and_then(|ext| ext.to_str())
            .unwrap_or("unknown")
            .to_lowercase(),
        source_ref: source_path.display().to_string(),
        tags: Vec::new(),
    };
    documents_writer.write_record(&doc_record)?;

    let mut page_lookup = HashMap::new();
    for (idx, page) in document.pages.iter().enumerate() {
        let page_id = format!("{doc_id}_page_{:04}", idx + 1);
        let page_text = document.decode_page_to_text(page.z);
        let approx_tokens = if page_text.trim().is_empty() {
            None
        } else {
            Some(page_text.split_whitespace().count() as u32)
        };
        let page_record = PageRecord {
            page_id: page_id.clone(),
            doc_id: doc_id.to_string(),
            page_number: (idx + 1) as u32,
            approx_tokens,
            meta: json!({
                "width_px": page.width_px,
                "height_px": page.height_px,
                "z": page.z,
            }),
        };
        pages_writer.write_record(&page_record)?;
        page_lookup.insert(page.z, page_id);
    }

    let ordered_cells = document.ordered_cells();
    for (idx, cell) in ordered_cells.iter().enumerate() {
        let cell_id = format!("{doc_id}_cell_{:06}", idx + 1);
        let text = document
            .payload_for(&cell.code_id)
            .map(|s| s.to_string())
            .unwrap_or_default();
        let page_id = page_lookup
            .get(&cell.z)
            .cloned()
            .unwrap_or_else(|| format!("{doc_id}_page_{:04}", cell.z + 1));
        let bbox = Some([
            cell.x as f32,
            cell.y as f32,
            (cell.x as f32) + cell.w as f32,
            (cell.y as f32) + cell.h as f32,
        ]);
        let record = IndexCellRecord {
            cell_id,
            doc_id: doc_id.to_string(),
            page_id,
            kind: normalize_kind(cell.cell_type),
            text,
            importance: (cell.importance as f32) / 255.0,
            bbox,
            numguard: None,
            meta: json!({
                "rle": cell.rle,
            }),
        };
        cells_writer.write_record(&record)?;
    }

    Ok(())
}

fn normalize_kind(cell_type: CellType) -> String {
    match cell_type {
        CellType::Text => "text",
        CellType::Table => "table",
        CellType::Figure => "figure",
        CellType::Footer => "footer",
        CellType::Header => "heading",
    }
    .to_string()
}

fn next_doc_id(raw_dir: &Path) -> Result<String> {
    fs::create_dir_all(raw_dir)?;
    let mut max_id = 0u32;
    for entry in fs::read_dir(raw_dir)? {
        let entry = entry?;
        if !entry.path().is_file() {
            continue;
        }
        if let Some(name) = entry.file_name().to_str() {
            if let Some(stripped) = name.strip_prefix("doc_") {
                if let Some(number_part) = stripped.strip_suffix(".3dcf") {
                    if let Ok(num) = number_part.parse::<u32>() {
                        max_id = max_id.max(num);
                    }
                }
            }
        }
    }
    Ok(format!("doc_{:04}", max_id + 1))
}