three_dcf_core/
lib.rs

1//! # three-dcf-core
2//!
3//! A high-performance library for encoding documents into structured datasets
4//! optimized for LLM training and retrieval-augmented generation (RAG).
5//!
6//! ## Overview
7//!
8//! `three-dcf-core` converts various document formats (PDF, Markdown, HTML, images)
9//! into a normalized, cell-based representation that preserves document structure
10//! while being optimized for machine learning workloads.
11//!
12//! ## Quick Start
13//!
14//! ```rust,no_run
15//! use three_dcf_core::prelude::*;
16//!
17//! fn main() -> Result<()> {
18//!     // Encode a PDF document
19//!     let encoder = Encoder::from_preset("reports")?;
20//!     let (document, metrics) = encoder.encode_path("report.pdf")?;
21//!
22//!     println!("Processed {} pages, {} cells", metrics.pages, metrics.cells_kept);
23//!
24//!     // Serialize to text format for LLM context
25//!     let serializer = TextSerializer::new();
26//!     let output = serializer.to_string(&document)?;
27//!
28//!     Ok(())
29//! }
30//! ```
31//!
32//! ## Encoder Presets
33//!
34//! | Preset | Use Case | Page Size |
35//! |--------|----------|-----------|
36//! | `reports` | Business documents, papers | 1024×1400 |
37//! | `slides` | Presentations | 1920×1080 |
38//! | `news` | Articles, blogs | 1100×1600 |
39//! | `scans` | Scanned documents | 1400×2000 |
40//!
41//! ## Features
42//!
43//! - **`text`** (default): Basic text/markdown/HTML processing
44//! - **`pdfium`**: Native PDF rendering via pdfium for better extraction
45//! - **`ocr`**: Optical character recognition via Tesseract
46//! - **`full`**: All features enabled
47//!
48//! ## Architecture
49//!
50//! The encoding pipeline:
51//!
52//! 1. **Input** → Document loaded from file (PDF/MD/HTML/image)
53//! 2. **Parse** → Extract pages and text content
54//! 3. **Normalize** → Apply hyphenation rules, detect structure
55//! 4. **Classify** → Identify cell types (text, table, code, header)
56//! 5. **Score** → Calculate importance scores for ranking
57//! 6. **Deduplicate** → Hash-based deduplication across pages
58//! 7. **Output** → `Document` with cells, dictionary, and metadata
59//!
60//! ## Output Formats
61//!
62//! - **TextSerializer**: Human-readable format for LLM context windows
63//! - **JsonlWriter**: JSONL output for dataset pipelines
64//! - **Protobuf**: Binary format via `proto` module
65//!
66//! ## Example: Custom Configuration
67//!
68//! ```rust,no_run
69//! use three_dcf_core::{EncoderBuilder, HyphenationMode, ImportanceTuning};
70//!
71//! let encoder = EncoderBuilder::new("reports")?
72//!     .budget(Some(4096))           // Token budget
73//!     .drop_footers(true)           // Remove page footers
74//!     .dedup_window(5)              // Dedup across 5 pages
75//!     .hyphenation(HyphenationMode::Preserve)
76//!     .importance_tuning(ImportanceTuning {
77//!         header_boost: 1.5,
78//!         table_boost: 1.2,
79//!         ..Default::default()
80//!     })
81//!     .build();
82//! # Ok::<(), three_dcf_core::DcfError>(())
83//! ```
84//!
85//! ## Chunking for RAG
86//!
87//! ```rust,no_run
88//! use three_dcf_core::{Chunker, ChunkConfig, ChunkMode};
89//!
90//! let chunker = Chunker::new(ChunkConfig {
91//!     mode: ChunkMode::Semantic,
92//!     target_tokens: 512,
93//!     overlap_tokens: 64,
94//!     ..Default::default()
95//! });
96//!
97//! let chunks = chunker.chunk(&document);
98//! ```
99
100#![cfg_attr(docsrs, feature(doc_cfg))]
101
102/// Protobuf-generated types for binary serialization
103pub mod proto {
104    include!(concat!(env!("OUT_DIR"), "/dcf.v1.rs"));
105}
106
107/// Index types for JSONL output (merged from three_dcf_index)
108pub mod index;
109
110/// Prelude for convenient imports
111pub mod prelude;
112
113mod bench;
114mod chunk;
115mod decoder;
116mod document;
117mod embedding;
118mod encoder;
119mod error;
120mod ingest;
121mod metrics;
122mod normalization;
123mod numguard;
124mod ocr;
125mod serializer;
126mod stats;
127
128// Re-exports for public API
129pub use bench::{BenchConfig, BenchMode, BenchResult, BenchRunner, CorpusMetrics};
130pub use chunk::{ChunkConfig, ChunkMode, ChunkRecord, Chunker};
131pub use decoder::Decoder;
132pub use document::{
133    hash_payload, CellRecord, CellType, CodeHash, Document, Header, NumGuard, NumGuardAlert,
134    NumGuardIssue, PageInfo,
135};
136pub use embedding::{EmbeddingRecord, HashEmbedder, HashEmbedderConfig};
137pub use encoder::{EncodeInput, Encoder, EncoderBuilder, EncoderPreset};
138pub use error::{DcfError, Result};
139pub use ingest::{ingest_to_index, ingest_to_index_with_opts, IngestOptions};
140pub use metrics::{cer, numeric_stats, wer, Metrics, NumStats, TokenMetrics};
141pub use normalization::{HyphenationMode, ImportanceTuning};
142pub use serializer::{TableMode, TextSerializer, TextSerializerConfig};
143pub use stats::{estimate_tokens, Stats, TokenizerKind};
144
145// Re-export index types at crate root for convenience
146pub use index::{DocumentRecord, JsonlWriter, PageRecord};
147// Note: index::CellRecord conflicts with document::CellRecord; refer to it via `index::CellRecord` explicitly.
three_dcf_core/lib.rs

three_dcf_core/
lib.rs