three_dcf_core/lib.rs
1//! # three-dcf-core
2//!
3//! A high-performance library for encoding documents into structured datasets
4//! optimized for LLM training and retrieval-augmented generation (RAG).
5//!
6//! ## Overview
7//!
8//! `three-dcf-core` converts various document formats (PDF, Markdown, HTML, images)
9//! into a normalized, cell-based representation that preserves document structure
10//! while being optimized for machine learning workloads.
11//!
12//! ## Quick Start
13//!
14//! ```rust,no_run
15//! use three_dcf_core::prelude::*;
16//!
17//! fn main() -> Result<()> {
18//! // Encode a PDF document
19//! let encoder = Encoder::from_preset("reports")?;
20//! let (document, metrics) = encoder.encode_path("report.pdf")?;
21//!
22//! println!("Processed {} pages, {} cells", metrics.pages, metrics.cells_kept);
23//!
24//! // Serialize to text format for LLM context
25//! let serializer = TextSerializer::new();
26//! let output = serializer.to_string(&document)?;
27//!
28//! Ok(())
29//! }
30//! ```
31//!
32//! ## Encoder Presets
33//!
34//! | Preset | Use Case | Page Size |
35//! |--------|----------|-----------|
36//! | `reports` | Business documents, papers | 1024×1400 |
37//! | `slides` | Presentations | 1920×1080 |
38//! | `news` | Articles, blogs | 1100×1600 |
39//! | `scans` | Scanned documents | 1400×2000 |
40//!
41//! ## Features
42//!
43//! - **`text`** (default): Basic text/markdown/HTML processing
44//! - **`pdfium`**: Native PDF rendering via pdfium for better extraction
45//! - **`ocr`**: Optical character recognition via Tesseract
46//! - **`full`**: All features enabled
47//!
48//! ## Architecture
49//!
50//! The encoding pipeline:
51//!
52//! 1. **Input** → Document loaded from file (PDF/MD/HTML/image)
53//! 2. **Parse** → Extract pages and text content
54//! 3. **Normalize** → Apply hyphenation rules, detect structure
55//! 4. **Classify** → Identify cell types (text, table, code, header)
56//! 5. **Score** → Calculate importance scores for ranking
57//! 6. **Deduplicate** → Hash-based deduplication across pages
58//! 7. **Output** → `Document` with cells, dictionary, and metadata
59//!
60//! ## Output Formats
61//!
62//! - **TextSerializer**: Human-readable format for LLM context windows
63//! - **JsonlWriter**: JSONL output for dataset pipelines
64//! - **Protobuf**: Binary format via `proto` module
65//!
66//! ## Example: Custom Configuration
67//!
68//! ```rust,no_run
69//! use three_dcf_core::{EncoderBuilder, HyphenationMode, ImportanceTuning};
70//!
71//! let encoder = EncoderBuilder::new("reports")?
72//! .budget(Some(4096)) // Token budget
73//! .drop_footers(true) // Remove page footers
74//! .dedup_window(5) // Dedup across 5 pages
75//! .hyphenation(HyphenationMode::Preserve)
76//! .importance_tuning(ImportanceTuning {
77//! header_boost: 1.5,
78//! table_boost: 1.2,
79//! ..Default::default()
80//! })
81//! .build();
82//! # Ok::<(), three_dcf_core::DcfError>(())
83//! ```
84//!
85//! ## Chunking for RAG
86//!
87//! ```rust,no_run
88//! use three_dcf_core::{Chunker, ChunkConfig, ChunkMode};
89//!
90//! let chunker = Chunker::new(ChunkConfig {
91//! mode: ChunkMode::Semantic,
92//! target_tokens: 512,
93//! overlap_tokens: 64,
94//! ..Default::default()
95//! });
96//!
97//! let chunks = chunker.chunk(&document);
98//! ```
99
100#![cfg_attr(docsrs, feature(doc_cfg))]
101
102/// Protobuf-generated types for binary serialization
103pub mod proto {
104 include!(concat!(env!("OUT_DIR"), "/dcf.v1.rs"));
105}
106
107/// Index types for JSONL output (merged from three_dcf_index)
108pub mod index;
109
110/// Prelude for convenient imports
111pub mod prelude;
112
113mod bench;
114mod chunk;
115mod decoder;
116mod document;
117mod embedding;
118mod encoder;
119mod error;
120mod ingest;
121mod metrics;
122mod normalization;
123mod numguard;
124mod ocr;
125mod serializer;
126mod stats;
127
128// Re-exports for public API
129pub use bench::{BenchConfig, BenchMode, BenchResult, BenchRunner, CorpusMetrics};
130pub use chunk::{ChunkConfig, ChunkMode, ChunkRecord, Chunker};
131pub use decoder::Decoder;
132pub use document::{
133 hash_payload, CellRecord, CellType, CodeHash, Document, Header, NumGuard, NumGuardAlert,
134 NumGuardIssue, PageInfo,
135};
136pub use embedding::{EmbeddingRecord, HashEmbedder, HashEmbedderConfig};
137pub use encoder::{EncodeInput, Encoder, EncoderBuilder, EncoderPreset};
138pub use error::{DcfError, Result};
139pub use ingest::{ingest_to_index, ingest_to_index_with_opts, IngestOptions};
140pub use metrics::{cer, numeric_stats, wer, Metrics, NumStats, TokenMetrics};
141pub use normalization::{HyphenationMode, ImportanceTuning};
142pub use serializer::{TableMode, TextSerializer, TextSerializerConfig};
143pub use stats::{estimate_tokens, Stats, TokenizerKind};
144
145// Re-export index types at crate root for convenience
146pub use index::{DocumentRecord, JsonlWriter, PageRecord};
147// Note: index::CellRecord conflicts with document::CellRecord; refer to it via `index::CellRecord` explicitly.