pub struct Pipeline { /* private fields */ }Expand description
A reusable PDF pipeline. The primary worker runs its models on every core, so a single-page / small / image / METS input is converted at full intra-op speed with no pool to load. A document with enough pages instead fans out across a pool of narrower workers processed concurrently. Both load lazily and are cached for reuse, so a one-shot conversion only pays for what it uses.
Implementations§
Source§impl Pipeline
impl Pipeline
Sourcepub fn new() -> Result<Self, PdfError>
pub fn new() -> Result<Self, PdfError>
Construct the pipeline. Models load lazily on first use (full-intra primary for serial inputs, the helper pool for multi-page PDFs), so nothing is loaded that a given document doesn’t need.
Examples found in repository?
42fn main() {
43 let mut args = std::env::args().skip(1);
44 let root = PathBuf::from(args.next().expect("usage: snapshot <root> <outdir>"));
45 let outdir = PathBuf::from(args.next().expect("usage: snapshot <root> <outdir>"));
46
47 let mut pdfs = Vec::new();
48 find_pdfs(&root, &mut pdfs);
49
50 let mut pipeline = Pipeline::new().expect("load pipeline");
51 let (mut ok, mut err) = (0u32, 0u32);
52 for pdf in &pdfs {
53 let rel = pdf.strip_prefix(&root).unwrap_or(pdf);
54 let name = pdf.file_name().unwrap().to_string_lossy().to_string();
55 let md = match std::fs::read(pdf)
56 .map_err(|e| format!("read: {e}"))
57 .and_then(|bytes| {
58 let ext = pdf.extension().and_then(|e| e.to_str()).unwrap_or("");
59 let result = if ext == "gz" {
60 fleischwolf_pdf::convert_mets_gbs(&bytes, &name)
61 } else if IMAGE_EXTS.contains(&ext) {
62 pipeline.convert_image(&bytes, &name)
63 } else {
64 pipeline.convert(&bytes, None, &name)
65 };
66 result
67 .map(|d| d.export_to_markdown())
68 .map_err(|e| e.to_string())
69 }) {
70 Ok(md) => {
71 ok += 1;
72 md
73 }
74 Err(e) => {
75 err += 1;
76 eprintln!("ERR {}: {e}", rel.display());
77 format!("ERROR: {e}\n")
78 }
79 };
80 // Groundtruth naming keeps the source extension: `<file>.<ext>.md`.
81 let mut dest = outdir.join(rel).into_os_string();
82 dest.push(".md");
83 let dest = PathBuf::from(dest);
84 std::fs::create_dir_all(dest.parent().unwrap()).expect("mkdir");
85 std::fs::write(&dest, md).expect("write snapshot");
86 }
87 eprintln!("snapshots: {} ok, {} error, {} total", ok, err, pdfs.len());
88}Sourcepub fn no_table_former(self, disable: bool) -> Self
pub fn no_table_former(self, disable: bool) -> Self
Skip loading and running the TableFormer table-structure model. Table regions still get emitted, but reconstructed geometrically from cell positions instead of via the ONNX model’s predicted structure — faster (no model load, no per-table inference) at the cost of table fidelity. No effect if a worker is already loaded; set this before the first conversion.
Sourcepub fn convert(
&mut self,
bytes: &[u8],
password: Option<&str>,
name: &str,
) -> Result<DoclingDocument, PdfError>
pub fn convert( &mut self, bytes: &[u8], password: Option<&str>, name: &str, ) -> Result<DoclingDocument, PdfError>
Convert a PDF (bytes) to a DoclingDocument. A document with fewer than
parallel_min pages (or a pool size of 1) streams through the full-intra
primary; a larger one renders on this thread (pdfium is not thread-safe) and
fans the pages out across the worker pool, reassembled in page order so the
output is byte-identical to the serial path.
Examples found in repository?
42fn main() {
43 let mut args = std::env::args().skip(1);
44 let root = PathBuf::from(args.next().expect("usage: snapshot <root> <outdir>"));
45 let outdir = PathBuf::from(args.next().expect("usage: snapshot <root> <outdir>"));
46
47 let mut pdfs = Vec::new();
48 find_pdfs(&root, &mut pdfs);
49
50 let mut pipeline = Pipeline::new().expect("load pipeline");
51 let (mut ok, mut err) = (0u32, 0u32);
52 for pdf in &pdfs {
53 let rel = pdf.strip_prefix(&root).unwrap_or(pdf);
54 let name = pdf.file_name().unwrap().to_string_lossy().to_string();
55 let md = match std::fs::read(pdf)
56 .map_err(|e| format!("read: {e}"))
57 .and_then(|bytes| {
58 let ext = pdf.extension().and_then(|e| e.to_str()).unwrap_or("");
59 let result = if ext == "gz" {
60 fleischwolf_pdf::convert_mets_gbs(&bytes, &name)
61 } else if IMAGE_EXTS.contains(&ext) {
62 pipeline.convert_image(&bytes, &name)
63 } else {
64 pipeline.convert(&bytes, None, &name)
65 };
66 result
67 .map(|d| d.export_to_markdown())
68 .map_err(|e| e.to_string())
69 }) {
70 Ok(md) => {
71 ok += 1;
72 md
73 }
74 Err(e) => {
75 err += 1;
76 eprintln!("ERR {}: {e}", rel.display());
77 format!("ERROR: {e}\n")
78 }
79 };
80 // Groundtruth naming keeps the source extension: `<file>.<ext>.md`.
81 let mut dest = outdir.join(rel).into_os_string();
82 dest.push(".md");
83 let dest = PathBuf::from(dest);
84 std::fs::create_dir_all(dest.parent().unwrap()).expect("mkdir");
85 std::fs::write(&dest, md).expect("write snapshot");
86 }
87 eprintln!("snapshots: {} ok, {} error, {} total", ok, err, pdfs.len());
88}Sourcepub fn convert_streaming<F>(
&mut self,
bytes: &[u8],
password: Option<&str>,
name: &str,
emit: F,
) -> Result<(), PdfError>
pub fn convert_streaming<F>( &mut self, bytes: &[u8], password: Option<&str>, name: &str, emit: F, ) -> Result<(), PdfError>
Convert a PDF in streaming mode: emit is called with each finalized,
in-document-order batch of nodes (and that span’s recovered links) as pages
complete, so a caller can serialize Markdown page by page instead of waiting
for the whole document. The batches are exactly the buffered convert’s
nodes, split at safe block boundaries by [assemble::StreamAssembler] — the
parallel path reorders pages back into document order before emitting, so
the output is identical regardless of worker scheduling.
emit runs on the calling thread (never a worker), so it needn’t be Send
and its backpressure throttles the whole pipeline. Returning Err from
emit aborts the conversion with that error.
Sourcepub fn convert_image(
&mut self,
bytes: &[u8],
name: &str,
) -> Result<DoclingDocument, PdfError>
pub fn convert_image( &mut self, bytes: &[u8], name: &str, ) -> Result<DoclingDocument, PdfError>
Convert a standalone image (PNG/JPEG/TIFF/WebP/…) as a single page — docling routes images through the same layout+OCR pipeline as a PDF page.
Examples found in repository?
42fn main() {
43 let mut args = std::env::args().skip(1);
44 let root = PathBuf::from(args.next().expect("usage: snapshot <root> <outdir>"));
45 let outdir = PathBuf::from(args.next().expect("usage: snapshot <root> <outdir>"));
46
47 let mut pdfs = Vec::new();
48 find_pdfs(&root, &mut pdfs);
49
50 let mut pipeline = Pipeline::new().expect("load pipeline");
51 let (mut ok, mut err) = (0u32, 0u32);
52 for pdf in &pdfs {
53 let rel = pdf.strip_prefix(&root).unwrap_or(pdf);
54 let name = pdf.file_name().unwrap().to_string_lossy().to_string();
55 let md = match std::fs::read(pdf)
56 .map_err(|e| format!("read: {e}"))
57 .and_then(|bytes| {
58 let ext = pdf.extension().and_then(|e| e.to_str()).unwrap_or("");
59 let result = if ext == "gz" {
60 fleischwolf_pdf::convert_mets_gbs(&bytes, &name)
61 } else if IMAGE_EXTS.contains(&ext) {
62 pipeline.convert_image(&bytes, &name)
63 } else {
64 pipeline.convert(&bytes, None, &name)
65 };
66 result
67 .map(|d| d.export_to_markdown())
68 .map_err(|e| e.to_string())
69 }) {
70 Ok(md) => {
71 ok += 1;
72 md
73 }
74 Err(e) => {
75 err += 1;
76 eprintln!("ERR {}: {e}", rel.display());
77 format!("ERROR: {e}\n")
78 }
79 };
80 // Groundtruth naming keeps the source extension: `<file>.<ext>.md`.
81 let mut dest = outdir.join(rel).into_os_string();
82 dest.push(".md");
83 let dest = PathBuf::from(dest);
84 std::fs::create_dir_all(dest.parent().unwrap()).expect("mkdir");
85 std::fs::write(&dest, md).expect("write snapshot");
86 }
87 eprintln!("snapshots: {} ok, {} error, {} total", ok, err, pdfs.len());
88}Auto Trait Implementations§
impl !RefUnwindSafe for Pipeline
impl !UnwindSafe for Pipeline
impl Freeze for Pipeline
impl Send for Pipeline
impl Sync for Pipeline
impl Unpin for Pipeline
impl UnsafeUnpin for Pipeline
Blanket Implementations§
Source§impl<T> BorrowMut<T> for Twhere
T: ?Sized,
impl<T> BorrowMut<T> for Twhere
T: ?Sized,
Source§fn borrow_mut(&mut self) -> &mut T
fn borrow_mut(&mut self) -> &mut T
impl<ST, DT> CastableFrom<ST, Initialized, Initialized> for DT
impl<ST, DT> CastableFrom<ST, Uninit, Uninit> for DT
Source§impl<T> IntoEither for T
impl<T> IntoEither for T
Source§fn into_either(self, into_left: bool) -> Either<Self, Self>
fn into_either(self, into_left: bool) -> Either<Self, Self>
self into a Left variant of Either<Self, Self>
if into_left is true.
Converts self into a Right variant of Either<Self, Self>
otherwise. Read moreSource§fn into_either_with<F>(self, into_left: F) -> Either<Self, Self>
fn into_either_with<F>(self, into_left: F) -> Either<Self, Self>
self into a Left variant of Either<Self, Self>
if into_left(&self) returns true.
Converts self into a Right variant of Either<Self, Self>
otherwise. Read more