pub struct Pipeline { /* private fields */ }Expand description
A reusable PDF pipeline: the layout model is loaded once and reused across documents; OCR loads lazily the first time a scanned page is seen.
Implementations§
Source§impl Pipeline
impl Pipeline
Sourcepub fn new() -> Result<Self, PdfError>
pub fn new() -> Result<Self, PdfError>
Load the layout model (the only always-required model).
Examples found in repository?
examples/snapshot.rs (line 50)
42fn main() {
43 let mut args = std::env::args().skip(1);
44 let root = PathBuf::from(args.next().expect("usage: snapshot <root> <outdir>"));
45 let outdir = PathBuf::from(args.next().expect("usage: snapshot <root> <outdir>"));
46
47 let mut pdfs = Vec::new();
48 find_pdfs(&root, &mut pdfs);
49
50 let mut pipeline = Pipeline::new().expect("load pipeline");
51 let (mut ok, mut err) = (0u32, 0u32);
52 for pdf in &pdfs {
53 let rel = pdf.strip_prefix(&root).unwrap_or(pdf);
54 let name = pdf.file_name().unwrap().to_string_lossy().to_string();
55 let md = match std::fs::read(pdf)
56 .map_err(|e| format!("read: {e}"))
57 .and_then(|bytes| {
58 let ext = pdf.extension().and_then(|e| e.to_str()).unwrap_or("");
59 let result = if ext == "gz" {
60 fleischwolf_pdf::convert_mets_gbs(&bytes, &name)
61 } else if IMAGE_EXTS.contains(&ext) {
62 pipeline.convert_image(&bytes, &name)
63 } else {
64 pipeline.convert(&bytes, None, &name)
65 };
66 result
67 .map(|d| d.export_to_markdown())
68 .map_err(|e| e.to_string())
69 }) {
70 Ok(md) => {
71 ok += 1;
72 md
73 }
74 Err(e) => {
75 err += 1;
76 eprintln!("ERR {}: {e}", rel.display());
77 format!("ERROR: {e}\n")
78 }
79 };
80 // Groundtruth naming keeps the source extension: `<file>.<ext>.md`.
81 let mut dest = outdir.join(rel).into_os_string();
82 dest.push(".md");
83 let dest = PathBuf::from(dest);
84 std::fs::create_dir_all(dest.parent().unwrap()).expect("mkdir");
85 std::fs::write(&dest, md).expect("write snapshot");
86 }
87 eprintln!("snapshots: {} ok, {} error, {} total", ok, err, pdfs.len());
88}Sourcepub fn convert(
&mut self,
bytes: &[u8],
password: Option<&str>,
name: &str,
) -> Result<DoclingDocument, PdfError>
pub fn convert( &mut self, bytes: &[u8], password: Option<&str>, name: &str, ) -> Result<DoclingDocument, PdfError>
Convert a PDF (bytes) to a DoclingDocument via the discriminative
pipeline: pdfium text cells (or OCR for scanned pages) + per-page layout
detection, assembled in reading order. Errors are detailed and surfaced.
Examples found in repository?
examples/snapshot.rs (line 64)
42fn main() {
43 let mut args = std::env::args().skip(1);
44 let root = PathBuf::from(args.next().expect("usage: snapshot <root> <outdir>"));
45 let outdir = PathBuf::from(args.next().expect("usage: snapshot <root> <outdir>"));
46
47 let mut pdfs = Vec::new();
48 find_pdfs(&root, &mut pdfs);
49
50 let mut pipeline = Pipeline::new().expect("load pipeline");
51 let (mut ok, mut err) = (0u32, 0u32);
52 for pdf in &pdfs {
53 let rel = pdf.strip_prefix(&root).unwrap_or(pdf);
54 let name = pdf.file_name().unwrap().to_string_lossy().to_string();
55 let md = match std::fs::read(pdf)
56 .map_err(|e| format!("read: {e}"))
57 .and_then(|bytes| {
58 let ext = pdf.extension().and_then(|e| e.to_str()).unwrap_or("");
59 let result = if ext == "gz" {
60 fleischwolf_pdf::convert_mets_gbs(&bytes, &name)
61 } else if IMAGE_EXTS.contains(&ext) {
62 pipeline.convert_image(&bytes, &name)
63 } else {
64 pipeline.convert(&bytes, None, &name)
65 };
66 result
67 .map(|d| d.export_to_markdown())
68 .map_err(|e| e.to_string())
69 }) {
70 Ok(md) => {
71 ok += 1;
72 md
73 }
74 Err(e) => {
75 err += 1;
76 eprintln!("ERR {}: {e}", rel.display());
77 format!("ERROR: {e}\n")
78 }
79 };
80 // Groundtruth naming keeps the source extension: `<file>.<ext>.md`.
81 let mut dest = outdir.join(rel).into_os_string();
82 dest.push(".md");
83 let dest = PathBuf::from(dest);
84 std::fs::create_dir_all(dest.parent().unwrap()).expect("mkdir");
85 std::fs::write(&dest, md).expect("write snapshot");
86 }
87 eprintln!("snapshots: {} ok, {} error, {} total", ok, err, pdfs.len());
88}Sourcepub fn convert_image(
&mut self,
bytes: &[u8],
name: &str,
) -> Result<DoclingDocument, PdfError>
pub fn convert_image( &mut self, bytes: &[u8], name: &str, ) -> Result<DoclingDocument, PdfError>
Convert a standalone image (PNG/JPEG/TIFF/WebP/…) as a single page — docling routes images through the same layout+OCR pipeline as a PDF page.
Examples found in repository?
examples/snapshot.rs (line 62)
42fn main() {
43 let mut args = std::env::args().skip(1);
44 let root = PathBuf::from(args.next().expect("usage: snapshot <root> <outdir>"));
45 let outdir = PathBuf::from(args.next().expect("usage: snapshot <root> <outdir>"));
46
47 let mut pdfs = Vec::new();
48 find_pdfs(&root, &mut pdfs);
49
50 let mut pipeline = Pipeline::new().expect("load pipeline");
51 let (mut ok, mut err) = (0u32, 0u32);
52 for pdf in &pdfs {
53 let rel = pdf.strip_prefix(&root).unwrap_or(pdf);
54 let name = pdf.file_name().unwrap().to_string_lossy().to_string();
55 let md = match std::fs::read(pdf)
56 .map_err(|e| format!("read: {e}"))
57 .and_then(|bytes| {
58 let ext = pdf.extension().and_then(|e| e.to_str()).unwrap_or("");
59 let result = if ext == "gz" {
60 fleischwolf_pdf::convert_mets_gbs(&bytes, &name)
61 } else if IMAGE_EXTS.contains(&ext) {
62 pipeline.convert_image(&bytes, &name)
63 } else {
64 pipeline.convert(&bytes, None, &name)
65 };
66 result
67 .map(|d| d.export_to_markdown())
68 .map_err(|e| e.to_string())
69 }) {
70 Ok(md) => {
71 ok += 1;
72 md
73 }
74 Err(e) => {
75 err += 1;
76 eprintln!("ERR {}: {e}", rel.display());
77 format!("ERROR: {e}\n")
78 }
79 };
80 // Groundtruth naming keeps the source extension: `<file>.<ext>.md`.
81 let mut dest = outdir.join(rel).into_os_string();
82 dest.push(".md");
83 let dest = PathBuf::from(dest);
84 std::fs::create_dir_all(dest.parent().unwrap()).expect("mkdir");
85 std::fs::write(&dest, md).expect("write snapshot");
86 }
87 eprintln!("snapshots: {} ok, {} error, {} total", ok, err, pdfs.len());
88}Auto Trait Implementations§
impl !RefUnwindSafe for Pipeline
impl !UnwindSafe for Pipeline
impl Freeze for Pipeline
impl Send for Pipeline
impl Sync for Pipeline
impl Unpin for Pipeline
impl UnsafeUnpin for Pipeline
Blanket Implementations§
Source§impl<T> BorrowMut<T> for Twhere
T: ?Sized,
impl<T> BorrowMut<T> for Twhere
T: ?Sized,
Source§fn borrow_mut(&mut self) -> &mut T
fn borrow_mut(&mut self) -> &mut T
Mutably borrows from an owned value. Read more
impl<ST, DT> CastableFrom<ST, Initialized, Initialized> for DT
impl<ST, DT> CastableFrom<ST, Uninit, Uninit> for DT
Source§impl<T> IntoEither for T
impl<T> IntoEither for T
Source§fn into_either(self, into_left: bool) -> Either<Self, Self>
fn into_either(self, into_left: bool) -> Either<Self, Self>
Converts
self into a Left variant of Either<Self, Self>
if into_left is true.
Converts self into a Right variant of Either<Self, Self>
otherwise. Read moreSource§fn into_either_with<F>(self, into_left: F) -> Either<Self, Self>
fn into_either_with<F>(self, into_left: F) -> Either<Self, Self>
Converts
self into a Left variant of Either<Self, Self>
if into_left(&self) returns true.
Converts self into a Right variant of Either<Self, Self>
otherwise. Read more