use image::RgbImage;
use pdfium_render::prelude::*;
#[derive(Debug, Clone)]
pub struct TextCell {
pub text: String,
pub l: f32,
pub t: f32,
pub r: f32,
pub b: f32,
}
pub const RENDER_SCALE: f32 = 2.0;
#[derive(Clone)]
pub struct PdfPage {
pub width: f32,
pub height: f32,
pub scale: f32,
pub cells: Vec<TextCell>,
pub code_cells: Vec<TextCell>,
pub word_cells: Vec<TextCell>,
pub image: RgbImage,
pub links: Vec<LinkAnnot>,
}
#[derive(Debug, Clone)]
pub struct LinkAnnot {
pub l: f32,
pub t: f32,
pub r: f32,
pub b: f32,
pub uri: String,
}
pub struct PdfDocument {
pub pages: Vec<PdfPage>,
}
pub(crate) fn use_dp_lines() -> bool {
std::env::var("DOCLING_LEGACY_LINES").is_err()
}
pub(crate) fn use_parser_words() -> bool {
std::env::var("DOCLING_PDFIUM_WORDS").is_err() && std::env::var("DOCLING_PDFIUM_TEXT").is_err()
}
pub(crate) fn use_parser_code() -> bool {
std::env::var("DOCLING_PDFIUM_WORDS").is_err() && std::env::var("DOCLING_PDFIUM_TEXT").is_err()
}
fn try_bind_dir(path: &str) -> Option<Box<dyn pdfium_render::prelude::PdfiumLibraryBindings>> {
let name = Pdfium::pdfium_platform_library_name_at_path(path);
if let Ok(b) = Pdfium::bind_to_library(&name) {
return Some(b);
}
Pdfium::bind_to_library(path).ok()
}
fn bind() -> Result<Pdfium, PdfiumError> {
if let Ok(path) = std::env::var("PDFIUM_DYNAMIC_LIB_PATH") {
if let Some(b) = try_bind_dir(&path) {
return Ok(Pdfium::new(b));
}
}
if let Some(b) = try_bind_dir(".pdfium/lib") {
return Ok(Pdfium::new(b));
}
Pdfium::bind_to_system_library().map(Pdfium::new)
}
impl PdfDocument {
pub fn open(bytes: &[u8], password: Option<&str>) -> Result<Self, PdfiumError> {
let pdfium = bind()?;
let ffi = FfiText::load(pdfium.bindings(), bytes, password);
let doc = pdfium.load_pdf_from_byte_slice(bytes, password)?;
let mut rust = rust_parser_cells(bytes);
let mut pages = Vec::new();
for (i, page) in doc.pages().iter().enumerate() {
let rc = rust.as_mut().and_then(|v| v.get_mut(i).map(std::mem::take));
pages.push(extract_page(&page, &ffi, i as i32, rc, true)?);
}
Ok(PdfDocument { pages })
}
}
fn rust_parser_cells(bytes: &[u8]) -> Option<Vec<crate::textparse::PageParserCells>> {
if std::env::var("DOCLING_PDFIUM_TEXT").is_ok() {
return None;
}
Some(crate::timing::timed("textparse", || {
crate::textparse::pdf_all_cells(bytes)
}))
}
pub fn page_count(bytes: &[u8], password: Option<&str>) -> Result<usize, PdfiumError> {
let pdfium = bind()?;
let doc = pdfium.load_pdf_from_byte_slice(bytes, password)?;
Ok(doc.pages().len() as usize)
}
pub fn for_each_page<E, F>(
bytes: &[u8],
password: Option<&str>,
render_image: bool,
mut f: F,
) -> Result<(), E>
where
E: From<PdfiumError>,
F: FnMut(usize, usize, PdfPage) -> Result<(), E>,
{
let pdfium = bind()?;
let ffi = FfiText::load(pdfium.bindings(), bytes, password);
let doc = pdfium.load_pdf_from_byte_slice(bytes, password)?;
let mut rust = rust_parser_cells(bytes);
let pages = doc.pages();
let total = pages.len() as usize;
for (i, page) in pages.iter().enumerate() {
let rc = rust.as_mut().and_then(|v| v.get_mut(i).map(std::mem::take));
let extracted = extract_page(&page, &ffi, i as i32, rc, render_image)?;
f(i, total, extracted)?;
}
Ok(())
}
fn extract_page(
page: &pdfium_render::prelude::PdfPage<'_>,
ffi: &FfiText<'_>,
index: i32,
rust_cells: Option<crate::textparse::PageParserCells>,
render_image: bool,
) -> Result<PdfPage, PdfiumError> {
let width = page.width().value;
let height = page.height().value;
let rc = rust_cells.unwrap_or_default();
let need_pdfium_prose = rc.prose.is_empty();
let need_pdfium_words = !use_parser_words() || rc.words.is_empty();
let need_pdfium_code = !use_parser_code() || rc.code.is_empty();
let (mut cells, mut code_cells, mut word_cells) =
if need_pdfium_prose || need_pdfium_words || need_pdfium_code {
let (mut cells, code_cells, word_cells) =
crate::timing::timed("ffi.page_cells", || ffi.page_cells(index, height));
if cells.is_empty() {
cells = segment_cells(&page.text()?, height);
}
(cells, code_cells, word_cells)
} else {
(Vec::new(), Vec::new(), Vec::new())
};
if !rc.prose.is_empty() {
cells = rc.prose;
}
if use_parser_words() && !rc.words.is_empty() {
word_cells = rc.words;
}
if use_parser_code() && !rc.code.is_empty() {
code_cells = rc.code;
}
let image = if render_image {
const SUPERSAMPLE: f32 = 1.5;
let tw = (width * RENDER_SCALE * SUPERSAMPLE).round().max(1.0) as i32;
let th = (height * RENDER_SCALE * SUPERSAMPLE).round().max(1.0) as i32;
let cfg = PdfRenderConfig::new()
.set_target_width(tw)
.set_target_height(th);
let big = crate::timing::timed("pdfium.render", || {
page.render_with_config(&cfg)
.map(|b| b.as_image().into_rgb8())
})?;
let dw = (width * RENDER_SCALE).round().max(1.0) as u32;
let dh = (height * RENDER_SCALE).round().max(1.0) as u32;
crate::timing::timed("image.resize", || {
image::imageops::resize(&big, dw, dh, image::imageops::FilterType::CatmullRom)
})
} else {
RgbImage::new(1, 1)
};
Ok(PdfPage {
width,
height,
scale: RENDER_SCALE,
cells,
code_cells,
word_cells,
image,
links: extract_links(page, height),
})
}
fn extract_links(page: &pdfium_render::prelude::PdfPage<'_>, page_h: f32) -> Vec<LinkAnnot> {
let mut out = Vec::new();
for link in page.links().iter() {
let Some(uri) = link
.action()
.and_then(|a| a.as_uri_action().and_then(|u| u.uri().ok()))
else {
continue;
};
let scheme_ok = ["http://", "https://", "mailto:", "tel:"]
.iter()
.any(|s| uri.starts_with(s));
if !scheme_ok {
continue;
}
if let Ok(rect) = link.rect() {
out.push(LinkAnnot {
l: rect.left().value,
t: page_h - rect.top().value,
r: rect.right().value,
b: page_h - rect.bottom().value,
uri,
});
}
}
out
}
fn segment_cells(text: &PdfPageText, page_h: f32) -> Vec<TextCell> {
text.segments()
.iter()
.filter_map(|seg| {
let s = seg.text();
if s.trim().is_empty() {
return None;
}
let r = seg.bounds();
Some(TextCell {
text: s,
l: r.left().value,
t: page_h - r.top().value,
r: r.right().value,
b: page_h - r.bottom().value,
})
})
.collect()
}
struct FfiText<'a> {
bindings: &'a dyn PdfiumLibraryBindings,
doc: FPDF_DOCUMENT,
}
pub(crate) struct Glyph {
pub(crate) ch: char,
pub(crate) l: f32,
pub(crate) b: f32,
pub(crate) r: f32,
pub(crate) t: f32,
pub(crate) ll: f32,
pub(crate) lb: f32,
pub(crate) lr: f32,
pub(crate) lt: f32,
pub(crate) font: u64,
}
impl<'a> FfiText<'a> {
fn load(bindings: &'a dyn PdfiumLibraryBindings, bytes: &[u8], password: Option<&str>) -> Self {
let doc = bindings.FPDF_LoadMemDocument(bytes, password);
FfiText { bindings, doc }
}
fn page_cells(&self, index: i32, page_h: f32) -> (Vec<TextCell>, Vec<TextCell>, Vec<TextCell>) {
let empty = || (Vec::new(), Vec::new(), Vec::new());
if self.doc.is_null() {
return empty();
}
let b = self.bindings;
let page = b.FPDF_LoadPage(self.doc, index);
if page.is_null() {
return empty();
}
let tp = b.FPDFText_LoadPage(page);
let out = if tp.is_null() {
empty()
} else {
let dp = use_dp_lines();
let g = glyphs(b, tp, dp);
b.FPDFText_ClosePage(tp);
let prose = if dp {
crate::dp_lines::line_cells(&g, page_h, false)
} else {
lines_from_glyphs(&g, page_h, Grouping::Prose)
};
(
prose,
lines_from_glyphs(&g, page_h, Grouping::CodeSpaceOnly),
words_from_glyphs(&g, page_h),
)
};
b.FPDF_ClosePage(page);
out
}
}
impl Drop for FfiText<'_> {
fn drop(&mut self) {
if !self.doc.is_null() {
self.bindings.FPDF_CloseDocument(self.doc);
}
}
}
pub fn debug_glyphs(bytes: &[u8], index: i32) -> Vec<(char, f32, f32)> {
let Ok(pdfium) = bind() else {
return Vec::new();
};
let ffi = FfiText::load(pdfium.bindings(), bytes, None);
if ffi.doc.is_null() {
return Vec::new();
}
let b = ffi.bindings;
let page = b.FPDF_LoadPage(ffi.doc, index);
if page.is_null() {
return Vec::new();
}
let tp = b.FPDFText_LoadPage(page);
let mut out = Vec::new();
if !tp.is_null() {
for g in glyphs(b, tp, true) {
out.push((g.ch, g.ll, g.lr));
}
b.FPDFText_ClosePage(tp);
}
b.FPDF_ClosePage(page);
out
}
#[derive(Debug, Clone)]
pub struct DebugTextObject {
pub invisible: bool,
pub l: f32,
pub b: f32,
pub r: f32,
pub t: f32,
pub text: String,
}
pub fn debug_text_objects(bytes: &[u8], index: i32) -> Vec<DebugTextObject> {
let Ok(pdfium) = bind() else {
return Vec::new();
};
let ffi = FfiText::load(pdfium.bindings(), bytes, None);
if ffi.doc.is_null() {
return Vec::new();
}
let b = ffi.bindings;
let page = b.FPDF_LoadPage(ffi.doc, index);
if page.is_null() {
return Vec::new();
}
let tp = b.FPDFText_LoadPage(page);
let mut out = Vec::new();
let n = b.FPDFPage_CountObjects(page);
for i in 0..n {
let obj = b.FPDFPage_GetObject(page, i);
if obj.is_null() || b.FPDFPageObj_GetType(obj) != FPDF_PAGEOBJ_TEXT as i32 {
continue;
}
let (mut l, mut bot, mut r, mut top) = (0f32, 0f32, 0f32, 0f32);
if b.FPDFPageObj_GetBounds(obj, &mut l, &mut bot, &mut r, &mut top) == 0 {
continue;
}
let invisible = b.FPDFTextObj_GetTextRenderMode(obj) == INVISIBLE_RENDER_MODE;
let text = if tp.is_null() {
String::new()
} else {
let need = b.FPDFTextObj_GetText(obj, tp, std::ptr::null_mut(), 0);
if need <= 1 {
String::new()
} else {
let mut buf = vec![0u16; need as usize];
b.FPDFTextObj_GetText(obj, tp, buf.as_mut_ptr(), need);
if let Some(&0) = buf.last() {
buf.pop();
}
String::from_utf16_lossy(&buf)
}
};
out.push(DebugTextObject {
invisible,
l,
b: bot,
r,
t: top,
text,
});
}
if !tp.is_null() {
b.FPDFText_ClosePage(tp);
}
b.FPDF_ClosePage(page);
out
}
fn font_hash(b: &dyn PdfiumLibraryBindings, tp: FPDF_TEXTPAGE, i: i32) -> u64 {
use std::hash::{Hash, Hasher};
let mut flags: std::os::raw::c_int = 0;
let len = b.FPDFText_GetFontInfo(tp, i, std::ptr::null_mut(), 0, &mut flags);
if len == 0 {
return 0;
}
let mut buf = vec![0u8; len as usize];
b.FPDFText_GetFontInfo(
tp,
i,
buf.as_mut_ptr() as *mut std::os::raw::c_void,
len,
&mut flags,
);
let mut h = std::collections::hash_map::DefaultHasher::new();
buf.hash(&mut h);
flags.hash(&mut h);
h.finish()
}
const INVISIBLE_RENDER_MODE: i32 = 3;
fn glyphs(b: &dyn PdfiumLibraryBindings, tp: FPDF_TEXTPAGE, fetch_font: bool) -> Vec<Glyph> {
let n = b.FPDFText_CountChars(tp);
let mut out = Vec::with_capacity(n.max(0) as usize);
for i in 0..n {
let ch = match char::from_u32(b.FPDFText_GetUnicode(tp, i)) {
Some(c) => c,
None => continue,
};
if ch == '\r' || ch == '\n' {
continue;
}
let font = if fetch_font && !ch.is_whitespace() {
font_hash(b, tp, i)
} else {
0
};
let (mut l, mut r, mut bot, mut top) = (0f64, 0f64, 0f64, 0f64);
let has_box = b.FPDFText_GetCharBox(tp, i, &mut l, &mut r, &mut bot, &mut top) != 0;
let mut lr = FS_RECTF {
left: 0.0,
top: 0.0,
right: 0.0,
bottom: 0.0,
};
let (ll, lb, lrt, ltop) = if b.FPDFText_GetLooseCharBox(tp, i, &mut lr) != 0 {
(lr.left, lr.bottom, lr.right, lr.top)
} else if has_box {
(l as f32, bot as f32, r as f32, top as f32)
} else {
(f32::NAN, 0.0, 0.0, 0.0)
};
if ch.is_whitespace() {
out.push(Glyph {
ch: ' ',
l: if has_box { l as f32 } else { f32::NAN },
b: if has_box { bot as f32 } else { 0.0 },
r: if has_box { r as f32 } else { 0.0 },
t: if has_box { top as f32 } else { 0.0 },
ll,
lb,
lr: lrt,
lt: ltop,
font,
});
continue;
}
if !has_box {
continue;
}
out.push(Glyph {
ch,
l: l as f32,
b: bot as f32,
r: r as f32,
t: top as f32,
ll,
lb,
lr: lrt,
lt: ltop,
font,
});
}
for i in 0..out.len().saturating_sub(1) {
let same_x = out[i].l.is_finite()
&& out[i + 1].l.is_finite()
&& (out[i].l - out[i + 1].l).abs() < 1.0;
if same_x
&& matches!(out[i].ch, '\u{0622}' | '\u{0623}' | '\u{0625}' | '\u{0627}')
&& out[i + 1].ch == '\u{0644}'
{
out.swap(i, i + 1);
}
}
for i in 0..out.len() {
if out[i].ch != ' ' || (out[i].lr - out[i].ll).abs() >= 0.5 {
continue;
}
let prev = out[..i]
.iter()
.rev()
.find(|g| g.ch != ' ' && g.ll.is_finite())
.map(|g| (g.lr, g.lb, g.lt));
let next = out[i + 1..]
.iter()
.find(|g| g.ch != ' ' && g.ll.is_finite())
.map(|g| (g.ll, g.lb));
if let (Some((plr, plb, plt)), Some((nll, nlb))) = (prev, next) {
let line_h = (plt - plb).abs().max(1.0);
if (plb - nlb).abs() < line_h * 0.5 && nll > plr + 0.5 {
out[i].ll = plr;
out[i].lr = nll;
out[i].lb = plb;
out[i].lt = plt;
}
}
}
out
}
#[derive(Clone, Copy, PartialEq)]
enum Grouping {
Prose,
CodeSpaceOnly,
CodeGap,
}
fn lines_from_glyphs(gs: &[Glyph], page_h: f32, mode: Grouping) -> Vec<TextCell> {
let mut cells: Vec<TextCell> = Vec::new();
let mut words: Vec<String> = Vec::new(); let mut word = String::new();
let (mut ll, mut lb, mut lr, mut lt) = (
f32::INFINITY,
f32::INFINITY,
f32::NEG_INFINITY,
f32::NEG_INFINITY,
);
let mut line_h: f32 = 0.0;
let mut prev: Option<&Glyph> = None;
let mut pending_space = false;
for g in gs {
if g.ch == ' ' {
pending_space = true;
continue;
}
let h = (g.t - g.b).abs().max(1.0);
let (mut new_word, mut new_line) = (false, false);
if let Some(p) = prev {
let x_reset = if is_arabic(g.ch) || is_arabic(p.ch) {
g.l > p.r
} else {
g.l < p.r
};
new_line = (p.b - g.b > h * 0.5 && x_reset) || (p.b - g.b > line_h.max(h) * 1.5);
let glued = is_close_punct(g.ch)
|| is_open_punct(p.ch)
|| (p.ch.is_ascii_digit() && g.ch.is_ascii_digit())
|| (p.ch == '.'
&& !pending_space
&& (g.ch.is_ascii_digit() || g.ch.is_ascii_lowercase()));
let word_gap = line_h.max(h) * 0.25;
new_word = if mode == Grouping::CodeSpaceOnly {
new_line || pending_space
} else if mode == Grouping::CodeGap {
new_line || pending_space || g.l - p.r > word_gap
} else if is_arabic(g.ch) || is_arabic(p.ch) {
new_line || (p.l - g.r > word_gap && !glued)
} else {
new_line || ((pending_space || g.l - p.r > word_gap) && !glued)
};
}
pending_space = false;
if new_line {
push_word(&mut word, &mut words);
push_line(&mut words, (ll, lb, lr, lt), page_h, &mut cells);
(ll, lb, lr, lt) = (
f32::INFINITY,
f32::INFINITY,
f32::NEG_INFINITY,
f32::NEG_INFINITY,
);
line_h = 0.0;
} else if new_word {
push_word(&mut word, &mut words);
}
word.push(g.ch);
ll = ll.min(g.l);
lb = lb.min(g.b);
lr = lr.max(g.r);
lt = lt.max(g.t);
line_h = line_h.max(h);
prev = Some(g);
}
push_word(&mut word, &mut words);
push_line(&mut words, (ll, lb, lr, lt), page_h, &mut cells);
cells
}
pub(crate) fn code_cells_from_glyphs(gs: &[Glyph], page_h: f32) -> Vec<TextCell> {
lines_from_glyphs(gs, page_h, Grouping::CodeGap)
}
pub(crate) fn words_from_glyphs(gs: &[Glyph], page_h: f32) -> Vec<TextCell> {
let mut cells = Vec::new();
let mut word = String::new();
let inf = (
f32::INFINITY,
f32::INFINITY,
f32::NEG_INFINITY,
f32::NEG_INFINITY,
);
let (mut wl, mut wb, mut wr, mut wt) = inf;
let mut line_h: f32 = 0.0;
let mut prev: Option<&Glyph> = None;
let mut pending_space = false;
for g in gs {
if g.ch == ' ' {
pending_space = true;
continue;
}
let h = (g.t - g.b).abs().max(1.0);
let mut new_line = false;
let mut new_word = false;
if let Some(p) = prev {
let x_reset = if is_arabic(g.ch) || is_arabic(p.ch) {
g.l > p.r
} else {
g.l < p.r
};
new_line = (p.b - g.b > h * 0.5 && x_reset) || (p.b - g.b > line_h.max(h) * 1.5);
let glued = is_close_punct(g.ch)
|| is_open_punct(p.ch)
|| (p.ch == '.'
&& !pending_space
&& (g.ch.is_ascii_digit() || g.ch.is_ascii_lowercase()));
let word_gap = line_h.max(h) * 0.25;
new_word = new_line || ((pending_space || g.l - p.r > word_gap) && !glued);
}
pending_space = false;
if new_word && !word.is_empty() {
cells.push(TextCell {
text: std::mem::take(&mut word),
l: wl,
t: page_h - wt,
r: wr,
b: page_h - wb,
});
(wl, wb, wr, wt) = inf;
}
if new_line {
line_h = 0.0;
}
word.push(g.ch);
wl = wl.min(g.l);
wb = wb.min(g.b);
wr = wr.max(g.r);
wt = wt.max(g.t);
line_h = line_h.max(h);
prev = Some(g);
}
if !word.is_empty() {
cells.push(TextCell {
text: word,
l: wl,
t: page_h - wt,
r: wr,
b: page_h - wb,
});
}
cells
}
fn is_arabic(c: char) -> bool {
('\u{0600}'..='\u{06FF}').contains(&c)
}
fn is_close_punct(c: char) -> bool {
matches!(
c,
',' | '.' | ';' | '!' | '?' | ')' | ']' | '}' | '%' | '\'' | '\u{2019}' | '\u{2018}'
)
}
fn is_open_punct(c: char) -> bool {
matches!(c, '(' | '[' | '{' | '@')
}
fn push_word(word: &mut String, words: &mut Vec<String>) {
if !word.is_empty() {
words.push(std::mem::take(word));
}
}
fn push_line(
words: &mut Vec<String>,
bbox: (f32, f32, f32, f32),
page_h: f32,
cells: &mut Vec<TextCell>,
) {
if words.is_empty() {
return;
}
let text = std::mem::take(words).join(" ");
let (l, b, r, t) = bbox;
cells.push(TextCell {
text,
l,
t: page_h - t,
r,
b: page_h - b,
});
}