pdfsink-rs 0.2.5

Fast pure-Rust PDF extraction library and CLI — ~10-50x faster than pdfplumber for text, word, table, layout, image, and metadata extraction from PDFs. By Clark Labs Inc.
Documentation
use crate::text::{extract_text_lines, extract_words};
use crate::types::{BBox, Char, Direction, LayoutObject, Page, PageLayout, PdfDocument, StructureElement};
use std::collections::BTreeMap;

fn layout_object_from_bbox(
    object_type: &str,
    page_number: usize,
    bbox: BBox,
    text: Option<String>,
    direction: Option<Direction>,
    upright: Option<bool>,
    children: Vec<LayoutObject>,
) -> LayoutObject {
    LayoutObject {
        object_type: object_type.to_string(),
        page_number,
        x0: bbox.x0,
        top: bbox.top,
        x1: bbox.x1,
        bottom: bbox.bottom,
        width: bbox.width(),
        height: bbox.height(),
        text,
        direction,
        upright,
        children,
    }
}

fn merge_layout_children(children: &[LayoutObject]) -> Option<BBox> {
    let first = children.first()?;
    let mut bbox = first.bbox();
    for child in children.iter().skip(1) {
        let cb = child.bbox();
        bbox = BBox::new(
            bbox.x0.min(cb.x0),
            bbox.top.min(cb.top),
            bbox.x1.max(cb.x1),
            bbox.bottom.max(cb.bottom),
        );
    }
    Some(bbox)
}

fn build_textline_objects(page: &Page, chars: &[Char], vertical: bool) -> Vec<LayoutObject> {
    if chars.is_empty() {
        return Vec::new();
    }

    let mut options = page.default_text_options();
    if vertical {
        options.line_dir = Direction::Ltr;
        options.char_dir = Direction::Ttb;
        options.line_dir_rotated = Some(Direction::Ltr);
        options.char_dir_rotated = Some(Direction::Ttb);
    }

    extract_text_lines(chars, &options, false, true)
        .into_iter()
        .map(|line| {
            let bbox = BBox::new(line.x0, line.top, line.x1, line.bottom);
            layout_object_from_bbox(
                if vertical { "textlinevertical" } else { "textlinehorizontal" },
                page.page_number,
                bbox,
                Some(line.text),
                Some(if vertical { Direction::Ttb } else { Direction::Ltr }),
                Some(!vertical),
                Vec::new(),
            )
        })
        .collect()
}

fn build_textbox_objects(lines: &[LayoutObject], vertical: bool) -> Vec<LayoutObject> {
    if lines.is_empty() {
        return Vec::new();
    }

    let mut sorted = lines.to_vec();
    if vertical {
        sorted.sort_by(|a, b| a.x0.total_cmp(&b.x0).then_with(|| a.top.total_cmp(&b.top)));
    } else {
        sorted.sort_by(|a, b| a.top.total_cmp(&b.top).then_with(|| a.x0.total_cmp(&b.x0)));
    }

    let mut groups: Vec<Vec<LayoutObject>> = Vec::new();
    for line in sorted {
        let should_split = groups
            .last()
            .and_then(|group| group.last())
            .map(|prev| {
                let gap = if vertical {
                    line.x0 - prev.x1
                } else {
                    line.top - prev.bottom
                };
                let overlap = if vertical {
                    !(line.bottom < prev.top || line.top > prev.bottom)
                } else {
                    !(line.x1 < prev.x0 || line.x0 > prev.x1)
                };
                gap > 12.0 && !overlap
            })
            .unwrap_or(false);

        if should_split {
            groups.push(vec![line]);
        } else if let Some(group) = groups.last_mut() {
            group.push(line);
        } else {
            groups.push(vec![line]);
        }
    }

    groups
        .into_iter()
        .filter_map(|children| {
            let bbox = merge_layout_children(&children)?;
            let text = children
                .iter()
                .filter_map(|child| child.text.clone())
                .collect::<Vec<_>>()
                .join("\n");
            Some(layout_object_from_bbox(
                if vertical { "textboxvertical" } else { "textboxhorizontal" },
                children[0].page_number,
                bbox,
                Some(text),
                Some(if vertical { Direction::Ttb } else { Direction::Ltr }),
                Some(!vertical),
                children,
            ))
        })
        .collect()
}

impl Page {
    pub fn pages(&self) -> Vec<Page> {
        vec![self.clone()]
    }

    pub fn point2coord(&self, pt: (f64, f64)) -> (f64, f64) {
        (pt.0, self.height - pt.1)
    }

    pub fn textlinehorizontals(&self) -> Vec<LayoutObject> {
        let chars: Vec<Char> = self.chars.iter().filter(|ch| ch.upright).cloned().collect();
        build_textline_objects(self, &chars, false)
    }

    pub fn textlineverticals(&self) -> Vec<LayoutObject> {
        let chars: Vec<Char> = self.chars.iter().filter(|ch| !ch.upright).cloned().collect();
        build_textline_objects(self, &chars, true)
    }

    pub fn textboxhorizontals(&self) -> Vec<LayoutObject> {
        build_textbox_objects(&self.textlinehorizontals(), false)
    }

    pub fn textboxverticals(&self) -> Vec<LayoutObject> {
        build_textbox_objects(&self.textlineverticals(), true)
    }

    pub fn layout(&self) -> PageLayout {
        let mut objects = Vec::new();
        objects.extend(self.textboxhorizontals());
        objects.extend(self.textboxverticals());
        objects.extend(self.textlinehorizontals());
        objects.extend(self.textlineverticals());
        PageLayout {
            page_number: self.page_number,
            bbox: self.bbox,
            objects,
        }
    }

    pub fn iter_layout_objects(&self) -> std::vec::IntoIter<LayoutObject> {
        let mut flat = Vec::new();
        fn push_children(target: &mut Vec<LayoutObject>, object: LayoutObject) {
            let children = object.children.clone();
            target.push(object);
            for child in children {
                push_children(target, child);
            }
        }
        for object in self.layout().objects {
            push_children(&mut flat, object);
        }
        flat.into_iter()
    }

    pub fn parse_objects(&self) -> BTreeMap<String, Vec<LayoutObject>> {
        let mut out = BTreeMap::new();
        let textboxh = self.textboxhorizontals();
        if !textboxh.is_empty() {
            out.insert("textboxhorizontal".to_string(), textboxh);
        }
        let textboxv = self.textboxverticals();
        if !textboxv.is_empty() {
            out.insert("textboxvertical".to_string(), textboxv);
        }
        let textlineh = self.textlinehorizontals();
        if !textlineh.is_empty() {
            out.insert("textlinehorizontal".to_string(), textlineh);
        }
        let textlinev = self.textlineverticals();
        if !textlinev.is_empty() {
            out.insert("textlinevertical".to_string(), textlinev);
        }
        out
    }

    pub fn process_object(&self, obj: &LayoutObject) -> LayoutObject {
        obj.clone()
    }

    pub fn structure_tree(&self) -> Option<&StructureElement> {
        self.structure_tree.as_ref()
    }

    pub fn derive_word_layout(&self) -> Vec<LayoutObject> {
        let options = self.default_text_options();
        extract_words(&self.chars, &options, false)
            .into_iter()
            .map(|word| {
                let bbox = BBox::new(word.x0, word.top, word.x1, word.bottom);
                layout_object_from_bbox(
                    "word",
                    self.page_number,
                    bbox,
                    Some(word.text),
                    Some(word.direction),
                    Some(word.upright),
                    Vec::new(),
                )
            })
            .collect()
    }
}

impl PdfDocument {
    pub fn structure_tree(&self) -> Option<&StructureElement> {
        self.structure_tree.as_ref()
    }

    pub fn textboxhorizontals(&self) -> Vec<LayoutObject> {
        self.pages.iter().flat_map(|page| page.textboxhorizontals()).collect()
    }

    pub fn textboxverticals(&self) -> Vec<LayoutObject> {
        self.pages.iter().flat_map(|page| page.textboxverticals()).collect()
    }

    pub fn textlinehorizontals(&self) -> Vec<LayoutObject> {
        self.pages.iter().flat_map(|page| page.textlinehorizontals()).collect()
    }

    pub fn textlineverticals(&self) -> Vec<LayoutObject> {
        self.pages.iter().flat_map(|page| page.textlineverticals()).collect()
    }
}