Struct ExtractionOptions

Source

pub struct ExtractionOptions {
    pub preserve_layout: bool,
    pub space_threshold: f64,
    pub tj_space_threshold: f64,
    pub newline_threshold: f64,
    pub sort_by_position: bool,
    pub detect_columns: bool,
    pub column_threshold: f64,
    pub merge_hyphenated: bool,
    pub track_space_decisions: bool,
    pub reconstruct_paragraphs: bool,
    pub include_artifacts: bool,
}

Expand description

Text extraction options

Fields§

§preserve_layout: bool

Preserve the original layout (spacing and positioning)

§space_threshold: f64

Minimum space width to insert space character (in text space units)

§tj_space_threshold: f64

Threshold for synthesising an implicit U+0020 from a TJ numeric kerning offset, expressed as a fraction of the current font size. A TJ kern advances the text matrix by -adjustment/1000 * font_size without rendering any glyph; many PDFs (academic publishers, LaTeX, kerned typography) encode inter-word gaps purely as wide negative kerns rather than literal space bytes. When the synthesised advance exceeds tj_space_threshold * font_size, the extractor inserts one U+0020. Default 0.2 (200 milli-em) sits well between typical intra-word kerning (10-50 milli-em) and the width of a space glyph in most fonts (250-300 milli-em). Lower values catch tighter spaces; higher values reduce false positives in fonts with unusually wide kerning. Separate from space_threshold (which governs the post-glyph gap between separate text-show operators) because the TJ numeric kern is measured without any glyph advance baseline and needs a more sensitive threshold (issue #272).

§newline_threshold: f64

Minimum vertical distance to insert newline (in text space units)

§sort_by_position: bool

Sort text fragments by position (useful for multi-column layouts)

§detect_columns: bool

Detect and handle columns

§column_threshold: f64

Column separation threshold (in page units)

§merge_hyphenated: bool

Merge hyphenated words at line ends

§track_space_decisions: bool

Track space insertion decisions in each TextFragment (default: false). When false: zero overhead. When true: populates TextFragment::space_decisions.

§reconstruct_paragraphs: bool

Reconstruct visual lines and paragraphs from the raw text fragments produced by PDF text-show operators. When true, the extractor groups fragments by baseline into single-line fragments, then groups consecutive lines with normal leading into paragraph-level fragments. This is what the partition pipeline needs to produce Element values at paragraph granularity rather than at per-Tj granularity (see issue #261).

Default false for backward compatibility with direct extract_text callers. The PdfDocument::partition* entry points force this to true.

§include_artifacts: bool

Include content inside /Artifact marked-content scopes (page headers, footers, watermarks, decorative content). Default false — Artifact content is filtered out, as the PDF/UA conformance level recommends for accessibility tooling and as RAG callers consistently want (issue #269 Phase 1). Opt-in by setting true when extracting page furniture matters (e.g. forensic auditing, redaction tools).

ExtractionOptions

Struct ExtractionOptions Copy item path

Fields§

Trait Implementations§

impl Clone for ExtractionOptions

fn clone(&self) -> ExtractionOptions

fn clone_from(&mut self, source: &Self)

impl Debug for ExtractionOptions

fn fmt(&self, f: &mut Formatter<'_>) -> Result

impl Default for ExtractionOptions

fn default() -> Self

Auto Trait Implementations§

impl Freeze for ExtractionOptions

impl RefUnwindSafe for ExtractionOptions

impl Send for ExtractionOptions

impl Sync for ExtractionOptions

impl Unpin for ExtractionOptions

impl UnsafeUnpin for ExtractionOptions

impl UnwindSafe for ExtractionOptions

Blanket Implementations§

impl<T> Any for Twhere T: 'static + ?Sized,

fn type_id(&self) -> TypeId

impl<T> Borrow<T> for Twhere T: ?Sized,

fn borrow(&self) -> &T

impl<T> BorrowMut<T> for Twhere T: ?Sized,

fn borrow_mut(&mut self) -> &mut T

impl<T> CloneToUninit for Twhere T: Clone,

unsafe fn clone_to_uninit(&self, dest: *mut u8)

impl<T> From<T> for T

fn from(t: T) -> T

impl<T> Instrument for T

fn instrument(self, span: Span) -> Instrumented<Self>

fn in_current_span(self) -> Instrumented<Self>

impl<T, U> Into<U> for Twhere U: From<T>,

fn into(self) -> U

impl<T> Same for T

type Output = T

impl<T> ToOwned for Twhere T: Clone,

type Owned = T

fn to_owned(&self) -> T

fn clone_into(&self, target: &mut T)

impl<T, U> TryFrom<U> for Twhere U: Into<T>,

type Error = Infallible

fn try_from(value: U) -> Result<T, <T as TryFrom<U>>::Error>

impl<T, U> TryInto<U> for Twhere U: TryFrom<T>,

type Error = <U as TryFrom<T>>::Error

fn try_into(self) -> Result<U, <U as TryFrom<T>>::Error>

impl<T> WithSubscriber for T

fn with_subscriber<S>(self, subscriber: S) -> WithDispatch<Self>where S: Into<Dispatch>,

fn with_current_subscriber(self) -> WithDispatch<Self>

Struct ExtractionOptions

impl<T> Any for T
where T: 'static + ?Sized,

impl<T> Borrow<T> for T
where T: ?Sized,

impl<T> BorrowMut<T> for T
where T: ?Sized,

impl<T> CloneToUninit for T
where T: Clone,

impl<T, U> Into<U> for T
where U: From<T>,

impl<T> ToOwned for T
where T: Clone,

impl<T, U> TryFrom<U> for T
where U: Into<T>,

impl<T, U> TryInto<U> for T
where U: TryFrom<T>,

fn with_subscriber<S>(self, subscriber: S) -> WithDispatch<Self>
where S: Into<Dispatch>,