Struct PlainTextExtractor

Source

pub struct PlainTextExtractor { /* private fields */ }

Expand description

Plain text extractor with simplified API

Extracts text from PDF pages without maintaining position information, providing a simpler API by returning String and Vec<String> instead of Vec<TextFragment>.

§Architecture

This extractor uses the same content stream parser as TextExtractor, but discards position metadata to provide a simpler output format. It tracks minimal position data (x, y coordinates) to determine spacing and line breaks, then returns clean text strings.

§Performance Characteristics

Memory: O(1) position tracking vs O(n) fragments
CPU: No fragment sorting, no width calculations
Performance: Comparable to TextExtractor (same parser)

§Thread Safety

PlainTextExtractor is thread-safe and can be reused across multiple pages and documents. Create once, use many times.

§Examples

§Basic Usage

use oxidize_pdf::parser::PdfReader;
use oxidize_pdf::text::plaintext::PlainTextExtractor;

let doc = PdfReader::open_document("document.pdf")?;

let mut extractor = PlainTextExtractor::new();
let result = extractor.extract(&doc, 0)?;

println!("{}", result.text);

§Custom Configuration

use oxidize_pdf::parser::PdfReader;
use oxidize_pdf::text::plaintext::{PlainTextExtractor, PlainTextConfig};

let doc = PdfReader::open_document("document.pdf")?;

let config = PlainTextConfig {
    space_threshold: 0.3,
    newline_threshold: 12.0,
    preserve_layout: true,
    line_break_mode: oxidize_pdf::text::plaintext::LineBreakMode::Normalize,
};

let mut extractor = PlainTextExtractor::with_config(config);
let result = extractor.extract(&doc, 0)?;

Struct PlainTextExtractor Copy item path

§Architecture

§Performance Characteristics

§Thread Safety

§Examples

§Basic Usage

§Custom Configuration

Implementations§

impl PlainTextExtractor

pub fn new() -> Self

§Examples

pub fn with_config(config: PlainTextConfig) -> Self

§Examples

pub fn extract<R: Read + Seek>( &mut self, document: &PdfDocument<R>, page_index: u32, ) -> ParseResult<PlainTextResult>

§Output

§Examples

pub fn extract_lines<R: Read + Seek>( &mut self, document: &PdfDocument<R>, page_index: u32, ) -> ParseResult<Vec<String>>

§Examples

pub fn config(&self) -> &PlainTextConfig

§Examples

Trait Implementations§

impl Default for PlainTextExtractor

fn default() -> Self

Auto Trait Implementations§

impl Freeze for PlainTextExtractor

impl RefUnwindSafe for PlainTextExtractor

impl Send for PlainTextExtractor

impl Sync for PlainTextExtractor

impl Unpin for PlainTextExtractor

impl UnwindSafe for PlainTextExtractor

Blanket Implementations§

impl<T> Any for Twhere T: 'static + ?Sized,

fn type_id(&self) -> TypeId

impl<T> Borrow<T> for Twhere T: ?Sized,

fn borrow(&self) -> &T

impl<T> BorrowMut<T> for Twhere T: ?Sized,

fn borrow_mut(&mut self) -> &mut T

impl<T> From<T> for T

fn from(t: T) -> T

impl<T> Instrument for T

fn instrument(self, span: Span) -> Instrumented<Self>

fn in_current_span(self) -> Instrumented<Self>

impl<T, U> Into<U> for Twhere U: From<T>,

fn into(self) -> U

impl<T> IntoEither for T

fn into_either(self, into_left: bool) -> Either<Self, Self>

fn into_either_with<F>(self, into_left: F) -> Either<Self, Self>where F: FnOnce(&Self) -> bool,

impl<T> Pointable for T

const ALIGN: usize

type Init = T

unsafe fn init(init: <T as Pointable>::Init) -> usize

unsafe fn deref<'a>(ptr: usize) -> &'a T

unsafe fn deref_mut<'a>(ptr: usize) -> &'a mut T

unsafe fn drop(ptr: usize)

impl<R, P> ReadPrimitive<R> for Pwhere R: Read + ReadEndian<P>, P: Default,

fn read_from_little_endian(read: &mut R) -> Result<Self, Error>

fn read_from_big_endian(read: &mut R) -> Result<Self, Error>

fn read_from_native_endian(read: &mut R) -> Result<Self, Error>

impl<T> Same for T

type Output = T

impl<T, U> TryFrom<U> for Twhere U: Into<T>,

type Error = Infallible

fn try_from(value: U) -> Result<T, <T as TryFrom<U>>::Error>

impl<T, U> TryInto<U> for Twhere U: TryFrom<T>,

type Error = <U as TryFrom<T>>::Error

fn try_into(self) -> Result<U, <U as TryFrom<T>>::Error>

impl<V, T> VZip<V> for Twhere V: MultiLane<T>,

fn vzip(self) -> V

impl<T> WithSubscriber for T

fn with_subscriber<S>(self, subscriber: S) -> WithDispatch<Self>where S: Into<Dispatch>,

fn with_current_subscriber(self) -> WithDispatch<Self>

Struct PlainTextExtractor

impl<T> Any for T
where T: 'static + ?Sized,

impl<T> Borrow<T> for T
where T: ?Sized,

impl<T> BorrowMut<T> for T
where T: ?Sized,

impl<T, U> Into<U> for T
where U: From<T>,

fn into_either_with<F>(self, into_left: F) -> Either<Self, Self>
where F: FnOnce(&Self) -> bool,

impl<R, P> ReadPrimitive<R> for P
where R: Read + ReadEndian<P>, P: Default,

impl<T, U> TryFrom<U> for T
where U: Into<T>,

impl<T, U> TryInto<U> for T
where U: TryFrom<T>,

impl<V, T> VZip<V> for T
where V: MultiLane<T>,

fn with_subscriber<S>(self, subscriber: S) -> WithDispatch<Self>
where S: Into<Dispatch>,