pub struct PlainTextExtractor { /* private fields */ }Expand description
Plain text extractor with simplified API
Extracts text from PDF pages without maintaining position information,
providing a simpler API by returning String and Vec<String> instead
of Vec<TextFragment>.
§Architecture
This extractor uses the same content stream parser as TextExtractor,
but discards position metadata to provide a simpler output format. It
tracks minimal position data (x, y coordinates) to determine spacing
and line breaks, then returns clean text strings.
§Performance Characteristics
- Memory: O(1) position tracking vs O(n) fragments
- CPU: No fragment sorting, no width calculations
- Performance: Comparable to
TextExtractor(same parser)
§Thread Safety
PlainTextExtractor is thread-safe and can be reused across multiple
pages and documents. Create once, use many times.
§Examples
§Basic Usage
use oxidize_pdf::parser::PdfReader;
use oxidize_pdf::text::plaintext::PlainTextExtractor;
let doc = PdfReader::open_document("document.pdf")?;
let mut extractor = PlainTextExtractor::new();
let result = extractor.extract(&doc, 0)?;
println!("{}", result.text);§Custom Configuration
use oxidize_pdf::parser::PdfReader;
use oxidize_pdf::text::plaintext::{PlainTextExtractor, PlainTextConfig};
let doc = PdfReader::open_document("document.pdf")?;
let config = PlainTextConfig {
space_threshold: 0.3,
newline_threshold: 12.0,
preserve_layout: true,
line_break_mode: oxidize_pdf::text::plaintext::LineBreakMode::Normalize,
};
let mut extractor = PlainTextExtractor::with_config(config);
let result = extractor.extract(&doc, 0)?;Implementations§
Source§impl PlainTextExtractor
impl PlainTextExtractor
Sourcepub fn new() -> Self
pub fn new() -> Self
Create a new extractor with default configuration
§Examples
use oxidize_pdf::text::plaintext::PlainTextExtractor;
let extractor = PlainTextExtractor::new();Sourcepub fn with_config(config: PlainTextConfig) -> Self
pub fn with_config(config: PlainTextConfig) -> Self
Create a new extractor with custom configuration
§Examples
use oxidize_pdf::text::plaintext::{PlainTextExtractor, PlainTextConfig};
let config = PlainTextConfig::dense();
let extractor = PlainTextExtractor::with_config(config);Sourcepub fn extract<R: Read + Seek>(
&mut self,
document: &PdfDocument<R>,
page_index: u32,
) -> ParseResult<PlainTextResult>
pub fn extract<R: Read + Seek>( &mut self, document: &PdfDocument<R>, page_index: u32, ) -> ParseResult<PlainTextResult>
Extract plain text from a PDF page
Returns text with spaces and newlines inserted according to the configured thresholds. Position information is not included in the result.
§Output
Returns a PlainTextResult containing the extracted text as a String,
along with character count and line count metadata. This is simpler than
TextExtractor which returns Vec<TextFragment> with position data.
§Examples
use oxidize_pdf::parser::PdfReader;
use oxidize_pdf::text::plaintext::PlainTextExtractor;
let doc = PdfReader::open_document("document.pdf")?;
let mut extractor = PlainTextExtractor::new();
let result = extractor.extract(&doc, 0)?; // page index 0 = first page
println!("Extracted {} characters", result.char_count);Sourcepub fn extract_lines<R: Read + Seek>(
&mut self,
document: &PdfDocument<R>,
page_index: u32,
) -> ParseResult<Vec<String>>
pub fn extract_lines<R: Read + Seek>( &mut self, document: &PdfDocument<R>, page_index: u32, ) -> ParseResult<Vec<String>>
Extract text as individual lines
Returns a vector of strings, one for each line detected in the page. Useful for grep-like operations or line-based processing.
§Examples
use oxidize_pdf::parser::PdfReader;
use oxidize_pdf::text::plaintext::PlainTextExtractor;
let doc = PdfReader::open_document("document.pdf")?;
let mut extractor = PlainTextExtractor::new();
let lines = extractor.extract_lines(&doc, 0)?;
for (i, line) in lines.iter().enumerate() {
println!("{}: {}", i + 1, line);
}Sourcepub fn config(&self) -> &PlainTextConfig
pub fn config(&self) -> &PlainTextConfig
Get the current configuration
§Examples
use oxidize_pdf::text::plaintext::{PlainTextExtractor, PlainTextConfig};
let config = PlainTextConfig::dense();
let extractor = PlainTextExtractor::with_config(config.clone());
assert_eq!(extractor.config().space_threshold, 0.1);Trait Implementations§
Auto Trait Implementations§
impl Freeze for PlainTextExtractor
impl RefUnwindSafe for PlainTextExtractor
impl Send for PlainTextExtractor
impl Sync for PlainTextExtractor
impl Unpin for PlainTextExtractor
impl UnwindSafe for PlainTextExtractor
Blanket Implementations§
Source§impl<T> BorrowMut<T> for Twhere
T: ?Sized,
impl<T> BorrowMut<T> for Twhere
T: ?Sized,
Source§fn borrow_mut(&mut self) -> &mut T
fn borrow_mut(&mut self) -> &mut T
Source§impl<T> Instrument for T
impl<T> Instrument for T
Source§fn instrument(self, span: Span) -> Instrumented<Self>
fn instrument(self, span: Span) -> Instrumented<Self>
Source§fn in_current_span(self) -> Instrumented<Self>
fn in_current_span(self) -> Instrumented<Self>
Source§impl<T> IntoEither for T
impl<T> IntoEither for T
Source§fn into_either(self, into_left: bool) -> Either<Self, Self>
fn into_either(self, into_left: bool) -> Either<Self, Self>
self into a Left variant of Either<Self, Self>
if into_left is true.
Converts self into a Right variant of Either<Self, Self>
otherwise. Read moreSource§fn into_either_with<F>(self, into_left: F) -> Either<Self, Self>
fn into_either_with<F>(self, into_left: F) -> Either<Self, Self>
self into a Left variant of Either<Self, Self>
if into_left(&self) returns true.
Converts self into a Right variant of Either<Self, Self>
otherwise. Read moreSource§impl<T> Pointable for T
impl<T> Pointable for T
Source§impl<R, P> ReadPrimitive<R> for P
impl<R, P> ReadPrimitive<R> for P
Source§fn read_from_little_endian(read: &mut R) -> Result<Self, Error>
fn read_from_little_endian(read: &mut R) -> Result<Self, Error>
ReadEndian::read_from_little_endian().