cli-pdf-to-text 0.1.20

A CLI pdf to plain text converter
Documentation
use pdf_extract::{ConvertToFmt, OutputDev, OutputError, Transform};

pub struct LayoutTextOutput<W: ConvertToFmt> {
  writer: W::Writer,
  flip_ctm: Transform,
  media_left_x: f64,
  media_right_x: f64,
  doc_left_x: f64,
  have_doc_left_x: bool,
  page_left_x: f64,
  have_page_left_x: bool,
  last_end: f64,
  last_x: f64,
  last_y: f64,
  have_char: bool,
  at_word_start: bool,
  skip_current_word: bool,
  pending_newline: bool,
}

impl<W: ConvertToFmt> LayoutTextOutput<W> {
  pub fn new(writer: W) -> Self {
    Self {
      writer: writer.convert(),
      flip_ctm: Transform::identity(),
      media_left_x: 0.0,
      media_right_x: 0.0,
      doc_left_x: 0.0,
      have_doc_left_x: false,
      page_left_x: 0.0,
      have_page_left_x: false,
      last_end: 0.0,
      last_x: 0.0,
      last_y: 0.0,
      have_char: false,
      at_word_start: false,
      skip_current_word: false,
      pending_newline: false,
    }
  }

  fn transformed_font_size(trm: &Transform, font_size: f64) -> f64 {
    let x_vec = (trm.m11 + trm.m21) * font_size;
    let y_vec = (trm.m12 + trm.m22) * font_size;
    let v = (x_vec * y_vec).abs().sqrt();
    if v.is_finite() && v > 0.000_001 { v } else { font_size.max(0.000_001) }
  }

  fn space_width(font_size: f64) -> f64 {
    font_size * 0.35
  }

  fn write_n_spaces(
    writer: &mut impl std::fmt::Write,
    mut count: usize,
  ) -> Result<(), OutputError> {
    use std::fmt::Write;
    const MAX_SPACES: usize = 200;
    if count > MAX_SPACES {
      count = MAX_SPACES;
    }
    if count == 0 {
      return Ok(());
    }
    let s = " ".repeat(count);
    write!(writer, "{s}")?;
    Ok(())
  }

  fn is_mostly_vertical(trm: &Transform) -> bool {
    let horizontal = trm.m11.abs();
    let vertical = trm.m12.abs();
    vertical > (horizontal * 1.4) && vertical > 0.000_001
  }

  fn near_right_margin_x(&self, x: f64) -> bool {
    let width = (self.media_right_x - self.media_left_x).abs();
    if width <= 0.0 {
      return false;
    }
    let margin = (width * 0.12).clamp(24.0, 96.0);
    x >= self.media_right_x - margin
  }

  fn should_skip_margin_word(
    &self,
    trm: &Transform,
    x: f64,
    y: f64,
    fs: f64,
    ch: &str,
  ) -> bool {
    if !self.near_right_margin_x(x) {
      return false;
    }
    let is_alpha = ch.chars().all(char::is_alphabetic);
    if !is_alpha {
      return false;
    }
    if Self::is_mostly_vertical(trm) {
      return true;
    }
    if !self.have_char {
      return true;
    }
    let dx = (x - self.last_x).abs();
    let dy = (y - self.last_y).abs();
    dx <= fs * 0.35 && dy >= fs * 0.45
  }
}

impl<W: ConvertToFmt> OutputDev for LayoutTextOutput<W> {
  fn begin_page(
    &mut self,
    _page_num: u32,
    media_box: &pdf_extract::MediaBox,
    _art_box: Option<(f64, f64, f64, f64)>,
  ) -> Result<(), OutputError> {
    self.flip_ctm = Transform::row_major(
      1.0,
      0.0,
      0.0,
      -1.0,
      0.0,
      media_box.ury - media_box.lly,
    );
    self.media_left_x = media_box.llx;
    self.media_right_x = media_box.urx;
    if self.have_doc_left_x {
      self.page_left_x = self.doc_left_x;
      self.have_page_left_x = true;
    } else {
      self.have_page_left_x = false;
      self.page_left_x = 0.0;
    }
    self.last_end = 0.0;
    self.last_x = 0.0;
    self.last_y = 0.0;
    self.have_char = false;
    self.at_word_start = false;
    self.skip_current_word = false;
    self.pending_newline = false;
    Ok(())
  }

  fn end_page(&mut self) -> Result<(), OutputError> {
    use std::fmt::Write;
    if self.have_char {
      writeln!(self.writer)?;
      writeln!(self.writer)?;
    }
    Ok(())
  }

  fn output_character(
    &mut self,
    trm: &Transform,
    width: f64,
    _spacing: f64,
    font_size: f64,
    ch: &str,
  ) -> Result<(), OutputError> {
    use std::fmt::Write;

    let position = trm.post_transform(&self.flip_ctm);
    let (x, y) = (position.m31, position.m32);
    let fs = Self::transformed_font_size(trm, font_size);
    let sw = Self::space_width(fs);

    if self.at_word_start {
      self.skip_current_word = self.should_skip_margin_word(trm, x, y, fs, ch);
      if self.skip_current_word {
        self.at_word_start = false;
        return Ok(());
      }

      if !self.have_page_left_x {
        self.page_left_x = x;
        self.have_page_left_x = true;
      } else if x < self.page_left_x {
        self.page_left_x = x;
      }
      if !self.have_doc_left_x {
        self.doc_left_x = x;
        self.have_doc_left_x = true;
      } else if x < self.doc_left_x {
        self.doc_left_x = x;
      }

      if self.have_char {
        let dy = (y - self.last_y).abs();
        let moved_left = x < self.last_x - (fs * 0.25);
        if dy > fs * 0.90 || (moved_left && dy > fs * 0.25) {
          self.pending_newline = true;
        }
      }

      if self.pending_newline {
        writeln!(self.writer)?;
        self.last_end = 0.0;
        self.pending_newline = false;

        if self.have_page_left_x {
          let indent = (x - self.page_left_x).max(0.0);
          let spaces = (indent / sw).round() as usize;
          Self::write_n_spaces(&mut self.writer, spaces)?;
        }
      } else if self.have_char {
        let gap = x - self.last_end;
        if gap > sw * 0.75 {
          let spaces = (gap / sw).round().max(1.0) as usize;
          Self::write_n_spaces(&mut self.writer, spaces)?;
        }
      } else if self.have_page_left_x {
        let indent = (x - self.page_left_x).max(0.0);
        if indent > sw * 0.5 {
          let spaces = (indent / sw).round() as usize;
          Self::write_n_spaces(&mut self.writer, spaces)?;
        }
      }
    }

    if self.skip_current_word {
      return Ok(());
    }

    write!(self.writer, "{ch}")?;

    self.have_char = true;
    self.at_word_start = false;
    self.last_x = x;
    self.last_y = y;
    self.last_end = x + (width * fs);
    Ok(())
  }

  fn begin_word(&mut self) -> Result<(), OutputError> {
    self.at_word_start = true;
    self.skip_current_word = false;
    Ok(())
  }

  fn end_word(&mut self) -> Result<(), OutputError> {
    self.skip_current_word = false;
    Ok(())
  }

  fn end_line(&mut self) -> Result<(), OutputError> {
    Ok(())
  }
}