1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
extern crate lopdf;

use std::collections::BTreeMap;
use std::fmt;
use std::io::Read;

use lopdf::content::Content;
use lopdf::{Document, Object, ObjectId};

// ¯\_(ツ)_/¯
const SPACE_THRESHOLD: i64 = 100;

#[derive(Default)]
pub struct Collector {
    text: String,
}

#[derive(Default)]
pub struct WordCount {
    pub words: usize,
    pub characters: usize,
    pub lines: usize,
}

impl Collector {
    pub fn process_document<R: Read>(source: R) -> WordCount {
        let document = Document::load_from(source).unwrap();
        let mut collector = Collector::default();
        let pages = document.get_pages();
        for page_id in pages.values().into_iter() {
            collector.process_page(&document, *page_id);
        }

        collector.count()
    }

    fn collect_text(&mut self, encoding: Option<&str>, operands: &[Object]) {
        for operand in operands.iter() {
            match operand {
                Object::String(ref bytes, _) => {
                    let decoded_text = Document::decode_text(encoding, bytes);
                    self.text.push_str(&decoded_text);
                }
                Object::Array(ref arr) => {
                    self.collect_text(encoding, arr);
                }
                Object::Real(f) if f.abs() > SPACE_THRESHOLD as f64 => {
                    self.text.push(' ');
                }
                Object::Integer(i) if i.abs() > SPACE_THRESHOLD => {
                    self.text.push(' ');
                }
                _ => {}
            }
        }
    }

    fn process_page(&mut self, document: &Document, page_id: ObjectId) {
        let fonts = document.get_page_fonts(page_id);
        let encodings = fonts
            .into_iter()
            .map(|(name, font)| (name, document.get_font_encoding(font)))
            .collect::<BTreeMap<String, &str>>();
        let raw_content = document.get_page_content(page_id).unwrap();
        let content = Content::decode(&raw_content).unwrap();
        let mut current_encoding = None;

        for operation in content.operations.iter() {
            match operation.operator.as_ref() {
                "Tf" => {
                    let current_font = operation.operands[0].as_name_str().unwrap();
                    current_encoding = encodings.get(current_font).cloned();
                }
                "Tj" | "TJ" => {
                    self.collect_text(current_encoding, &operation.operands);
                }
                "ET" => if !self.text.ends_with('\n') {
                    self.text.push('\n')
                },
                "Td" | "TD" | "T*" if self.text.ends_with('-') => {
                    // Trim away end-of-line hyphenation:
                    let len = self.text.len() - 1;
                    self.text.truncate(len)
                },
                "Td" | "TD" | "T*" if !self.text.ends_with(' ') => {
                    self.text.push(' ')
                },
                _ => {}
            }
        }

        self.text = self.text.trim().to_string();
    }

    fn count(&self) -> WordCount {
        let mut wc = WordCount::default();
        wc.characters = self.text.len();
        wc.words = self.text.split_whitespace().count();
        wc.lines = self.text.lines().count();
        wc
    }
}

impl fmt::Display for WordCount {
    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
        write!(f, "\t{}\t{}\t{}", self.lines, self.words, self.characters)
    }
}