1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
extern crate lopdf;
use std::collections::BTreeMap;
use std::fmt;
use std::io::Read;
use lopdf::content::Content;
use lopdf::{Document, Object, ObjectId};
const SPACE_THRESHOLD: i64 = 100;
#[derive(Default)]
pub struct Collector {
text: String,
}
#[derive(Default)]
pub struct WordCount {
pub words: usize,
pub characters: usize,
pub lines: usize,
}
impl Collector {
pub fn process_document<R: Read>(source: R) -> WordCount {
let document = Document::load_from(source).unwrap();
let mut collector = Collector::default();
let pages = document.get_pages();
for page_id in pages.values().into_iter() {
collector.process_page(&document, *page_id);
}
collector.count()
}
fn collect_text(&mut self, encoding: Option<&str>, operands: &[Object]) {
for operand in operands.iter() {
match operand {
Object::String(ref bytes, _) => {
let decoded_text = Document::decode_text(encoding, bytes);
self.text.push_str(&decoded_text);
}
Object::Array(ref arr) => {
self.collect_text(encoding, arr);
}
Object::Real(f) if f.abs() > SPACE_THRESHOLD as f64 => {
self.text.push(' ');
}
Object::Integer(i) if i.abs() > SPACE_THRESHOLD => {
self.text.push(' ');
}
_ => {}
}
}
}
fn process_page(&mut self, document: &Document, page_id: ObjectId) {
let fonts = document.get_page_fonts(page_id);
let encodings = fonts
.into_iter()
.map(|(name, font)| (name, document.get_font_encoding(font)))
.collect::<BTreeMap<String, &str>>();
let raw_content = document.get_page_content(page_id).unwrap();
let content = Content::decode(&raw_content).unwrap();
let mut current_encoding = None;
for operation in content.operations.iter() {
match operation.operator.as_ref() {
"Tf" => {
let current_font = operation.operands[0].as_name_str().unwrap();
current_encoding = encodings.get(current_font).cloned();
}
"Tj" | "TJ" => {
self.collect_text(current_encoding, &operation.operands);
}
"ET" => if !self.text.ends_with('\n') {
self.text.push('\n')
},
"Td" | "TD" | "T*" if self.text.ends_with('-') => {
let len = self.text.len() - 1;
self.text.truncate(len)
},
"Td" | "TD" | "T*" if !self.text.ends_with(' ') => {
self.text.push(' ')
},
_ => {}
}
}
self.text = self.text.trim().to_string();
}
fn count(&self) -> WordCount {
let mut wc = WordCount::default();
wc.characters = self.text.len();
wc.words = self.text.split_whitespace().count();
wc.lines = self.text.lines().count();
wc
}
}
impl fmt::Display for WordCount {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
write!(f, "\t{}\t{}\t{}", self.lines, self.words, self.characters)
}
}