1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
use super::*;
use lazy_static::lazy_static;
use spawning::SpawningFileAdapter;
use std::io::BufReader;
use std::process::Command;

static EXTENSIONS: &[&str] = &["pdf"];

lazy_static! {
    static ref METADATA: AdapterMeta = AdapterMeta {
        name: "poppler".to_owned(),
        version: 1,
        description: "Uses pdftotext (from poppler-utils) to extract plain text from PDF files"
            .to_owned(),
        fast_matchers: EXTENSIONS
            .iter()
            .map(|s| FastMatcher::FileExtension(s.to_string()))
            .collect(),
        slow_matchers: None
    };
}
#[derive(Default)]
pub struct PopplerAdapter;

impl PopplerAdapter {
    pub fn new() -> PopplerAdapter {
        PopplerAdapter
    }
}

impl GetMetadata for PopplerAdapter {
    fn metadata(&self) -> &AdapterMeta {
        &METADATA
    }
}
impl SpawningFileAdapter for PopplerAdapter {
    fn postproc(line_prefix: &str, inp: &mut dyn Read, oup: &mut dyn Write) -> Fallible<()> {
        // prepend Page X to each line
        let mut page = 1;
        for line in BufReader::new(inp).lines() {
            let mut line = line?;
            if line.contains('\x0c') {
                // page break
                line = line.replace('\x0c', "");
                page += 1;
            }
            oup.write_all(format!("{}Page {}: {}\n", line_prefix, page, line).as_bytes())?;
        }
        Ok(())
    }
    fn get_exe(&self) -> &str {
        "pdftotext"
    }
    fn command(&self, _filepath_hint: &Path, mut cmd: Command) -> Command {
        cmd.arg("-layout").arg("-").arg("-");
        cmd
    }
}