br_pdf/
lib.rs

1use pdf_extract::content::{Content, Operation};
2use pdf_extract::{dictionary, Document, Object, ObjectId, Stream};
3
4pub struct Pdf {
5    version: f32,
6    text: String,
7    document: Document,
8    filename: String,
9    pub font_id: ObjectId,
10    pub resources_id: ObjectId,
11    pub pages_id: ObjectId,
12    pub content_id: ObjectId,
13}
14
15impl Pdf {
16    pub fn read(filename: &str) -> Result<Self, String> {
17        let bytes = match std::fs::read(filename) {
18            Ok(e) => e,
19            Err(e) => return Err(format!("Failed to open {}", e)),
20        };
21        let document = Document::load(filename).unwrap();
22        let out = pdf_extract::extract_text_from_mem(&bytes).unwrap();
23        Ok(Self {
24            version: document.version.parse().unwrap(),
25            text: out,
26            document,
27            filename: filename.to_string(),
28            font_id: (0, 0),
29            resources_id: (0, 0),
30            pages_id: (0, 0),
31            content_id: (0, 0),
32        })
33    }
34    pub fn version(&self) -> f32 {
35        self.version
36    }
37    pub fn text(&self) -> String {
38        self.text.clone()
39    }
40    pub fn get_text_list(&self) -> Vec<&str> {
41        let lines = self.text.lines();
42        let mut list = vec![];
43        for line in lines {
44            if line.trim().is_empty() {
45                continue;
46            }
47            list.push(line.trim());
48        }
49        list
50    }
51
52    pub fn write(filename: &str) -> Result<Self, String> {
53        let mut doc = Document::with_version("1.7");
54        let pages_id = doc.new_object_id();
55
56        let font_id = doc.add_object(dictionary! {
57            "Type" => "Font",
58            "Subtype" => "Type1",
59            "BaseFont" => "Courier",
60        });
61        let resources_id = doc.add_object(dictionary! {
62            "Font" => dictionary! {
63                "F1" => font_id,
64            },
65        });
66        Ok(Self {
67            version: doc.version.parse().unwrap(),
68            text: "".to_string(),
69            document: doc,
70            pages_id,
71            font_id,
72            resources_id,
73            filename: filename.to_string(),
74            content_id: (0, 0),
75        })
76    }
77
78    pub fn page_data(mut self) {
79        let content = Content {
80            operations: vec![
81                Operation::new("BT", vec![]),
82                Operation::new("Tf", vec!["F1".into(), 10.into()]),
83                Operation::new("Td", vec![0.into(), 800.into()]),
84                Operation::new("Tj", vec![Object::string_literal("Hello World!")]),
85                Operation::new("ET", vec![]),
86            ],
87        };
88        let content_id = self
89            .document
90            .add_object(Stream::new(dictionary! {}, content.encode().unwrap()));
91        self.content_id = content_id;
92        let page_id = self.document.add_object(dictionary! {
93            "Type" => "Page",
94            "Parent" => self.pages_id,
95            "Contents" => content_id,
96        });
97        let pages = dictionary! {
98            // Type of dictionary
99            "Type" => "Pages",
100            // Vector of page IDs in document. Normally would contain more than one ID
101            // and be produced using a loop of some kind.
102            "Kids" => vec![page_id.into()],
103            // Page count
104            "Count" => 1,
105            // ID of resources dictionary, defined earlier
106            "Resources" => self.resources_id,
107            // A rectangle that defines the boundaries of the physical or digital media.
108            // This is the "page size".
109            "MediaBox" => vec![0.into(), 0.into(), 595.into(), 842.into()],
110        };
111        self.document
112            .objects
113            .insert(self.pages_id, Object::Dictionary(pages));
114        let catalog_id = self.document.add_object(dictionary! {
115            "Type" => "Catalog",
116            "Pages" => self.pages_id,
117        });
118        self.document.trailer.set("Root", catalog_id);
119        self.document.compress();
120        self.document.save(self.filename).unwrap();
121    }
122}