br_pdf/
lib.rs

1use std::io::Cursor;
2use pdf_extract::content::{Content, Operation};
3use pdf_extract::{dictionary, Document, Object, ObjectId, Stream};
4
5pub struct Pdf {
6    version: f32,
7    text: String,
8    document: Document,
9    filename: String,
10    pub font_id: ObjectId,
11    pub resources_id: ObjectId,
12    pub pages_id: ObjectId,
13    pub content_id: ObjectId,
14}
15
16impl Pdf {
17    pub fn read_from_file(filename: &str) -> Result<Self, String> {
18        let bytes = match std::fs::read(filename) {
19            Ok(e) => e,
20            Err(e) => return Err(format!("Failed to open {}", e)),
21        };
22
23        Self::inner_from(bytes, Some(filename))
24    }
25
26    pub fn read_from_bytes(bytes: Vec<u8>)->Result<Self, String> {
27        Self::inner_from(bytes, None)
28    }
29
30    fn inner_from(bytes: Vec<u8>, filename: Option<&str>)->Result<Self, String> {
31        let out = pdf_extract::extract_text_from_mem(&bytes).unwrap();
32        let reader = Cursor::new(bytes);
33        let document = Document::load_from(reader).unwrap();
34        let filename = filename.unwrap_or("memory").to_string();
35
36        Ok(Self {
37            version: document.version.parse().unwrap(),
38            text: out,
39            document,
40            filename,
41            font_id: (0, 0),
42            resources_id: (0, 0),
43            pages_id: (0, 0),
44            content_id: (0, 0),
45        })
46    }
47
48    pub fn version(&self) -> f32 {
49        self.version
50    }
51    pub fn text(&self) -> String {
52        self.text.clone()
53    }
54    pub fn get_text_list(&self) -> Vec<&str> {
55        let lines = self.text.lines();
56        let mut list = vec![];
57        for line in lines {
58            if line.trim().is_empty() {
59                continue;
60            }
61            list.push(line.trim());
62        }
63        list
64    }
65
66    pub fn write(filename: &str) -> Result<Self, String> {
67        let mut doc = Document::with_version("1.7");
68        let pages_id = doc.new_object_id();
69
70        let font_id = doc.add_object(dictionary! {
71            "Type" => "Font",
72            "Subtype" => "Type1",
73            "BaseFont" => "Courier",
74        });
75        let resources_id = doc.add_object(dictionary! {
76            "Font" => dictionary! {
77                "F1" => font_id,
78            },
79        });
80        Ok(Self {
81            version: doc.version.parse().unwrap(),
82            text: "".to_string(),
83            document: doc,
84            pages_id,
85            font_id,
86            resources_id,
87            filename: filename.to_string(),
88            content_id: (0, 0),
89        })
90    }
91
92    pub fn page_data(mut self) {
93        let content = Content {
94            operations: vec![
95                Operation::new("BT", vec![]),
96                Operation::new("Tf", vec!["F1".into(), 10.into()]),
97                Operation::new("Td", vec![0.into(), 800.into()]),
98                Operation::new("Tj", vec![Object::string_literal("Hello World!")]),
99                Operation::new("ET", vec![]),
100            ],
101        };
102        let content_id = self
103            .document
104            .add_object(Stream::new(dictionary! {}, content.encode().unwrap()));
105        self.content_id = content_id;
106        let page_id = self.document.add_object(dictionary! {
107            "Type" => "Page",
108            "Parent" => self.pages_id,
109            "Contents" => content_id,
110        });
111        let pages = dictionary! {
112            // Type of dictionary
113            "Type" => "Pages",
114            // Vector of page IDs in document. Normally would contain more than one ID
115            // and be produced using a loop of some kind.
116            "Kids" => vec![page_id.into()],
117            // Page count
118            "Count" => 1,
119            // ID of resources dictionary, defined earlier
120            "Resources" => self.resources_id,
121            // A rectangle that defines the boundaries of the physical or digital media.
122            // This is the "page size".
123            "MediaBox" => vec![0.into(), 0.into(), 595.into(), 842.into()],
124        };
125        self.document
126            .objects
127            .insert(self.pages_id, Object::Dictionary(pages));
128        let catalog_id = self.document.add_object(dictionary! {
129            "Type" => "Catalog",
130            "Pages" => self.pages_id,
131        });
132        self.document.trailer.set("Root", catalog_id);
133        self.document.compress();
134        self.document.save(self.filename).unwrap();
135    }
136}