use std::io::Cursor;
use pdf_extract::content::{Content, Operation};
use pdf_extract::{dictionary, Document, Object, ObjectId, Stream};
pub struct Pdf {
version: f32,
text: String,
document: Document,
filename: String,
pub font_id: ObjectId,
pub resources_id: ObjectId,
pub pages_id: ObjectId,
pub content_id: ObjectId,
}
impl Pdf {
pub fn read_from_file(filename: &str) -> Result<Self, String> {
let bytes = match std::fs::read(filename) {
Ok(e) => e,
Err(e) => return Err(format!("Failed to open {}", e)),
};
Self::inner_from(bytes, Some(filename))
}
pub fn read_from_bytes(bytes: Vec<u8>)->Result<Self, String> {
Self::inner_from(bytes, None)
}
fn inner_from(bytes: Vec<u8>, filename: Option<&str>)->Result<Self, String> {
let out = pdf_extract::extract_text_from_mem(&bytes).unwrap();
let reader = Cursor::new(bytes);
let document = Document::load_from(reader).unwrap();
let filename = filename.unwrap_or("memory").to_string();
Ok(Self {
version: document.version.parse().unwrap(),
text: out,
document,
filename,
font_id: (0, 0),
resources_id: (0, 0),
pages_id: (0, 0),
content_id: (0, 0),
})
}
pub fn version(&self) -> f32 {
self.version
}
pub fn text(&self) -> String {
self.text.clone()
}
pub fn get_text_list(&self) -> Vec<&str> {
let lines = self.text.lines();
let mut list = vec![];
for line in lines {
if line.trim().is_empty() {
continue;
}
list.push(line.trim());
}
list
}
pub fn write(filename: &str) -> Result<Self, String> {
let mut doc = Document::with_version("1.7");
let pages_id = doc.new_object_id();
let font_id = doc.add_object(dictionary! {
"Type" => "Font",
"Subtype" => "Type1",
"BaseFont" => "Courier",
});
let resources_id = doc.add_object(dictionary! {
"Font" => dictionary! {
"F1" => font_id,
},
});
Ok(Self {
version: doc.version.parse().unwrap(),
text: "".to_string(),
document: doc,
pages_id,
font_id,
resources_id,
filename: filename.to_string(),
content_id: (0, 0),
})
}
pub fn page_data(mut self) {
let content = Content {
operations: vec![
Operation::new("BT", vec![]),
Operation::new("Tf", vec!["F1".into(), 10.into()]),
Operation::new("Td", vec![0.into(), 800.into()]),
Operation::new("Tj", vec![Object::string_literal("Hello World!")]),
Operation::new("ET", vec![]),
],
};
let content_id = self
.document
.add_object(Stream::new(dictionary! {}, content.encode().unwrap()));
self.content_id = content_id;
let page_id = self.document.add_object(dictionary! {
"Type" => "Page",
"Parent" => self.pages_id,
"Contents" => content_id,
});
let pages = dictionary! {
"Type" => "Pages",
"Kids" => vec![page_id.into()],
"Count" => 1,
"Resources" => self.resources_id,
"MediaBox" => vec![0.into(), 0.into(), 595.into(), 842.into()],
};
self.document
.objects
.insert(self.pages_id, Object::Dictionary(pages));
let catalog_id = self.document.add_object(dictionary! {
"Type" => "Catalog",
"Pages" => self.pages_id,
});
self.document.trailer.set("Root", catalog_id);
self.document.compress();
self.document.save(self.filename).unwrap();
}
}