use std::{fs, iter::Peekable, mem, path::PathBuf};
use argh::FromArgs;
use printpdf::{self, Op, PdfPage, TextItem};
type Section = Vec<String>;
type Sections = Vec<Section>;
fn parse_sections(page: &PdfPage) -> Sections {
let mut sections: Vec<Vec<String>> = vec![];
let mut current_section: Vec<String> = vec![];
let mut current_line: String = String::new();
let mut split_on_line_break = true;
for op in &page.ops {
match op {
Op::SetTextMatrix { matrix: _ } | Op::SetTextCursor { pos: _ } => {
if !current_line.is_empty() {
current_section.push(mem::take(&mut current_line));
}
if !current_section.is_empty() {
let section = mem::take(&mut current_section);
sections.push(section);
}
}
Op::WriteText { items, font: _font } => {
for item in items {
if let TextItem::Text(text) = item {
let trimmed = text.trim_matches(['\r', '\n']);
current_line.push_str(trimmed);
if trimmed.len() < text.len() {
split_on_line_break = false;
}
}
}
}
Op::AddLineBreak => {
if split_on_line_break {
let mut line = mem::take(&mut current_line);
line.push('\n');
current_section.push(line);
}
split_on_line_break = true;
}
_ => {}
}
}
if !current_line.is_empty() {
current_section.push(current_line);
}
if !current_section.is_empty() {
sections.push(current_section);
}
sections
}
#[derive(Debug)]
struct PageData {
path: PathBuf,
custom_text: Option<String>,
contents: String,
}
impl PageData {
fn parse_path(path_section: &[String]) -> PathBuf {
let path_str = path_section.join("");
PathBuf::from(path_str.trim())
}
fn parse_contents(contents_section: &[String]) -> String {
contents_section.join("")
}
pub fn parse_from_sections(sections: Vec<Vec<String>>) -> Result<Self, String> {
match sections.len() {
2 => {
let path = Self::parse_path(§ions[0]);
let contents = Self::parse_contents(§ions[1]);
Ok(Self {
path,
custom_text: None,
contents,
})
}
3 => {
let path = Self::parse_path(§ions[0]);
let contents = Self::parse_contents(§ions[2]);
Ok(Self {
path,
custom_text: None,
contents,
})
}
_ => {
dbg!(§ions);
Err(format!(
"Malformed Page. Sections length should be 2 or 3, but it was {}",
sections.len()
))
}
}
}
}
#[derive(Debug)]
struct FileData {
path: PathBuf,
contents: String,
}
fn next_file_data<I: Iterator<Item = Result<PageData, String>>>(
pages: &mut Peekable<I>,
) -> Option<Result<FileData, String>> {
let page = match pages.next()? {
Ok(p) => p,
Err(e) => return Some(Err(e)),
};
let PageData {
path,
mut contents,
custom_text: _custom_text,
} = page;
while let Some(Ok(peeked)) = pages.peek() {
if peeked.path == path {
let data = pages.next().unwrap().unwrap();
let mut c = data.contents;
c.push('\n');
contents.push_str(&c);
} else {
break;
}
}
Some(Ok(FileData { path, contents }))
}
#[derive(FromArgs)]
struct Arguments {
#[argh(positional)]
pdf_path: PathBuf,
#[argh(option, default = "PathBuf::from(\"./generated\")")]
out_dir: PathBuf,
}
fn main() {
let args: Arguments = argh::from_env();
let Arguments { pdf_path, out_dir } = args;
let pdf_bytes = fs::read(pdf_path).unwrap();
let doc = printpdf::PdfDocument::parse(&pdf_bytes, &Default::default(), &mut vec![]).unwrap();
let mut pages_iterator = doc
.pages
.iter()
.map(|p| PageData::parse_from_sections(parse_sections(p)))
.peekable();
while let Some(file_data_result) = next_file_data(&mut pages_iterator) {
let file_data = match file_data_result {
Ok(data) => data,
Err(err) => {
eprintln!("{}", err);
continue;
}
};
let base = &out_dir;
let file_path = base.join(file_data.path);
fs::create_dir_all(file_path.parent().unwrap()).unwrap();
fs::write(file_path, file_data.contents).unwrap();
}
}