use crate::encoding::Encoding;
use crate::error::Result;
use crate::object::{Object, StringFormat};
use crate::parser::{is_token_end, is_ws, read_object, Cursor};
use std::collections::BTreeMap;
#[derive(Debug, Clone)]
pub struct Operation {
pub operator: String,
pub operands: Vec<Object>,
}
impl Operation {
pub fn new(operator: impl Into<String>, operands: Vec<Object>) -> Self {
Self {
operator: operator.into(),
operands,
}
}
}
#[derive(Debug, Clone, Default)]
pub struct Content {
pub operations: Vec<Operation>,
}
impl Content {
pub fn decode(data: &[u8]) -> Result<Self> {
let mut c = Cursor::at(data, 0);
let mut ops: Vec<Operation> = Vec::with_capacity(256);
let mut buf: Vec<Object> = Vec::with_capacity(8);
loop {
c.skip_ws_and_comments();
if c.at_eof_local() {
break;
}
let before = c.pos;
if can_start_operand(c.rest()) {
match read_object(&mut c) {
Ok(obj) => {
buf.push(obj);
}
Err(_) => {
let op = read_operator(&mut c);
if !op.is_empty() {
ops.push(Operation {
operator: op,
operands: std::mem::take(&mut buf),
});
}
}
}
} else if c.peek() == Some(b'B') && c.rest().starts_with(b"BI") {
skip_inline_image(&mut c);
} else {
let op = read_operator(&mut c);
if !op.is_empty() {
ops.push(Operation {
operator: op,
operands: std::mem::take(&mut buf),
});
}
}
if c.pos == before {
c.advance(1);
}
}
Ok(Self { operations: ops })
}
}
pub fn extract_text_from_stream(
data: &[u8],
encodings: &BTreeMap<Vec<u8>, Encoding>,
) -> String {
let content = match Content::decode(data) {
Ok(c) => c,
Err(_) => return String::new(),
};
let mut current_encoding: Option<&Encoding> = None;
let mut out = String::with_capacity(data.len() / 4);
for op in &content.operations {
match op.operator.as_str() {
"Tf" => {
if let Some(first) = op.operands.first() {
if let Ok(name) = first.as_name() {
current_encoding = encodings.get(name);
}
}
}
"Tj" => {
collect_show(&mut out, current_encoding, &op.operands);
}
"TJ" => {
if let Some(Object::Array(items)) = op.operands.first() {
for item in items {
if let Object::String(_, _) = item {
collect_show(&mut out, current_encoding, std::slice::from_ref(item));
}
}
}
}
"'" => {
if !out.ends_with('\n') {
out.push('\n');
}
collect_show(&mut out, current_encoding, &op.operands);
}
"\"" => {
if !out.ends_with('\n') {
out.push('\n');
}
if let Some(s) = op.operands.get(2) {
collect_show(&mut out, current_encoding, std::slice::from_ref(s));
}
}
"T*" => {
if !out.ends_with('\n') {
out.push('\n');
}
}
"ET" => {
if !out.ends_with('\n') {
out.push('\n');
}
}
_ => {}
}
}
out
}
fn collect_show(out: &mut String, encoding: Option<&Encoding>, operands: &[Object]) {
for op in operands {
if let Object::String(bytes, fmt) = op {
let s = decode_one(encoding, bytes, fmt);
out.push_str(&s);
}
}
}
fn decode_one(encoding: Option<&Encoding>, bytes: &[u8], _fmt: &StringFormat) -> String {
if let Some(enc) = encoding {
if let Ok(s) = enc.bytes_to_string(bytes) {
return s;
}
}
bytes.iter().map(|&b| b as char).collect()
}
fn can_start_operand(rest: &[u8]) -> bool {
let Some(&b) = rest.first() else { return false };
matches!(
b,
b'/' | b'(' | b'<' | b'[' | b'+' | b'-' | b'.' | b'0'..=b'9'
)
}
fn read_operator(c: &mut Cursor<'_>) -> String {
let start = c.pos;
while let Some(b) = c.peek() {
if is_token_end(b) || is_ws(b) {
break;
}
c.bump();
}
String::from_utf8_lossy(&c.buf[start..c.pos]).into_owned()
}
fn skip_inline_image(c: &mut Cursor<'_>) {
c.advance(2);
while c.pos + 2 <= c.buf.len() {
if c.rest().starts_with(b"ID") {
c.advance(2);
if matches!(c.peek(), Some(b' ') | Some(b'\n') | Some(b'\r')) {
c.bump();
}
break;
}
c.bump();
}
while c.pos + 2 <= c.buf.len() {
if c.rest().starts_with(b"EI")
&& c.buf
.get(c.pos + 2)
.map(|&b| is_ws(b))
.unwrap_or(true)
{
c.advance(2);
break;
}
c.bump();
}
}
trait CursorExt {
fn at_eof_local(&self) -> bool;
}
impl<'a> CursorExt for Cursor<'a> {
fn at_eof_local(&self) -> bool {
self.pos >= self.buf.len()
}
}