dongler-core 0.3.0

use std::collections::HashMap;
use std::io::Read;

use flate2::read::ZlibDecoder;
use rayon::prelude::*;
use sha2::{Digest, Sha256};

use crate::engine::ExtractionEngine;
use crate::error::{DonglerError, Result};
use crate::ir::{
    Asset, BBox, Block, Confidence, Document, ImageObject, Line, Metadata, Page, SourceAnchor,
    Span, TableBlock, TableCell, TextBlock, Warning, SCHEMA_VERSION,
};
use crate::source::Source;

#[derive(Debug, Default, Clone, Copy)]
pub struct PdfEngine;

impl ExtractionEngine for PdfEngine {
    fn name(&self) -> &'static str {
        "pdf-native"
    }

    fn extract(&self, source: &Source) -> Result<Document> {
        let bytes = source.bytes.as_deref().unwrap_or(source.content.as_bytes());
        extract_pdf(bytes, source, self.name())
    }
}

#[derive(Debug, Clone)]
struct PdfObject {
    object_number: u32,
    generation: u16,
    body: Vec<u8>,
}

#[derive(Debug, Clone)]
struct PageSeed {
    number: usize,
    body: String,
}

#[derive(Debug, Clone)]
struct PageExtraction {
    page: Page,
    text: String,
}

#[derive(Debug, Clone)]
struct TextRun {
    text: String,
    bbox: BBox,
    font: Option<String>,
    size: f32,
    source_object_ids: Vec<String>,
}

#[derive(Debug, Clone)]
struct TextLine {
    runs: Vec<TextRun>,
    bbox: BBox,
}

#[derive(Debug, Clone)]
struct DetectedTable {
    table: TableBlock,
    line_indices: Vec<usize>,
}

#[derive(Debug, Clone)]
struct ColumnLayout<'a> {
    leading: Vec<&'a TextLine>,
    columns: Vec<Vec<&'a TextLine>>,
    trailing: Vec<&'a TextLine>,
}

#[derive(Debug, Clone)]
struct ContentExtraction {
    text_runs: Vec<TextRun>,
    images: Vec<ImageObject>,
    assets: Vec<Asset>,
    warnings: Vec<Warning>,
}

#[derive(Debug, Clone, Default)]
struct FontDecoder {
    cmap: HashMap<Vec<u8>, String>,
    max_code_len: usize,
}

#[derive(Debug, Clone)]
enum Operand {
    Number(f32),
    Name(String),
    Literal(Vec<u8>),
    Hex(Vec<u8>),
    Array(Vec<Operand>),
    Other,
}

#[derive(Debug, Clone)]
struct ContentOp {
    operands: Vec<Operand>,
    operator: String,
}

#[derive(Debug, Clone)]
struct GraphicsState {
    ctm: Matrix,
    text_x: f32,
    text_y: f32,
    line_x: f32,
    line_y: f32,
    font_name: Option<String>,
    font_size: f32,
    leading: f32,
}

impl Default for GraphicsState {
    fn default() -> Self {
        Self {
            ctm: Matrix::identity(),
            text_x: 0.0,
            text_y: 0.0,
            line_x: 0.0,
            line_y: 0.0,
            font_name: None,
            font_size: 12.0,
            leading: 12.0,
        }
    }
}

#[derive(Debug, Clone, Copy)]
struct Matrix {
    a: f32,
    b: f32,
    c: f32,
    d: f32,
    e: f32,
    f: f32,
}

impl Matrix {
    fn identity() -> Self {
        Self {
            a: 1.0,
            b: 0.0,
            c: 0.0,
            d: 1.0,
            e: 0.0,
            f: 0.0,
        }
    }

    fn multiply(self, other: Self) -> Self {
        Self {
            a: self.a * other.a + self.b * other.c,
            b: self.a * other.b + self.b * other.d,
            c: self.c * other.a + self.d * other.c,
            d: self.c * other.b + self.d * other.d,
            e: self.e * other.a + self.f * other.c + other.e,
            f: self.e * other.b + self.f * other.d + other.f,
        }
    }

    fn point(self, x: f32, y: f32) -> (f32, f32) {
        (
            self.a * x + self.c * y + self.e,
            self.b * x + self.d * y + self.f,
        )
    }

    fn bbox(self) -> BBox {
        BBox {
            x: self.e,
            y: self.f,
            width: self.a.abs(),
            height: self.d.abs(),
        }
    }
}

pub fn extract_pdf(bytes: &[u8], source: &Source, engine_name: &str) -> Result<Document> {
    if !bytes.starts_with(b"%PDF-") {
        return Err(DonglerError::pdf("missing %PDF header"));
    }

    let mut objects = parse_indirect_objects(bytes);
    expand_object_streams(&mut objects);
    if objects.is_empty() {
        return Err(DonglerError::pdf("no indirect objects found"));
    }

    let object_map = objects
        .iter()
        .map(|object| (object.object_number, object.clone()))
        .collect::<HashMap<_, _>>();
    let page_seeds = objects
        .iter()
        .filter_map(|object| page_seed(object, &object_map))
        .enumerate()
        .map(|(index, mut seed)| {
            seed.number = index + 1;
            seed
        })
        .collect::<Vec<_>>();

    if page_seeds.is_empty() {
        return Err(DonglerError::pdf("no page objects found"));
    }

    let mut document_warnings = Vec::new();
    if contains_name(bytes, b"/Encrypt") {
        document_warnings.push(warning(
            "pdf.encrypted",
            "warning",
            "document declares encryption; extraction may be incomplete",
            None,
        ));
    }
    if contains_name(bytes, b"/ObjStm") {
        document_warnings.push(warning(
            "pdf.object_stream",
            "info",
            "object streams detected and expanded by the native scanner",
            None,
        ));
    }

    let page_extractions = page_seeds
        .par_iter()
        .map(|seed| extract_page(seed, &object_map))
        .collect::<Vec<_>>();

    let mut pages = Vec::with_capacity(page_extractions.len());
    let mut all_text = String::new();
    let mut assets = Vec::new();

    for extraction in page_extractions {
        all_text.push_str(&extraction.text);
        all_text.push('\n');
        assets.extend(extraction.page.assets.clone());
        pages.push(extraction.page);
    }

    Ok(Document {
        schema_version: SCHEMA_VERSION.to_owned(),
        metadata: Metadata {
            format: "pdf".to_owned(),
            engine: engine_name.to_owned(),
            source: source.path.clone(),
            title: extract_info_string(&objects, "Title"),
            character_count: all_text.chars().count(),
            word_count: all_text.split_whitespace().count(),
            block_count: pages.iter().map(|page| page.blocks.len()).sum(),
            file_size_bytes: Some(bytes.len() as u64),
            pdf_version: pdf_version(bytes),
            encrypted: contains_name(bytes, b"/Encrypt"),
        },
        pages,
        assets,
        warnings: document_warnings,
    })
}

fn extract_page(seed: &PageSeed, object_map: &HashMap<u32, PdfObject>) -> PageExtraction {
    let media_box = parse_number_array_after(&seed.body, "/MediaBox")
        .unwrap_or_else(|| vec![0.0, 0.0, 612.0, 792.0]);
    let width =
        media_box.get(2).copied().unwrap_or(612.0) - media_box.first().copied().unwrap_or(0.0);
    let height =
        media_box.get(3).copied().unwrap_or(792.0) - media_box.get(1).copied().unwrap_or(0.0);
    let rotation = parse_number_after(&seed.body, "/Rotate").map(|value| value as i32);
    let contents = parse_refs_after_key(&seed.body, "/Contents");
    let resource_body = resolve_resource_body(&seed.body, object_map);
    let resource_text = resource_body.as_deref().unwrap_or(&seed.body);
    let xobjects = resolve_named_resource_refs(resource_text, "/XObject", object_map);
    let fonts = load_font_decoders(resource_text, object_map);

    let mut warnings = Vec::new();
    let mut extraction = ContentExtraction {
        text_runs: Vec::new(),
        images: Vec::new(),
        assets: Vec::new(),
        warnings: Vec::new(),
    };

    for content_ref in contents {
        match object_map
            .get(&(content_ref as u32))
            .map(decode_stream_object)
        {
            Some(Ok(Some(stream))) => {
                let object_id = format!("{content_ref} 0 R");
                let mut content = interpret_content_stream(
                    &stream,
                    seed.number,
                    &[object_id],
                    &xobjects,
                    &fonts,
                    object_map,
                );
                extraction.text_runs.append(&mut content.text_runs);
                extraction.images.append(&mut content.images);
                extraction.assets.append(&mut content.assets);
                extraction.warnings.append(&mut content.warnings);
            }
            Some(Ok(None)) | None => warnings.push(warning(
                "pdf.missing_content",
                "warning",
                "page content stream is missing",
                Some(seed.number),
            )),
            Some(Err(error)) => warnings.push(warning(
                "pdf.stream_decode",
                "warning",
                &error.to_string(),
                Some(seed.number),
            )),
        }
    }

    warnings.append(&mut extraction.warnings);
    let lines = group_text_runs(extraction.text_runs);
    let blocks = build_blocks(seed.number, &lines);
    let text = blocks
        .iter()
        .map(block_text)
        .filter(|text| !text.is_empty())
        .collect::<Vec<_>>()
        .join("\n");

    let page = Page {
        number: seed.number,
        width: Some(width),
        height: Some(height),
        rotation,
        bbox: Some(BBox {
            x: media_box.first().copied().unwrap_or(0.0),
            y: media_box.get(1).copied().unwrap_or(0.0),
            width,
            height,
        }),
        blocks,
        images: extraction.images,
        assets: extraction.assets,
        warnings,
    };

    PageExtraction { page, text }
}

fn interpret_content_stream(
    bytes: &[u8],
    page_number: usize,
    source_object_ids: &[String],
    xobjects: &HashMap<String, u32>,
    fonts: &HashMap<String, FontDecoder>,
    object_map: &HashMap<u32, PdfObject>,
) -> ContentExtraction {
    let mut state = GraphicsState::default();
    let mut graphics_stack = Vec::new();
    let mut extraction = ContentExtraction {
        text_runs: Vec::new(),
        images: Vec::new(),
        assets: Vec::new(),
        warnings: Vec::new(),
    };

    for op in parse_content_ops(bytes) {
        match op.operator.as_str() {
            "q" => graphics_stack.push(state.clone()),
            "Q" => {
                if let Some(previous) = graphics_stack.pop() {
                    state = previous;
                }
            }
            "cm" => {
                if let Some(values) = numbers(&op.operands, 6) {
                    state.ctm = state.ctm.multiply(Matrix {
                        a: values[0],
                        b: values[1],
                        c: values[2],
                        d: values[3],
                        e: values[4],
                        f: values[5],
                    });
                }
            }
            "BT" => {
                state.text_x = 0.0;
                state.text_y = 0.0;
                state.line_x = 0.0;
                state.line_y = 0.0;
            }
            "Tf" => {
                if let [Operand::Name(name), Operand::Number(size)] = op.operands.as_slice() {
                    state.font_name = Some(name.clone());
                    state.font_size = *size;
                    state.leading = *size * 1.2;
                }
            }
            "Td" | "TD" => {
                if let Some(values) = numbers(&op.operands, 2) {
                    state.line_x += values[0];
                    state.line_y += values[1];
                    state.text_x = state.line_x;
                    state.text_y = state.line_y;
                    if op.operator == "TD" {
                        state.leading = -values[1];
                    }
                }
            }
            "Tm" => {
                if let Some(values) = numbers(&op.operands, 6) {
                    state.line_x = values[4];
                    state.line_y = values[5];
                    state.text_x = values[4];
                    state.text_y = values[5];
                }
            }
            "T*" => {
                state.line_y -= state.leading;
                state.text_x = state.line_x;
                state.text_y = state.line_y;
            }
            "Tj" => {
                if let Some(text) = first_text_operand(&op.operands, &state, fonts) {
                    push_text_run(&mut extraction, &mut state, source_object_ids, text);
                }
            }
            "TJ" => {
                if let Some(Operand::Array(items)) = op.operands.first() {
                    let text = text_from_array(items, &state, fonts);
                    push_text_run(&mut extraction, &mut state, source_object_ids, text);
                }
            }
            "'" => {
                state.line_y -= state.leading;
                state.text_x = state.line_x;
                state.text_y = state.line_y;
                if let Some(text) = first_text_operand(&op.operands, &state, fonts) {
                    push_text_run(&mut extraction, &mut state, source_object_ids, text);
                }
            }
            "\"" => {
                state.line_y -= state.leading;
                state.text_x = state.line_x;
                state.text_y = state.line_y;
                if let Some(text) = op
                    .operands
                    .last()
                    .and_then(|operand| operand_text(operand, &state, fonts))
                {
                    push_text_run(&mut extraction, &mut state, source_object_ids, text);
                }
            }
            "Do" => {
                if let Some(Operand::Name(name)) = op.operands.first() {
                    if let Some(object_number) = xobjects.get(name) {
                        if let Some(object) = object_map.get(object_number) {
                            let object_body = lossy(&object.body);
                            if object_body.contains("/Subtype /Image") {
                                let bbox = state.ctm.bbox();
                                let id = format!("image-{}-{name}", page_number);
                                let object_id = Some(format!(
                                    "{} {} R",
                                    object.object_number, object.generation
                                ));
                                let width = parse_number_after(&object_body, "/Width")
                                    .map(|value| value as u32);
                                let height = parse_number_after(&object_body, "/Height")
                                    .map(|value| value as u32);

                                extraction.images.push(ImageObject {
                                    id: id.clone(),
                                    object_id: object_id.clone(),
                                    bbox: Some(bbox),
                                    width,
                                    height,
                                });
                                extraction.assets.push(Asset {
                                    id,
                                    kind: "image".to_owned(),
                                    object_id,
                                    bbox: Some(bbox),
                                    width,
                                    height,
                                });
                            }
                        }
                    }
                }
            }
            _ => {}
        }
    }

    extraction
}

fn push_text_run(
    extraction: &mut ContentExtraction,
    state: &mut GraphicsState,
    source_object_ids: &[String],
    text: String,
) {
    if text.trim().is_empty() {
        return;
    }

    let (x, y) = state.ctm.point(state.text_x, state.text_y);
    let width = (text.chars().count() as f32 * state.font_size * 0.5).max(state.font_size * 0.25);
    let bbox = BBox {
        x,
        y,
        width,
        height: state.font_size,
    };
    extraction.text_runs.push(TextRun {
        text,
        bbox,
        font: state.font_name.clone(),
        size: state.font_size,
        source_object_ids: source_object_ids.to_vec(),
    });
    state.text_x += width;
}

fn build_blocks(page_number: usize, lines: &[TextLine]) -> Vec<Block> {
    if let Some(detected_table) = detect_table(page_number, lines) {
        let mut blocks = Vec::new();
        let mut table_inserted = false;
        for (line_index, line) in lines.iter().enumerate() {
            if detected_table.line_indices.contains(&line_index) {
                if !table_inserted {
                    blocks.push(Block::Table(detected_table.table.clone()));
                    table_inserted = true;
                }
            } else if let Some(block) = text_line_block(page_number, line) {
                blocks.push(block);
            }
        }
        return blocks;
    }

    let split_lines = split_wide_text_lines(lines);
    let text_blocks = text_lines_in_reading_order(&split_lines)
        .into_iter()
        .filter_map(|line| text_block_from_line(page_number, line))
        .collect::<Vec<_>>();
    merge_wrapped_text_blocks(text_blocks)
        .into_iter()
        .map(Block::Text)
        .collect()
}

fn split_wide_text_lines(lines: &[TextLine]) -> Vec<TextLine> {
    let enable_tight_column_band = has_repeated_tight_column_band_evidence(lines);
    let mut split_lines = Vec::new();
    for line in lines {
        match split_text_line_at_wide_gap(line, enable_tight_column_band) {
            Some((left, right)) => {
                split_lines.push(left);
                split_lines.push(right);
            }
            None => split_lines.push(line.clone()),
        }
    }
    split_lines
}

fn split_text_line_at_wide_gap(
    line: &TextLine,
    enable_tight_column_band: bool,
) -> Option<(TextLine, TextLine)> {
    if line.runs.len() < 2 {
        return None;
    }
    let mut runs = line.runs.clone();
    runs.sort_by(|left, right| left.bbox.x.total_cmp(&right.bbox.x));
    if runs
        .iter()
        .any(|run| looks_like_pdf_math_notation(&normalize_pdf_token(&run.text)))
    {
        return None;
    }
    let split_index = enable_tight_column_band
        .then(|| right_column_band_split_index(&runs))
        .flatten()
        .or_else(|| largest_run_gap(&runs).map(|(split_index, _, _)| split_index))?;
    let left_runs = runs[..split_index].to_vec();
    let right_runs = runs[split_index..].to_vec();
    if left_runs.is_empty() || right_runs.is_empty() {
        return None;
    }
    Some((
        text_line_from_runs(left_runs)?,
        text_line_from_runs(right_runs)?,
    ))
}

fn has_repeated_tight_column_band_evidence(lines: &[TextLine]) -> bool {
    lines
        .iter()
        .filter(|line| {
            let mut runs = line.runs.clone();
            runs.sort_by(|left, right| left.bbox.x.total_cmp(&right.bbox.x));
            right_column_band_split_index(&runs).is_some()
        })
        .take(2)
        .count()
        >= 2
}

fn right_column_band_split_index(runs: &[TextRun]) -> Option<usize> {
    if runs.len() < 4 || runs.first()?.bbox.x > 120.0 {
        return None;
    }
    if runs
        .iter()
        .any(|run| looks_like_pdf_math_notation(&normalize_pdf_token(&run.text)))
    {
        return None;
    }

    for index in 1..runs.len() {
        let right_x = runs[index].bbox.x;
        if !(300.0..=340.0).contains(&right_x) {
            continue;
        }
        if index < 2 || runs.len() - index < 2 {
            continue;
        }

        let previous = &runs[index - 1].bbox;
        let gap = right_x - (previous.x + previous.width);
        if gap < -35.0 {
            continue;
        }

        let right_text_len = runs[index..]
            .iter()
            .map(|run| run.text.trim().len())
            .sum::<usize>();
        if right_text_len < 18 {
            continue;
        }

        return Some(index);
    }

    None
}

fn largest_run_gap(runs: &[TextRun]) -> Option<(usize, f32, f32)> {
    runs.windows(2)
        .enumerate()
        .filter_map(|(index, window)| {
            let left = &window[0].bbox;
            let right = &window[1].bbox;
            let gap = right.x - (left.x + left.width);
            let x_jump = right.x - left.x;
            is_likely_column_split_gap(&window[0].bbox, &window[1].bbox, gap, x_jump).then_some((
                index + 1,
                gap,
                x_jump,
            ))
        })
        .max_by(|left, right| left.1.max(left.2).total_cmp(&right.1.max(right.2)))
}

fn is_likely_column_split_gap(left: &BBox, right: &BBox, gap: f32, x_jump: f32) -> bool {
    if gap >= 18.0 {
        return true;
    }

    x_jump >= 110.0 && left.x < 280.0 && right.x > 280.0
}

fn text_line_from_runs(runs: Vec<TextRun>) -> Option<TextLine> {
    let bbox = union_boxes(runs.iter().map(|run| run.bbox))?;
    Some(TextLine { runs, bbox })
}

fn text_lines_in_reading_order(lines: &[TextLine]) -> Vec<&TextLine> {
    if let Some(layout) = detect_paired_text_columns(lines) {
        return order_column_layout(layout);
    }
    if let Some(mut columns) = detect_text_columns(lines) {
        columns.sort_by(|left, right| column_x(left).total_cmp(&column_x(right)));
        return columns
            .into_iter()
            .flat_map(|mut column| {
                column.sort_by(|left, right| {
                    right
                        .bbox
                        .y
                        .total_cmp(&left.bbox.y)
                        .then(left.bbox.x.total_cmp(&right.bbox.x))
                });
                column
            })
            .collect();
    }
    lines.iter().collect()
}

fn order_column_layout(mut layout: ColumnLayout<'_>) -> Vec<&TextLine> {
    let mut ordered = Vec::new();
    sort_lines_top_down(&mut layout.leading);
    ordered.extend(layout.leading);
    layout
        .columns
        .sort_by(|left, right| column_x(left).total_cmp(&column_x(right)));
    for mut column in layout.columns {
        sort_lines_top_down(&mut column);
        ordered.extend(column);
    }
    sort_lines_top_down(&mut layout.trailing);
    ordered.extend(layout.trailing);
    ordered
}

fn sort_lines_top_down(lines: &mut [&TextLine]) {
    lines.sort_by(|left, right| {
        right
            .bbox
            .y
            .total_cmp(&left.bbox.y)
            .then(left.bbox.x.total_cmp(&right.bbox.x))
    });
}

fn detect_paired_text_columns(lines: &[TextLine]) -> Option<ColumnLayout<'_>> {
    if lines.len() < 4 {
        return None;
    }

    let mut left_seed_indices = Vec::new();
    let mut right_seed_indices = Vec::new();
    for (left_index, left) in lines.iter().enumerate() {
        for (right_index, right) in lines.iter().enumerate() {
            if left_index == right_index || left.bbox.x >= right.bbox.x {
                continue;
            }
            if (left.bbox.y - right.bbox.y).abs() > column_pair_y_tolerance(left, right) {
                continue;
            }
            let gap = right.bbox.x - (left.bbox.x + left.bbox.width);
            let x_jump = right.bbox.x - left.bbox.x;
            if !is_likely_column_split_gap(&left.bbox, &right.bbox, gap, x_jump) {
                continue;
            }
            left_seed_indices.push(left_index);
            right_seed_indices.push(right_index);
        }
    }
    dedupe_indices(&mut left_seed_indices);
    dedupe_indices(&mut right_seed_indices);
    if left_seed_indices.len() < 2 || right_seed_indices.len() < 2 {
        return None;
    }

    let left_x = average_x(lines, &left_seed_indices)?;
    let right_x = average_x(lines, &right_seed_indices)?;
    if right_x - left_x < 90.0 {
        return None;
    }
    let column_min_y = left_seed_indices
        .iter()
        .chain(&right_seed_indices)
        .map(|index| lines[*index].bbox.y)
        .reduce(f32::min)?;
    let column_max_y = left_seed_indices
        .iter()
        .chain(&right_seed_indices)
        .map(|index| lines[*index].bbox.y)
        .reduce(f32::max)?;
    let abstract_y = abstract_heading_y(lines);
    let midpoint = (left_x + right_x) / 2.0;
    let mut leading = Vec::new();
    let mut trailing = Vec::new();
    let mut left_column = Vec::new();
    let mut right_column = Vec::new();

    for line in lines {
        if is_likely_front_matter_line(line, abstract_y)
            || line.bbox.y > column_max_y + line.bbox.height
        {
            leading.push(line);
        } else if line.bbox.y < column_min_y - line.bbox.height * 1.8
            && (is_likely_page_number_line(line) || is_likely_bottom_footnote_line(line))
        {
            trailing.push(line);
        } else if line.bbox.x < midpoint {
            left_column.push(line);
        } else {
            right_column.push(line);
        }
    }

    if left_column.len() < 2 || right_column.len() < 2 {
        return None;
    }

    Some(ColumnLayout {
        leading,
        columns: vec![left_column, right_column],
        trailing,
    })
}

fn column_pair_y_tolerance(left: &TextLine, right: &TextLine) -> f32 {
    left.bbox.height.max(right.bbox.height) * 0.45
}

fn abstract_heading_y(lines: &[TextLine]) -> Option<f32> {
    lines
        .iter()
        .find(|line| text_line_plain_text(line).eq_ignore_ascii_case("abstract"))
        .map(|line| line.bbox.y)
}

fn is_likely_front_matter_line(line: &TextLine, abstract_y: Option<f32>) -> bool {
    abstract_y.is_some_and(|y| line.bbox.y > y + 36.0)
}

fn is_likely_bottom_footnote_line(line: &TextLine) -> bool {
    average_run_size(line) <= 10.0 && text_line_plain_text(line).len() > 4
}

fn average_run_size(line: &TextLine) -> f32 {
    if line.runs.is_empty() {
        return line.bbox.height;
    }
    line.runs.iter().map(|run| run.size).sum::<f32>() / line.runs.len() as f32
}

fn is_likely_page_number_line(line: &TextLine) -> bool {
    let text = text_line_plain_text(line);
    !text.is_empty() && text.len() <= 4 && text.chars().all(|character| character.is_ascii_digit())
}

fn text_line_plain_text(line: &TextLine) -> String {
    line.runs
        .iter()
        .map(|run| run.text.trim())
        .filter(|text| !text.is_empty())
        .collect::<Vec<_>>()
        .join(" ")
        .trim()
        .to_owned()
}

fn dedupe_indices(indices: &mut Vec<usize>) {
    indices.sort_unstable();
    indices.dedup();
}

fn average_x(lines: &[TextLine], indices: &[usize]) -> Option<f32> {
    if indices.is_empty() {
        return None;
    }
    Some(
        indices
            .iter()
            .map(|index| lines[*index].bbox.x)
            .sum::<f32>()
            / indices.len() as f32,
    )
}

fn detect_text_columns(lines: &[TextLine]) -> Option<Vec<Vec<&TextLine>>> {
    if lines.len() < 4 {
        return None;
    }

    let mut centers = lines
        .iter()
        .enumerate()
        .map(|(index, line)| (index, line.bbox.x + line.bbox.width / 2.0))
        .collect::<Vec<_>>();
    centers.sort_by(|left, right| left.1.total_cmp(&right.1));

    let (split_index, largest_gap) = centers
        .windows(2)
        .enumerate()
        .map(|(index, window)| (index + 1, window[1].1 - window[0].1))
        .max_by(|left, right| left.1.total_cmp(&right.1))?;
    if largest_gap < 90.0 {
        return None;
    }

    let (left_indices, right_indices) = centers.split_at(split_index);
    if left_indices.len() < 2 || right_indices.len() < 2 {
        return None;
    }

    let left = left_indices
        .iter()
        .map(|(index, _)| &lines[*index])
        .collect::<Vec<_>>();
    let right = right_indices
        .iter()
        .map(|(index, _)| &lines[*index])
        .collect::<Vec<_>>();

    let overlap = y_overlap(&left, &right)?;
    let average_height = average_line_height(lines);
    if overlap < average_height {
        return None;
    }

    Some(vec![left, right])
}

fn column_x(lines: &[&TextLine]) -> f32 {
    if lines.is_empty() {
        return 0.0;
    }
    lines.iter().map(|line| line.bbox.x).sum::<f32>() / lines.len() as f32
}

fn y_overlap(left: &[&TextLine], right: &[&TextLine]) -> Option<f32> {
    let left_min = left.iter().map(|line| line.bbox.y).reduce(f32::min)?;
    let left_max = left
        .iter()
        .map(|line| line.bbox.y + line.bbox.height)
        .reduce(f32::max)?;
    let right_min = right.iter().map(|line| line.bbox.y).reduce(f32::min)?;
    let right_max = right
        .iter()
        .map(|line| line.bbox.y + line.bbox.height)
        .reduce(f32::max)?;
    Some((left_max.min(right_max) - left_min.max(right_min)).max(0.0))
}

fn average_line_height(lines: &[TextLine]) -> f32 {
    let total = lines.iter().map(|line| line.bbox.height).sum::<f32>();
    total / lines.len() as f32
}

fn text_line_block(page_number: usize, line: &TextLine) -> Option<Block> {
    text_block_from_line(page_number, line).map(Block::Text)
}

fn text_block_from_line(page_number: usize, line: &TextLine) -> Option<TextBlock> {
    let text = line
        .runs
        .iter()
        .map(|run| run.text.trim())
        .filter(|text| !text.is_empty())
        .collect::<Vec<_>>()
        .join(" ");
    let text = clean_pdf_line_text(&text);
    if text.is_empty() {
        return None;
    }

    Some(TextBlock {
        text: text.clone(),
        kind: classify_text_line(&text),
        bbox: Some(line.bbox),
        lines: vec![Line {
            text,
            bbox: Some(line.bbox),
            spans: line
                .runs
                .iter()
                .map(|run| Span {
                    text: run.text.clone(),
                    bbox: Some(run.bbox),
                    font: run.font.clone(),
                    size: Some(run.size),
                })
                .collect(),
        }],
        source_anchors: vec![anchor(
            page_number,
            Some(line.bbox),
            source_ids_for_line(line),
        )],
        confidence: Some(Confidence {
            score: 0.82,
            calibrated: false,
        }),
    })
}

fn merge_wrapped_text_blocks(blocks: Vec<TextBlock>) -> Vec<TextBlock> {
    let mut merged: Vec<TextBlock> = Vec::new();
    for block in blocks {
        if let Some(previous) = merged.last_mut() {
            if should_merge_text_blocks(previous, &block) {
                merge_text_block(previous, block);
                continue;
            }
        }
        merged.push(block);
    }
    merged
}

fn should_merge_text_blocks(previous: &TextBlock, next: &TextBlock) -> bool {
    let Some(previous_bbox) = previous.bbox else {
        return false;
    };
    let Some(next_bbox) = next.bbox else {
        return false;
    };
    let baseline_gap = previous_bbox.y - next_bbox.y;
    if baseline_gap <= 0.0 || baseline_gap > previous_bbox.height.max(next_bbox.height) * 1.8 {
        return false;
    }
    let x_aligned = (previous_bbox.x - next_bbox.x).abs() <= 18.0;
    let hyphenated = previous.text.ends_with('-') && starts_with_lowercase(&next.text);
    if x_aligned && hyphenated {
        return true;
    }
    if previous.kind != "paragraph" || next.kind != "paragraph" {
        return false;
    }
    let lowercase_continuation =
        starts_with_lowercase(&next.text) && !ends_sentence(&previous.text);
    x_aligned && (hyphenated || lowercase_continuation)
}

fn merge_text_block(previous: &mut TextBlock, next: TextBlock) {
    previous.text = join_wrapped_text(&previous.text, &next.text);
    previous.bbox = union_boxes(previous.bbox.into_iter().chain(next.bbox)).or(previous.bbox);
    previous.lines.extend(next.lines);
    for anchor in next.source_anchors {
        previous.source_anchors.push(anchor);
    }
}

fn join_wrapped_text(previous: &str, next: &str) -> String {
    if let Some(stem) = previous.strip_suffix('-') {
        format!("{stem}{}", next.trim_start())
    } else {
        format!("{} {}", previous.trim_end(), next.trim_start())
    }
}

fn starts_with_lowercase(text: &str) -> bool {
    text.chars()
        .find(|character| character.is_alphabetic())
        .is_some_and(|character| character.is_lowercase())
}

fn ends_sentence(text: &str) -> bool {
    text.trim_end()
        .chars()
        .last()
        .is_some_and(|character| matches!(character, '.' | '!' | '?'))
}

fn clean_pdf_line_text(text: &str) -> String {
    let tokens = text
        .split_whitespace()
        .map(normalize_pdf_token)
        .filter(|token| !token.is_empty())
        .collect::<Vec<_>>();
    let mut cleaned: Vec<String> = Vec::new();
    let mut index = 0;
    while index < tokens.len() {
        let token = tokens[index].as_str();
        if is_closing_punctuation_token(token) && !cleaned.is_empty() {
            let previous = cleaned.last_mut().expect("checked non-empty");
            previous.push_str(token);
            index += 1;
            continue;
        }
        if is_joining_apostrophe(token) && !cleaned.is_empty() && index + 1 < tokens.len() {
            let next = tokens[index + 1].as_str();
            if is_word_piece(next) {
                let previous = cleaned.last_mut().expect("checked non-empty");
                previous.push('\'');
                previous.push_str(next);
                index += 2;
                continue;
            }
        }
        if is_joining_hyphen(token) && !cleaned.is_empty() && index + 1 < tokens.len() {
            let next = tokens[index + 1].as_str();
            if is_word_piece(next) {
                let previous = cleaned.last_mut().expect("checked non-empty");
                previous.push('-');
                previous.push_str(next);
                index += 2;
                continue;
            }
        }
        if let Some(previous) = cleaned.last_mut() {
            if should_join_after_trailing_hyphen(previous, token) {
                previous.push_str(token);
                index += 1;
                continue;
            }
            if should_join_pdf_word_piece(previous, token) {
                previous.push_str(token);
                index += 1;
                continue;
            }
        }
        if is_letter_fragment(token) {
            let mut merged = String::new();
            let mut end = index;
            while end < tokens.len() && is_letter_fragment(tokens[end].as_str()) {
                merged.push_str(tokens[end].as_str());
                end += 1;
            }
            if end - index >= 2 {
                cleaned.push(merged);
                index = end;
                continue;
            }
        }
        cleaned.push(token.to_owned());
        index += 1;
    }
    repair_pdf_math_notation(&repair_pdf_word_fragment_phrases(&cleaned.join(" ")))
}

fn repair_pdf_word_fragment_phrases(text: &str) -> String {
    let mut repaired = text.to_owned();
    for (broken, fixed) in [
        ("a c onversatio n", "a conversation"),
        ("ac onversatio n", "a conversation"),
        ("an other", "another"),
        ("ce nters", "centers"),
        ("prod uction", "production"),
        ("de mands", "demands"),
        ("turn s", "turns"),
        ("coordinate s", "coordinates"),
        ("coordinat e", "coordinate"),
        ("facilitat e", "facilitate"),
        ("speake rs", "speakers"),
        ("listener s'", "listeners'"),
        ("th e", "the"),
        ("p resent", "present"),
        ("linguisti c", "linguistic"),
        ("an d", "and"),
        ("inferen ces", "inferences"),
        ("attentio n", "attention"),
        ("B eyond", "Beyond"),
        ("variabilit y", "variability"),
        ("l essons", "lessons"),
        ("re peating", "repeating"),
        ("import ant", "important"),
        ("sp ecified", "specified"),
    ] {
        repaired = repaired.replace(broken, fixed);
    }
    repaired
}

fn normalize_pdf_token(token: &str) -> String {
    let normalized = token
        .replace("â\u{80}\u{98}", "'")
        .replace("â\u{80}\u{99}", "'")
        .replace("Â·", "·")
        .replace("â\u{84}\u{93}", "ℓ")
        .replace("Î»", "λ")
        .replace("Î›", "Λ")
        .replace("Ï\u{84}", "τ")
        .replace("Ã\u{97}", "×")
        .replace("â\u{86}\u{92}", "→")
        .replace("â\u{89}¥", "≥")
        .replace("â\u{89}¤", "≤")
        .replace("â\u{88}\u{88}", "∈")
        .replace(['‘', '’'], "'")
        .replace(['“', '”'], "\"");
    repair_embedded_pdf_control_glyphs(&normalized)
}

fn repair_embedded_pdf_control_glyphs(token: &str) -> String {
    let characters = token.chars().collect::<Vec<_>>();
    let mut output = String::with_capacity(token.len());
    for (index, character) in characters.iter().enumerate() {
        match character {
            '\u{2}' if has_following_alphabetic(&characters, index + 1) => {
                output.push_str("fi");
            }
            '\u{2}' => {}
            '\u{3}' if has_following_alphabetic(&characters, index + 1) => {
                output.push_str("fl");
            }
            _ => output.push(*character),
        }
    }
    output
}

fn has_following_alphabetic(characters: &[char], index: usize) -> bool {
    characters
        .get(index)
        .is_some_and(|character| character.is_alphabetic())
}

fn is_closing_punctuation_token(token: &str) -> bool {
    matches!(token, "." | "," | ":" | ";" | "!" | "?" | ")" | "]" | "}")
}

fn should_join_after_trailing_hyphen(previous: &str, token: &str) -> bool {
    previous.ends_with('-')
        && token
            .chars()
            .next()
            .is_some_and(|character| character.is_ascii_alphanumeric())
        && previous
            .chars()
            .any(|character| character.is_ascii_alphanumeric())
}

fn should_join_pdf_word_piece(previous: &str, token: &str) -> bool {
    if !is_alphabetic_word(previous) || !is_alphabetic_word(token) {
        return false;
    }
    if !previous
        .chars()
        .last()
        .is_some_and(|character| character.is_lowercase())
        || !starts_with_lowercase(token)
    {
        return false;
    }

    matches!(
        (previous, token),
        ("coordina", "ting") | ("de", "scribe") | ("foc", "i") | ("pro", "posed")
    )
}

fn is_alphabetic_word(token: &str) -> bool {
    !token.is_empty() && token.chars().all(|character| character.is_alphabetic())
}

fn repair_pdf_math_notation(text: &str) -> String {
    let normalized = text.replace("Â·", "·").replace("â\u{84}\u{93}", "ℓ");
    if !looks_like_pdf_math_notation(&normalized) {
        return strip_pdf_control_glyphs(&normalized);
    }

    let symbols = replace_math_symbols(&normalized);
    strip_pdf_control_glyphs(&repair_math_subscript_spacing(&symbols))
}

fn looks_like_pdf_math_notation(text: &str) -> bool {
    text.chars().any(|character| {
        matches!(
            character,
            'ℓ' | 'λ'
                | 'θ'
                | 'ρ'
                | 'τ'
                | '∆'
                | 'Δ'
                | '≤'
                | '≥'
                | '∈'
                | '∪'
                | '∅'
                | '·'
                | '−'
                | '±'
                | '⊆'
                | '∼'
                | '≠'
                | '→'
        )
    }) || text.contains("...")
        || text.contains("Fq")
        || text.contains(" 6 =")
}

fn replace_math_symbols(text: &str) -> String {
    let collapsed = text
        .replace("· · ·", r"\cdots")
        .replace("...", r"\ldots")
        .replace("6 =", r"\neq")
        .replace("Fq", r"\mathbb{F}_q");
    let mut output = String::with_capacity(collapsed.len());

    for character in collapsed.chars() {
        match character {
            '\u{3}' => output.push_str(r"\Lambda"),
            'ℓ' => output.push_str(r"\ell"),
            'λ' => output.push_str(r"\lambda"),
            'Λ' => output.push_str(r"\Lambda"),
            'θ' => output.push_str(r"\theta"),
            'Θ' => output.push_str(r"\Theta"),
            'ρ' => output.push_str(r"\rho"),
            'τ' => output.push_str(r"\tau"),
            '∆' | 'Δ' => output.push_str(r"\Delta"),
            '≤' => output.push_str(r"\leq"),
            '≥' => output.push_str(r"\geq"),
            '∈' => output.push_str(r"\in"),
            '∪' => output.push_str(r"\cup"),
            '∅' => output.push_str(r"\varnothing"),
            '−' => output.push('-'),
            '±' => output.push_str(r"\pm"),
            '⊆' => output.push_str(r"\subseteq"),
            '∼' => output.push_str(r"\sim"),
            '≠' => output.push_str(r"\neq"),
            '×' => output.push_str(r"\times"),
            '→' => output.push_str(r"\to"),
            '·' => output.push_str(r"\cdot"),
            _ => output.push(character),
        }
    }

    output
}

fn strip_pdf_control_glyphs(text: &str) -> String {
    text.chars()
        .filter(|character| !matches!(character, '\u{2}' | '\u{3}'))
        .collect()
}

fn repair_math_subscript_spacing(text: &str) -> String {
    let tokens = text.split_whitespace().collect::<Vec<_>>();
    let mut repaired = Vec::with_capacity(tokens.len());
    let mut index = 0;

    while index < tokens.len() {
        let token = tokens[index];
        if is_math_base_token(token) && index + 1 < tokens.len() {
            if tokens[index + 1].starts_with('_') {
                repaired.push(format!("{}{}", token, tokens[index + 1]));
                index += 2;
                continue;
            }
            if let Some((subscript, suffix)) = split_math_subscript_token(tokens[index + 1]) {
                repaired.push(format!(
                    "{}{}{}",
                    token,
                    format_math_subscript(subscript),
                    suffix
                ));
                index += 2;
                continue;
            }
        }

        repaired.push(repair_compact_math_subscript(token));
        index += 1;
    }

    repaired.join(" ")
}

fn repair_compact_math_subscript(token: &str) -> String {
    if token.chars().count() > 2 && token.chars().all(|character| character.is_alphabetic()) {
        return token.to_owned();
    }

    for base in ["m", "n", "N", "T", "V", "C", "x", "t", "i", "k", "h", "g"] {
        if let Some(rest) = token.strip_prefix(base) {
            if rest.is_empty() || rest.starts_with('_') {
                continue;
            }
            if let Some((subscript, suffix)) = split_math_subscript_token(rest) {
                return format!("{}{}{}", base, format_math_subscript(subscript), suffix);
            }
        }
    }

    for base in [r"\lambda", r"\theta", r"\rho"] {
        if let Some(rest) = token.strip_prefix(base) {
            if rest.is_empty() || rest.starts_with('_') {
                continue;
            }
            if let Some((subscript, suffix)) = split_math_subscript_token(rest) {
                return format!("{}{}{}", base, format_math_subscript(subscript), suffix);
            }
        }
    }

    token.to_owned()
}

fn is_math_base_token(token: &str) -> bool {
    matches!(
        token,
        "m" | "n"
            | "N"
            | "T"
            | "V"
            | "C"
            | "x"
            | "t"
            | "i"
            | "k"
            | "h"
            | "g"
            | r"\lambda"
            | r"\theta"
            | r"\rho"
    )
}

fn split_math_subscript_token(token: &str) -> Option<(&str, &str)> {
    for command in [r"\ell", r"\lambda", r"\theta", r"\rho"] {
        if let Some(suffix) = token.strip_prefix(command) {
            return Some((command, suffix));
        }
    }
    for word in ["init", "cl"] {
        if let Some(suffix) = token.strip_prefix(word) {
            return Some((word, suffix));
        }
    }

    let mut end = 0;
    for (offset, character) in token.char_indices() {
        if character.is_ascii_digit() {
            end = offset + character.len_utf8();
            continue;
        }
        break;
    }
    if end > 0 {
        return Some((&token[..end], &token[end..]));
    }

    let mut chars = token.char_indices();
    let (_, first) = chars.next()?;
    if matches!(first, 'i' | 'j' | 'k' | 'l' | 'n' | 'r' | 's') {
        let end = first.len_utf8();
        return Some((&token[..end], &token[end..]));
    }
    None
}

fn format_math_subscript(subscript: &str) -> String {
    match subscript {
        "init" => r"_{\text{init}}".to_owned(),
        _ => format!("_{subscript}"),
    }
}

fn is_letter_fragment(token: &str) -> bool {
    let chars = token.chars().collect::<Vec<_>>();
    matches!(chars.as_slice(), [character] if character.is_alphabetic())
        || matches!(chars.as_slice(), [character, '-'] if character.is_alphabetic())
}

fn is_word_piece(token: &str) -> bool {
    token.chars().any(|character| character.is_alphabetic())
}

fn is_joining_apostrophe(token: &str) -> bool {
    matches!(token, "'" | "’")
}

fn is_joining_hyphen(token: &str) -> bool {
    matches!(token, "-" | "‐" | "‑" | "–")
}

fn detect_table(page_number: usize, lines: &[TextLine]) -> Option<DetectedTable> {
    let candidate_lines = lines
        .iter()
        .enumerate()
        .filter(|(_, line)| line.runs.len() >= 2)
        .collect::<Vec<_>>();
    if candidate_lines.len() < 2 {
        return None;
    }

    let width = candidate_lines[0].1.runs.len();
    if !candidate_lines.iter().all(|(_, line)| {
        line.runs.len() == width && columns_align(&candidate_lines[0].1.runs, &line.runs)
    }) {
        return None;
    }
    if !has_table_evidence(&candidate_lines) {
        return None;
    }

    let headers = candidate_lines[0]
        .1
        .runs
        .iter()
        .map(|run| run.text.trim().to_owned())
        .collect::<Vec<_>>();
    let rows = candidate_lines
        .iter()
        .skip(1)
        .map(|(_, line)| {
            line.runs
                .iter()
                .map(|run| run.text.trim().to_owned())
                .collect::<Vec<_>>()
        })
        .collect::<Vec<_>>();
    let bbox = union_boxes(candidate_lines.iter().map(|(_, line)| line.bbox))?;
    let mut cells = Vec::new();

    for (row_index, (_, line)) in candidate_lines.iter().enumerate() {
        for (column_index, run) in line.runs.iter().enumerate() {
            cells.push(TableCell {
                row: row_index,
                column: column_index,
                text: run.text.clone(),
                bbox: Some(run.bbox),
                is_header: row_index == 0,
            });
        }
    }

    Some(DetectedTable {
        table: TableBlock {
            headers,
            rows,
            caption: None,
            bbox: Some(bbox),
            cells,
            source_anchors: vec![anchor(page_number, Some(bbox), Vec::new())],
            confidence: Some(Confidence {
                score: 0.72,
                calibrated: false,
            }),
        },
        line_indices: candidate_lines
            .iter()
            .map(|(line_index, _)| *line_index)
            .collect(),
    })
}

fn has_table_evidence(candidate_lines: &[(usize, &TextLine)]) -> bool {
    if candidate_lines.len() >= 3 {
        return true;
    }
    candidate_lines
        .iter()
        .skip(1)
        .flat_map(|(_, line)| line.runs.iter())
        .any(|run| run.text.chars().any(|character| character.is_ascii_digit()))
}

fn columns_align(first: &[TextRun], next: &[TextRun]) -> bool {
    first
        .iter()
        .zip(next)
        .all(|(left, right)| (left.bbox.x - right.bbox.x).abs() <= 6.0)
}

fn group_text_runs(mut runs: Vec<TextRun>) -> Vec<TextLine> {
    runs.sort_by(|left, right| {
        right
            .bbox
            .y
            .total_cmp(&left.bbox.y)
            .then(left.bbox.x.total_cmp(&right.bbox.x))
    });

    let mut lines: Vec<TextLine> = Vec::new();
    for run in runs {
        if let Some(line) = lines
            .iter_mut()
            .find(|line| (line.bbox.y - run.bbox.y).abs() <= 3.0)
        {
            line.runs.push(run);
            line.runs
                .sort_by(|left, right| left.bbox.x.total_cmp(&right.bbox.x));
            line.bbox = union_boxes(line.runs.iter().map(|run| run.bbox)).unwrap_or(line.bbox);
        } else {
            lines.push(TextLine {
                bbox: run.bbox,
                runs: vec![run],
            });
        }
    }

    lines
}

fn parse_content_ops(bytes: &[u8]) -> Vec<ContentOp> {
    let mut parser = ContentParser::new(bytes);
    let mut stack = Vec::new();
    let mut ops = Vec::new();

    while let Some(token) = parser.next_operand_or_operator() {
        match token {
            ContentToken::Operand(operand) => stack.push(operand),
            ContentToken::Operator(operator) => {
                ops.push(ContentOp {
                    operands: std::mem::take(&mut stack),
                    operator,
                });
            }
        }
    }

    ops
}

#[derive(Debug)]
enum ContentToken {
    Operand(Operand),
    Operator(String),
}

struct ContentParser<'a> {
    bytes: &'a [u8],
    pos: usize,
}

impl<'a> ContentParser<'a> {
    fn new(bytes: &'a [u8]) -> Self {
        Self { bytes, pos: 0 }
    }

    fn next_operand_or_operator(&mut self) -> Option<ContentToken> {
        self.skip_ws_and_comments();
        if self.pos >= self.bytes.len() {
            return None;
        }

        let byte = self.bytes[self.pos];
        match byte {
            b'/' => Some(ContentToken::Operand(Operand::Name(self.read_name()))),
            b'(' => Some(ContentToken::Operand(Operand::Literal(self.read_literal()))),
            b'[' => Some(ContentToken::Operand(Operand::Array(self.read_array()))),
            b'<' if self.peek(1) != Some(b'<') => {
                Some(ContentToken::Operand(Operand::Hex(self.read_hex_string())))
            }
            b'+' | b'-' | b'.' | b'0'..=b'9' => self
                .read_number()
                .map(|number| ContentToken::Operand(Operand::Number(number))),
            _ => {
                let word = self.read_word();
                if word.is_empty() {
                    self.pos += 1;
                    Some(ContentToken::Operand(Operand::Other))
                } else {
                    Some(ContentToken::Operator(word))
                }
            }
        }
    }

    fn read_array(&mut self) -> Vec<Operand> {
        self.pos += 1;
        let mut items = Vec::new();
        loop {
            self.skip_ws_and_comments();
            if self.pos >= self.bytes.len() || self.bytes[self.pos] == b']' {
                self.pos = (self.pos + 1).min(self.bytes.len());
                break;
            }

            match self.next_operand_or_operator() {
                Some(ContentToken::Operand(operand)) => items.push(operand),
                Some(ContentToken::Operator(_)) | None => {}
            }
        }
        items
    }

    fn read_name(&mut self) -> String {
        self.pos += 1;
        let start = self.pos;
        while self.pos < self.bytes.len() && !is_delimiter_or_ws(self.bytes[self.pos]) {
            self.pos += 1;
        }
        lossy(&self.bytes[start..self.pos])
    }

    fn read_literal(&mut self) -> Vec<u8> {
        self.pos += 1;
        let mut depth = 1;
        let mut output = Vec::new();

        while self.pos < self.bytes.len() && depth > 0 {
            let byte = self.bytes[self.pos];
            self.pos += 1;
            match byte {
                b'\\' => {
                    if self.pos < self.bytes.len() {
                        match self.bytes[self.pos] {
                            b'n' => {
                                output.push(b'\n');
                                self.pos += 1;
                            }
                            b'r' => {
                                output.push(b'\r');
                                self.pos += 1;
                            }
                            b't' => {
                                output.push(b'\t');
                                self.pos += 1;
                            }
                            b'b' => {
                                output.push(0x08);
                                self.pos += 1;
                            }
                            b'f' => {
                                output.push(0x0c);
                                self.pos += 1;
                            }
                            b'\n' => {
                                self.pos += 1;
                            }
                            b'\r' => {
                                self.pos += 1;
                                if self.bytes.get(self.pos) == Some(&b'\n') {
                                    self.pos += 1;
                                }
                            }
                            b'0'..=b'7' => output.push(self.read_octal_escape()),
                            other => {
                                output.push(other);
                                self.pos += 1;
                            }
                        }
                    }
                }
                b'(' => {
                    depth += 1;
                    output.push(byte);
                }
                b')' => {
                    depth -= 1;
                    if depth > 0 {
                        output.push(byte);
                    }
                }
                _ => output.push(byte),
            }
        }

        output
    }

    fn read_octal_escape(&mut self) -> u8 {
        let mut value = 0u16;
        let mut digits = 0;
        while self.pos < self.bytes.len()
            && digits < 3
            && matches!(self.bytes[self.pos], b'0'..=b'7')
        {
            value = (value << 3) + u16::from(self.bytes[self.pos] - b'0');
            self.pos += 1;
            digits += 1;
        }
        value.min(u16::from(u8::MAX)) as u8
    }

    fn read_hex_string(&mut self) -> Vec<u8> {
        self.pos += 1;
        let start = self.pos;
        while self.pos < self.bytes.len() && self.bytes[self.pos] != b'>' {
            self.pos += 1;
        }
        let raw = self.bytes[start..self.pos].to_vec();
        self.pos = (self.pos + 1).min(self.bytes.len());
        decode_hex(&raw)
    }

    fn read_number(&mut self) -> Option<f32> {
        let start = self.pos;
        while self.pos < self.bytes.len()
            && matches!(self.bytes[self.pos], b'+' | b'-' | b'.' | b'0'..=b'9')
        {
            self.pos += 1;
        }
        std::str::from_utf8(&self.bytes[start..self.pos])
            .ok()
            .and_then(|text| text.parse().ok())
    }

    fn read_word(&mut self) -> String {
        let start = self.pos;
        while self.pos < self.bytes.len() && !is_delimiter_or_ws(self.bytes[self.pos]) {
            self.pos += 1;
        }
        lossy(&self.bytes[start..self.pos])
    }

    fn skip_ws_and_comments(&mut self) {
        loop {
            while self.pos < self.bytes.len() && is_ws(self.bytes[self.pos]) {
                self.pos += 1;
            }
            if self.pos < self.bytes.len() && self.bytes[self.pos] == b'%' {
                while self.pos < self.bytes.len() && !matches!(self.bytes[self.pos], b'\n' | b'\r')
                {
                    self.pos += 1;
                }
            } else {
                break;
            }
        }
    }

    fn peek(&self, offset: usize) -> Option<u8> {
        self.bytes.get(self.pos + offset).copied()
    }
}

fn parse_indirect_objects(bytes: &[u8]) -> Vec<PdfObject> {
    let mut objects = Vec::new();
    let mut pos = 0;

    while pos < bytes.len() {
        if !is_ws_or_line_start(bytes, pos) && pos != 0 {
            pos += 1;
            continue;
        }

        let Some((object_number, after_object_number)) = parse_unsigned_at(bytes, pos) else {
            pos += 1;
            continue;
        };
        let Some(after_space) = skip_required_ws(bytes, after_object_number) else {
            pos += 1;
            continue;
        };
        let Some((generation, after_generation)) = parse_unsigned_at(bytes, after_space) else {
            pos += 1;
            continue;
        };
        let Some(after_space) = skip_required_ws(bytes, after_generation) else {
            pos += 1;
            continue;
        };
        if !bytes[after_space..].starts_with(b"obj") {
            pos += 1;
            continue;
        }

        let body_start = after_space + 3;
        if let Some(relative_end) = find_subslice(&bytes[body_start..], b"endobj") {
            let body_end = body_start + relative_end;
            objects.push(PdfObject {
                object_number: object_number as u32,
                generation: generation as u16,
                body: bytes[body_start..body_end].to_vec(),
            });
            pos = body_end + b"endobj".len();
        } else {
            break;
        }
    }

    objects
}

fn expand_object_streams(objects: &mut Vec<PdfObject>) {
    let object_streams = objects
        .iter()
        .filter(|object| {
            lossy(&object.body)
                .split_whitespace()
                .collect::<String>()
                .contains("/Type/ObjStm")
        })
        .cloned()
        .collect::<Vec<_>>();
    let existing = objects
        .iter()
        .map(|object| object.object_number)
        .collect::<std::collections::HashSet<_>>();
    let mut expanded = Vec::new();

    for object_stream in object_streams {
        let object_body = lossy(&object_stream.body);
        let Some(count) = parse_number_after(&object_body, "/N").map(|value| value as usize) else {
            continue;
        };
        let Some(first) = parse_number_after(&object_body, "/First").map(|value| value as usize)
        else {
            continue;
        };
        let Ok(Some(decoded)) = decode_stream_object(&object_stream) else {
            continue;
        };
        if first > decoded.len() {
            continue;
        }

        let header = lossy(&decoded[..first]);
        let header_numbers = header
            .split_whitespace()
            .filter_map(|part| part.parse::<usize>().ok())
            .collect::<Vec<_>>();
        let mut entries = Vec::new();
        for pair in header_numbers.chunks_exact(2).take(count) {
            entries.push((pair[0] as u32, pair[1]));
        }

        for (index, (object_number, offset)) in entries.iter().enumerate() {
            if existing.contains(object_number) {
                continue;
            }
            let next_offset = entries
                .get(index + 1)
                .map(|(_, next_offset)| *next_offset)
                .unwrap_or(decoded.len() - first);
            if *offset > next_offset || first + next_offset > decoded.len() {
                continue;
            }
            expanded.push(PdfObject {
                object_number: *object_number,
                generation: 0,
                body: decoded[first + *offset..first + next_offset].to_vec(),
            });
        }
    }

    objects.extend(expanded);
}

fn page_seed(object: &PdfObject, object_map: &HashMap<u32, PdfObject>) -> Option<PageSeed> {
    let body = lossy(&object.body);
    let compact = body.split_whitespace().collect::<String>();
    if compact.contains("/Type/Page") && !compact.contains("/Type/Pages") {
        Some(PageSeed {
            number: 0,
            body: body_with_inherited_page_tree_entries(&body, object_map),
        })
    } else {
        None
    }
}

fn body_with_inherited_page_tree_entries(
    page_body: &str,
    object_map: &HashMap<u32, PdfObject>,
) -> String {
    let mut body = page_body.to_owned();
    append_parent_page_tree_entries(page_body, object_map, &mut body, 0);
    body
}

fn append_parent_page_tree_entries(
    body: &str,
    object_map: &HashMap<u32, PdfObject>,
    output: &mut String,
    depth: usize,
) {
    if depth >= 16 {
        return;
    }
    let Some(parent_ref) = parse_direct_ref_after_key(body, "/Parent") else {
        return;
    };
    let Some(parent) = object_map.get(&(parent_ref as u32)) else {
        return;
    };
    let parent_body = lossy(&parent.body);
    output.push('\n');
    output.push_str(&parent_body);
    append_parent_page_tree_entries(&parent_body, object_map, output, depth + 1);
}

fn decode_stream_object(object: &PdfObject) -> Result<Option<Vec<u8>>> {
    let Some(stream_marker) = find_subslice(&object.body, b"stream") else {
        return Ok(None);
    };
    let Some(end_marker) = find_subslice(&object.body, b"endstream") else {
        return Err(DonglerError::pdf("stream is missing endstream marker"));
    };
    if end_marker <= stream_marker {
        return Err(DonglerError::pdf("stream markers are malformed"));
    }

    let dict = lossy(&object.body[..stream_marker]);
    let mut stream = object.body[stream_marker + b"stream".len()..end_marker].to_vec();
    trim_stream_edges(&mut stream);

    let compact_dict = dict.split_whitespace().collect::<String>();
    if compact_dict.contains("/Filter/FlateDecode")
        || compact_dict.contains("/Filter[/FlateDecode")
        || compact_dict.contains("/Filter[/FlateDecode]")
    {
        let mut decoder = ZlibDecoder::new(stream.as_slice());
        let mut decoded = Vec::new();
        decoder
            .read_to_end(&mut decoded)
            .map_err(|error| DonglerError::pdf(format!("FlateDecode failed: {error}")))?;
        Ok(Some(decoded))
    } else {
        Ok(Some(stream))
    }
}

fn trim_stream_edges(stream: &mut Vec<u8>) {
    while matches!(stream.first(), Some(b'\n' | b'\r')) {
        stream.remove(0);
    }
    while matches!(stream.last(), Some(b'\n' | b'\r')) {
        stream.pop();
    }
}

fn parse_refs_after_key(text: &str, key: &str) -> Vec<usize> {
    let Some(start) = text.find(key) else {
        return Vec::new();
    };
    let rest = &text[start + key.len()..];
    if let Some(array_start) = rest.find('[') {
        let before_array = rest[..array_start].trim();
        if before_array.is_empty() {
            if let Some(array_end) = rest[array_start..].find(']') {
                return parse_refs(&rest[array_start..array_start + array_end]);
            }
        }
    }
    parse_refs(rest).into_iter().take(1).collect()
}

fn parse_direct_ref_after_key(text: &str, key: &str) -> Option<usize> {
    let start = text.find(key)?;
    let bytes = text.as_bytes();
    let mut pos = start + key.len();
    while pos < bytes.len() && is_ws(bytes[pos]) {
        pos += 1;
    }
    let (object, after_object) = parse_unsigned_at(bytes, pos)?;
    let after_space = skip_required_ws(bytes, after_object)?;
    let (_generation, after_generation) = parse_unsigned_at(bytes, after_space)?;
    let after_space = skip_required_ws(bytes, after_generation)?;
    if bytes.get(after_space) == Some(&b'R') {
        Some(object)
    } else {
        None
    }
}

fn parse_resource_refs(text: &str, key: &str) -> HashMap<String, u32> {
    let Some(start) = text.find(key) else {
        return HashMap::new();
    };
    let rest = &text[start + key.len()..];
    let Some(dict_start) = rest.find("<<") else {
        return HashMap::new();
    };
    let Some(dict_end) = rest[dict_start + 2..].find(">>") else {
        return HashMap::new();
    };
    let dict = &rest[dict_start + 2..dict_start + 2 + dict_end];
    parse_named_refs(dict)
}

fn resolve_resource_body(page_body: &str, object_map: &HashMap<u32, PdfObject>) -> Option<String> {
    let resource_ref = parse_direct_ref_after_key(page_body, "/Resources")?;
    object_map
        .get(&(resource_ref as u32))
        .map(|object| lossy(&object.body))
}

fn load_font_decoders(
    resource_text: &str,
    object_map: &HashMap<u32, PdfObject>,
) -> HashMap<String, FontDecoder> {
    resolve_named_resource_refs(resource_text, "/Font", object_map)
        .into_iter()
        .map(|(name, object_number)| {
            let decoder = object_map
                .get(&object_number)
                .map(|font| font_decoder(font, object_map))
                .unwrap_or_default();
            (name, decoder)
        })
        .collect()
}

fn resolve_named_resource_refs(
    resource_text: &str,
    key: &str,
    object_map: &HashMap<u32, PdfObject>,
) -> HashMap<String, u32> {
    let direct = parse_resource_refs(resource_text, key);
    if !direct.is_empty() {
        return direct;
    }

    parse_direct_ref_after_key(resource_text, key)
        .and_then(|object_number| object_map.get(&(object_number as u32)))
        .map(|object| parse_named_refs(&lossy(&object.body)))
        .unwrap_or_default()
}

fn font_decoder(font: &PdfObject, object_map: &HashMap<u32, PdfObject>) -> FontDecoder {
    let font_body = lossy(&font.body);
    let Some(to_unicode_ref) = parse_refs_after_key(&font_body, "/ToUnicode")
        .into_iter()
        .next()
    else {
        return FontDecoder::default();
    };
    let Some(to_unicode) = object_map.get(&(to_unicode_ref as u32)) else {
        return FontDecoder::default();
    };
    let Ok(Some(cmap_stream)) = decode_stream_object(to_unicode) else {
        return FontDecoder::default();
    };

    parse_to_unicode_cmap(&lossy(&cmap_stream))
}

fn parse_to_unicode_cmap(text: &str) -> FontDecoder {
    let mut cmap = HashMap::new();
    let mut in_bfchar = false;
    let mut in_bfrange = false;

    for line in text.lines() {
        let trimmed = line.trim();
        match trimmed {
            value if value.ends_with("beginbfchar") => {
                in_bfchar = true;
                continue;
            }
            "endbfchar" => {
                in_bfchar = false;
                continue;
            }
            value if value.ends_with("beginbfrange") => {
                in_bfrange = true;
                continue;
            }
            "endbfrange" => {
                in_bfrange = false;
                continue;
            }
            _ => {}
        }

        let hexes = hex_strings_in_line(trimmed);
        if in_bfchar && hexes.len() >= 2 {
            cmap.insert(
                hexes[0].clone(),
                cmap_text_for_mapping(&hexes[0], &hexes[1]),
            );
        } else if in_bfrange && hexes.len() >= 3 {
            add_bfrange(&mut cmap, &hexes);
        }
    }

    let max_code_len = cmap.keys().map(Vec::len).max().unwrap_or(1);
    FontDecoder { cmap, max_code_len }
}

fn add_bfrange(cmap: &mut HashMap<Vec<u8>, String>, hexes: &[Vec<u8>]) {
    let Some(start) = hex_to_u32(&hexes[0]) else {
        return;
    };
    let Some(end) = hex_to_u32(&hexes[1]) else {
        return;
    };
    let Some(destination) = hex_to_u32(&hexes[2]) else {
        return;
    };
    let source_len = hexes[0].len();

    for offset in 0..=(end.saturating_sub(start)).min(512) {
        let source = start + offset;
        let destination = destination + offset;
        cmap.insert(
            number_to_be_bytes(source, source_len),
            cmap_text_for_codes(source, destination),
        );
    }
}

fn cmap_text_for_mapping(source: &[u8], destination: &[u8]) -> String {
    let Some(source_code) = hex_to_u32(source) else {
        return utf16be_hex_to_string(destination);
    };
    let Some(destination_code) = hex_to_u32(destination) else {
        return utf16be_hex_to_string(destination);
    };
    cmap_text_for_codes(source_code, destination_code)
}

fn cmap_text_for_codes(source: u32, destination: u32) -> String {
    if is_private_use_text_code(destination) {
        if let Some(character) = private_use_source_ascii(source) {
            return character.to_string();
        }
    }
    char::from_u32(destination)
        .map(|character| character.to_string())
        .unwrap_or_default()
}

fn is_private_use_text_code(code: u32) -> bool {
    (0xe000..=0xf8ff).contains(&code)
}

fn private_use_source_ascii(source: u32) -> Option<char> {
    let ascii = source + 28;
    (0x20..=0x7e)
        .contains(&ascii)
        .then(|| char::from_u32(ascii))
        .flatten()
}

fn hex_strings_in_line(line: &str) -> Vec<Vec<u8>> {
    let bytes = line.as_bytes();
    let mut hexes = Vec::new();
    let mut pos = 0;

    while pos < bytes.len() {
        if bytes[pos] == b'<' && bytes.get(pos + 1) != Some(&b'<') {
            let start = pos + 1;
            if let Some(end) = bytes[start..].iter().position(|byte| *byte == b'>') {
                hexes.push(decode_hex(&bytes[start..start + end]));
                pos = start + end + 1;
                continue;
            }
        }
        pos += 1;
    }

    hexes
}

fn utf16be_hex_to_string(bytes: &[u8]) -> String {
    if bytes.len() >= 2 {
        let units = bytes
            .chunks_exact(2)
            .map(|chunk| u16::from_be_bytes([chunk[0], chunk[1]]))
            .collect::<Vec<_>>();
        String::from_utf16_lossy(&units)
    } else {
        bytes.iter().map(|byte| *byte as char).collect()
    }
}

fn hex_to_u32(bytes: &[u8]) -> Option<u32> {
    let mut value = 0u32;
    for byte in bytes {
        value = (value << 8) | (*byte as u32);
    }
    Some(value)
}

fn number_to_be_bytes(value: u32, len: usize) -> Vec<u8> {
    (0..len)
        .rev()
        .map(|shift| ((value >> (shift * 8)) & 0xff) as u8)
        .collect()
}

fn parse_named_refs(text: &str) -> HashMap<String, u32> {
    let mut refs = HashMap::new();
    let bytes = text.as_bytes();
    let mut pos = 0;

    while pos < bytes.len() {
        if bytes[pos] != b'/' || bytes.get(pos + 1) == Some(&b'/') {
            pos += 1;
            continue;
        }
        pos += 1;
        let name_start = pos;
        while pos < bytes.len() && !is_delimiter_or_ws(bytes[pos]) {
            pos += 1;
        }
        let name = lossy(&bytes[name_start..pos]);
        while pos < bytes.len() && is_ws(bytes[pos]) {
            pos += 1;
        }
        let Some((object, after_object)) = parse_unsigned_at(bytes, pos) else {
            continue;
        };
        let Some(after_space) = skip_required_ws(bytes, after_object) else {
            pos += 1;
            continue;
        };
        let Some((_generation, after_generation)) = parse_unsigned_at(bytes, after_space) else {
            pos += 1;
            continue;
        };
        let Some(after_space) = skip_required_ws(bytes, after_generation) else {
            pos += 1;
            continue;
        };
        if bytes.get(after_space) == Some(&b'R') {
            refs.insert(name, object as u32);
            pos = after_space + 1;
        }
    }

    refs
}

fn parse_refs(text: &str) -> Vec<usize> {
    let mut refs = Vec::new();
    let bytes = text.as_bytes();
    let mut pos = 0;

    while pos < bytes.len() {
        let Some((object, after_object)) = parse_unsigned_at(bytes, pos) else {
            pos += 1;
            continue;
        };
        let Some(after_space) = skip_required_ws(bytes, after_object) else {
            pos += 1;
            continue;
        };
        let Some((_generation, after_generation)) = parse_unsigned_at(bytes, after_space) else {
            pos += 1;
            continue;
        };
        let Some(after_space) = skip_required_ws(bytes, after_generation) else {
            pos += 1;
            continue;
        };
        if bytes.get(after_space) == Some(&b'R') {
            refs.push(object);
            pos = after_space + 1;
        } else {
            pos += 1;
        }
    }

    refs
}

fn parse_number_array_after(text: &str, key: &str) -> Option<Vec<f32>> {
    let start = text.find(key)?;
    let rest = &text[start + key.len()..];
    let open = rest.find('[')?;
    let close = rest[open + 1..].find(']')?;
    Some(
        rest[open + 1..open + 1 + close]
            .split_whitespace()
            .filter_map(|part| part.parse::<f32>().ok())
            .collect(),
    )
}

fn parse_number_after(text: &str, key: &str) -> Option<f32> {
    let start = text.find(key)?;
    let bytes = text.as_bytes();
    let mut pos = start + key.len();
    while pos < bytes.len() && (is_ws(bytes[pos]) || matches!(bytes[pos], b'[' | b']')) {
        pos += 1;
    }
    let number_start = pos;
    while pos < bytes.len() && matches!(bytes[pos], b'+' | b'-' | b'.' | b'0'..=b'9') {
        pos += 1;
    }
    if pos == number_start {
        return None;
    }
    text[number_start..pos].parse().ok()
}

fn first_text_operand(
    operands: &[Operand],
    state: &GraphicsState,
    fonts: &HashMap<String, FontDecoder>,
) -> Option<String> {
    operands
        .first()
        .and_then(|operand| operand_text(operand, state, fonts))
}

fn operand_text(
    operand: &Operand,
    state: &GraphicsState,
    fonts: &HashMap<String, FontDecoder>,
) -> Option<String> {
    match operand {
        Operand::Literal(bytes) | Operand::Hex(bytes) => Some(decode_pdf_text(
            bytes,
            state
                .font_name
                .as_ref()
                .and_then(|font_name| fonts.get(font_name)),
        )),
        _ => None,
    }
}

fn text_from_array(
    items: &[Operand],
    state: &GraphicsState,
    fonts: &HashMap<String, FontDecoder>,
) -> String {
    let mut text = String::new();
    for item in items {
        match item {
            Operand::Number(value) if value.abs() >= 120.0 => {
                if !text.ends_with(' ') {
                    text.push(' ');
                }
            }
            _ => {
                if let Some(part) = operand_text(item, state, fonts) {
                    text.push_str(&part);
                }
            }
        }
    }
    text
}

fn decode_pdf_text(bytes: &[u8], font: Option<&FontDecoder>) -> String {
    if let Some(font) = font {
        if !font.cmap.is_empty() {
            return decode_with_cmap(bytes, font);
        }
    }

    if bytes.starts_with(&[0xfe, 0xff]) {
        let utf16 = bytes[2..]
            .chunks_exact(2)
            .map(|chunk| u16::from_be_bytes([chunk[0], chunk[1]]))
            .collect::<Vec<_>>();
        String::from_utf16_lossy(&utf16)
    } else {
        bytes.iter().map(|byte| *byte as char).collect()
    }
}

fn decode_with_cmap(bytes: &[u8], font: &FontDecoder) -> String {
    let mut output = String::new();
    let mut index = 0;

    while index < bytes.len() {
        let max_len = font.max_code_len.min(bytes.len() - index).max(1);
        let mut matched = false;
        for len in (1..=max_len).rev() {
            if let Some(text) = font.cmap.get(&bytes[index..index + len]) {
                output.push_str(text);
                index += len;
                matched = true;
                break;
            }
        }
        if !matched {
            output.push(bytes[index] as char);
            index += 1;
        }
    }

    output
}

fn numbers(operands: &[Operand], count: usize) -> Option<Vec<f32>> {
    if operands.len() < count {
        return None;
    }
    let values = operands[operands.len() - count..]
        .iter()
        .map(|operand| match operand {
            Operand::Number(value) => Some(*value),
            _ => None,
        })
        .collect::<Option<Vec<_>>>()?;
    Some(values)
}

fn block_text(block: &Block) -> String {
    match block {
        Block::Text(text) => text.text.clone(),
        Block::Table(table) => {
            let mut rows = Vec::new();
            if !table.headers.is_empty() {
                rows.push(table.headers.join(" "));
            }
            rows.extend(table.rows.iter().map(|row| row.join(" ")));
            rows.join("\n")
        }
        Block::Figure(figure) => figure.caption.clone().unwrap_or_default(),
    }
}

fn classify_text_line(text: &str) -> String {
    if text.chars().count() < 120 && text.ends_with(':') {
        "heading".to_owned()
    } else {
        "paragraph".to_owned()
    }
}

fn source_ids_for_line(line: &TextLine) -> Vec<String> {
    let mut ids = Vec::new();
    for run in &line.runs {
        for id in &run.source_object_ids {
            if !ids.contains(id) {
                ids.push(id.clone());
            }
        }
    }
    ids
}

fn anchor(page_number: usize, bbox: Option<BBox>, pdf_object_ids: Vec<String>) -> SourceAnchor {
    SourceAnchor {
        page_number,
        pdf_object_ids,
        bbox,
        extraction_method: "native_pdf".to_owned(),
    }
}

fn warning(code: &str, severity: &str, message: &str, page_number: Option<usize>) -> Warning {
    Warning {
        code: code.to_owned(),
        severity: severity.to_owned(),
        message: message.to_owned(),
        source_anchor: page_number.map(|page_number| anchor(page_number, None, Vec::new())),
    }
}

fn union_boxes(boxes: impl IntoIterator<Item = BBox>) -> Option<BBox> {
    let mut iter = boxes.into_iter();
    let first = iter.next()?;
    let mut min_x = first.x;
    let mut min_y = first.y;
    let mut max_x = first.x + first.width;
    let mut max_y = first.y + first.height;

    for bbox in iter {
        min_x = min_x.min(bbox.x);
        min_y = min_y.min(bbox.y);
        max_x = max_x.max(bbox.x + bbox.width);
        max_y = max_y.max(bbox.y + bbox.height);
    }

    Some(BBox {
        x: min_x,
        y: min_y,
        width: max_x - min_x,
        height: max_y - min_y,
    })
}

fn extract_info_string(objects: &[PdfObject], key: &str) -> Option<String> {
    let needle = format!("/{key}");
    objects.iter().find_map(|object| {
        let body = lossy(&object.body);
        if !(body.contains("/Producer") || body.contains("/Creator") || body.contains("/Author")) {
            return None;
        }
        let start = body.find(&needle)?;
        let rest = &object.body[start + needle.len()..];
        let open = rest.iter().position(|byte| *byte == b'(')?;
        let mut parser = ContentParser::new(&rest[open..]);
        match parser.next_operand_or_operator()? {
            ContentToken::Operand(Operand::Literal(bytes)) => Some(decode_pdf_text(&bytes, None)),
            _ => None,
        }
    })
}

fn pdf_version(bytes: &[u8]) -> Option<String> {
    let first_line = bytes.split(|byte| matches!(byte, b'\n' | b'\r')).next()?;
    let text = std::str::from_utf8(first_line).ok()?;
    text.strip_prefix("%PDF-").map(ToOwned::to_owned)
}

fn decode_hex(bytes: &[u8]) -> Vec<u8> {
    let hex = bytes
        .iter()
        .copied()
        .filter(|byte| !is_ws(*byte))
        .collect::<Vec<_>>();
    let mut output = Vec::new();
    let mut index = 0;
    while index < hex.len() {
        let high = hex_value(hex[index]).unwrap_or(0);
        let low = hex
            .get(index + 1)
            .and_then(|byte| hex_value(*byte))
            .unwrap_or(0);
        output.push((high << 4) | low);
        index += 2;
    }
    output
}

fn hex_value(byte: u8) -> Option<u8> {
    match byte {
        b'0'..=b'9' => Some(byte - b'0'),
        b'a'..=b'f' => Some(byte - b'a' + 10),
        b'A'..=b'F' => Some(byte - b'A' + 10),
        _ => None,
    }
}

fn parse_unsigned_at(bytes: &[u8], mut pos: usize) -> Option<(usize, usize)> {
    let start = pos;
    while pos < bytes.len() && bytes[pos].is_ascii_digit() {
        pos += 1;
    }
    if pos == start {
        return None;
    }
    std::str::from_utf8(&bytes[start..pos])
        .ok()?
        .parse()
        .ok()
        .map(|value| (value, pos))
}

fn skip_required_ws(bytes: &[u8], mut pos: usize) -> Option<usize> {
    if pos >= bytes.len() || !is_ws(bytes[pos]) {
        return None;
    }
    while pos < bytes.len() && is_ws(bytes[pos]) {
        pos += 1;
    }
    Some(pos)
}

fn is_ws_or_line_start(bytes: &[u8], pos: usize) -> bool {
    pos == 0 || matches!(bytes[pos - 1], b'\n' | b'\r')
}

fn is_delimiter_or_ws(byte: u8) -> bool {
    is_ws(byte) || matches!(byte, b'[' | b']' | b'<' | b'>' | b'/' | b'(' | b')')
}

fn is_ws(byte: u8) -> bool {
    matches!(byte, 0x00 | b'\t' | b'\n' | b'\x0c' | b'\r' | b' ')
}

fn find_subslice(haystack: &[u8], needle: &[u8]) -> Option<usize> {
    haystack
        .windows(needle.len())
        .position(|window| window == needle)
}

fn contains_name(bytes: &[u8], name: &[u8]) -> bool {
    find_subslice(bytes, name).is_some()
}

fn lossy(bytes: &[u8]) -> String {
    String::from_utf8_lossy(bytes).into_owned()
}

#[allow(dead_code)]
fn sha256_hex(bytes: &[u8]) -> String {
    let digest = Sha256::digest(bytes);
    digest.iter().map(|byte| format!("{byte:02x}")).collect()
}