dongler-core 0.3.7

Rust-native PDF and document extraction core for Markdown, LaTeX, and JSON output.
Documentation
use std::io::Read;
use std::path::Path;

use flate2::read::GzDecoder;

use crate::engine::{split_paragraphs, text_document_from_paragraphs, ExtractionEngine};
use crate::error::{DonglerError, Result};
use crate::ir::Document;
use crate::openxml::read_zip_files;
use crate::source::Source;
use crate::textual::html_to_text;
use serde_json::Value;

#[derive(Debug, Default, Clone, Copy)]
pub struct ArchiveEngine;

#[derive(Debug)]
struct TarEntry<'a> {
    name: String,
    bytes: &'a [u8],
}

impl ExtractionEngine for ArchiveEngine {
    fn name(&self) -> &'static str {
        "archive-native"
    }

    fn extract(&self, source: &Source) -> Result<Document> {
        let bytes = source.bytes.as_deref().unwrap_or(source.content.as_bytes());
        if source.path.as_deref().map(is_zip_path).unwrap_or(false)
            || bytes.starts_with(b"PK\x03\x04")
        {
            return extract_zip_archive(source, self.name(), bytes);
        }

        let archive_bytes = decode_archive_bytes(bytes, source.path.as_deref())?;
        let entries = match read_tar_entries(&archive_bytes) {
            Ok(entries) if !entries.is_empty() => entries,
            Ok(entries) => {
                if let Some(text) = single_file_text_payload(&archive_bytes) {
                    let paragraphs = split_paragraphs(&text);
                    return text_document_from_paragraphs(source, self.name(), paragraphs, None);
                }
                entries
            }
            Err(error) => {
                if let Some(text) = single_file_text_payload(&archive_bytes) {
                    let paragraphs = split_paragraphs(&text);
                    return text_document_from_paragraphs(source, self.name(), paragraphs, None);
                }
                return Err(error);
            }
        };
        let mut paragraphs = Vec::new();

        for entry in entries {
            if !is_supported_archive_text_entry(&entry.name) {
                continue;
            }
            let text = String::from_utf8_lossy(entry.bytes);
            let normalized = archive_entry_text(&entry.name, &text);
            paragraphs.extend(split_paragraphs(&normalized));
        }

        text_document_from_paragraphs(source, self.name(), paragraphs, None)
    }
}

fn single_file_text_payload(bytes: &[u8]) -> Option<String> {
    if bytes.is_empty() || bytes.contains(&0) {
        return None;
    }
    Some(String::from_utf8_lossy(bytes).into_owned())
}

fn extract_zip_archive(source: &Source, engine_name: &str, bytes: &[u8]) -> Result<Document> {
    let mut files = read_zip_files(bytes)?.into_iter().collect::<Vec<_>>();
    files.sort_by(|left, right| {
        archive_entry_sort_key(&left.0).cmp(&archive_entry_sort_key(&right.0))
    });
    let mut paragraphs = Vec::new();

    for (name, text) in files {
        if !is_supported_archive_text_entry(&name) {
            continue;
        }
        let normalized = archive_entry_text(&name, &text);
        paragraphs.extend(split_paragraphs(&normalized));
    }

    text_document_from_paragraphs(source, engine_name, paragraphs, None)
}

fn archive_entry_sort_key(name: &str) -> (u8, String) {
    let priority = if is_json_entry(name) {
        0
    } else if is_xml_entry(name) {
        1
    } else {
        2
    };
    (priority, name.to_owned())
}

fn archive_entry_text(name: &str, text: &str) -> String {
    if is_xml_entry(name) {
        html_to_text(text)
    } else if is_json_entry(name) {
        json_text(text)
    } else {
        text.to_owned()
    }
}

fn json_text(text: &str) -> String {
    match serde_json::from_str::<Value>(text) {
        Ok(value) => {
            let mut parts = Vec::new();
            collect_json_text(&value, &mut parts);
            parts.join("\n\n")
        }
        Err(_) => {
            let mut parts = Vec::new();
            for line in text.lines() {
                let trimmed = line.trim();
                if trimmed.is_empty() {
                    continue;
                }
                if let Ok(value) = serde_json::from_str::<Value>(trimmed) {
                    collect_json_text(&value, &mut parts);
                }
            }
            if parts.is_empty() {
                text.to_owned()
            } else {
                parts.join("\n\n")
            }
        }
    }
}

fn collect_json_text(value: &Value, parts: &mut Vec<String>) {
    match value {
        Value::Object(object) => {
            let before = parts.len();
            for key in ["title", "abstract", "body_text", "text", "content"] {
                if let Some(child) = object.get(key) {
                    collect_json_text(child, parts);
                }
            }
            if parts.len() != before {
                return;
            }
            for child in object.values() {
                collect_json_text(child, parts);
            }
        }
        Value::Array(items) => {
            for item in items {
                collect_json_text(item, parts);
            }
        }
        Value::String(text) => {
            let text = text.split_whitespace().collect::<Vec<_>>().join(" ");
            if !text.is_empty() {
                parts.push(text);
            }
        }
        _ => {}
    }
}

fn decode_archive_bytes(bytes: &[u8], path: Option<&str>) -> Result<Vec<u8>> {
    if path.map(is_gzip_path).unwrap_or(false) || bytes.starts_with(&[0x1f, 0x8b]) {
        let mut decoder = GzDecoder::new(bytes);
        let mut decoded = Vec::new();
        decoder
            .read_to_end(&mut decoded)
            .map_err(|error| DonglerError::archive(format!("gzip decode failed: {error}")))?;
        Ok(decoded)
    } else {
        Ok(bytes.to_vec())
    }
}

fn read_tar_entries(bytes: &[u8]) -> Result<Vec<TarEntry<'_>>> {
    let mut entries = Vec::new();
    let mut pos = 0usize;

    while pos + 512 <= bytes.len() {
        let header = &bytes[pos..pos + 512];
        if header.iter().all(|byte| *byte == 0) {
            break;
        }

        let name = tar_name(header)?;
        let size = tar_octal(&header[124..136])? as usize;
        let typeflag = header[156];
        let data_start = pos + 512;
        let data_end = data_start + size;
        if data_end > bytes.len() {
            return Err(DonglerError::archive("truncated TAR entry data"));
        }
        if typeflag == 0 || typeflag == b'0' {
            entries.push(TarEntry {
                name,
                bytes: &bytes[data_start..data_end],
            });
        }
        pos = data_start + size.div_ceil(512) * 512;
    }

    Ok(entries)
}

fn tar_name(header: &[u8]) -> Result<String> {
    let name = trim_nul_string(&header[0..100]);
    let prefix = trim_nul_string(&header[345..500]);
    let full_name = if prefix.is_empty() {
        name
    } else {
        format!("{prefix}/{name}")
    };
    if full_name.is_empty() {
        Err(DonglerError::archive("TAR entry missing name"))
    } else {
        Ok(full_name)
    }
}

fn tar_octal(bytes: &[u8]) -> Result<u64> {
    let text = trim_nul_string(bytes);
    let text = text.trim();
    if text.is_empty() {
        return Ok(0);
    }
    u64::from_str_radix(text, 8).map_err(|_| DonglerError::archive("invalid TAR octal field"))
}

fn trim_nul_string(bytes: &[u8]) -> String {
    let end = bytes
        .iter()
        .position(|byte| *byte == 0)
        .unwrap_or(bytes.len());
    String::from_utf8_lossy(&bytes[..end]).trim().to_owned()
}

fn is_supported_archive_text_entry(name: &str) -> bool {
    matches!(
        logical_extension(name).as_deref(),
        Some(
            "txt"
                | "text"
                | "md"
                | "markdown"
                | "tex"
                | "latex"
                | "ltx"
                | "xml"
                | "nxml"
                | "tei"
                | "html"
                | "htm"
                | "json"
                | "jsonl"
                | "ndjson"
                | "csv"
                | "tsv"
        )
    )
}

fn is_xml_entry(name: &str) -> bool {
    matches!(
        logical_extension(name).as_deref(),
        Some("xml" | "nxml" | "tei" | "html" | "htm")
    )
}

fn is_json_entry(name: &str) -> bool {
    matches!(
        logical_extension(name).as_deref(),
        Some("json" | "jsonl" | "ndjson")
    )
}

fn logical_extension(path: &str) -> Option<String> {
    let path = Path::new(path);
    let extension = path.extension()?.to_str()?.to_ascii_lowercase();
    if extension != "gz" {
        return Some(extension);
    }
    Path::new(path.file_stem()?.to_str()?)
        .extension()
        .and_then(|extension| extension.to_str())
        .map(|extension| extension.to_ascii_lowercase())
}

fn is_gzip_path(path: &str) -> bool {
    path.to_ascii_lowercase().ends_with(".gz") || path.to_ascii_lowercase().ends_with(".tgz")
}

fn is_zip_path(path: &str) -> bool {
    path.to_ascii_lowercase().ends_with(".zip")
}