Skip to main content

dongler_core/
archive.rs

1use std::io::Read;
2use std::path::Path;
3
4use flate2::read::GzDecoder;
5
6use crate::engine::{split_paragraphs, text_document_from_paragraphs, ExtractionEngine};
7use crate::error::{DonglerError, Result};
8use crate::ir::Document;
9use crate::openxml::read_zip_files;
10use crate::source::Source;
11use crate::textual::html_to_text;
12use serde_json::Value;
13
14#[derive(Debug, Default, Clone, Copy)]
15pub struct ArchiveEngine;
16
17#[derive(Debug)]
18struct TarEntry<'a> {
19    name: String,
20    bytes: &'a [u8],
21}
22
23impl ExtractionEngine for ArchiveEngine {
24    fn name(&self) -> &'static str {
25        "archive-native"
26    }
27
28    fn extract(&self, source: &Source) -> Result<Document> {
29        let bytes = source.bytes.as_deref().unwrap_or(source.content.as_bytes());
30        if source.path.as_deref().map(is_zip_path).unwrap_or(false)
31            || bytes.starts_with(b"PK\x03\x04")
32        {
33            return extract_zip_archive(source, self.name(), bytes);
34        }
35
36        let archive_bytes = decode_archive_bytes(bytes, source.path.as_deref())?;
37        let entries = match read_tar_entries(&archive_bytes) {
38            Ok(entries) if !entries.is_empty() => entries,
39            Ok(entries) => {
40                if let Some(text) = single_file_text_payload(&archive_bytes) {
41                    let paragraphs = split_paragraphs(&text);
42                    return text_document_from_paragraphs(source, self.name(), paragraphs, None);
43                }
44                entries
45            }
46            Err(error) => {
47                if let Some(text) = single_file_text_payload(&archive_bytes) {
48                    let paragraphs = split_paragraphs(&text);
49                    return text_document_from_paragraphs(source, self.name(), paragraphs, None);
50                }
51                return Err(error);
52            }
53        };
54        let mut paragraphs = Vec::new();
55
56        for entry in entries {
57            if !is_supported_archive_text_entry(&entry.name) {
58                continue;
59            }
60            let text = String::from_utf8_lossy(entry.bytes);
61            let normalized = archive_entry_text(&entry.name, &text);
62            paragraphs.extend(split_paragraphs(&normalized));
63        }
64
65        text_document_from_paragraphs(source, self.name(), paragraphs, None)
66    }
67}
68
69fn single_file_text_payload(bytes: &[u8]) -> Option<String> {
70    if bytes.is_empty() || bytes.contains(&0) {
71        return None;
72    }
73    Some(String::from_utf8_lossy(bytes).into_owned())
74}
75
76fn extract_zip_archive(source: &Source, engine_name: &str, bytes: &[u8]) -> Result<Document> {
77    let mut files = read_zip_files(bytes)?.into_iter().collect::<Vec<_>>();
78    files.sort_by(|left, right| {
79        archive_entry_sort_key(&left.0).cmp(&archive_entry_sort_key(&right.0))
80    });
81    let mut paragraphs = Vec::new();
82
83    for (name, text) in files {
84        if !is_supported_archive_text_entry(&name) {
85            continue;
86        }
87        let normalized = archive_entry_text(&name, &text);
88        paragraphs.extend(split_paragraphs(&normalized));
89    }
90
91    text_document_from_paragraphs(source, engine_name, paragraphs, None)
92}
93
94fn archive_entry_sort_key(name: &str) -> (u8, String) {
95    let priority = if is_json_entry(name) {
96        0
97    } else if is_xml_entry(name) {
98        1
99    } else {
100        2
101    };
102    (priority, name.to_owned())
103}
104
105fn archive_entry_text(name: &str, text: &str) -> String {
106    if is_xml_entry(name) {
107        html_to_text(text)
108    } else if is_json_entry(name) {
109        json_text(text)
110    } else {
111        text.to_owned()
112    }
113}
114
115fn json_text(text: &str) -> String {
116    match serde_json::from_str::<Value>(text) {
117        Ok(value) => {
118            let mut parts = Vec::new();
119            collect_json_text(&value, &mut parts);
120            parts.join("\n\n")
121        }
122        Err(_) => {
123            let mut parts = Vec::new();
124            for line in text.lines() {
125                let trimmed = line.trim();
126                if trimmed.is_empty() {
127                    continue;
128                }
129                if let Ok(value) = serde_json::from_str::<Value>(trimmed) {
130                    collect_json_text(&value, &mut parts);
131                }
132            }
133            if parts.is_empty() {
134                text.to_owned()
135            } else {
136                parts.join("\n\n")
137            }
138        }
139    }
140}
141
142fn collect_json_text(value: &Value, parts: &mut Vec<String>) {
143    match value {
144        Value::Object(object) => {
145            let before = parts.len();
146            for key in ["title", "abstract", "body_text", "text", "content"] {
147                if let Some(child) = object.get(key) {
148                    collect_json_text(child, parts);
149                }
150            }
151            if parts.len() != before {
152                return;
153            }
154            for child in object.values() {
155                collect_json_text(child, parts);
156            }
157        }
158        Value::Array(items) => {
159            for item in items {
160                collect_json_text(item, parts);
161            }
162        }
163        Value::String(text) => {
164            let text = text.split_whitespace().collect::<Vec<_>>().join(" ");
165            if !text.is_empty() {
166                parts.push(text);
167            }
168        }
169        _ => {}
170    }
171}
172
173fn decode_archive_bytes(bytes: &[u8], path: Option<&str>) -> Result<Vec<u8>> {
174    if path.map(is_gzip_path).unwrap_or(false) || bytes.starts_with(&[0x1f, 0x8b]) {
175        let mut decoder = GzDecoder::new(bytes);
176        let mut decoded = Vec::new();
177        decoder
178            .read_to_end(&mut decoded)
179            .map_err(|error| DonglerError::archive(format!("gzip decode failed: {error}")))?;
180        Ok(decoded)
181    } else {
182        Ok(bytes.to_vec())
183    }
184}
185
186fn read_tar_entries(bytes: &[u8]) -> Result<Vec<TarEntry<'_>>> {
187    let mut entries = Vec::new();
188    let mut pos = 0usize;
189
190    while pos + 512 <= bytes.len() {
191        let header = &bytes[pos..pos + 512];
192        if header.iter().all(|byte| *byte == 0) {
193            break;
194        }
195
196        let name = tar_name(header)?;
197        let size = tar_octal(&header[124..136])? as usize;
198        let typeflag = header[156];
199        let data_start = pos + 512;
200        let data_end = data_start + size;
201        if data_end > bytes.len() {
202            return Err(DonglerError::archive("truncated TAR entry data"));
203        }
204        if typeflag == 0 || typeflag == b'0' {
205            entries.push(TarEntry {
206                name,
207                bytes: &bytes[data_start..data_end],
208            });
209        }
210        pos = data_start + size.div_ceil(512) * 512;
211    }
212
213    Ok(entries)
214}
215
216fn tar_name(header: &[u8]) -> Result<String> {
217    let name = trim_nul_string(&header[0..100]);
218    let prefix = trim_nul_string(&header[345..500]);
219    let full_name = if prefix.is_empty() {
220        name
221    } else {
222        format!("{prefix}/{name}")
223    };
224    if full_name.is_empty() {
225        Err(DonglerError::archive("TAR entry missing name"))
226    } else {
227        Ok(full_name)
228    }
229}
230
231fn tar_octal(bytes: &[u8]) -> Result<u64> {
232    let text = trim_nul_string(bytes);
233    let text = text.trim();
234    if text.is_empty() {
235        return Ok(0);
236    }
237    u64::from_str_radix(text, 8).map_err(|_| DonglerError::archive("invalid TAR octal field"))
238}
239
240fn trim_nul_string(bytes: &[u8]) -> String {
241    let end = bytes
242        .iter()
243        .position(|byte| *byte == 0)
244        .unwrap_or(bytes.len());
245    String::from_utf8_lossy(&bytes[..end]).trim().to_owned()
246}
247
248fn is_supported_archive_text_entry(name: &str) -> bool {
249    matches!(
250        logical_extension(name).as_deref(),
251        Some(
252            "txt"
253                | "text"
254                | "md"
255                | "markdown"
256                | "tex"
257                | "latex"
258                | "ltx"
259                | "xml"
260                | "nxml"
261                | "tei"
262                | "html"
263                | "htm"
264                | "json"
265                | "jsonl"
266                | "ndjson"
267                | "csv"
268                | "tsv"
269        )
270    )
271}
272
273fn is_xml_entry(name: &str) -> bool {
274    matches!(
275        logical_extension(name).as_deref(),
276        Some("xml" | "nxml" | "tei" | "html" | "htm")
277    )
278}
279
280fn is_json_entry(name: &str) -> bool {
281    matches!(
282        logical_extension(name).as_deref(),
283        Some("json" | "jsonl" | "ndjson")
284    )
285}
286
287fn logical_extension(path: &str) -> Option<String> {
288    let path = Path::new(path);
289    let extension = path.extension()?.to_str()?.to_ascii_lowercase();
290    if extension != "gz" {
291        return Some(extension);
292    }
293    Path::new(path.file_stem()?.to_str()?)
294        .extension()
295        .and_then(|extension| extension.to_str())
296        .map(|extension| extension.to_ascii_lowercase())
297}
298
299fn is_gzip_path(path: &str) -> bool {
300    path.to_ascii_lowercase().ends_with(".gz") || path.to_ascii_lowercase().ends_with(".tgz")
301}
302
303fn is_zip_path(path: &str) -> bool {
304    path.to_ascii_lowercase().ends_with(".zip")
305}