1use std::io::Read;
2use std::path::Path;
3
4use flate2::read::GzDecoder;
5
6use crate::engine::{split_paragraphs, text_document_from_paragraphs, ExtractionEngine};
7use crate::error::{DonglerError, Result};
8use crate::ir::Document;
9use crate::openxml::read_zip_files;
10use crate::source::Source;
11use crate::textual::html_to_text;
12use serde_json::Value;
13
14#[derive(Debug, Default, Clone, Copy)]
15pub struct ArchiveEngine;
16
17#[derive(Debug)]
18struct TarEntry<'a> {
19 name: String,
20 bytes: &'a [u8],
21}
22
23impl ExtractionEngine for ArchiveEngine {
24 fn name(&self) -> &'static str {
25 "archive-native"
26 }
27
28 fn extract(&self, source: &Source) -> Result<Document> {
29 let bytes = source.bytes.as_deref().unwrap_or(source.content.as_bytes());
30 if source.path.as_deref().map(is_zip_path).unwrap_or(false)
31 || bytes.starts_with(b"PK\x03\x04")
32 {
33 return extract_zip_archive(source, self.name(), bytes);
34 }
35
36 let archive_bytes = decode_archive_bytes(bytes, source.path.as_deref())?;
37 let entries = match read_tar_entries(&archive_bytes) {
38 Ok(entries) if !entries.is_empty() => entries,
39 Ok(entries) => {
40 if let Some(text) = single_file_text_payload(&archive_bytes) {
41 let paragraphs = split_paragraphs(&text);
42 return text_document_from_paragraphs(source, self.name(), paragraphs, None);
43 }
44 entries
45 }
46 Err(error) => {
47 if let Some(text) = single_file_text_payload(&archive_bytes) {
48 let paragraphs = split_paragraphs(&text);
49 return text_document_from_paragraphs(source, self.name(), paragraphs, None);
50 }
51 return Err(error);
52 }
53 };
54 let mut paragraphs = Vec::new();
55
56 for entry in entries {
57 if !is_supported_archive_text_entry(&entry.name) {
58 continue;
59 }
60 let text = String::from_utf8_lossy(entry.bytes);
61 let normalized = archive_entry_text(&entry.name, &text);
62 paragraphs.extend(split_paragraphs(&normalized));
63 }
64
65 text_document_from_paragraphs(source, self.name(), paragraphs, None)
66 }
67}
68
69fn single_file_text_payload(bytes: &[u8]) -> Option<String> {
70 if bytes.is_empty() || bytes.contains(&0) {
71 return None;
72 }
73 Some(String::from_utf8_lossy(bytes).into_owned())
74}
75
76fn extract_zip_archive(source: &Source, engine_name: &str, bytes: &[u8]) -> Result<Document> {
77 let mut files = read_zip_files(bytes)?.into_iter().collect::<Vec<_>>();
78 files.sort_by(|left, right| {
79 archive_entry_sort_key(&left.0).cmp(&archive_entry_sort_key(&right.0))
80 });
81 let mut paragraphs = Vec::new();
82
83 for (name, text) in files {
84 if !is_supported_archive_text_entry(&name) {
85 continue;
86 }
87 let normalized = archive_entry_text(&name, &text);
88 paragraphs.extend(split_paragraphs(&normalized));
89 }
90
91 text_document_from_paragraphs(source, engine_name, paragraphs, None)
92}
93
94fn archive_entry_sort_key(name: &str) -> (u8, String) {
95 let priority = if is_json_entry(name) {
96 0
97 } else if is_xml_entry(name) {
98 1
99 } else {
100 2
101 };
102 (priority, name.to_owned())
103}
104
105fn archive_entry_text(name: &str, text: &str) -> String {
106 if is_xml_entry(name) {
107 html_to_text(text)
108 } else if is_json_entry(name) {
109 json_text(text)
110 } else {
111 text.to_owned()
112 }
113}
114
115fn json_text(text: &str) -> String {
116 match serde_json::from_str::<Value>(text) {
117 Ok(value) => {
118 let mut parts = Vec::new();
119 collect_json_text(&value, &mut parts);
120 parts.join("\n\n")
121 }
122 Err(_) => {
123 let mut parts = Vec::new();
124 for line in text.lines() {
125 let trimmed = line.trim();
126 if trimmed.is_empty() {
127 continue;
128 }
129 if let Ok(value) = serde_json::from_str::<Value>(trimmed) {
130 collect_json_text(&value, &mut parts);
131 }
132 }
133 if parts.is_empty() {
134 text.to_owned()
135 } else {
136 parts.join("\n\n")
137 }
138 }
139 }
140}
141
142fn collect_json_text(value: &Value, parts: &mut Vec<String>) {
143 match value {
144 Value::Object(object) => {
145 let before = parts.len();
146 for key in ["title", "abstract", "body_text", "text", "content"] {
147 if let Some(child) = object.get(key) {
148 collect_json_text(child, parts);
149 }
150 }
151 if parts.len() != before {
152 return;
153 }
154 for child in object.values() {
155 collect_json_text(child, parts);
156 }
157 }
158 Value::Array(items) => {
159 for item in items {
160 collect_json_text(item, parts);
161 }
162 }
163 Value::String(text) => {
164 let text = text.split_whitespace().collect::<Vec<_>>().join(" ");
165 if !text.is_empty() {
166 parts.push(text);
167 }
168 }
169 _ => {}
170 }
171}
172
173fn decode_archive_bytes(bytes: &[u8], path: Option<&str>) -> Result<Vec<u8>> {
174 if path.map(is_gzip_path).unwrap_or(false) || bytes.starts_with(&[0x1f, 0x8b]) {
175 let mut decoder = GzDecoder::new(bytes);
176 let mut decoded = Vec::new();
177 decoder
178 .read_to_end(&mut decoded)
179 .map_err(|error| DonglerError::archive(format!("gzip decode failed: {error}")))?;
180 Ok(decoded)
181 } else {
182 Ok(bytes.to_vec())
183 }
184}
185
186fn read_tar_entries(bytes: &[u8]) -> Result<Vec<TarEntry<'_>>> {
187 let mut entries = Vec::new();
188 let mut pos = 0usize;
189
190 while pos + 512 <= bytes.len() {
191 let header = &bytes[pos..pos + 512];
192 if header.iter().all(|byte| *byte == 0) {
193 break;
194 }
195
196 let name = tar_name(header)?;
197 let size = tar_octal(&header[124..136])? as usize;
198 let typeflag = header[156];
199 let data_start = pos + 512;
200 let data_end = data_start + size;
201 if data_end > bytes.len() {
202 return Err(DonglerError::archive("truncated TAR entry data"));
203 }
204 if typeflag == 0 || typeflag == b'0' {
205 entries.push(TarEntry {
206 name,
207 bytes: &bytes[data_start..data_end],
208 });
209 }
210 pos = data_start + size.div_ceil(512) * 512;
211 }
212
213 Ok(entries)
214}
215
216fn tar_name(header: &[u8]) -> Result<String> {
217 let name = trim_nul_string(&header[0..100]);
218 let prefix = trim_nul_string(&header[345..500]);
219 let full_name = if prefix.is_empty() {
220 name
221 } else {
222 format!("{prefix}/{name}")
223 };
224 if full_name.is_empty() {
225 Err(DonglerError::archive("TAR entry missing name"))
226 } else {
227 Ok(full_name)
228 }
229}
230
231fn tar_octal(bytes: &[u8]) -> Result<u64> {
232 let text = trim_nul_string(bytes);
233 let text = text.trim();
234 if text.is_empty() {
235 return Ok(0);
236 }
237 u64::from_str_radix(text, 8).map_err(|_| DonglerError::archive("invalid TAR octal field"))
238}
239
240fn trim_nul_string(bytes: &[u8]) -> String {
241 let end = bytes
242 .iter()
243 .position(|byte| *byte == 0)
244 .unwrap_or(bytes.len());
245 String::from_utf8_lossy(&bytes[..end]).trim().to_owned()
246}
247
248fn is_supported_archive_text_entry(name: &str) -> bool {
249 matches!(
250 logical_extension(name).as_deref(),
251 Some(
252 "txt"
253 | "text"
254 | "md"
255 | "markdown"
256 | "tex"
257 | "latex"
258 | "ltx"
259 | "xml"
260 | "nxml"
261 | "tei"
262 | "html"
263 | "htm"
264 | "json"
265 | "jsonl"
266 | "ndjson"
267 | "csv"
268 | "tsv"
269 )
270 )
271}
272
273fn is_xml_entry(name: &str) -> bool {
274 matches!(
275 logical_extension(name).as_deref(),
276 Some("xml" | "nxml" | "tei" | "html" | "htm")
277 )
278}
279
280fn is_json_entry(name: &str) -> bool {
281 matches!(
282 logical_extension(name).as_deref(),
283 Some("json" | "jsonl" | "ndjson")
284 )
285}
286
287fn logical_extension(path: &str) -> Option<String> {
288 let path = Path::new(path);
289 let extension = path.extension()?.to_str()?.to_ascii_lowercase();
290 if extension != "gz" {
291 return Some(extension);
292 }
293 Path::new(path.file_stem()?.to_str()?)
294 .extension()
295 .and_then(|extension| extension.to_str())
296 .map(|extension| extension.to_ascii_lowercase())
297}
298
299fn is_gzip_path(path: &str) -> bool {
300 path.to_ascii_lowercase().ends_with(".gz") || path.to_ascii_lowercase().ends_with(".tgz")
301}
302
303fn is_zip_path(path: &str) -> bool {
304 path.to_ascii_lowercase().ends_with(".zip")
305}