gunzip_split/
lib.rs

1//! Splits concatenated gzip files and decompresses them separately
2//!
3//! This crate contains the main functionality of the `gunzip-split` utility.
4
5use flate2::bufread::GzDecoder;
6use std::borrow::Cow;
7use std::fs::{create_dir, rename, File};
8use std::io::{BufRead, BufReader, Error, ErrorKind, Read, Result, Seek, SeekFrom, Write};
9use std::mem::drop;
10use std::path::Path;
11
12mod sendfile;
13
14pub(crate) const CHUNK_SIZE: usize = 1024 * 1024;
15
16/// Progress information
17///
18/// Used for the progress callbacks in some functions in this crate to
19/// pass status information to the calling code.
20#[derive(Debug)]
21pub enum Progress<'a> {
22	/// Processing of a file starts
23	///
24	/// - `start`: offset in the input file where processing starts
25	/// - `name`: filename (if no metadata is found a dummy name like "file_`counter`" is used)
26	FileBegin { start: u64, name: &'a str },
27	/// Forward progress happened
28	///
29	/// Guarantees for the progress callbacks:
30	/// - passed at least once for every file that could be successfully processed
31	/// - passed directly after an OS signal interrupted a system call.
32	ProgressStep,
33	/// Processing the current file failed, but this wasn't fatal for the whole operation
34	///
35	/// - `error`: underlying error
36	FileFailed { error: Error },
37	/// Current file was processed successfully
38	///
39	/// - `end`: offset in the input file where processing completed
40	FileDone { end: u64 },
41}
42
43/// Constituent file information
44#[derive(Clone, Debug)]
45pub struct FileInfo {
46	pub filename: String,
47	pub start: u64,
48	pub end: u64,
49}
50
51/// Replace ASCII control characters with REPLACEMENT CHARACTER
52///
53/// They may upset terminals when used in e.g. a filename or (in the case of NL) confuse users.
54#[doc(hidden)]
55#[must_use]
56#[inline]
57pub fn escape_cc(input: &str) -> Cow<'_, str> {
58	if input.find(|c| ('\0'..'\x1f').contains(&c)).is_some() {
59		input
60			.replace(|c| ('\0'..'\x1f').contains(&c), "\u{FFFD}")
61			.into()
62	} else {
63		input.into()
64	}
65}
66
67/// Uncompresses one file from the gzip input
68///
69/// The passed `decoder` must already be positioned at the start offset.
70///
71/// Any `/`, `\` or NUL byte in filename will be replaced with an underscore to prevent
72/// directory traversal attacks.
73///
74/// # Errors
75/// If this function encounters any form of I/O or other error except interruption by a signal,
76/// an adequate error variant will be returned. The underlying reader will be positioned near
77/// the offset where the error occured.
78///
79/// Interruption by a signal instead causes `progress_cb` to be called with
80/// [`Progress::ProgressStep`].
81pub fn uncompress_one<R: BufRead>(
82	decoder: &mut GzDecoder<R>,
83	output_directory: &Path,
84	filename: &str,
85	overwrite: bool,
86	mut progress_cb: impl FnMut(Progress),
87) -> Result<()> {
88	let mut buffer = vec![0; CHUNK_SIZE];
89	let filename = filename.replace(['/', '\0', '\\'], "_");
90	let tempfile = format!("{}.part", filename);
91	let outpath = output_directory.join(&tempfile);
92	let finalpath = output_directory.join(&filename);
93
94	// Early bail out if we would overwrite a file
95	if !overwrite && (finalpath.exists() || finalpath.is_symlink()) {
96		return Err(Error::new(
97			ErrorKind::AlreadyExists,
98			format!("file {} already exists", escape_cc(&filename)),
99		));
100	}
101
102	// Decompress the file
103	{
104		// Create .part file
105		let mut outfile = if overwrite {
106			File::create(&outpath)?
107		} else {
108			match File::options().write(true).create_new(true).open(&outpath) {
109				Ok(f) => f,
110				Err(e) => match e.kind() {
111					ErrorKind::AlreadyExists => {
112						return Err(Error::new(
113							ErrorKind::AlreadyExists,
114							format!("file {} already exists", escape_cc(&tempfile)),
115						))
116					}
117					_ => return Err(e),
118				},
119			}
120		};
121
122		// Decompress contents into .part file
123		let mut chunk_size = 0;
124		while match decoder.read(&mut *buffer) {
125			Ok(0) => false,
126			Ok(i) => {
127				chunk_size = i;
128				true
129			}
130			Err(e) => match e.kind() {
131				ErrorKind::Interrupted => true,
132				_ => return Err(e),
133			},
134		} {
135			// Write chunk into file
136			let mut slice = &mut buffer[..chunk_size];
137			while !slice.is_empty() {
138				let count = match outfile.write(slice) {
139					Ok(c) => c,
140					Err(e) => match e.kind() {
141						ErrorKind::Interrupted => 0,
142						_ => return Err(e),
143					},
144				};
145				slice = &mut slice[count..];
146				progress_cb(Progress::ProgressStep);
147			}
148		}
149	}
150
151	// Move to final destination
152	if !overwrite && (finalpath.exists() || finalpath.is_symlink()) {
153		return Err(Error::new(
154			ErrorKind::AlreadyExists,
155			format!("file {} already exists", escape_cc(&filename)),
156		));
157	}
158	rename(outpath, finalpath)?;
159
160	Ok(())
161}
162
163/// Uncompresses all files from the gzip input
164///
165/// The passed `file` must be positioned at the beginning of the input.
166///
167/// # Errors
168/// If this function encounters any form of I/O or other error except interruption by a signal,
169/// an adequate error variant will be returned. The underlying reader will be positioned near
170/// the offset where the error occured.
171///
172/// Interruption by a signal instead causes `progress_cb` to be called with
173/// [`Progress::ProgressStep`].
174pub fn uncompress_all(
175	file: &mut File,
176	output_directory: &Path,
177	overwrite: bool,
178	mut progress_cb: impl FnMut(Progress),
179) -> Result<()> {
180	let mut reader = BufReader::with_capacity(CHUNK_SIZE * 2, file);
181	let mut counter: u64 = 1;
182
183	while !reader.fill_buf()?.is_empty() {
184		let start = reader.stream_position()?;
185		let mut decoder = GzDecoder::new(reader);
186		let filename: String = if let Some(header) = decoder.header() {
187			header.filename().map_or_else(
188				|| format!("file_{}", counter),
189				|bytes| String::from_utf8_lossy(bytes).into(),
190			)
191		} else {
192			let mut reader = decoder.into_inner();
193			// Try to seek back to the exact point where to non-gzip data started
194			drop(reader.seek(SeekFrom::Start(start)));
195			return Ok(());
196		};
197		counter += 1;
198		progress_cb(Progress::FileBegin {
199			start,
200			name: &filename,
201		});
202
203		uncompress_one(
204			&mut decoder,
205			output_directory,
206			&filename,
207			overwrite,
208			&mut progress_cb,
209		)?;
210		reader = decoder.into_inner();
211
212		progress_cb(Progress::FileDone {
213			end: reader.stream_position()?,
214		});
215	}
216	Ok(())
217}
218
219/// Lists all files in the input gzip file
220///
221/// # Errors
222/// If this function encounters any form of I/O or other error except interruption by a signal,
223/// an adequate error variant will be returned. The position in `input` is undefined.
224///
225/// Interruption by a signal instead causes `progress_cb` to be called with
226/// [`Progress::ProgressStep`].
227pub fn list_contents(
228	file: &mut File,
229	mut progress_cb: impl FnMut(Progress),
230) -> Result<Vec<FileInfo>> {
231	let length = file.metadata()?.len();
232	let mut reader = BufReader::with_capacity(CHUNK_SIZE * 2, file);
233	let mut buffer = vec![0; CHUNK_SIZE];
234	let mut files: Vec<FileInfo> = Vec::with_capacity(2);
235
236	while !reader.fill_buf()?.is_empty() {
237		let start = reader.stream_position()?;
238		let mut decoder = GzDecoder::new(reader);
239		// Find header and (if possible) extract filename
240		let filename: String = if let Some(header) = decoder.header() {
241			header.filename().map_or_else(
242				|| format!("file_{}", files.len() + 1),
243				|bytes| String::from_utf8_lossy(bytes).into(),
244			)
245		} else {
246			let mut reader = decoder.into_inner();
247			// Try to seek back to the exact point where to non-gzip data started
248			drop(reader.seek(SeekFrom::Start(start)));
249			return Ok(files);
250		};
251
252		progress_cb(Progress::FileBegin {
253			start,
254			name: &filename,
255		});
256
257		while match decoder.read(&mut *buffer) {
258			Ok(0) => false,
259			Ok(_) => true,
260			Err(e) => {
261				if e.kind() == ErrorKind::Interrupted {
262					true
263				} else {
264					files.push(FileInfo {
265						filename: format!("{}~corrupted", filename),
266						start,
267						end: length,
268					});
269					progress_cb(Progress::FileFailed { error: e });
270					return Ok(files);
271				}
272			}
273		} {
274			progress_cb(Progress::ProgressStep);
275			// skip contents
276		}
277
278		reader = decoder.into_inner();
279		let end = reader.stream_position()?;
280		files.push(FileInfo {
281			filename,
282			start,
283			end,
284		});
285		progress_cb(Progress::FileDone { end });
286	}
287	Ok(files)
288}
289
290/// Copies one gzip file described by `info` from the concatenated gzip input
291///
292/// The filename will be `info.filename` with `.gz` appended. Any `/`, `\`, or NUL byte in
293/// the resulting filename will be replaced with an underscore to prevent directory traversal
294/// attacks.
295///
296/// # Errors
297/// If this function encounters any form of I/O or other error except interruption by a signal,
298/// an adequate error variant will be returned. The position in `input` is undefined.
299///
300/// Interruption by a signal instead causes `progress_cb` to be called with
301/// [`Progress::ProgressStep`].
302pub fn write_one_file(
303	input: &mut File,
304	info: &FileInfo,
305	output_directory: &Path,
306	overwrite: bool,
307	mut progress_cb: impl FnMut(Progress),
308) -> Result<()> {
309	input.seek(SeekFrom::Start(info.start))?;
310	let filename = format!("{}.gz", info.filename.replace(['/', '\0', '\\'], "_"));
311	let tempfile = format!("{}.part", filename);
312	let outpath = output_directory.join(&tempfile);
313	let finalpath = output_directory.join(&filename);
314
315	// Early bail out if we would overwrite a file
316	if !overwrite && (finalpath.exists() || finalpath.is_symlink()) {
317		return Err(Error::new(
318			ErrorKind::AlreadyExists,
319			format!("file {} already exists", escape_cc(&filename)),
320		));
321	}
322
323	// Write into .part file
324	{
325		let mut outfile = if overwrite {
326			File::create(&outpath)?
327		} else {
328			match File::options().write(true).create_new(true).open(&outpath) {
329				Ok(f) => f,
330				Err(e) => match e.kind() {
331					ErrorKind::AlreadyExists => {
332						return Err(Error::new(
333							ErrorKind::AlreadyExists,
334							format!("file {} already exists", escape_cc(&tempfile)),
335						))
336					}
337					_ => return Err(e),
338				},
339			}
340		};
341		let mut range = info.start..info.end;
342		while let Some(r) = sendfile::sendfile(input, range, &mut outfile)? {
343			progress_cb(Progress::ProgressStep);
344			range = r;
345		}
346	}
347
348	// Move to final destination
349	if !overwrite && (finalpath.exists() || finalpath.is_symlink()) {
350		return Err(Error::new(
351			ErrorKind::AlreadyExists,
352			format!("file {} already exists", escape_cc(&filename)),
353		));
354	}
355	rename(outpath, finalpath)?;
356
357	Ok(())
358}
359
360/// Splits the concatenated gzip input into separate files
361///
362/// After successfully returning, `input` will be positioned at the end of the last file.
363///
364/// # Errors
365/// If this function cannot ensure the existence of the output directory, an error will be returned.
366///
367/// Any form of I/O or other error encountered while extracting one file,
368/// causes `progress_cb` to be called with [`Progress::FileFailed`].
369///
370/// Interruption by a signal instead causes `progress_cb` to be called with
371/// [`Progress::ProgressStep`].
372pub fn unconcatenate_files(
373	input: &mut File,
374	infos: &[FileInfo],
375	output_directory: &Path,
376	overwrite: bool,
377	mut progress_cb: impl FnMut(Progress),
378) -> Result<()> {
379	// Try to create output directory
380	if let Err(e) = create_dir(output_directory) {
381		if e.kind() != ErrorKind::AlreadyExists {
382			return Err(e);
383		}
384	}
385
386	// Write files
387	for info in infos {
388		progress_cb(Progress::FileBegin {
389			start: info.start,
390			name: &info.filename,
391		});
392		if let Err(e) = write_one_file(input, info, output_directory, overwrite, &mut progress_cb) {
393			progress_cb(Progress::FileFailed { error: e });
394		} else {
395			progress_cb(Progress::FileDone { end: info.end });
396		}
397	}
398
399	// Seek after the last file for correct error position information about corrupted tail data
400	// even if native sendfile was used. Ignore errors while seeking.
401	let len = infos.len();
402	if len > 0 {
403		drop(input.seek(SeekFrom::Start(infos[len - 1].end)));
404	}
405
406	Ok(())
407}