use crate::error::{ArchiveError, Result};
use crate::format::ArchiveFormat;
use std::io::{Cursor, Read};
#[derive(Debug, Clone)]
pub struct ExtractedFile {
pub path: String,
pub data: Vec<u8>,
pub is_directory: bool,
}
#[derive(Debug, Clone)]
pub struct ArchiveExtractor {
max_file_size: usize,
max_total_size: usize,
}
impl Default for ArchiveExtractor {
fn default() -> Self {
Self {
max_file_size: 100 * 1024 * 1024, max_total_size: 1024 * 1024 * 1024, }
}
}
impl ArchiveExtractor {
pub fn new() -> Self {
Self::default()
}
pub fn with_max_file_size(mut self, size: usize) -> Self {
self.max_file_size = size;
self
}
pub fn with_max_total_size(mut self, size: usize) -> Self {
self.max_total_size = size;
self
}
pub fn extract(&self, data: &[u8], format: ArchiveFormat) -> Result<Vec<ExtractedFile>> {
match format {
ArchiveFormat::Zip => self.extract_zip(data),
ArchiveFormat::Tar => self.extract_tar(data),
ArchiveFormat::Ar => self.extract_ar(data),
ArchiveFormat::Deb => self.extract_deb(data),
ArchiveFormat::TarGz => self.extract_tar_gz(data),
ArchiveFormat::TarBz2 => self.extract_tar_bz2(data),
ArchiveFormat::TarXz => self.extract_tar_xz(data),
ArchiveFormat::TarZst => self.extract_tar_zst(data),
ArchiveFormat::TarLz4 => self.extract_tar_lz4(data),
ArchiveFormat::SevenZ => self.extract_7z(data),
ArchiveFormat::Gz => self.extract_single_gz(data),
ArchiveFormat::Bz2 => self.extract_single_bz2(data),
ArchiveFormat::Xz => self.extract_single_xz(data),
ArchiveFormat::Lz4 => self.extract_single_lz4(data),
ArchiveFormat::Zst => self.extract_single_zst(data),
}
}
fn extract_zip(&self, data: &[u8]) -> Result<Vec<ExtractedFile>> {
let reader = Cursor::new(data);
let mut archive = zip::ZipArchive::new(reader)?;
let mut files = Vec::new();
let mut total_size = 0usize;
for i in 0..archive.len() {
let mut file = archive.by_index(i)?;
let is_directory = file.is_dir();
if !is_directory {
let size = file.size() as usize;
if size > self.max_file_size {
return Err(ArchiveError::FileTooLarge {
size,
limit: self.max_file_size,
});
}
total_size += size;
if total_size > self.max_total_size {
return Err(ArchiveError::TotalSizeTooLarge {
size: total_size,
limit: self.max_total_size,
});
}
let mut contents = Vec::new();
file.read_to_end(&mut contents)?;
files.push(ExtractedFile {
path: file.name().to_string(),
data: contents,
is_directory,
});
} else {
files.push(ExtractedFile {
path: file.name().to_string(),
data: Vec::new(),
is_directory,
});
}
}
Ok(files)
}
fn extract_tar(&self, data: &[u8]) -> Result<Vec<ExtractedFile>> {
let cursor = Cursor::new(data);
let mut archive = tar::Archive::new(cursor);
self.process_tar_entries(&mut archive)
}
fn extract_ar(&self, data: &[u8]) -> Result<Vec<ExtractedFile>> {
let cursor = Cursor::new(data);
let mut archive = ar::Archive::new(cursor);
self.process_ar_entries(&mut archive)
}
fn extract_deb(&self, data: &[u8]) -> Result<Vec<ExtractedFile>> {
let cursor = Cursor::new(data);
let mut archive = ar::Archive::new(cursor);
self.process_ar_entries(&mut archive)
}
fn extract_tar_gz(&self, data: &[u8]) -> Result<Vec<ExtractedFile>> {
let cursor = Cursor::new(data);
let decoder = flate2::read::GzDecoder::new(cursor);
let mut archive = tar::Archive::new(decoder);
self.process_tar_entries(&mut archive)
}
fn extract_tar_bz2(&self, data: &[u8]) -> Result<Vec<ExtractedFile>> {
let cursor = Cursor::new(data);
let decoder = bzip2::read::BzDecoder::new(cursor);
let mut archive = tar::Archive::new(decoder);
self.process_tar_entries(&mut archive)
}
fn extract_tar_xz(&self, data: &[u8]) -> Result<Vec<ExtractedFile>> {
let cursor = Cursor::new(data);
let mut output = Vec::new();
lzma_rs::xz_decompress(&mut cursor.clone(), &mut output)
.map_err(|e| ArchiveError::InvalidArchive(e.to_string()))?;
let cursor = Cursor::new(output);
let mut archive = tar::Archive::new(cursor);
self.process_tar_entries(&mut archive)
}
fn extract_tar_zst(&self, data: &[u8]) -> Result<Vec<ExtractedFile>> {
let cursor = Cursor::new(data);
let decoder = zstd::stream::read::Decoder::new(cursor)?;
let mut archive = tar::Archive::new(decoder);
self.process_tar_entries(&mut archive)
}
fn extract_tar_lz4(&self, data: &[u8]) -> Result<Vec<ExtractedFile>> {
let cursor = Cursor::new(data);
let decoder = lz4::Decoder::new(cursor)?;
let mut archive = tar::Archive::new(decoder);
self.process_tar_entries(&mut archive)
}
fn extract_7z(&self, data: &[u8]) -> Result<Vec<ExtractedFile>> {
let mut cursor = Cursor::new(data);
let len = cursor.get_ref().len() as u64;
let mut archive = sevenz_rust::SevenZReader::new(&mut cursor, len, "".into())
.map_err(|e| ArchiveError::InvalidArchive(format!("7z error: {}", e)))?;
let mut files = Vec::new();
let mut total_size = 0usize;
let mut size_error: Option<ArchiveError> = None;
let result = archive.for_each_entries(|entry, reader| {
if entry.is_directory() {
files.push(ExtractedFile {
path: entry.name().to_string(),
data: Vec::new(),
is_directory: true,
});
} else {
let size = entry.size() as usize;
if size > self.max_file_size {
size_error = Some(ArchiveError::FileTooLarge {
size,
limit: self.max_file_size,
});
return Ok(false); }
total_size += size;
if total_size > self.max_total_size {
size_error = Some(ArchiveError::TotalSizeTooLarge {
size: total_size,
limit: self.max_total_size,
});
return Ok(false); }
let mut contents = Vec::new();
reader.read_to_end(&mut contents)?;
files.push(ExtractedFile {
path: entry.name().to_string(),
data: contents,
is_directory: false,
});
}
Ok(true)
});
if let Some(err) = size_error {
return Err(err);
}
result.map_err(|e| ArchiveError::InvalidArchive(format!("7z extraction error: {}", e)))?;
Ok(files)
}
fn extract_single_gz(&self, data: &[u8]) -> Result<Vec<ExtractedFile>> {
let cursor = Cursor::new(data);
let mut decoder = flate2::read::GzDecoder::new(cursor);
let mut decompressed = Vec::new();
decoder.read_to_end(&mut decompressed)?;
if decompressed.len() > self.max_file_size {
return Err(ArchiveError::FileTooLarge {
size: decompressed.len(),
limit: self.max_file_size,
});
}
let path = decoder
.header()
.and_then(|h| h.filename())
.and_then(|f| std::str::from_utf8(f).ok())
.unwrap_or("data")
.to_string();
Ok(vec![ExtractedFile {
path,
data: decompressed,
is_directory: false,
}])
}
fn extract_single_bz2(&self, data: &[u8]) -> Result<Vec<ExtractedFile>> {
let cursor = Cursor::new(data);
let mut decoder = bzip2::read::BzDecoder::new(cursor);
let mut decompressed = Vec::new();
decoder.read_to_end(&mut decompressed)?;
if decompressed.len() > self.max_file_size {
return Err(ArchiveError::FileTooLarge {
size: decompressed.len(),
limit: self.max_file_size,
});
}
Ok(vec![ExtractedFile {
path: "data".to_string(),
data: decompressed,
is_directory: false,
}])
}
fn extract_single_xz(&self, data: &[u8]) -> Result<Vec<ExtractedFile>> {
let mut cursor = Cursor::new(data);
let mut decompressed = Vec::new();
lzma_rs::xz_decompress(&mut cursor, &mut decompressed)
.map_err(|e| ArchiveError::InvalidArchive(e.to_string()))?;
if decompressed.len() > self.max_file_size {
return Err(ArchiveError::FileTooLarge {
size: decompressed.len(),
limit: self.max_file_size,
});
}
Ok(vec![ExtractedFile {
path: "data".to_string(),
data: decompressed,
is_directory: false,
}])
}
fn extract_single_lz4(&self, data: &[u8]) -> Result<Vec<ExtractedFile>> {
let cursor = Cursor::new(data);
let mut decoder = lz4::Decoder::new(cursor)?;
let mut decompressed = Vec::new();
decoder.read_to_end(&mut decompressed)?;
if decompressed.len() > self.max_file_size {
return Err(ArchiveError::FileTooLarge {
size: decompressed.len(),
limit: self.max_file_size,
});
}
Ok(vec![ExtractedFile {
path: "data".to_string(),
data: decompressed,
is_directory: false,
}])
}
fn extract_single_zst(&self, data: &[u8]) -> Result<Vec<ExtractedFile>> {
let cursor = Cursor::new(data);
let mut decoder = zstd::stream::read::Decoder::new(cursor)?;
let mut decompressed = Vec::new();
decoder.read_to_end(&mut decompressed)?;
if decompressed.len() > self.max_file_size {
return Err(ArchiveError::FileTooLarge {
size: decompressed.len(),
limit: self.max_file_size,
});
}
Ok(vec![ExtractedFile {
path: "data".to_string(),
data: decompressed,
is_directory: false,
}])
}
fn process_tar_entries<R: Read>(
&self,
archive: &mut tar::Archive<R>,
) -> Result<Vec<ExtractedFile>> {
let mut files = Vec::new();
let mut total_size = 0usize;
for entry_result in archive.entries()? {
let mut entry = entry_result?;
let path = entry.path()?.to_string_lossy().to_string();
let is_directory = entry.header().entry_type().is_dir();
if !is_directory {
let size = entry.size() as usize;
if size > self.max_file_size {
return Err(ArchiveError::FileTooLarge {
size,
limit: self.max_file_size,
});
}
total_size += size;
if total_size > self.max_total_size {
return Err(ArchiveError::TotalSizeTooLarge {
size: total_size,
limit: self.max_total_size,
});
}
let mut contents = Vec::new();
entry.read_to_end(&mut contents)?;
files.push(ExtractedFile {
path,
data: contents,
is_directory,
});
} else {
files.push(ExtractedFile {
path,
data: Vec::new(),
is_directory,
});
}
}
Ok(files)
}
fn process_ar_entries<R: Read>(
&self,
archive: &mut ar::Archive<R>,
) -> Result<Vec<ExtractedFile>> {
let mut files = Vec::new();
let mut total_size = 0usize;
while let Some(entry_result) = archive.next_entry(){
let mut entry = entry_result?;
let path = String::from_utf8_lossy(entry.header().identifier()).to_string();
let size = entry.header().size() as usize;
if size > self.max_file_size {
return Err(ArchiveError::FileTooLarge {
size,
limit: self.max_file_size,
});
}
total_size += size;
if total_size > self.max_total_size {
return Err(ArchiveError::TotalSizeTooLarge {
size: total_size,
limit: self.max_total_size,
});
}
let mut contents = Vec::new();
entry.read_to_end(&mut contents)?;
files.push(ExtractedFile {
path,
data: contents,
is_directory: false,
});
}
Ok(files)
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_default_limits() {
let extractor = ArchiveExtractor::new();
assert_eq!(extractor.max_file_size, 100 * 1024 * 1024);
assert_eq!(extractor.max_total_size, 1024 * 1024 * 1024);
}
#[test]
fn test_builder_pattern() {
let extractor = ArchiveExtractor::new()
.with_max_file_size(50 * 1024 * 1024)
.with_max_total_size(500 * 1024 * 1024);
assert_eq!(extractor.max_file_size, 50 * 1024 * 1024);
assert_eq!(extractor.max_total_size, 500 * 1024 * 1024);
}
}