partialzip/
partzip.rs

1use chrono::NaiveDate;
2use chrono::NaiveDateTime;
3use chrono::NaiveTime;
4use conv::{NoError, ValueFrom};
5use curl::easy::Easy;
6use num_traits::ToPrimitive;
7use serde::Deserialize;
8use serde::Serialize;
9use std::cell::RefCell;
10use std::io;
11use std::io::BufReader;
12use std::io::ErrorKind;
13use std::time::Duration;
14use thiserror::Error;
15use zip::result::ZipError;
16
17use super::utils;
18
19use zip::ZipArchive;
20
21/// Enum for errors thrown by the partialzip crate
22#[derive(Error, Debug)]
23pub enum PartialZipError {
24    /// The URL is invalid
25    #[error("Invalid URL")]
26    InvalidUrl,
27    /// The file is not found
28    #[error("File Not Found")]
29    FileNotFound,
30    /// Range request not supported
31    #[error("Range request not supported")]
32    RangeNotSupported,
33    /// The compression scheme is currently not supported
34    #[error("{0} is a Unsupported Compression")]
35    UnsupportedCompression(u16),
36    /// Error for the underlying zip crate
37    #[error("zip error: {0}")]
38    ZipRsError(#[from] ZipError),
39    /// `std::io::Error` wrapper
40    #[error("io error: {0}")]
41    IOError(#[from] io::Error),
42    /// Error for CURL
43    #[error("CURL error: {0}")]
44    CURLError(#[from] curl::Error),
45    /// `NoError` error
46    #[error("NoError error: {0}")]
47    NoError(#[from] NoError),
48    /// Conversion Error
49    #[error("Conversion error: {0}")]
50    ConvError(#[from] conv::PosOverflow<u64>),
51}
52
53/// Core struct of the crate representing a zip file we want to access partially
54#[derive(Debug)]
55pub struct PartialZip {
56    /// URL of the zip archive
57    url: String,
58    /// The archive object
59    archive: RefCell<ZipArchive<BufReader<PartialReader>>>,
60    /// The archive size
61    file_size: u64,
62}
63
64/// Compression methods for the files inside the archive. Redefined structure to make it serializable.
65/// Maps directly to the zip crate `zip::CompressionMethod` enum.
66#[derive(Debug, PartialEq, Eq, Serialize, Deserialize)]
67pub enum PartialZipCompressionMethod {
68    /// Stored (no compression)
69    Stored,
70    /// Deflated compression
71    Deflated,
72    /// bzip2 compression
73    Bzip2,
74    /// zstd compression
75    Zstd,
76    /// unsupported compression
77    Unsupported,
78}
79
80impl From<zip::CompressionMethod> for PartialZipCompressionMethod {
81    fn from(value: zip::CompressionMethod) -> Self {
82        match value {
83            zip::CompressionMethod::Stored => Self::Stored,
84            zip::CompressionMethod::Deflated => Self::Deflated,
85            zip::CompressionMethod::Bzip2 => Self::Bzip2,
86            zip::CompressionMethod::Zstd => Self::Zstd,
87            _ => Self::Unsupported,
88        }
89    }
90}
91
92/// Struct for a file in the zip file with some attributes
93#[derive(Debug, PartialEq, Eq, Serialize, Deserialize)]
94pub struct PartialZipFileDetailed {
95    /// Filename
96    pub name: String,
97    /// Compressed size of the file
98    pub compressed_size: u64,
99    /// How it has been compressed (compression method, like bzip2, deflate, etc.)
100    pub compression_method: PartialZipCompressionMethod,
101    /// Is the compression supported or not by this crate?
102    pub supported: bool,
103    /// The date the file was last modified
104    pub last_modified: Option<NaiveDateTime>,
105}
106
107impl PartialZip {
108    /// Create a new [`PartialZip`]
109    /// # Errors
110    ///
111    /// Will return a [`PartialZipError`] enum depending on what error happened
112    pub fn new(url: &dyn ToString) -> Result<Self, PartialZipError> {
113        Self::new_check_range(url, false)
114    }
115
116    /// Create a new [`PartialZip`]
117    /// # Errors
118    ///
119    /// Will return a [`PartialZipError`] enum depending on what error happened
120    pub fn new_check_range(url: &dyn ToString, check_range: bool) -> Result<Self, PartialZipError> {
121        let reader = PartialReader::new_check_range(url, check_range)?;
122        let file_size = reader.file_size;
123        // higher capacity BufReader has better performances
124        let bufreader = BufReader::with_capacity(0x0010_0000, reader);
125        let archive = ZipArchive::new(bufreader)?;
126        Ok(Self {
127            url: url.to_string(),
128            archive: RefCell::new(archive),
129            file_size,
130        })
131    }
132
133    /// Returns the url for the [`PartialZip`]
134    pub fn url(&self) -> String {
135        self.url.clone()
136    }
137
138    /// Returns the zip size for the entire archive of the [`PartialZip`]
139    pub const fn file_size(&self) -> u64 {
140        self.file_size
141    }
142
143    /// Get a list of the filenames in the archive
144    pub fn list_names(&self) -> Vec<String> {
145        self.archive
146            .borrow()
147            .file_names()
148            .map(std::borrow::ToOwned::to_owned)
149            .collect()
150    }
151
152    /// Get a list of the files in the archive with details (much slower than just listing names because it fetches much more data around with more requests)
153    pub fn list_detailed(&self) -> Vec<PartialZipFileDetailed> {
154        let mut file_list = Vec::new();
155        let num_files = self.archive.borrow().len();
156        for i in 0..num_files {
157            match self.archive.borrow_mut().by_index(i) {
158                Ok(file) => {
159                    let compression_method = file.compression();
160                    // we only support some compressions
161                    let supported = matches!(
162                        compression_method,
163                        zip::CompressionMethod::Stored
164                            | zip::CompressionMethod::Deflated
165                            | zip::CompressionMethod::Bzip2
166                            | zip::CompressionMethod::Zstd
167                    );
168                    let date = NaiveDate::from_ymd_opt(
169                        file.last_modified().year().into(),
170                        file.last_modified().month().into(),
171                        file.last_modified().day().into(),
172                    );
173                    let time = NaiveTime::from_hms_opt(
174                        file.last_modified().hour().into(),
175                        file.last_modified().minute().into(),
176                        file.last_modified().second().into(),
177                    );
178                    let last_modified = if let (Some(d), Some(t)) = (date, time) {
179                        Some(NaiveDateTime::new(d, t))
180                    } else {
181                        None
182                    };
183                    let pzf = PartialZipFileDetailed {
184                        name: file.name().to_string(),
185                        compressed_size: file.compressed_size(),
186                        compression_method: compression_method.into(),
187                        supported,
188                        last_modified,
189                    };
190                    file_list.push(pzf);
191                }
192                Err(e) => {
193                    // We are unable to get a file, let's try to continue,
194                    // and at least return the files we can
195                    log::warn!("list: error while matching file by index: {i} - {e}");
196                    continue;
197                }
198            };
199        }
200        file_list
201    }
202    /// Download a single file from the archive
203    ///
204    /// # Errors
205    /// Will return a [`PartialZipError`] depending on what happened
206    pub fn download(&self, filename: &str) -> Result<Vec<u8>, PartialZipError> {
207        let mut content: Vec<u8> = Vec::new();
208        self.download_to_write(filename, &mut content)?;
209        Ok(content)
210    }
211
212    /// Download a single file from the archive and writes it to a [`std::io::Write`]
213    ///
214    /// # Errors
215    /// Will return a [`PartialZipError`] depending on what happened
216    pub fn download_to_write(
217        &self,
218        filename: &str,
219        writer: &mut dyn std::io::Write,
220    ) -> Result<(), PartialZipError> {
221        let mut archive = self.archive.borrow_mut();
222        let mut file = archive.by_name(filename)?;
223        io::copy(&mut file, writer)?;
224        Ok(())
225    }
226
227    /// Download a single file from the archive showing a progress bar
228    ///
229    /// # Errors
230    /// Will return a [`PartialZipError`] depending on what happened
231    #[cfg(feature = "progressbar")]
232    pub fn download_with_progressbar(&self, filename: &str) -> Result<Vec<u8>, PartialZipError> {
233        let mut content: Vec<u8> = Vec::new();
234        self.download_to_write_with_progressbar(filename, &mut content)?;
235        Ok(content)
236    }
237
238    /// Download a single file from the archive showing a progress bar to a [`std::io::Write`]
239    ///
240    /// # Errors
241    /// Will return a [`PartialZipError`] depending on what happened
242    #[cfg(feature = "progressbar")]
243    pub fn download_to_write_with_progressbar(
244        &self,
245        filename: &str,
246        writer: &mut dyn std::io::Write,
247    ) -> Result<(), PartialZipError> {
248        use indicatif::ProgressBar;
249
250        let mut archive = self.archive.borrow_mut();
251        let file = archive.by_name(filename)?;
252        let pb = ProgressBar::new(file.compressed_size());
253        io::copy(&mut pb.wrap_read(file), writer)?;
254        Ok(())
255    }
256}
257
258/// Reader for the partialzip doing only the partial read instead of downloading everything
259#[derive(Debug)]
260pub struct PartialReader {
261    /// URL at which we read the file
262    url: String,
263    file_size: u64,
264    easy: Easy,
265    pos: u64,
266}
267
268const HTTP_PARTIAL_CONTENT: u32 = 206;
269
270impl PartialReader {
271    /// Creates a new [`PartialReader`]
272    ///
273    /// # Errors
274    /// Will return a [`PartialZipError`] enum depending on what happened
275    pub fn new(url: &dyn ToString) -> Result<Self, PartialZipError> {
276        Self::new_check_range(url, false)
277    }
278    /// Creates a new [`PartialReader`]
279    ///
280    /// # Errors
281    /// Will return a [`PartialZipError`] enum depending on what happened
282
283    pub fn new_check_range(url: &dyn ToString, check_range: bool) -> Result<Self, PartialZipError> {
284        let url = &url.to_string();
285        if !utils::url_is_valid(url) {
286            return Err(PartialZipError::InvalidUrl);
287        }
288
289        let mut easy = Easy::new();
290        easy.url(url)?;
291        easy.follow_location(true)?;
292        easy.tcp_keepalive(true)?;
293        easy.tcp_keepidle(Duration::from_secs(120))?;
294        easy.tcp_keepintvl(Duration::from_secs(60))?;
295        easy.nobody(true)?;
296        easy.write_function(|data| Ok(data.len()))?;
297        easy.perform()?;
298        let file_size = easy
299            .content_length_download()?
300            .to_u64()
301            .ok_or_else(|| std::io::Error::new(ErrorKind::InvalidData, "invalid content length"))?;
302
303        if check_range {
304            // check if range-request is possible by request 1 byte. if 206 Partial Content (HTTP_PARTIAL_CONTENT) is returned, we can make future request.
305            easy.range("0-0")?;
306            easy.nobody(true)?;
307            easy.perform()?;
308            let head_size = easy.content_length_download()?.to_u64().ok_or_else(|| {
309                std::io::Error::new(ErrorKind::InvalidData, "can not perform range request")
310            })?;
311            if head_size != 1 {
312                return Err(PartialZipError::RangeNotSupported);
313            }
314            // 206 Partial Content (HTTP_PARTIAL_CONTENT)
315            if easy.response_code()? != HTTP_PARTIAL_CONTENT {
316                return Err(PartialZipError::RangeNotSupported);
317            }
318            easy.range("")?;
319            easy.nobody(false)?;
320        }
321        Ok(Self {
322            url: url.to_string(),
323            file_size,
324            easy,
325            pos: 0,
326        })
327    }
328
329    /// Returns the url for the [`PartialReader`]
330    #[must_use]
331    pub fn url(&self) -> String {
332        self.url.clone()
333    }
334}
335
336impl io::Read for PartialReader {
337    fn read(&mut self, buf: &mut [u8]) -> io::Result<usize> {
338        log::trace!(
339            "read self.pos = {:x} self.file_size = {:x}",
340            self.pos,
341            self.file_size
342        );
343        if self.pos >= self.file_size {
344            return Ok(0);
345        }
346        // start = current position
347        let start = self.pos;
348        // end candidate = start + buf.len() - 1;
349        let maybe_end = start
350            .checked_add(buf.len().to_u64().ok_or_else(|| {
351                std::io::Error::new(
352                    ErrorKind::InvalidData,
353                    format!("The buf len is invalid {}", buf.len()),
354                )
355            })?)
356            .ok_or_else(|| {
357                std::io::Error::new(
358                    ErrorKind::InvalidData,
359                    format!("start + buf.len() overflow {start} {}", buf.len()),
360                )
361            })?
362            .checked_sub(1)
363            .ok_or_else(|| {
364                std::io::Error::new(
365                    ErrorKind::InvalidData,
366                    format!("start + buf.len() - 1 underflow {start} {}", buf.len()),
367                )
368            })?;
369        log::trace!("maybe_end = {maybe_end:x}");
370        // end = min(end candidate, file_size - 1)
371        let end = std::cmp::min(
372            maybe_end,
373            self.file_size.checked_sub(1).ok_or_else(|| {
374                std::io::Error::new(
375                    ErrorKind::InvalidData,
376                    format!("file_size - 1 underflow {}", self.file_size),
377                )
378            })?,
379        );
380        log::trace!("end = {end:x} start = {start:x}");
381        // check if the end and start are valid ( end >= start )
382        if end < start {
383            return Err(std::io::Error::new(
384                ErrorKind::InvalidData,
385                format!("end < start: {end} < {start}"),
386            ));
387        }
388        let range = format!("{start}-{end}");
389        log::trace!("range = {range}");
390
391        self.easy.range(&range)?;
392        self.easy.get(true)?;
393
394        let mut content: Vec<u8> = Vec::new();
395        {
396            let mut transfer = self.easy.transfer();
397            transfer.write_function(|data| {
398                log::trace!("transfered {:x} bytes", data.len());
399                content.extend_from_slice(data);
400                Ok(data.len())
401            })?;
402
403            transfer.perform()?;
404        };
405
406        let n = io::Read::read(&mut content[..].as_ref(), buf)?;
407        // new position = position + read amount;
408        self.pos = self
409            .pos
410            .checked_add(n.to_u64().ok_or_else(|| {
411                std::io::Error::new(ErrorKind::InvalidData, format!("invalid read amount {n}"))
412            })?)
413            .ok_or_else(|| {
414                std::io::Error::new(
415                    ErrorKind::InvalidData,
416                    format!("adding {n} overflows the reader position {}", self.pos),
417                )
418            })?;
419        log::trace!("new self.pos = {:x}", self.pos);
420        Ok(n)
421    }
422}
423
424impl io::Seek for PartialReader {
425    fn seek(&mut self, style: io::SeekFrom) -> io::Result<u64> {
426        // we can seek both from start, end, or current position
427        let (base_pos, offset) = match style {
428            io::SeekFrom::Start(n) => {
429                self.pos = n;
430                return Ok(n);
431            }
432            io::SeekFrom::End(n) => (self.file_size, n),
433            io::SeekFrom::Current(n) => (self.pos, n),
434        };
435        log::trace!("seek base_pos = {base_pos:x} offset = {offset:x}");
436        let new_pos = if offset >= 0 {
437            // position = base position + offset
438            base_pos.checked_add(
439                u64::value_from(offset)
440                    .map_err(|e| std::io::Error::new(ErrorKind::InvalidData, e.to_string()))?,
441            )
442        } else {
443            // position = base position - offset
444            base_pos.checked_sub(
445                u64::value_from(offset.wrapping_neg())
446                    .map_err(|e| std::io::Error::new(ErrorKind::InvalidData, e.to_string()))?,
447            )
448        };
449        // check if new position is valid
450        match new_pos {
451            Some(n) => {
452                self.pos = n;
453                log::trace!("new self.pos = {n:x}");
454                Ok(self.pos)
455            }
456            None => Err(std::io::Error::new(
457                ErrorKind::InvalidInput,
458                "invalid seek to a negative or overflowing position",
459            )),
460        }
461    }
462}