Skip to main content

uls_parser/
archive.rs

1//! ZIP archive handling with streaming support.
2//!
3//! This module provides functionality to stream DAT files directly from
4//! ZIP archives without extracting to disk, minimizing I/O overhead.
5
6use std::fs::File;
7use std::io::{BufReader, Read, Seek};
8use std::path::Path;
9
10use thiserror::Error;
11use zip::ZipArchive;
12
13use crate::dat::{DatReader, ParsedLine};
14
15/// ZIP-specific error types.
16#[derive(Error, Debug)]
17pub enum ZipError {
18    #[error("I/O error: {0}")]
19    Io(#[from] std::io::Error),
20
21    #[error("ZIP error: {0}")]
22    Zip(#[from] zip::result::ZipError),
23
24    #[error("DAT file not found in archive: {0}")]
25    DatFileNotFound(String),
26}
27
28/// A ULS ZIP archive that can stream DAT files.
29pub struct ZipExtractor<R: Read + Seek> {
30    archive: ZipArchive<R>,
31}
32
33impl ZipExtractor<BufReader<File>> {
34    /// Open a ULS ZIP file.
35    pub fn open<P: AsRef<Path>>(path: P) -> Result<Self, ZipError> {
36        let file = File::open(path)?;
37        let reader = BufReader::new(file);
38        let archive = ZipArchive::new(reader)?;
39        Ok(Self { archive })
40    }
41
42    /// Get statistics about the archive.
43    pub fn stats(&mut self) -> Result<ArchiveStats, ZipError> {
44        let dat_files = self.list_dat_files();
45        let total_files = self.archive.len();
46        let mut total_size = 0u64;
47
48        for i in 0..self.archive.len() {
49            let file = self.archive.by_index(i)?;
50            total_size += file.size();
51        }
52
53        Ok(ArchiveStats {
54            total_files,
55            dat_files,
56            total_size_bytes: total_size,
57        })
58    }
59
60    /// Count records in all DAT files in the archive.
61    pub fn count_all_records(
62        &mut self,
63    ) -> Result<std::collections::HashMap<String, usize>, crate::ParseError> {
64        let dat_files = self.list_dat_files();
65        let mut counts = std::collections::HashMap::new();
66
67        for dat_file in dat_files {
68            let count = self.process_dat_streaming(&dat_file, |_| true)?;
69            counts.insert(dat_file, count);
70        }
71
72        Ok(counts)
73    }
74}
75
76impl<R: Read + Seek> ZipExtractor<R> {
77    /// Create from an existing archive reader.
78    pub fn new(archive: ZipArchive<R>) -> Self {
79        Self { archive }
80    }
81
82    /// List all DAT files in the archive.
83    pub fn list_dat_files(&mut self) -> Vec<String> {
84        let mut files = Vec::new();
85        for i in 0..self.archive.len() {
86            if let Ok(file) = self.archive.by_index(i) {
87                let name = file.name().to_string();
88                if name.to_uppercase().ends_with(".DAT") {
89                    files.push(name);
90                }
91            }
92        }
93        files
94    }
95
96    /// List all files in the archive.
97    pub fn list_files(&mut self) -> Vec<String> {
98        let mut files = Vec::new();
99        for i in 0..self.archive.len() {
100            if let Ok(file) = self.archive.by_index(i) {
101                files.push(file.name().to_string());
102            }
103        }
104        files
105    }
106
107    /// Get the size of a file in the archive.
108    pub fn file_size(&mut self, name: &str) -> Result<u64, ZipError> {
109        let file = self.archive.by_name(name)?;
110        Ok(file.size())
111    }
112
113    /// Find the index of a file by name (case-insensitive).
114    fn find_file_index(&mut self, name: &str) -> Option<usize> {
115        // Try exact name first
116        for i in 0..self.archive.len() {
117            if let Ok(file) = self.archive.by_index(i) {
118                if file.name() == name {
119                    return Some(i);
120                }
121            }
122        }
123
124        // Try case-insensitive match
125        let name_upper = name.to_uppercase();
126        for i in 0..self.archive.len() {
127            if let Ok(file) = self.archive.by_index(i) {
128                if file.name().to_uppercase() == name_upper {
129                    return Some(i);
130                }
131            }
132        }
133
134        None
135    }
136
137    /// Stream a DAT file from the archive without extracting to disk.
138    /// Returns a reader that can be used with DatReader.
139    pub fn stream_dat(&mut self, name: &str) -> Result<impl Read + '_, ZipError> {
140        // Find the index first to avoid borrow issues
141        let index = self
142            .find_file_index(name)
143            .ok_or_else(|| ZipError::DatFileNotFound(name.to_string()))?;
144
145        self.archive.by_index(index).map_err(ZipError::Zip)
146    }
147
148    /// Process a DAT file streaming from the archive, calling a callback for each record.
149    /// This is the most memory-efficient way to process large ULS archives.
150    pub fn process_dat_streaming<F>(
151        &mut self,
152        dat_name: &str,
153        mut callback: F,
154    ) -> Result<usize, crate::ParseError>
155    where
156        F: FnMut(ParsedLine) -> bool,
157    {
158        let reader = self.stream_dat(dat_name)?;
159        let mut dat_reader = DatReader::new(reader);
160        let mut count = 0;
161
162        while let Some(line) = dat_reader.next_line()? {
163            count += 1;
164            if !callback(line) {
165                break;
166            }
167        }
168
169        Ok(count)
170    }
171
172    /// Extract the canonical file creation date from the `counts` file in the archive.
173    ///
174    /// FCC data files contain a `counts` file with a line like:
175    /// `File Creation Date: Sun Jan 18 12:01:25 EST 2026`
176    ///
177    /// This is the authoritative date for when the FCC generated the data file.
178    /// Returns None if the counts file doesn't exist or can't be parsed.
179    pub fn get_file_creation_date(&mut self) -> Option<String> {
180        // Try to find and read the counts file
181        let index = self.find_file_index("counts")?;
182        let mut file = self.archive.by_index(index).ok()?;
183
184        let mut contents = String::new();
185        file.read_to_string(&mut contents).ok()?;
186
187        // Parse "File Creation Date: Sun Jan 18 12:01:25 EST 2026"
188        for line in contents.lines() {
189            if line.starts_with("File Creation Date:") {
190                let date_str = line.trim_start_matches("File Creation Date:").trim();
191                return Some(date_str.to_string());
192            }
193        }
194
195        None
196    }
197}
198
199impl From<ZipError> for crate::ParseError {
200    fn from(err: ZipError) -> Self {
201        match err {
202            ZipError::Io(e) => crate::ParseError::Io(e),
203            ZipError::Zip(e) => crate::ParseError::Zip(e),
204            ZipError::DatFileNotFound(name) => crate::ParseError::InvalidFormat {
205                line: 0,
206                message: format!("DAT file not found: {}", name),
207            },
208        }
209    }
210}
211
212/// Statistics about a ULS archive.
213#[derive(Debug, Clone)]
214pub struct ArchiveStats {
215    pub total_files: usize,
216    pub dat_files: Vec<String>,
217    pub total_size_bytes: u64,
218}
219
220#[cfg(test)]
221mod tests {
222    use super::*;
223    use std::io::{Cursor, Write};
224
225    fn create_test_zip() -> Vec<u8> {
226        let mut buf = Vec::new();
227        {
228            let cursor = Cursor::new(&mut buf);
229            let mut writer = zip::ZipWriter::new(cursor);
230
231            let options = zip::write::SimpleFileOptions::default()
232                .compression_method(zip::CompressionMethod::Stored);
233
234            writer.start_file("HD.dat", options).unwrap();
235            writer.write_all(b"HD|1|||TEST|A|HA|\n").unwrap();
236            writer.write_all(b"HD|2|||TEST2|A|HA|\n").unwrap();
237
238            writer.start_file("EN.dat", options).unwrap();
239            writer.write_all(b"EN|1|||TEST|L||John||\n").unwrap();
240
241            writer.finish().unwrap();
242        }
243        buf
244    }
245
246    fn create_zip_with_mixed_case() -> Vec<u8> {
247        let mut buf = Vec::new();
248        {
249            let cursor = Cursor::new(&mut buf);
250            let mut writer = zip::ZipWriter::new(cursor);
251
252            let options = zip::write::SimpleFileOptions::default()
253                .compression_method(zip::CompressionMethod::Stored);
254
255            // Mixed case filenames
256            writer.start_file("hd.DAT", options).unwrap();
257            writer.write_all(b"HD|1|||LOWERCASE|A|HA|\n").unwrap();
258
259            writer.start_file("en.Dat", options).unwrap();
260            writer.write_all(b"EN|1|||MIXEDCASE|L||Test||\n").unwrap();
261
262            writer.start_file("readme.txt", options).unwrap();
263            writer.write_all(b"Not a DAT file\n").unwrap();
264
265            writer.finish().unwrap();
266        }
267        buf
268    }
269
270    #[test]
271    fn test_list_dat_files() {
272        let data = create_test_zip();
273        let cursor = Cursor::new(data);
274        let archive = ZipArchive::new(cursor).unwrap();
275        let mut extractor = ZipExtractor::new(archive);
276
277        let files = extractor.list_dat_files();
278        assert_eq!(files.len(), 2);
279        assert!(files.contains(&"HD.dat".to_string()));
280        assert!(files.contains(&"EN.dat".to_string()));
281    }
282
283    #[test]
284    fn test_stream_dat() {
285        let data = create_test_zip();
286        let cursor = Cursor::new(data);
287        let archive = ZipArchive::new(cursor).unwrap();
288        let mut extractor = ZipExtractor::new(archive);
289
290        let count = extractor
291            .process_dat_streaming("HD.dat", |line| {
292                assert_eq!(line.record_type, "HD");
293                true
294            })
295            .unwrap();
296
297        assert_eq!(count, 2);
298    }
299
300    #[test]
301    fn test_list_all_files() {
302        let data = create_zip_with_mixed_case();
303        let cursor = Cursor::new(data);
304        let archive = ZipArchive::new(cursor).unwrap();
305        let mut extractor = ZipExtractor::new(archive);
306
307        let files = extractor.list_files();
308        assert_eq!(files.len(), 3);
309        assert!(files.contains(&"hd.DAT".to_string()));
310        assert!(files.contains(&"en.Dat".to_string()));
311        assert!(files.contains(&"readme.txt".to_string()));
312    }
313
314    #[test]
315    fn test_list_dat_files_mixed_case() {
316        let data = create_zip_with_mixed_case();
317        let cursor = Cursor::new(data);
318        let archive = ZipArchive::new(cursor).unwrap();
319        let mut extractor = ZipExtractor::new(archive);
320
321        // Should find DAT files regardless of case
322        let dat_files = extractor.list_dat_files();
323        assert_eq!(dat_files.len(), 2);
324    }
325
326    #[test]
327    fn test_file_size() {
328        let data = create_test_zip();
329        let cursor = Cursor::new(data);
330        let archive = ZipArchive::new(cursor).unwrap();
331        let mut extractor = ZipExtractor::new(archive);
332
333        let size = extractor.file_size("HD.dat").unwrap();
334        assert!(size > 0);
335    }
336
337    #[test]
338    fn test_file_size_not_found() {
339        let data = create_test_zip();
340        let cursor = Cursor::new(data);
341        let archive = ZipArchive::new(cursor).unwrap();
342        let mut extractor = ZipExtractor::new(archive);
343
344        let result = extractor.file_size("nonexistent.dat");
345        assert!(result.is_err());
346    }
347
348    #[test]
349    fn test_stream_dat_case_insensitive() {
350        let data = create_zip_with_mixed_case();
351        let cursor = Cursor::new(data);
352        let archive = ZipArchive::new(cursor).unwrap();
353        let mut extractor = ZipExtractor::new(archive);
354
355        // Should find "hd.DAT" when looking for "HD.dat"
356        let count = extractor
357            .process_dat_streaming("HD.dat", |line| {
358                assert_eq!(line.record_type, "HD");
359                true
360            })
361            .unwrap();
362
363        assert_eq!(count, 1);
364    }
365
366    #[test]
367    fn test_stream_dat_not_found() {
368        let data = create_test_zip();
369        let cursor = Cursor::new(data);
370        let archive = ZipArchive::new(cursor).unwrap();
371        let mut extractor = ZipExtractor::new(archive);
372
373        let result = extractor.stream_dat("NONEXISTENT.dat");
374        assert!(result.is_err());
375
376        match result {
377            Err(ZipError::DatFileNotFound(name)) => {
378                assert_eq!(name, "NONEXISTENT.dat");
379            }
380            _ => panic!("Expected DatFileNotFound error"),
381        }
382    }
383
384    #[test]
385    fn test_process_dat_early_termination() {
386        let data = create_test_zip();
387        let cursor = Cursor::new(data);
388        let archive = ZipArchive::new(cursor).unwrap();
389        let mut extractor = ZipExtractor::new(archive);
390
391        let mut processed = 0;
392        let count = extractor
393            .process_dat_streaming("HD.dat", |_line| {
394                processed += 1;
395                false // Stop after first record
396            })
397            .unwrap();
398
399        // Should have processed exactly 1 record before stopping
400        assert_eq!(count, 1);
401        assert_eq!(processed, 1);
402    }
403
404    #[test]
405    fn test_zip_error_to_parse_error() {
406        let err = ZipError::DatFileNotFound("test.dat".to_string());
407        let parse_err: crate::ParseError = err.into();
408        let msg = parse_err.to_string();
409        assert!(msg.contains("test.dat"));
410    }
411
412    #[test]
413    fn test_zip_error_io_conversion() {
414        let io_err = std::io::Error::new(std::io::ErrorKind::NotFound, "test error");
415        let zip_err = ZipError::from(io_err);
416        let parse_err: crate::ParseError = zip_err.into();
417        assert!(matches!(parse_err, crate::ParseError::Io(_)));
418    }
419
420    fn create_zip_with_counts() -> Vec<u8> {
421        let mut buf = Vec::new();
422        {
423            let cursor = Cursor::new(&mut buf);
424            let mut writer = zip::ZipWriter::new(cursor);
425
426            let options = zip::write::SimpleFileOptions::default()
427                .compression_method(zip::CompressionMethod::Stored);
428
429            // Add counts file with FCC format
430            writer.start_file("counts", options).unwrap();
431            writer
432                .write_all(b"File Creation Date: Sun Jan 18 12:01:25 EST 2026\n")
433                .unwrap();
434            writer
435                .write_all(b"  1669550 /home/pubacc/scripts/licweekzipdata/AM.dat\n")
436                .unwrap();
437
438            writer.start_file("HD.dat", options).unwrap();
439            writer.write_all(b"HD|1|||TEST|A|HA|\n").unwrap();
440
441            writer.finish().unwrap();
442        }
443        buf
444    }
445
446    #[test]
447    fn test_get_file_creation_date() {
448        let data = create_zip_with_counts();
449        let cursor = Cursor::new(data);
450        let archive = ZipArchive::new(cursor).unwrap();
451        let mut extractor = ZipExtractor::new(archive);
452
453        let date = extractor.get_file_creation_date();
454        assert!(date.is_some());
455        assert_eq!(date.unwrap(), "Sun Jan 18 12:01:25 EST 2026");
456    }
457
458    #[test]
459    fn test_get_file_creation_date_no_counts_file() {
460        let data = create_test_zip(); // This ZIP doesn't have counts file
461        let cursor = Cursor::new(data);
462        let archive = ZipArchive::new(cursor).unwrap();
463        let mut extractor = ZipExtractor::new(archive);
464
465        let date = extractor.get_file_creation_date();
466        assert!(date.is_none());
467    }
468}