ecad_processor/archive/
inspector.rs

1use crate::archive::{TemperatureType, WeatherMetric};
2use crate::error::{ProcessingError, Result};
3use chrono::NaiveDate;
4use serde::{Deserialize, Serialize};
5use std::collections::{HashMap, HashSet};
6use std::fs::File;
7use std::io::{BufRead, BufReader};
8use std::path::Path;
9use zip::ZipArchive;
10
11type ScanResult = (
12    Vec<WeatherMetric>,
13    HashMap<WeatherMetric, usize>,
14    HashSet<u32>,
15);
16
17#[derive(Debug, Clone, Serialize, Deserialize)]
18pub struct ArchiveMetadata {
19    pub country: String,
20    pub metrics: Vec<WeatherMetric>,
21    pub station_count: usize,
22    pub date_range: Option<(NaiveDate, NaiveDate)>,
23    pub file_counts: HashMap<WeatherMetric, usize>,
24    pub total_files: usize,
25}
26
27impl ArchiveMetadata {
28    pub fn display_summary(&self) -> String {
29        let mut summary = format!(
30            "Archive Metadata:\n  Country: {}\n  Total Stations: {}\n  Total Files: {}\n",
31            self.country, self.station_count, self.total_files
32        );
33
34        if let Some((start, end)) = &self.date_range {
35            summary.push_str(&format!("  Date Range: {} to {}\n", start, end));
36        }
37
38        summary.push_str("  Available Metrics:\n");
39        for metric in &self.metrics {
40            if let Some(count) = self.file_counts.get(metric) {
41                summary.push_str(&format!(
42                    "    {}: {} stations ({})\n",
43                    metric.display_name(),
44                    count,
45                    metric.units()
46                ));
47            }
48        }
49
50        summary
51    }
52
53    pub fn has_temperature_data(&self) -> bool {
54        self.metrics
55            .iter()
56            .any(|m| matches!(m, WeatherMetric::Temperature(_)))
57    }
58
59    pub fn has_complete_temperature(&self) -> bool {
60        let temp_types: HashSet<_> = self
61            .metrics
62            .iter()
63            .filter_map(|m| match m {
64                WeatherMetric::Temperature(t) => Some(t),
65                _ => None,
66            })
67            .collect();
68
69        temp_types.contains(&TemperatureType::Minimum)
70            && temp_types.contains(&TemperatureType::Maximum)
71            && temp_types.contains(&TemperatureType::Average)
72    }
73
74    pub fn get_metric_coverage(&self, metric: &WeatherMetric) -> f64 {
75        if let Some(count) = self.file_counts.get(metric) {
76            (*count as f64) / (self.station_count as f64)
77        } else {
78            0.0
79        }
80    }
81}
82
83pub struct ArchiveInspector;
84
85impl ArchiveInspector {
86    pub fn inspect_zip(zip_path: &Path) -> Result<ArchiveMetadata> {
87        let file = File::open(zip_path)?;
88        let mut archive = ZipArchive::new(file)?;
89
90        // Step 1: Scan all files to identify metrics and collect station IDs
91        let (metrics, file_counts, all_station_ids) = Self::scan_data_files(&mut archive)?;
92
93        if metrics.is_empty() {
94            return Err(ProcessingError::InvalidFormat(
95                "No recognized weather data files found in archive".to_string(),
96            ));
97        }
98
99        // Step 2: Extract country from stations.txt
100        let country = Self::extract_country(&mut archive)?;
101
102        // Step 3: Validate metrics against elements.txt (optional)
103        if let Ok(element_metrics) = Self::validate_with_elements(&mut archive) {
104            // Cross-validate if elements.txt is available
105            for metric in &metrics {
106                if !element_metrics.contains(metric) {
107                    println!(
108                        "Warning: Metric {} found in files but not in elements.txt",
109                        metric
110                    );
111                }
112            }
113        }
114
115        // Step 4: Estimate date range (optional - requires parsing data files)
116        let date_range = Self::estimate_date_range(&mut archive, &metrics).ok();
117
118        Ok(ArchiveMetadata {
119            country,
120            metrics,
121            station_count: all_station_ids.len(),
122            date_range,
123            file_counts,
124            total_files: archive.len(),
125        })
126    }
127
128    fn scan_data_files(archive: &mut ZipArchive<File>) -> Result<ScanResult> {
129        let mut metrics = Vec::new();
130        let mut file_counts: HashMap<WeatherMetric, usize> = HashMap::new();
131        let mut all_station_ids = HashSet::new();
132
133        for i in 0..archive.len() {
134            let file = archive.by_index(i)?;
135            let file_name = file.name();
136
137            // Skip directories and metadata files
138            if file_name.ends_with('/')
139                || file_name == "stations.txt"
140                || file_name == "elements.txt"
141                || file_name == "metadata.txt"
142                || file_name == "sources.txt"
143            {
144                continue;
145            }
146
147            // Parse weather data file pattern: {PREFIX}_STAID{ID}.txt
148            if let Some(metric) = Self::parse_data_file_name(file_name) {
149                // Add metric if not already present
150                if !metrics.contains(&metric) {
151                    metrics.push(metric.clone());
152                }
153
154                // Count files per metric
155                *file_counts.entry(metric).or_insert(0) += 1;
156
157                // Extract station ID
158                if let Some(station_id) = Self::extract_station_id_from_filename(file_name) {
159                    all_station_ids.insert(station_id);
160                }
161            }
162        }
163
164        Ok((metrics, file_counts, all_station_ids))
165    }
166
167    fn parse_data_file_name(file_name: &str) -> Option<WeatherMetric> {
168        // Expected pattern: {PREFIX}_STAID{ID}.txt
169        if !file_name.ends_with(".txt") {
170            return None;
171        }
172
173        let name_without_ext = &file_name[..file_name.len() - 4];
174
175        // Find the prefix before "_STAID"
176        if let Some(pos) = name_without_ext.find("_STAID") {
177            let prefix = &name_without_ext[..pos];
178            WeatherMetric::from_file_prefix(prefix)
179        } else {
180            None
181        }
182    }
183
184    fn extract_station_id_from_filename(file_name: &str) -> Option<u32> {
185        // Extract station ID from patterns like TX_STAID000257.txt
186        if let Some(start) = file_name.find("STAID") {
187            let after_staid = &file_name[start + 5..];
188            if let Some(end) = after_staid.find('.') {
189                let id_str = &after_staid[..end];
190                // Remove leading zeros and parse
191                id_str.trim_start_matches('0').parse().ok()
192            } else {
193                None
194            }
195        } else {
196            None
197        }
198    }
199
200    fn extract_country(archive: &mut ZipArchive<File>) -> Result<String> {
201        // Extract stations.txt to read country codes
202        let mut stations_file = archive.by_name("stations.txt").map_err(|_| {
203            ProcessingError::InvalidFormat("stations.txt not found in archive".to_string())
204        })?;
205
206        let reader = BufReader::new(&mut stations_file);
207        let mut countries = HashSet::new();
208
209        for line_result in reader.lines() {
210            let line = line_result?;
211            let trimmed = line.trim();
212
213            // Skip empty lines and headers
214            if trimmed.is_empty() || trimmed.starts_with("STAID") || trimmed.starts_with("---") {
215                continue;
216            }
217
218            // Skip header content lines
219            if trimmed.contains("EUROPEAN") || trimmed.contains("Klein Tank") {
220                continue;
221            }
222
223            // Parse station line: STAID,STANAME,CN,LAT,LON,HGHT
224            let parts: Vec<&str> = line.split(',').map(|s| s.trim()).collect();
225            if parts.len() >= 3 {
226                // Country code is the 3rd field (index 2)
227                let country_code = parts[2].trim();
228                if !country_code.is_empty() && country_code.len() == 2 {
229                    countries.insert(country_code.to_string());
230                }
231            }
232        }
233
234        if countries.is_empty() {
235            return Err(ProcessingError::InvalidFormat(
236                "No valid country codes found in stations.txt".to_string(),
237            ));
238        }
239
240        if countries.len() > 1 {
241            println!("Warning: Multiple countries found: {:?}", countries);
242        }
243
244        // Return the first (or most common) country
245        Ok(countries.into_iter().next().unwrap())
246    }
247
248    fn validate_with_elements(archive: &mut ZipArchive<File>) -> Result<Vec<WeatherMetric>> {
249        let mut elements_file = archive.by_name("elements.txt").map_err(|_| {
250            ProcessingError::InvalidFormat("elements.txt not found in archive".to_string())
251        })?;
252
253        let reader = BufReader::new(&mut elements_file);
254        let mut element_metrics = Vec::new();
255
256        for line_result in reader.lines() {
257            let line = line_result?;
258            let trimmed = line.trim();
259
260            // Skip empty lines and headers
261            if trimmed.is_empty() || trimmed.starts_with("ELEID") || trimmed.starts_with("---") {
262                continue;
263            }
264
265            // Skip header content lines
266            if trimmed.contains("EUROPEAN") || trimmed.contains("Klein Tank") {
267                continue;
268            }
269
270            // Parse element line: ELEID,DESC,UNIT
271            let parts: Vec<&str> = line.split(',').map(|s| s.trim()).collect();
272            if let Some(element_id) = parts.first() {
273                // Extract prefix from element ID (e.g., "TX1" -> "TX")
274                let prefix = element_id
275                    .chars()
276                    .take_while(|c| c.is_alphabetic())
277                    .collect::<String>();
278                if let Some(metric) = WeatherMetric::from_file_prefix(&prefix) {
279                    if !element_metrics.contains(&metric) {
280                        element_metrics.push(metric);
281                    }
282                }
283            }
284        }
285
286        Ok(element_metrics)
287    }
288
289    fn estimate_date_range(
290        archive: &mut ZipArchive<File>,
291        _metrics: &[WeatherMetric],
292    ) -> Result<(NaiveDate, NaiveDate)> {
293        // For performance, we'll just sample a few files to estimate date range
294        let mut min_date: Option<NaiveDate> = None;
295        let mut max_date: Option<NaiveDate> = None;
296        let mut files_sampled = 0;
297        const MAX_SAMPLE_FILES: usize = 5;
298
299        for i in 0..archive.len() {
300            if files_sampled >= MAX_SAMPLE_FILES {
301                break;
302            }
303
304            let file = archive.by_index(i)?;
305            let file_name = file.name();
306
307            // Only sample data files
308            if Self::parse_data_file_name(file_name).is_some() {
309                if let Ok(dates) = Self::extract_date_range_from_file(file) {
310                    min_date = Some(min_date.map_or(dates.0, |d| d.min(dates.0)));
311                    max_date = Some(max_date.map_or(dates.1, |d| d.max(dates.1)));
312                    files_sampled += 1;
313                }
314            }
315        }
316
317        match (min_date, max_date) {
318            (Some(min), Some(max)) => Ok((min, max)),
319            _ => Err(ProcessingError::InvalidFormat(
320                "Could not determine date range from data files".to_string(),
321            )),
322        }
323    }
324
325    fn extract_date_range_from_file(
326        mut file: zip::read::ZipFile,
327    ) -> Result<(NaiveDate, NaiveDate)> {
328        let reader = BufReader::new(&mut file);
329        let mut min_date: Option<NaiveDate> = None;
330        let mut max_date: Option<NaiveDate> = None;
331        let mut lines_read = 0;
332        const MAX_LINES_TO_READ: usize = 100; // Sample first 100 data lines
333
334        for line_result in reader.lines() {
335            if lines_read >= MAX_LINES_TO_READ {
336                break;
337            }
338
339            let line = line_result?;
340            let trimmed = line.trim();
341
342            // Skip empty lines and headers
343            if trimmed.is_empty() || lines_read < 20 {
344                continue;
345            }
346
347            // Parse data line: SOUID, DATE, VALUE, Q_FLAG
348            let parts: Vec<&str> = line.split(',').map(|s| s.trim()).collect();
349            if parts.len() >= 2 {
350                if let Ok(date) = NaiveDate::parse_from_str(parts[1], "%Y%m%d") {
351                    min_date = Some(min_date.map_or(date, |d| d.min(date)));
352                    max_date = Some(max_date.map_or(date, |d| d.max(date)));
353                    lines_read += 1;
354                }
355            }
356        }
357
358        match (min_date, max_date) {
359            (Some(min), Some(max)) => Ok((min, max)),
360            _ => Err(ProcessingError::InvalidFormat(
361                "No valid dates found in data file".to_string(),
362            )),
363        }
364    }
365}
366
367#[cfg(test)]
368mod tests {
369    use super::*;
370    use std::io::Write;
371    use tempfile::NamedTempFile;
372    use zip::{CompressionMethod, ZipWriter};
373
374    fn create_test_zip_with_multiple_metrics() -> Result<NamedTempFile> {
375        let file = NamedTempFile::new()?;
376        {
377            let mut zip = ZipWriter::new(&file);
378
379            // Add stations.txt with GB country code
380            zip.start_file(
381                "stations.txt",
382                zip::write::FileOptions::default().compression_method(CompressionMethod::Stored),
383            )?;
384            zip.write_all(b"EUROPEAN CLIMATE ASSESSMENT & DATASET\n\nSTAID,STANAME,CN,LAT,LON,HGHT\n257,TEST STATION,GB,+51:30:00,-000:07:00,100\n258,ANOTHER STATION,GB,+52:30:00,-001:07:00,200\n")?;
385
386            // Add elements.txt
387            zip.start_file(
388                "elements.txt",
389                zip::write::FileOptions::default().compression_method(CompressionMethod::Stored),
390            )?;
391            zip.write_all(b"EUROPEAN CLIMATE ASSESSMENT & DATASET\n\nELEID,DESC,UNIT\nTX1,Maximum temperature,0.1 C\nTN1,Minimum temperature,0.1 C\nRR1,Precipitation,0.1 mm\n")?;
392
393            // Add temperature data files
394            zip.start_file(
395                "TX_STAID000257.txt",
396                zip::write::FileOptions::default().compression_method(CompressionMethod::Stored),
397            )?;
398            zip.write_all(b"Header\n101,20230101,125,0\n101,20230102,130,0\n")?;
399
400            zip.start_file(
401                "TN_STAID000257.txt",
402                zip::write::FileOptions::default().compression_method(CompressionMethod::Stored),
403            )?;
404            zip.write_all(b"Header\n101,20230101,75,0\n101,20230102,80,0\n")?;
405
406            // Add precipitation data file
407            zip.start_file(
408                "RR_STAID000258.txt",
409                zip::write::FileOptions::default().compression_method(CompressionMethod::Stored),
410            )?;
411            zip.write_all(b"Header\n102,20230101,25,0\n102,20230102,30,0\n")?;
412
413            zip.finish()?;
414        } // zip goes out of scope here
415        Ok(file)
416    }
417
418    #[test]
419    fn test_parse_data_file_name() {
420        assert_eq!(
421            ArchiveInspector::parse_data_file_name("TX_STAID000257.txt"),
422            Some(WeatherMetric::Temperature(TemperatureType::Maximum))
423        );
424        assert_eq!(
425            ArchiveInspector::parse_data_file_name("RR_STAID000258.txt"),
426            Some(WeatherMetric::Precipitation)
427        );
428        assert_eq!(
429            ArchiveInspector::parse_data_file_name("FG_STAID000259.txt"),
430            Some(WeatherMetric::WindSpeed)
431        );
432        assert_eq!(ArchiveInspector::parse_data_file_name("stations.txt"), None);
433        assert_eq!(
434            ArchiveInspector::parse_data_file_name("invalid_file.txt"),
435            None
436        );
437    }
438
439    #[test]
440    fn test_extract_station_id_from_filename() {
441        assert_eq!(
442            ArchiveInspector::extract_station_id_from_filename("TX_STAID000257.txt"),
443            Some(257)
444        );
445        assert_eq!(
446            ArchiveInspector::extract_station_id_from_filename("RR_STAID001234.txt"),
447            Some(1234)
448        );
449        assert_eq!(
450            ArchiveInspector::extract_station_id_from_filename("invalid_file.txt"),
451            None
452        );
453    }
454
455    #[test]
456    fn test_inspect_zip() -> Result<()> {
457        let test_zip = create_test_zip_with_multiple_metrics()?;
458        let metadata = ArchiveInspector::inspect_zip(test_zip.path())?;
459
460        assert_eq!(metadata.country, "GB");
461        assert_eq!(metadata.station_count, 2); // 257 and 258
462        assert_eq!(metadata.metrics.len(), 3); // TX, TN, RR
463
464        assert!(metadata.has_temperature_data());
465        assert!(!metadata.has_complete_temperature()); // Missing TG
466
467        // Check file counts
468        assert_eq!(
469            metadata
470                .file_counts
471                .get(&WeatherMetric::Temperature(TemperatureType::Maximum)),
472            Some(&1)
473        );
474        assert_eq!(
475            metadata.file_counts.get(&WeatherMetric::Precipitation),
476            Some(&1)
477        );
478
479        Ok(())
480    }
481
482    #[test]
483    fn test_archive_metadata_display() -> Result<()> {
484        let test_zip = create_test_zip_with_multiple_metrics()?;
485        let metadata = ArchiveInspector::inspect_zip(test_zip.path())?;
486
487        let summary = metadata.display_summary();
488        assert!(summary.contains("Country: GB"));
489        assert!(summary.contains("Total Stations: 2"));
490        assert!(summary.contains("Temperature (Max)"));
491        assert!(summary.contains("Precipitation"));
492
493        Ok(())
494    }
495}