1use crate::archive::{TemperatureType, WeatherMetric};
2use crate::error::{ProcessingError, Result};
3use chrono::NaiveDate;
4use serde::{Deserialize, Serialize};
5use std::collections::{HashMap, HashSet};
6use std::fs::File;
7use std::io::{BufRead, BufReader};
8use std::path::Path;
9use zip::ZipArchive;
10
11type ScanResult = (
12 Vec<WeatherMetric>,
13 HashMap<WeatherMetric, usize>,
14 HashSet<u32>,
15);
16
17#[derive(Debug, Clone, Serialize, Deserialize)]
18pub struct ArchiveMetadata {
19 pub country: String,
20 pub metrics: Vec<WeatherMetric>,
21 pub station_count: usize,
22 pub date_range: Option<(NaiveDate, NaiveDate)>,
23 pub file_counts: HashMap<WeatherMetric, usize>,
24 pub total_files: usize,
25}
26
27impl ArchiveMetadata {
28 pub fn display_summary(&self) -> String {
29 let mut summary = format!(
30 "Archive Metadata:\n Country: {}\n Total Stations: {}\n Total Files: {}\n",
31 self.country, self.station_count, self.total_files
32 );
33
34 if let Some((start, end)) = &self.date_range {
35 summary.push_str(&format!(" Date Range: {} to {}\n", start, end));
36 }
37
38 summary.push_str(" Available Metrics:\n");
39 for metric in &self.metrics {
40 if let Some(count) = self.file_counts.get(metric) {
41 summary.push_str(&format!(
42 " {}: {} stations ({})\n",
43 metric.display_name(),
44 count,
45 metric.units()
46 ));
47 }
48 }
49
50 summary
51 }
52
53 pub fn has_temperature_data(&self) -> bool {
54 self.metrics
55 .iter()
56 .any(|m| matches!(m, WeatherMetric::Temperature(_)))
57 }
58
59 pub fn has_complete_temperature(&self) -> bool {
60 let temp_types: HashSet<_> = self
61 .metrics
62 .iter()
63 .filter_map(|m| match m {
64 WeatherMetric::Temperature(t) => Some(t),
65 _ => None,
66 })
67 .collect();
68
69 temp_types.contains(&TemperatureType::Minimum)
70 && temp_types.contains(&TemperatureType::Maximum)
71 && temp_types.contains(&TemperatureType::Average)
72 }
73
74 pub fn get_metric_coverage(&self, metric: &WeatherMetric) -> f64 {
75 if let Some(count) = self.file_counts.get(metric) {
76 (*count as f64) / (self.station_count as f64)
77 } else {
78 0.0
79 }
80 }
81}
82
83pub struct ArchiveInspector;
84
85impl ArchiveInspector {
86 pub fn inspect_zip(zip_path: &Path) -> Result<ArchiveMetadata> {
87 let file = File::open(zip_path)?;
88 let mut archive = ZipArchive::new(file)?;
89
90 let (metrics, file_counts, all_station_ids) = Self::scan_data_files(&mut archive)?;
92
93 if metrics.is_empty() {
94 return Err(ProcessingError::InvalidFormat(
95 "No recognized weather data files found in archive".to_string(),
96 ));
97 }
98
99 let country = Self::extract_country(&mut archive)?;
101
102 if let Ok(element_metrics) = Self::validate_with_elements(&mut archive) {
104 for metric in &metrics {
106 if !element_metrics.contains(metric) {
107 println!(
108 "Warning: Metric {} found in files but not in elements.txt",
109 metric
110 );
111 }
112 }
113 }
114
115 let date_range = Self::estimate_date_range(&mut archive, &metrics).ok();
117
118 Ok(ArchiveMetadata {
119 country,
120 metrics,
121 station_count: all_station_ids.len(),
122 date_range,
123 file_counts,
124 total_files: archive.len(),
125 })
126 }
127
128 fn scan_data_files(archive: &mut ZipArchive<File>) -> Result<ScanResult> {
129 let mut metrics = Vec::new();
130 let mut file_counts: HashMap<WeatherMetric, usize> = HashMap::new();
131 let mut all_station_ids = HashSet::new();
132
133 for i in 0..archive.len() {
134 let file = archive.by_index(i)?;
135 let file_name = file.name();
136
137 if file_name.ends_with('/')
139 || file_name == "stations.txt"
140 || file_name == "elements.txt"
141 || file_name == "metadata.txt"
142 || file_name == "sources.txt"
143 {
144 continue;
145 }
146
147 if let Some(metric) = Self::parse_data_file_name(file_name) {
149 if !metrics.contains(&metric) {
151 metrics.push(metric.clone());
152 }
153
154 *file_counts.entry(metric).or_insert(0) += 1;
156
157 if let Some(station_id) = Self::extract_station_id_from_filename(file_name) {
159 all_station_ids.insert(station_id);
160 }
161 }
162 }
163
164 Ok((metrics, file_counts, all_station_ids))
165 }
166
167 fn parse_data_file_name(file_name: &str) -> Option<WeatherMetric> {
168 if !file_name.ends_with(".txt") {
170 return None;
171 }
172
173 let name_without_ext = &file_name[..file_name.len() - 4];
174
175 if let Some(pos) = name_without_ext.find("_STAID") {
177 let prefix = &name_without_ext[..pos];
178 WeatherMetric::from_file_prefix(prefix)
179 } else {
180 None
181 }
182 }
183
184 fn extract_station_id_from_filename(file_name: &str) -> Option<u32> {
185 if let Some(start) = file_name.find("STAID") {
187 let after_staid = &file_name[start + 5..];
188 if let Some(end) = after_staid.find('.') {
189 let id_str = &after_staid[..end];
190 id_str.trim_start_matches('0').parse().ok()
192 } else {
193 None
194 }
195 } else {
196 None
197 }
198 }
199
200 fn extract_country(archive: &mut ZipArchive<File>) -> Result<String> {
201 let mut stations_file = archive.by_name("stations.txt").map_err(|_| {
203 ProcessingError::InvalidFormat("stations.txt not found in archive".to_string())
204 })?;
205
206 let reader = BufReader::new(&mut stations_file);
207 let mut countries = HashSet::new();
208
209 for line_result in reader.lines() {
210 let line = line_result?;
211 let trimmed = line.trim();
212
213 if trimmed.is_empty() || trimmed.starts_with("STAID") || trimmed.starts_with("---") {
215 continue;
216 }
217
218 if trimmed.contains("EUROPEAN") || trimmed.contains("Klein Tank") {
220 continue;
221 }
222
223 let parts: Vec<&str> = line.split(',').map(|s| s.trim()).collect();
225 if parts.len() >= 3 {
226 let country_code = parts[2].trim();
228 if !country_code.is_empty() && country_code.len() == 2 {
229 countries.insert(country_code.to_string());
230 }
231 }
232 }
233
234 if countries.is_empty() {
235 return Err(ProcessingError::InvalidFormat(
236 "No valid country codes found in stations.txt".to_string(),
237 ));
238 }
239
240 if countries.len() > 1 {
241 println!("Warning: Multiple countries found: {:?}", countries);
242 }
243
244 Ok(countries.into_iter().next().unwrap())
246 }
247
248 fn validate_with_elements(archive: &mut ZipArchive<File>) -> Result<Vec<WeatherMetric>> {
249 let mut elements_file = archive.by_name("elements.txt").map_err(|_| {
250 ProcessingError::InvalidFormat("elements.txt not found in archive".to_string())
251 })?;
252
253 let reader = BufReader::new(&mut elements_file);
254 let mut element_metrics = Vec::new();
255
256 for line_result in reader.lines() {
257 let line = line_result?;
258 let trimmed = line.trim();
259
260 if trimmed.is_empty() || trimmed.starts_with("ELEID") || trimmed.starts_with("---") {
262 continue;
263 }
264
265 if trimmed.contains("EUROPEAN") || trimmed.contains("Klein Tank") {
267 continue;
268 }
269
270 let parts: Vec<&str> = line.split(',').map(|s| s.trim()).collect();
272 if let Some(element_id) = parts.first() {
273 let prefix = element_id
275 .chars()
276 .take_while(|c| c.is_alphabetic())
277 .collect::<String>();
278 if let Some(metric) = WeatherMetric::from_file_prefix(&prefix) {
279 if !element_metrics.contains(&metric) {
280 element_metrics.push(metric);
281 }
282 }
283 }
284 }
285
286 Ok(element_metrics)
287 }
288
289 fn estimate_date_range(
290 archive: &mut ZipArchive<File>,
291 _metrics: &[WeatherMetric],
292 ) -> Result<(NaiveDate, NaiveDate)> {
293 let mut min_date: Option<NaiveDate> = None;
295 let mut max_date: Option<NaiveDate> = None;
296 let mut files_sampled = 0;
297 const MAX_SAMPLE_FILES: usize = 5;
298
299 for i in 0..archive.len() {
300 if files_sampled >= MAX_SAMPLE_FILES {
301 break;
302 }
303
304 let file = archive.by_index(i)?;
305 let file_name = file.name();
306
307 if Self::parse_data_file_name(file_name).is_some() {
309 if let Ok(dates) = Self::extract_date_range_from_file(file) {
310 min_date = Some(min_date.map_or(dates.0, |d| d.min(dates.0)));
311 max_date = Some(max_date.map_or(dates.1, |d| d.max(dates.1)));
312 files_sampled += 1;
313 }
314 }
315 }
316
317 match (min_date, max_date) {
318 (Some(min), Some(max)) => Ok((min, max)),
319 _ => Err(ProcessingError::InvalidFormat(
320 "Could not determine date range from data files".to_string(),
321 )),
322 }
323 }
324
325 fn extract_date_range_from_file(
326 mut file: zip::read::ZipFile,
327 ) -> Result<(NaiveDate, NaiveDate)> {
328 let reader = BufReader::new(&mut file);
329 let mut min_date: Option<NaiveDate> = None;
330 let mut max_date: Option<NaiveDate> = None;
331 let mut lines_read = 0;
332 const MAX_LINES_TO_READ: usize = 100; for line_result in reader.lines() {
335 if lines_read >= MAX_LINES_TO_READ {
336 break;
337 }
338
339 let line = line_result?;
340 let trimmed = line.trim();
341
342 if trimmed.is_empty() || lines_read < 20 {
344 continue;
345 }
346
347 let parts: Vec<&str> = line.split(',').map(|s| s.trim()).collect();
349 if parts.len() >= 2 {
350 if let Ok(date) = NaiveDate::parse_from_str(parts[1], "%Y%m%d") {
351 min_date = Some(min_date.map_or(date, |d| d.min(date)));
352 max_date = Some(max_date.map_or(date, |d| d.max(date)));
353 lines_read += 1;
354 }
355 }
356 }
357
358 match (min_date, max_date) {
359 (Some(min), Some(max)) => Ok((min, max)),
360 _ => Err(ProcessingError::InvalidFormat(
361 "No valid dates found in data file".to_string(),
362 )),
363 }
364 }
365}
366
367#[cfg(test)]
368mod tests {
369 use super::*;
370 use std::io::Write;
371 use tempfile::NamedTempFile;
372 use zip::{CompressionMethod, ZipWriter};
373
374 fn create_test_zip_with_multiple_metrics() -> Result<NamedTempFile> {
375 let file = NamedTempFile::new()?;
376 {
377 let mut zip = ZipWriter::new(&file);
378
379 zip.start_file(
381 "stations.txt",
382 zip::write::FileOptions::default().compression_method(CompressionMethod::Stored),
383 )?;
384 zip.write_all(b"EUROPEAN CLIMATE ASSESSMENT & DATASET\n\nSTAID,STANAME,CN,LAT,LON,HGHT\n257,TEST STATION,GB,+51:30:00,-000:07:00,100\n258,ANOTHER STATION,GB,+52:30:00,-001:07:00,200\n")?;
385
386 zip.start_file(
388 "elements.txt",
389 zip::write::FileOptions::default().compression_method(CompressionMethod::Stored),
390 )?;
391 zip.write_all(b"EUROPEAN CLIMATE ASSESSMENT & DATASET\n\nELEID,DESC,UNIT\nTX1,Maximum temperature,0.1 C\nTN1,Minimum temperature,0.1 C\nRR1,Precipitation,0.1 mm\n")?;
392
393 zip.start_file(
395 "TX_STAID000257.txt",
396 zip::write::FileOptions::default().compression_method(CompressionMethod::Stored),
397 )?;
398 zip.write_all(b"Header\n101,20230101,125,0\n101,20230102,130,0\n")?;
399
400 zip.start_file(
401 "TN_STAID000257.txt",
402 zip::write::FileOptions::default().compression_method(CompressionMethod::Stored),
403 )?;
404 zip.write_all(b"Header\n101,20230101,75,0\n101,20230102,80,0\n")?;
405
406 zip.start_file(
408 "RR_STAID000258.txt",
409 zip::write::FileOptions::default().compression_method(CompressionMethod::Stored),
410 )?;
411 zip.write_all(b"Header\n102,20230101,25,0\n102,20230102,30,0\n")?;
412
413 zip.finish()?;
414 } Ok(file)
416 }
417
418 #[test]
419 fn test_parse_data_file_name() {
420 assert_eq!(
421 ArchiveInspector::parse_data_file_name("TX_STAID000257.txt"),
422 Some(WeatherMetric::Temperature(TemperatureType::Maximum))
423 );
424 assert_eq!(
425 ArchiveInspector::parse_data_file_name("RR_STAID000258.txt"),
426 Some(WeatherMetric::Precipitation)
427 );
428 assert_eq!(
429 ArchiveInspector::parse_data_file_name("FG_STAID000259.txt"),
430 Some(WeatherMetric::WindSpeed)
431 );
432 assert_eq!(ArchiveInspector::parse_data_file_name("stations.txt"), None);
433 assert_eq!(
434 ArchiveInspector::parse_data_file_name("invalid_file.txt"),
435 None
436 );
437 }
438
439 #[test]
440 fn test_extract_station_id_from_filename() {
441 assert_eq!(
442 ArchiveInspector::extract_station_id_from_filename("TX_STAID000257.txt"),
443 Some(257)
444 );
445 assert_eq!(
446 ArchiveInspector::extract_station_id_from_filename("RR_STAID001234.txt"),
447 Some(1234)
448 );
449 assert_eq!(
450 ArchiveInspector::extract_station_id_from_filename("invalid_file.txt"),
451 None
452 );
453 }
454
455 #[test]
456 fn test_inspect_zip() -> Result<()> {
457 let test_zip = create_test_zip_with_multiple_metrics()?;
458 let metadata = ArchiveInspector::inspect_zip(test_zip.path())?;
459
460 assert_eq!(metadata.country, "GB");
461 assert_eq!(metadata.station_count, 2); assert_eq!(metadata.metrics.len(), 3); assert!(metadata.has_temperature_data());
465 assert!(!metadata.has_complete_temperature()); assert_eq!(
469 metadata
470 .file_counts
471 .get(&WeatherMetric::Temperature(TemperatureType::Maximum)),
472 Some(&1)
473 );
474 assert_eq!(
475 metadata.file_counts.get(&WeatherMetric::Precipitation),
476 Some(&1)
477 );
478
479 Ok(())
480 }
481
482 #[test]
483 fn test_archive_metadata_display() -> Result<()> {
484 let test_zip = create_test_zip_with_multiple_metrics()?;
485 let metadata = ArchiveInspector::inspect_zip(test_zip.path())?;
486
487 let summary = metadata.display_summary();
488 assert!(summary.contains("Country: GB"));
489 assert!(summary.contains("Total Stations: 2"));
490 assert!(summary.contains("Temperature (Max)"));
491 assert!(summary.contains("Precipitation"));
492
493 Ok(())
494 }
495}