oxigdal-metadata 0.1.0

Metadata standards support for OxiGDAL - ISO 19115, FGDC, INSPIRE, DataCite, DCAT
Documentation
//! Metadata extraction from geospatial datasets.
//!
//! This module provides functionality to extract metadata from various
//! geospatial file formats including GeoTIFF, NetCDF, HDF5, and STAC.

use crate::common::{BoundingBox, Keyword, TemporalExtent};
use crate::error::{MetadataError, Result};
use crate::iso19115::{DataIdentification, Iso19115Metadata};
use serde::{Deserialize, Serialize};
use std::path::Path;

/// Extracted metadata from a dataset.
#[derive(Debug, Clone, Serialize, Deserialize, Default)]
pub struct ExtractedMetadata {
    /// Dataset title
    pub title: Option<String>,
    /// Abstract/description
    pub abstract_text: Option<String>,
    /// Bounding box
    pub bbox: Option<BoundingBox>,
    /// Temporal extent
    pub temporal_extent: Option<TemporalExtent>,
    /// Coordinate reference system
    pub crs: Option<String>,
    /// Spatial resolution
    pub spatial_resolution: Option<f64>,
    /// Format
    pub format: Option<String>,
    /// Keywords
    pub keywords: Vec<String>,
    /// Additional attributes
    pub attributes: std::collections::HashMap<String, String>,
}

/// Extract metadata from a file path.
///
/// # Arguments
///
/// * `path` - Path to the geospatial file
///
/// # Returns
///
/// Extracted metadata or error.
pub fn extract_metadata<P: AsRef<Path>>(path: P) -> Result<ExtractedMetadata> {
    let path = path.as_ref();

    // Determine file type from extension
    let extension = path
        .extension()
        .and_then(|e| e.to_str())
        .ok_or_else(|| MetadataError::InvalidFormat("No file extension".to_string()))?;

    match extension.to_lowercase().as_str() {
        "tif" | "tiff" | "gtiff" => extract_from_geotiff(path),
        "nc" | "nc4" | "netcdf" => extract_from_netcdf(path),
        "h5" | "hdf5" | "he5" => extract_from_hdf5(path),
        "json" => extract_from_stac(path),
        _ => Err(MetadataError::Unsupported(format!(
            "File format not supported: {}",
            extension
        ))),
    }
}

/// Extract metadata from GeoTIFF.
fn extract_from_geotiff<P: AsRef<Path>>(path: P) -> Result<ExtractedMetadata> {
    let path_str = path.as_ref().to_string_lossy().to_string();
    let mut attributes = std::collections::HashMap::new();
    attributes.insert("file_path".to_string(), path_str);

    let metadata = ExtractedMetadata {
        format: Some("GeoTIFF".to_string()),
        attributes,
        ..Default::default()
    };

    // In a real implementation, we would use the oxigdal-geotiff crate
    // For now, we'll provide a placeholder that demonstrates the interface

    // Placeholder for GeoTIFF-specific extraction
    // This would include:
    // - Reading TIFF tags
    // - Extracting GeoTIFF tags (ModelTiepoint, ModelPixelScale, etc.)
    // - Reading GDAL metadata from TIFF tags
    // - Extracting CRS from GeoKeys

    Ok(metadata)
}

/// Extract metadata from NetCDF.
fn extract_from_netcdf<P: AsRef<Path>>(path: P) -> Result<ExtractedMetadata> {
    let path_str = path.as_ref().to_string_lossy().to_string();
    let mut attributes = std::collections::HashMap::new();
    attributes.insert("file_path".to_string(), path_str);

    let metadata = ExtractedMetadata {
        format: Some("NetCDF".to_string()),
        attributes,
        ..Default::default()
    };

    // In a real implementation, we would use the oxigdal-netcdf crate
    // and parse CF (Climate and Forecast) conventions

    // Placeholder for NetCDF-specific extraction
    // This would include:
    // - Reading global attributes (title, summary, keywords)
    // - Extracting CF convention metadata
    // - Reading coordinate variables
    // - Extracting time bounds
    // - Reading CRS from grid_mapping variable

    Ok(metadata)
}

/// Extract metadata from HDF5.
fn extract_from_hdf5<P: AsRef<Path>>(path: P) -> Result<ExtractedMetadata> {
    let path_str = path.as_ref().to_string_lossy().to_string();
    let mut attributes = std::collections::HashMap::new();
    attributes.insert("file_path".to_string(), path_str);

    let metadata = ExtractedMetadata {
        format: Some("HDF5".to_string()),
        attributes,
        ..Default::default()
    };

    // In a real implementation, we would use the oxigdal-hdf5 crate

    // Placeholder for HDF5-specific extraction
    // This would include:
    // - Reading file attributes
    // - Extracting dataset metadata
    // - Reading group structure
    // - Extracting embedded metadata standards

    Ok(metadata)
}

/// Extract metadata from STAC.
fn extract_from_stac<P: AsRef<Path>>(path: P) -> Result<ExtractedMetadata> {
    let path_str = path.as_ref().to_string_lossy().to_string();
    let mut attributes = std::collections::HashMap::new();
    attributes.insert("file_path".to_string(), path_str);

    let metadata = ExtractedMetadata {
        format: Some("STAC".to_string()),
        attributes,
        ..Default::default()
    };

    // In a real implementation, we would use the oxigdal-stac crate
    // to parse STAC Item or Collection JSON

    // Placeholder for STAC-specific extraction
    // This would include:
    // - Parsing STAC JSON
    // - Extracting bbox from geometry
    // - Reading datetime/start_datetime/end_datetime
    // - Extracting assets
    // - Reading extension metadata

    Ok(metadata)
}

/// Auto-populate ISO 19115 metadata from extracted metadata.
///
/// # Arguments
///
/// * `extracted` - Extracted metadata
///
/// # Returns
///
/// ISO 19115 metadata populated with extracted information.
pub fn to_iso19115(extracted: &ExtractedMetadata) -> Result<Iso19115Metadata> {
    let mut iso = Iso19115Metadata::default();

    // Create data identification
    let mut ident = DataIdentification::default();

    // Set title
    if let Some(ref title) = extracted.title {
        ident.citation.title = title.clone();
    } else {
        ident.citation.title = "Untitled Dataset".to_string();
    }

    // Set abstract
    if let Some(ref abstract_text) = extracted.abstract_text {
        ident.abstract_text = abstract_text.clone();
    }

    // Set extent
    if let Some(bbox) = extracted.bbox {
        ident.extent.geographic_extent = Some(bbox);
    }

    if let Some(ref temporal) = extracted.temporal_extent {
        ident.extent.temporal_extent = Some(temporal.clone());
    }

    // Set keywords
    if !extracted.keywords.is_empty() {
        ident.keywords.push(
            extracted
                .keywords
                .iter()
                .map(|k| Keyword {
                    keyword: k.clone(),
                    thesaurus: None,
                })
                .collect(),
        );
    }

    iso.identification_info.push(ident);

    // Set reference system
    if let Some(ref crs) = extracted.crs {
        use crate::iso19115::reference_system::{Identifier, ReferenceSystem};
        iso.reference_system_info.push(ReferenceSystem {
            reference_system_identifier: Some(Identifier::new(crs)),
            reference_system_type: None,
        });
    }

    Ok(iso)
}

/// Auto-populate FGDC metadata from extracted metadata.
///
/// # Arguments
///
/// * `extracted` - Extracted metadata
///
/// # Returns
///
/// FGDC metadata populated with extracted information.
pub fn to_fgdc(extracted: &ExtractedMetadata) -> Result<crate::fgdc::FgdcMetadata> {
    use crate::fgdc::*;

    let mut fgdc = FgdcMetadata::default();

    // Set title
    if let Some(ref title) = extracted.title {
        fgdc.idinfo.citation.citeinfo.title = title.clone();
    } else {
        fgdc.idinfo.citation.citeinfo.title = "Untitled Dataset".to_string();
    }

    // Set abstract
    if let Some(ref abstract_text) = extracted.abstract_text {
        fgdc.idinfo.descript.abstract_text = abstract_text.clone();
    }

    // Set bounding box
    if let Some(bbox) = extracted.bbox {
        fgdc.idinfo.spdom.bounding = bbox;
    }

    // Set keywords
    if !extracted.keywords.is_empty() {
        fgdc.idinfo.keywords.push(Keywords {
            theme: Some("General".to_string()),
            theme_key: extracted.keywords.clone(),
            place: Vec::new(),
            temporal: Vec::new(),
        });
    }

    Ok(fgdc)
}

/// Metadata extractor with configurable options.
pub struct MetadataExtractor {
    /// Whether to extract spatial metadata
    pub extract_spatial: bool,
    /// Whether to extract temporal metadata
    pub extract_temporal: bool,
    /// Whether to extract attributes
    pub extract_attributes: bool,
    /// Maximum number of keywords to extract
    pub max_keywords: usize,
}

impl Default for MetadataExtractor {
    fn default() -> Self {
        Self {
            extract_spatial: true,
            extract_temporal: true,
            extract_attributes: true,
            max_keywords: 20,
        }
    }
}

impl MetadataExtractor {
    /// Create a new extractor with default settings.
    pub fn new() -> Self {
        Self::default()
    }

    /// Set whether to extract spatial metadata.
    pub fn with_spatial(mut self, extract: bool) -> Self {
        self.extract_spatial = extract;
        self
    }

    /// Set whether to extract temporal metadata.
    pub fn with_temporal(mut self, extract: bool) -> Self {
        self.extract_temporal = extract;
        self
    }

    /// Set whether to extract attributes.
    pub fn with_attributes(mut self, extract: bool) -> Self {
        self.extract_attributes = extract;
        self
    }

    /// Set maximum number of keywords.
    pub fn with_max_keywords(mut self, max: usize) -> Self {
        self.max_keywords = max;
        self
    }

    /// Extract metadata from file.
    pub fn extract<P: AsRef<Path>>(&self, path: P) -> Result<ExtractedMetadata> {
        let mut metadata = extract_metadata(path)?;

        // Apply extractor options
        if !self.extract_spatial {
            metadata.bbox = None;
            metadata.crs = None;
            metadata.spatial_resolution = None;
        }

        if !self.extract_temporal {
            metadata.temporal_extent = None;
        }

        if !self.extract_attributes {
            metadata.attributes.clear();
        }

        // Limit keywords
        if metadata.keywords.len() > self.max_keywords {
            metadata.keywords.truncate(self.max_keywords);
        }

        Ok(metadata)
    }
}

/// Extract metadata from multiple files in batch.
///
/// # Arguments
///
/// * `paths` - Iterator of file paths
///
/// # Returns
///
/// Vector of extracted metadata results.
pub fn batch_extract<I, P>(paths: I) -> Vec<Result<ExtractedMetadata>>
where
    I: IntoIterator<Item = P>,
    P: AsRef<Path>,
{
    paths.into_iter().map(extract_metadata).collect()
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn test_extractor_builder() {
        let extractor = MetadataExtractor::new()
            .with_spatial(true)
            .with_temporal(false)
            .with_max_keywords(10);

        assert!(extractor.extract_spatial);
        assert!(!extractor.extract_temporal);
        assert_eq!(extractor.max_keywords, 10);
    }

    #[test]
    fn test_extracted_metadata_default() {
        let metadata = ExtractedMetadata::default();
        assert!(metadata.title.is_none());
        assert!(metadata.bbox.is_none());
        assert!(metadata.keywords.is_empty());
    }
}