use crate::common::{BoundingBox, Keyword, TemporalExtent};
use crate::error::{MetadataError, Result};
use crate::iso19115::{DataIdentification, Iso19115Metadata};
use serde::{Deserialize, Serialize};
use std::path::Path;
#[derive(Debug, Clone, Serialize, Deserialize, Default)]
pub struct ExtractedMetadata {
pub title: Option<String>,
pub abstract_text: Option<String>,
pub bbox: Option<BoundingBox>,
pub temporal_extent: Option<TemporalExtent>,
pub crs: Option<String>,
pub spatial_resolution: Option<f64>,
pub format: Option<String>,
pub keywords: Vec<String>,
pub attributes: std::collections::HashMap<String, String>,
}
pub fn extract_metadata<P: AsRef<Path>>(path: P) -> Result<ExtractedMetadata> {
let path = path.as_ref();
let extension = path
.extension()
.and_then(|e| e.to_str())
.ok_or_else(|| MetadataError::InvalidFormat("No file extension".to_string()))?;
match extension.to_lowercase().as_str() {
"tif" | "tiff" | "gtiff" => extract_from_geotiff(path),
"nc" | "nc4" | "netcdf" => extract_from_netcdf(path),
"h5" | "hdf5" | "he5" => extract_from_hdf5(path),
"json" => extract_from_stac(path),
_ => Err(MetadataError::Unsupported(format!(
"File format not supported: {}",
extension
))),
}
}
fn extract_from_geotiff<P: AsRef<Path>>(path: P) -> Result<ExtractedMetadata> {
let path_str = path.as_ref().to_string_lossy().to_string();
let mut attributes = std::collections::HashMap::new();
attributes.insert("file_path".to_string(), path_str);
let metadata = ExtractedMetadata {
format: Some("GeoTIFF".to_string()),
attributes,
..Default::default()
};
Ok(metadata)
}
fn extract_from_netcdf<P: AsRef<Path>>(path: P) -> Result<ExtractedMetadata> {
let path_str = path.as_ref().to_string_lossy().to_string();
let mut attributes = std::collections::HashMap::new();
attributes.insert("file_path".to_string(), path_str);
let metadata = ExtractedMetadata {
format: Some("NetCDF".to_string()),
attributes,
..Default::default()
};
Ok(metadata)
}
fn extract_from_hdf5<P: AsRef<Path>>(path: P) -> Result<ExtractedMetadata> {
let path_str = path.as_ref().to_string_lossy().to_string();
let mut attributes = std::collections::HashMap::new();
attributes.insert("file_path".to_string(), path_str);
let metadata = ExtractedMetadata {
format: Some("HDF5".to_string()),
attributes,
..Default::default()
};
Ok(metadata)
}
fn extract_from_stac<P: AsRef<Path>>(path: P) -> Result<ExtractedMetadata> {
let path_str = path.as_ref().to_string_lossy().to_string();
let mut attributes = std::collections::HashMap::new();
attributes.insert("file_path".to_string(), path_str);
let metadata = ExtractedMetadata {
format: Some("STAC".to_string()),
attributes,
..Default::default()
};
Ok(metadata)
}
pub fn to_iso19115(extracted: &ExtractedMetadata) -> Result<Iso19115Metadata> {
let mut iso = Iso19115Metadata::default();
let mut ident = DataIdentification::default();
if let Some(ref title) = extracted.title {
ident.citation.title = title.clone();
} else {
ident.citation.title = "Untitled Dataset".to_string();
}
if let Some(ref abstract_text) = extracted.abstract_text {
ident.abstract_text = abstract_text.clone();
}
if let Some(bbox) = extracted.bbox {
ident.extent.geographic_extent = Some(bbox);
}
if let Some(ref temporal) = extracted.temporal_extent {
ident.extent.temporal_extent = Some(temporal.clone());
}
if !extracted.keywords.is_empty() {
ident.keywords.push(
extracted
.keywords
.iter()
.map(|k| Keyword {
keyword: k.clone(),
thesaurus: None,
})
.collect(),
);
}
iso.identification_info.push(ident);
if let Some(ref crs) = extracted.crs {
use crate::iso19115::reference_system::{Identifier, ReferenceSystem};
iso.reference_system_info.push(ReferenceSystem {
reference_system_identifier: Some(Identifier::new(crs)),
reference_system_type: None,
});
}
Ok(iso)
}
pub fn to_fgdc(extracted: &ExtractedMetadata) -> Result<crate::fgdc::FgdcMetadata> {
use crate::fgdc::*;
let mut fgdc = FgdcMetadata::default();
if let Some(ref title) = extracted.title {
fgdc.idinfo.citation.citeinfo.title = title.clone();
} else {
fgdc.idinfo.citation.citeinfo.title = "Untitled Dataset".to_string();
}
if let Some(ref abstract_text) = extracted.abstract_text {
fgdc.idinfo.descript.abstract_text = abstract_text.clone();
}
if let Some(bbox) = extracted.bbox {
fgdc.idinfo.spdom.bounding = bbox;
}
if !extracted.keywords.is_empty() {
fgdc.idinfo.keywords.push(Keywords {
theme: Some("General".to_string()),
theme_key: extracted.keywords.clone(),
place: Vec::new(),
temporal: Vec::new(),
});
}
Ok(fgdc)
}
pub struct MetadataExtractor {
pub extract_spatial: bool,
pub extract_temporal: bool,
pub extract_attributes: bool,
pub max_keywords: usize,
}
impl Default for MetadataExtractor {
fn default() -> Self {
Self {
extract_spatial: true,
extract_temporal: true,
extract_attributes: true,
max_keywords: 20,
}
}
}
impl MetadataExtractor {
pub fn new() -> Self {
Self::default()
}
pub fn with_spatial(mut self, extract: bool) -> Self {
self.extract_spatial = extract;
self
}
pub fn with_temporal(mut self, extract: bool) -> Self {
self.extract_temporal = extract;
self
}
pub fn with_attributes(mut self, extract: bool) -> Self {
self.extract_attributes = extract;
self
}
pub fn with_max_keywords(mut self, max: usize) -> Self {
self.max_keywords = max;
self
}
pub fn extract<P: AsRef<Path>>(&self, path: P) -> Result<ExtractedMetadata> {
let mut metadata = extract_metadata(path)?;
if !self.extract_spatial {
metadata.bbox = None;
metadata.crs = None;
metadata.spatial_resolution = None;
}
if !self.extract_temporal {
metadata.temporal_extent = None;
}
if !self.extract_attributes {
metadata.attributes.clear();
}
if metadata.keywords.len() > self.max_keywords {
metadata.keywords.truncate(self.max_keywords);
}
Ok(metadata)
}
}
pub fn batch_extract<I, P>(paths: I) -> Vec<Result<ExtractedMetadata>>
where
I: IntoIterator<Item = P>,
P: AsRef<Path>,
{
paths.into_iter().map(extract_metadata).collect()
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_extractor_builder() {
let extractor = MetadataExtractor::new()
.with_spatial(true)
.with_temporal(false)
.with_max_keywords(10);
assert!(extractor.extract_spatial);
assert!(!extractor.extract_temporal);
assert_eq!(extractor.max_keywords, 10);
}
#[test]
fn test_extracted_metadata_default() {
let metadata = ExtractedMetadata::default();
assert!(metadata.title.is_none());
assert!(metadata.bbox.is_none());
assert!(metadata.keywords.is_empty());
}
}