Skip to main content

gtars_refget/store/
fhr_metadata.rs

1//! FAIR Headers Reference genome (FHR) metadata for sequence collections.
2//!
3//! This module contains the FHR data types, sidecar JSON I/O functions,
4//! and RefgetStore bridge methods for managing FHR metadata.
5//!
6//! See: https://github.com/FAIR-bioHeaders/FHR-Specification
7
8use super::*;
9use super::readonly::ReadonlyRefgetStore;
10use super::core::RefgetStore;
11
12use std::collections::HashMap;
13use std::fs;
14use std::path::Path;
15
16use anyhow::{Context, Result};
17use serde::{Deserialize, Serialize};
18
19use crate::hashkeyable::{DigestKey, HashKeyable, key_to_digest_string};
20
21// ============================================================================
22// Types
23// ============================================================================
24
25/// FAIR Headers Reference genome (FHR) metadata for a sequence collection.
26///
27/// All fields are optional to allow partial metadata. RefgetStore does not
28/// enforce FHR schema compliance -- that's the user's responsibility.
29#[derive(Clone, Debug, Serialize, Deserialize, Default)]
30#[serde(rename_all = "camelCase")]
31pub struct FhrMetadata {
32    /// URL to the FHR JSON schema
33    #[serde(default, skip_serializing_if = "Option::is_none")]
34    pub schema: Option<String>,
35
36    /// FHR schema version (numeric per spec, e.g. 1 or 1.0)
37    #[serde(default, skip_serializing_if = "Option::is_none")]
38    pub schema_version: Option<serde_json::Number>,
39
40    /// Genome name (e.g., "Homo sapiens")
41    #[serde(default, skip_serializing_if = "Option::is_none")]
42    pub genome: Option<String>,
43
44    /// Taxonomy information
45    #[serde(default, skip_serializing_if = "Option::is_none")]
46    pub taxon: Option<FhrTaxon>,
47
48    /// Genome version (e.g., "GRCh38.p14")
49    #[serde(default, skip_serializing_if = "Option::is_none")]
50    pub version: Option<String>,
51
52    /// Who created the metadata (ORCID URIs)
53    #[serde(default, skip_serializing_if = "Option::is_none")]
54    pub metadata_author: Option<Vec<FhrAuthor>>,
55
56    /// Who assembled the genome
57    #[serde(default, skip_serializing_if = "Option::is_none")]
58    pub assembly_author: Option<Vec<FhrAuthor>>,
59
60    /// Assembly creation date (ISO 8601)
61    #[serde(default, skip_serializing_if = "Option::is_none")]
62    pub date_created: Option<String>,
63
64    /// Description of the physical sample
65    #[serde(default, skip_serializing_if = "Option::is_none")]
66    pub voucher_specimen: Option<String>,
67
68    /// Masking type
69    #[serde(default, skip_serializing_if = "Option::is_none")]
70    pub masking: Option<String>,
71
72    /// File-level checksum (SHA2-512/256 per FHR spec)
73    #[serde(default, skip_serializing_if = "Option::is_none")]
74    pub checksum: Option<String>,
75
76    /// Alternative common names for this genome
77    #[serde(default, skip_serializing_if = "Option::is_none")]
78    pub genome_synonym: Option<Vec<String>>,
79
80    /// Database accession identifier (single object per spec)
81    #[serde(
82        default,
83        skip_serializing_if = "Option::is_none",
84        rename = "accessionID"
85    )]
86    pub accession_id: Option<FhrIdentifier>,
87
88    /// Sequencing instruments used
89    #[serde(default, skip_serializing_if = "Option::is_none")]
90    pub instrument: Option<Vec<String>>,
91
92    /// DOI or scholarly article reference (single string per spec)
93    #[serde(default, skip_serializing_if = "Option::is_none")]
94    pub scholarly_article: Option<String>,
95
96    /// Documentation about the genome
97    #[serde(default, skip_serializing_if = "Option::is_none")]
98    pub documentation: Option<String>,
99
100    /// Identifiers of the genome (namespace:value format)
101    #[serde(default, skip_serializing_if = "Option::is_none")]
102    pub identifier: Option<Vec<String>>,
103
104    /// License information
105    #[serde(default, skip_serializing_if = "Option::is_none")]
106    pub license: Option<String>,
107
108    /// Related URLs
109    #[serde(default, skip_serializing_if = "Option::is_none")]
110    pub related_link: Option<Vec<String>>,
111
112    /// Funding information (single string per spec)
113    #[serde(default, skip_serializing_if = "Option::is_none")]
114    pub funding: Option<String>,
115
116    /// General statistics about the genome assembly
117    #[serde(default, skip_serializing_if = "Option::is_none")]
118    pub vital_stats: Option<FhrVitalStats>,
119
120    /// Seqcol digest (added by RefgetStore, not part of FHR 1.0)
121    #[serde(skip)]
122    pub seqcol_digest: Option<String>,
123
124    /// Catch-all for any other FHR fields or custom extensions
125    #[serde(flatten)]
126    pub extra: HashMap<String, serde_json::Value>,
127}
128
129/// General statistics about a genome assembly.
130#[derive(Clone, Debug, Serialize, Deserialize, Default)]
131#[serde(rename_all = "camelCase")]
132pub struct FhrVitalStats {
133    #[serde(default, skip_serializing_if = "Option::is_none", rename = "L50")]
134    pub l50: Option<i64>,
135    #[serde(default, skip_serializing_if = "Option::is_none", rename = "N50")]
136    pub n50: Option<i64>,
137    #[serde(default, skip_serializing_if = "Option::is_none", rename = "L90")]
138    pub l90: Option<i64>,
139    #[serde(default, skip_serializing_if = "Option::is_none")]
140    pub total_base_pairs: Option<i64>,
141    #[serde(default, skip_serializing_if = "Option::is_none")]
142    pub number_contigs: Option<i64>,
143    #[serde(default, skip_serializing_if = "Option::is_none")]
144    pub number_scaffolds: Option<i64>,
145    #[serde(default, skip_serializing_if = "Option::is_none")]
146    pub read_technology: Option<String>,
147}
148
149#[derive(Clone, Debug, Serialize, Deserialize)]
150pub struct FhrTaxon {
151    #[serde(default, skip_serializing_if = "Option::is_none")]
152    pub name: Option<String>,
153    #[serde(default, skip_serializing_if = "Option::is_none")]
154    pub uri: Option<String>,
155}
156
157#[derive(Clone, Debug, Serialize, Deserialize)]
158pub struct FhrAuthor {
159    #[serde(default, skip_serializing_if = "Option::is_none")]
160    pub name: Option<String>,
161    #[serde(default, skip_serializing_if = "Option::is_none")]
162    pub uri: Option<String>,
163}
164
165#[derive(Clone, Debug, Serialize, Deserialize)]
166pub struct FhrIdentifier {
167    #[serde(default, skip_serializing_if = "Option::is_none")]
168    pub name: Option<String>,
169    #[serde(default, skip_serializing_if = "Option::is_none")]
170    pub url: Option<String>,
171}
172
173// ============================================================================
174// Disk I/O helpers -- called by RefgetStore, no dependency on it
175// ============================================================================
176
177pub(crate) const SIDECAR_EXTENSION: &str = ".fhr.json";
178
179/// Load all FHR sidecar files from the FHR directory.
180///
181/// Scans for `*.fhr.json` files, parses each one, and returns a map
182/// keyed by collection digest. Malformed files are skipped with a warning to stderr.
183pub fn load_sidecars(fhr_dir: &Path) -> HashMap<DigestKey, FhrMetadata> {
184    let mut map = HashMap::new();
185    if !fhr_dir.exists() {
186        return map;
187    }
188    let entries = match fs::read_dir(fhr_dir) {
189        Ok(e) => e,
190        Err(e) => {
191            eprintln!(
192                "Warning: could not read FHR sidecar directory {}: {}",
193                fhr_dir.display(),
194                e
195            );
196            return map;
197        }
198    };
199    for entry in entries.flatten() {
200        let path = entry.path();
201        if let Some(name) = path.file_name().and_then(|n| n.to_str()) {
202            if name.ends_with(SIDECAR_EXTENSION) {
203                let digest_str = &name[..name.len() - SIDECAR_EXTENSION.len()];
204                let key = digest_str.to_key();
205                match fs::read_to_string(&path) {
206                    Ok(json) => {
207                        match serde_json::from_str::<FhrMetadata>(&json) {
208                            Ok(fhr) => {
209                                map.insert(key, fhr);
210                            }
211                            Err(e) => {
212                                eprintln!(
213                                    "Warning: skipping malformed FHR sidecar {}: {}",
214                                    path.display(),
215                                    e
216                                );
217                            }
218                        }
219                    }
220                    Err(e) => {
221                        eprintln!(
222                            "Warning: could not read FHR sidecar {}: {}",
223                            path.display(),
224                            e
225                        );
226                    }
227                }
228            }
229        }
230    }
231    map
232}
233
234/// Write all FHR sidecar files to the FHR directory.
235pub fn write_sidecars(
236    fhr_dir: &Path,
237    metadata: &HashMap<DigestKey, FhrMetadata>,
238) -> Result<()> {
239    for (key, fhr) in metadata {
240        let digest_str = key_to_digest_string(key);
241        let path = fhr_dir.join(format!("{}{}", digest_str, SIDECAR_EXTENSION));
242        write_sidecar(&path, fhr)?;
243    }
244    Ok(())
245}
246
247/// Write a single FHR sidecar JSON file.
248pub fn write_sidecar(path: &Path, metadata: &FhrMetadata) -> Result<()> {
249    if let Some(parent) = path.parent() {
250        fs::create_dir_all(parent)?;
251    }
252    let json = serde_json::to_string_pretty(metadata)?;
253    fs::write(path, json)?;
254    Ok(())
255}
256
257/// Remove a single FHR sidecar file (if it exists). Returns quietly if missing.
258pub fn remove_sidecar(fhr_dir: &Path, digest_str: &str) {
259    let path = fhr_dir.join(format!("{}{}", digest_str, SIDECAR_EXTENSION));
260    let _ = fs::remove_file(path);
261}
262
263/// Build the sidecar file path for a given digest.
264pub fn sidecar_path(fhr_dir: &Path, digest_str: &str) -> std::path::PathBuf {
265    fhr_dir.join(format!("{}{}", digest_str, SIDECAR_EXTENSION))
266}
267
268/// Load FHR metadata from a standalone JSON file.
269pub fn load_from_json(path: &str) -> Result<FhrMetadata> {
270    let json = fs::read_to_string(path)
271        .context(format!("Failed to read FHR metadata from {}", path))?;
272    serde_json::from_str(&json).context("Failed to parse FHR JSON")
273}
274
275// ============================================================================
276// ReadonlyRefgetStore FHR bridge methods
277// ============================================================================
278
279impl ReadonlyRefgetStore {
280    /// Set FHR metadata for a collection.
281    pub fn set_fhr_metadata(
282        &mut self,
283        collection_digest: &str,
284        metadata: FhrMetadata,
285    ) -> Result<()> {
286        let key = collection_digest.to_key();
287        if !self.collections.contains_key(&key) {
288            return Err(anyhow::anyhow!("Collection not found: {}", collection_digest));
289        }
290        if self.persist_to_disk {
291            if let Some(ref local_path) = self.local_path {
292                let path = sidecar_path(
293                    &local_path.join("fhr"),
294                    collection_digest,
295                );
296                write_sidecar(&path, &metadata)?;
297            }
298        }
299        self.fhr_metadata.insert(key, metadata);
300        Ok(())
301    }
302
303    /// Get FHR metadata for a collection. Returns None if missing.
304    pub fn get_fhr_metadata(&self, collection_digest: &str) -> Option<&FhrMetadata> {
305        let key = collection_digest.to_key();
306        self.fhr_metadata.get(&key)
307    }
308
309    /// Remove FHR metadata for a collection.
310    pub fn remove_fhr_metadata(&mut self, collection_digest: &str) -> bool {
311        let key = collection_digest.to_key();
312        if self.persist_to_disk {
313            if let Some(ref local_path) = self.local_path {
314                remove_sidecar(
315                    &local_path.join("fhr"),
316                    collection_digest,
317                );
318            }
319        }
320        self.fhr_metadata.remove(&key).is_some()
321    }
322
323    /// List all collection digests that have FHR metadata.
324    pub fn list_fhr_metadata(&self) -> Vec<String> {
325        self.fhr_metadata
326            .keys()
327            .map(|key| key_to_digest_string(key))
328            .collect()
329    }
330
331    /// Load FHR metadata from a JSON file and attach it to a collection.
332    pub fn load_fhr_metadata(&mut self, collection_digest: &str, path: &str) -> Result<()> {
333        let metadata = load_from_json(path)?;
334        self.set_fhr_metadata(collection_digest, metadata)
335    }
336}
337
338// ============================================================================
339// RefgetStore FHR wrapper delegates
340// ============================================================================
341
342impl RefgetStore {
343    /// Set FHR metadata for a collection.
344    pub fn set_fhr_metadata(&mut self, collection_digest: &str, metadata: FhrMetadata) -> Result<()> {
345        self.inner.set_fhr_metadata(collection_digest, metadata)
346    }
347
348    /// Remove FHR metadata for a collection.
349    pub fn remove_fhr_metadata(&mut self, collection_digest: &str) -> bool {
350        self.inner.remove_fhr_metadata(collection_digest)
351    }
352
353    /// Load FHR metadata from a JSON file.
354    pub fn load_fhr_metadata(&mut self, collection_digest: &str, path: &str) -> Result<()> {
355        self.inner.load_fhr_metadata(collection_digest, path)
356    }
357
358    /// Pull FHR metadata sidecars from the remote store.
359    ///
360    /// If `digest` is Some, pulls only that collection's FHR.
361    /// If `digest` is None, pulls FHR for all known collections.
362    pub fn pull_fhr(
363        &mut self,
364        digest: Option<&str>,
365        strategy: SyncStrategy,
366    ) -> Result<PullResult> {
367        let mut result = PullResult::default();
368
369        let digests: Vec<String> = match digest {
370            Some(d) => vec![d.to_string()],
371            None => self
372                .inner
373                .collections
374                .values()
375                .map(|r| r.metadata().digest.to_string())
376                .collect(),
377        };
378
379        for digest_str in &digests {
380            let relative_path = format!("fhr/{}.fhr.json", digest_str);
381
382            match strategy {
383                SyncStrategy::KeepOurs => {
384                    let was_local = self
385                        .inner
386                        .local_path
387                        .as_ref()
388                        .map(|p| p.join(&relative_path).exists())
389                        .unwrap_or(false);
390                    match ReadonlyRefgetStore::fetch_file(
391                        &self.inner.local_path,
392                        &self.inner.remote_source,
393                        &relative_path,
394                        self.inner.persist_to_disk,
395                        false,
396                    ) {
397                        Ok(data) => {
398                            if was_local {
399                                result.skipped += 1;
400                            } else {
401                                if let Ok(fhr) = serde_json::from_slice::<FhrMetadata>(&data) {
402                                    let key = digest_str.to_key();
403                                    self.inner.fhr_metadata.insert(key, fhr);
404                                }
405                                result.pulled += 1;
406                            }
407                        }
408                        Err(_) => {
409                            result.not_found += 1;
410                        }
411                    }
412                }
413                SyncStrategy::KeepTheirs => {
414                    match ReadonlyRefgetStore::fetch_file(
415                        &self.inner.local_path,
416                        &self.inner.remote_source,
417                        &relative_path,
418                        self.inner.persist_to_disk,
419                        true,
420                    ) {
421                        Ok(data) => {
422                            if let Ok(fhr) = serde_json::from_slice::<FhrMetadata>(&data) {
423                                let key = digest_str.to_key();
424                                self.inner.fhr_metadata.insert(key, fhr);
425                            }
426                            result.pulled += 1;
427                        }
428                        Err(_) => {
429                            result.not_found += 1;
430                        }
431                    }
432                }
433                SyncStrategy::Notify => {
434                    let local_exists = self
435                        .inner
436                        .local_path
437                        .as_ref()
438                        .map(|p| p.join(&relative_path).exists())
439                        .unwrap_or(false);
440
441                    if local_exists {
442                        match ReadonlyRefgetStore::fetch_file(
443                            &None,
444                            &self.inner.remote_source,
445                            &relative_path,
446                            false,
447                            false,
448                        ) {
449                            Ok(remote_data) => {
450                                let local_path = self
451                                    .inner
452                                    .local_path
453                                    .as_ref()
454                                    .unwrap()
455                                    .join(&relative_path);
456                                let local_data = fs::read(&local_path)?;
457                                if local_data != remote_data {
458                                    result.conflicts.push(relative_path);
459                                } else {
460                                    result.skipped += 1;
461                                }
462                            }
463                            Err(_) => {
464                                result.not_found += 1;
465                            }
466                        }
467                    } else {
468                        match ReadonlyRefgetStore::fetch_file(
469                            &None,
470                            &self.inner.remote_source,
471                            &relative_path,
472                            false,
473                            false,
474                        ) {
475                            Ok(_) => {
476                                result.conflicts.push(relative_path);
477                            }
478                            Err(_) => {
479                                result.not_found += 1;
480                            }
481                        }
482                    }
483                }
484            }
485        }
486
487        Ok(result)
488    }
489}
490
491// ============================================================================
492// Tests -- serialization, roundtripping, disk I/O, and store integration
493// ============================================================================
494
495#[cfg(test)]
496mod tests {
497    use super::*;
498    use tempfile::tempdir;
499
500    #[test]
501    fn test_json_roundtrip() {
502        let fhr = FhrMetadata {
503            schema: Some("https://raw.githubusercontent.com/FAIR-bioHeaders/FHR-Specification/main/fhr.json".to_string()),
504            schema_version: Some(serde_json::Number::from_f64(1.0).unwrap()),
505            genome: Some("Homo sapiens".to_string()),
506            taxon: Some(FhrTaxon {
507                name: Some("Homo sapiens".to_string()),
508                uri: Some("https://identifiers.org/taxonomy:9606".to_string()),
509            }),
510            version: Some("GRCh38.p14".to_string()),
511            masking: Some("soft-masked".to_string()),
512            genome_synonym: Some(vec!["hg38".to_string()]),
513            scholarly_article: Some("10.1371/journal.pntd.0008755".to_string()),
514            funding: Some("NIH R01".to_string()),
515            accession_id: Some(FhrIdentifier {
516                name: Some("GCA_000001405.29".to_string()),
517                url: Some("https://www.ncbi.nlm.nih.gov/assembly/GCA_000001405.29".to_string()),
518            }),
519            ..Default::default()
520        };
521
522        let json = serde_json::to_string_pretty(&fhr).unwrap();
523        let roundtripped: FhrMetadata = serde_json::from_str(&json).unwrap();
524        assert_eq!(roundtripped.genome, fhr.genome);
525        assert_eq!(roundtripped.taxon.as_ref().unwrap().name, fhr.taxon.as_ref().unwrap().name);
526        assert_eq!(roundtripped.genome_synonym, fhr.genome_synonym);
527        assert_eq!(roundtripped.scholarly_article, Some("10.1371/journal.pntd.0008755".to_string()));
528        assert_eq!(roundtripped.funding, Some("NIH R01".to_string()));
529        assert!(roundtripped.accession_id.is_some());
530    }
531
532    #[test]
533    fn test_extra_fields_preserved() {
534        let json = r#"{
535            "genome": "Test",
536            "customField": "custom_value",
537            "anotherCustom": [1, 2, 3]
538        }"#;
539        let fhr: FhrMetadata = serde_json::from_str(json).unwrap();
540        assert_eq!(fhr.genome, Some("Test".to_string()));
541        assert!(fhr.extra.contains_key("customField"));
542
543        let json_out = serde_json::to_string(&fhr).unwrap();
544        assert!(json_out.contains("customField"));
545        assert!(json_out.contains("custom_value"));
546    }
547
548    #[test]
549    fn test_camel_case_serialization() {
550        let fhr = FhrMetadata {
551            schema_version: Some(serde_json::Number::from_f64(1.0).unwrap()),
552            genome_synonym: Some(vec!["hg38".to_string()]),
553            date_created: Some("2024-01-01".to_string()),
554            ..Default::default()
555        };
556        let json = serde_json::to_string(&fhr).unwrap();
557        assert!(json.contains("schemaVersion"));
558        assert!(json.contains("genomeSynonym"));
559        assert!(json.contains("dateCreated"));
560        assert!(!json.contains("schema_version"));
561        assert!(!json.contains("genome_synonym"));
562    }
563
564    #[test]
565    fn test_default_is_empty() {
566        let fhr = FhrMetadata::default();
567        let json = serde_json::to_string(&fhr).unwrap();
568        assert_eq!(json, "{}");
569    }
570
571    #[test]
572    fn test_write_and_load_sidecar() {
573        let dir = tempdir().unwrap();
574        let path = dir.path().join("test.fhr.json");
575
576        let fhr = FhrMetadata {
577            genome: Some("Test".to_string()),
578            version: Some("1.0".to_string()),
579            ..Default::default()
580        };
581
582        write_sidecar(&path, &fhr).unwrap();
583        assert!(path.exists());
584
585        let loaded = load_from_json(path.to_str().unwrap()).unwrap();
586        assert_eq!(loaded.genome, Some("Test".to_string()));
587        assert_eq!(loaded.version, Some("1.0".to_string()));
588    }
589
590    #[test]
591    fn test_load_sidecars_empty_dir() {
592        let dir = tempdir().unwrap();
593        let map = load_sidecars(dir.path());
594        assert!(map.is_empty());
595    }
596
597    #[test]
598    fn test_load_sidecars_nonexistent_dir() {
599        let map = load_sidecars(Path::new("/nonexistent/path"));
600        assert!(map.is_empty());
601    }
602
603    #[test]
604    fn test_remove_sidecar_missing_is_ok() {
605        let dir = tempdir().unwrap();
606        remove_sidecar(dir.path(), "nonexistent_digest");
607    }
608
609    #[test]
610    fn test_accession_id_casing() {
611        let fhr = FhrMetadata {
612            accession_id: Some(FhrIdentifier {
613                name: Some("GCA_000001405.29".to_string()),
614                url: Some("https://ncbi.nlm.nih.gov".to_string()),
615            }),
616            ..Default::default()
617        };
618        let json = serde_json::to_string(&fhr).unwrap();
619        assert!(json.contains("accessionID"));
620        assert!(!json.contains("accessionId"));
621    }
622
623    #[test]
624    fn test_schema_version_as_number() {
625        let json = r#"{"schemaVersion": 1}"#;
626        let fhr: FhrMetadata = serde_json::from_str(json).unwrap();
627        assert!(fhr.schema_version.is_some());
628        let ver = fhr.schema_version.unwrap();
629        assert_eq!(ver.to_string(), "1");
630
631        let json = r#"{"schemaVersion": 1.0}"#;
632        let fhr: FhrMetadata = serde_json::from_str(json).unwrap();
633        assert!(fhr.schema_version.is_some());
634        let ver = fhr.schema_version.unwrap();
635        assert_eq!(ver.to_string(), "1.0");
636    }
637
638    #[test]
639    fn test_vital_stats_roundtrip() {
640        let fhr = FhrMetadata {
641            vital_stats: Some(FhrVitalStats {
642                l50: Some(42),
643                n50: Some(1_000_000),
644                l90: Some(100),
645                total_base_pairs: Some(3_000_000_000),
646                number_contigs: Some(500),
647                number_scaffolds: Some(24),
648                read_technology: Some("hifi".to_string()),
649            }),
650            ..Default::default()
651        };
652        let json = serde_json::to_string_pretty(&fhr).unwrap();
653        assert!(json.contains("\"L50\""));
654        assert!(json.contains("\"N50\""));
655        assert!(json.contains("\"L90\""));
656        assert!(json.contains("\"totalBasePairs\""));
657        assert!(json.contains("\"numberContigs\""));
658        let roundtripped: FhrMetadata = serde_json::from_str(&json).unwrap();
659        let stats = roundtripped.vital_stats.unwrap();
660        assert_eq!(stats.l50, Some(42));
661        assert_eq!(stats.n50, Some(1_000_000));
662        assert_eq!(stats.read_technology, Some("hifi".to_string()));
663    }
664
665    #[test]
666    fn test_spec_example_roundtrip() {
667        let json = r#"{
668            "schema":"https://raw.githubusercontent.com/FAIR-bioHeaders/FHR-Specification/main/fhr.jso",
669            "schemaVersion": 1.0,
670            "taxon": {"name":"Bombas huntii", "uri": "https://identifiers.org/taxonomy:9606"},
671            "genome": "Bombas huntii",
672            "genomeSynonym": ["B. huntii"],
673            "version": "0.0.1",
674            "metadataAuthor": [{"name":"Adam Wright", "uri":"https://orcid.org/0000-0002-5719-4024"}],
675            "assemblyAuthor": [{"name":"David Molik", "url":"https://orcid.org/0000-0003-3192-6538"}],
676            "dateCreated":"2022-03-21",
677            "accessionID": {"name":"PBARC", "url":"https://www.ars.usda.gov/pacific-west-area/hilo-hi/daniel-k-inouye-us-pacific-basin-agricultural-research-center/"},
678            "instrument": ["Sequel IIe", "Nanopore"],
679            "voucherSpecimen":"Located in Freezer 33, Drawer 137",
680            "scholarlyArticle":"10.1371/journal.pntd.0008755",
681            "assemblySoftware":"HiFiASM",
682            "funding":"funding",
683            "reuseConditions":"public domain",
684            "documentation":"Built assembly from... ",
685            "masking":"soft-masked",
686            "identifier": ["beetlebase:TC010103"],
687            "relatedLink": ["http://wfleabase.org/genome/Daphnia_pulex/dpulex_jgi060905/fasta/"],
688            "checksum":"md5:7582b26fcb0a9775b87c38f836e97c42"
689        }"#;
690        let fhr: FhrMetadata = serde_json::from_str(json).unwrap();
691        assert_eq!(fhr.genome, Some("Bombas huntii".to_string()));
692        assert_eq!(fhr.voucher_specimen, Some("Located in Freezer 33, Drawer 137".to_string()));
693        assert_eq!(fhr.documentation, Some("Built assembly from... ".to_string()));
694        assert_eq!(fhr.scholarly_article, Some("10.1371/journal.pntd.0008755".to_string()));
695        assert_eq!(fhr.funding, Some("funding".to_string()));
696        assert_eq!(fhr.identifier, Some(vec!["beetlebase:TC010103".to_string()]));
697        assert!(fhr.accession_id.is_some());
698        assert_eq!(fhr.accession_id.as_ref().unwrap().name, Some("PBARC".to_string()));
699        assert!(fhr.extra.contains_key("assemblySoftware"));
700        assert!(fhr.extra.contains_key("reuseConditions"));
701    }
702
703    #[test]
704    fn test_seqcol_digest_skipped_in_json() {
705        let mut fhr = FhrMetadata {
706            genome: Some("Test".to_string()),
707            ..Default::default()
708        };
709        fhr.seqcol_digest = Some("abc123".to_string());
710        let json = serde_json::to_string(&fhr).unwrap();
711        assert!(!json.contains("seqcolDigest"));
712        assert!(!json.contains("seqcol_digest"));
713        assert!(!json.contains("abc123"));
714    }
715
716    #[test]
717    fn test_new_fields_present() {
718        let fhr = FhrMetadata {
719            voucher_specimen: Some("Freezer 33".to_string()),
720            documentation: Some("Assembly notes".to_string()),
721            identifier: Some(vec!["ncbi:GCA_000001405".to_string()]),
722            ..Default::default()
723        };
724        let json = serde_json::to_string(&fhr).unwrap();
725        assert!(json.contains("voucherSpecimen"));
726        assert!(json.contains("documentation"));
727        assert!(json.contains("identifier"));
728    }
729
730    #[test]
731    fn test_load_sidecars_skips_malformed_json() {
732        let dir = tempdir().unwrap();
733        let bad_path = dir.path().join("baddigest.fhr.json");
734        fs::write(&bad_path, "{ not valid json }").unwrap();
735        let map = load_sidecars(dir.path());
736        assert!(map.is_empty());
737    }
738
739    #[test]
740    fn test_load_sidecars_loads_valid_skips_invalid() {
741        let dir = tempdir().unwrap();
742
743        let valid_fhr = FhrMetadata {
744            genome: Some("ValidGenome".to_string()),
745            ..Default::default()
746        };
747        write_sidecar(&dir.path().join("validdigest.fhr.json"), &valid_fhr).unwrap();
748
749        fs::write(dir.path().join("baddigest.fhr.json"), "not json at all").unwrap();
750
751        let map = load_sidecars(dir.path());
752        assert_eq!(map.len(), 1);
753    }
754
755    // =========================================================================
756    // Store-level FHR integration tests
757    // =========================================================================
758
759    #[test]
760    fn test_fhr_metadata_empty_by_default() {
761        let mut store = RefgetStore::in_memory();
762
763        let (meta, _) = store
764            .add_sequence_collection_from_fasta("../tests/data/fasta/base.fa", FastaImportOptions::new())
765            .unwrap();
766
767        assert!(store.get_fhr_metadata(&meta.digest).is_none());
768        assert!(store.list_fhr_metadata().is_empty());
769    }
770
771    #[test]
772    fn test_fhr_metadata_set_get() {
773        let mut store = RefgetStore::in_memory();
774        let (meta, _) = store
775            .add_sequence_collection_from_fasta("../tests/data/fasta/base.fa", FastaImportOptions::new())
776            .unwrap();
777
778        let mut fhr = FhrMetadata::default();
779        fhr.genome = Some("Test genome".to_string());
780        fhr.version = Some("1.0".to_string());
781        fhr.masking = Some("not-masked".to_string());
782
783        store.set_fhr_metadata(&meta.digest, fhr.clone()).unwrap();
784
785        let retrieved = store.get_fhr_metadata(&meta.digest).unwrap();
786        assert_eq!(retrieved.genome, Some("Test genome".to_string()));
787        assert_eq!(retrieved.version, Some("1.0".to_string()));
788    }
789
790    #[test]
791    fn test_fhr_metadata_nonexistent_collection() {
792        let mut store = RefgetStore::in_memory();
793        let fhr = FhrMetadata::default();
794        assert!(store.set_fhr_metadata("nonexistent_digest", fhr).is_err());
795    }
796
797    #[test]
798    fn test_fhr_metadata_remove() {
799        let mut store = RefgetStore::in_memory();
800        let (meta, _) = store
801            .add_sequence_collection_from_fasta("../tests/data/fasta/base.fa", FastaImportOptions::new())
802            .unwrap();
803
804        let fhr = FhrMetadata {
805            genome: Some("Test".to_string()),
806            ..Default::default()
807        };
808        store.set_fhr_metadata(&meta.digest, fhr).unwrap();
809
810        assert!(store.get_fhr_metadata(&meta.digest).is_some());
811        assert!(store.remove_fhr_metadata(&meta.digest));
812        assert!(store.get_fhr_metadata(&meta.digest).is_none());
813    }
814
815    #[test]
816    fn test_fhr_metadata_persistence() {
817        let dir = tempdir().unwrap();
818        let store_path = dir.path().join("store");
819        let digest: String;
820
821        {
822            let mut store = RefgetStore::on_disk(&store_path).unwrap();
823            let (meta, _) = store
824                .add_sequence_collection_from_fasta("../tests/data/fasta/base.fa", FastaImportOptions::new())
825                .unwrap();
826            digest = meta.digest.clone();
827
828            let fhr = FhrMetadata {
829                genome: Some("Homo sapiens".to_string()),
830                version: Some("GRCh38".to_string()),
831                masking: Some("soft-masked".to_string()),
832                ..Default::default()
833            };
834            store.set_fhr_metadata(&digest, fhr).unwrap();
835        }
836
837        {
838            let store = RefgetStore::open_local(&store_path).unwrap();
839            let fhr = store.get_fhr_metadata(&digest).unwrap();
840            assert_eq!(fhr.genome, Some("Homo sapiens".to_string()));
841            assert_eq!(fhr.version, Some("GRCh38".to_string()));
842            assert_eq!(fhr.masking, Some("soft-masked".to_string()));
843        }
844    }
845
846    #[test]
847    fn test_fhr_list() {
848        let mut store = RefgetStore::in_memory();
849        assert!(store.list_fhr_metadata().is_empty());
850
851        let (meta, _) = store
852            .add_sequence_collection_from_fasta("../tests/data/fasta/base.fa", FastaImportOptions::new())
853            .unwrap();
854        let fhr = FhrMetadata {
855            genome: Some("Test".to_string()),
856            ..Default::default()
857        };
858        store.set_fhr_metadata(&meta.digest, fhr).unwrap();
859
860        let list = store.list_fhr_metadata();
861        assert_eq!(list.len(), 1);
862        assert!(list.contains(&meta.digest));
863    }
864
865    #[test]
866    fn test_remove_collection_cleans_up_fhr_metadata() {
867        let dir = tempdir().unwrap();
868        let fasta = dir.path().join("test.fa");
869        std::fs::write(&fasta, ">chr1\nACGT\n").unwrap();
870
871        let mut store = RefgetStore::in_memory();
872        let (meta, _) = store
873            .add_sequence_collection_from_fasta(&fasta, FastaImportOptions::new())
874            .unwrap();
875        let digest = meta.digest;
876
877        let fhr = FhrMetadata::default();
878        store.set_fhr_metadata(&digest, fhr).unwrap();
879        assert!(store.get_fhr_metadata(&digest).is_some());
880
881        store.remove_collection(&digest, false).unwrap();
882
883        assert!(store.get_fhr_metadata(&digest).is_none());
884    }
885
886    // -----------------------------------------------------------------------
887    // KeepOurs sync strategy tests (regression test for was_local ordering bug)
888    // -----------------------------------------------------------------------
889
890    /// Spin up a minimal HTTP server serving files from `serve_dir`.
891    /// Returns `(base_url, shutdown_fn)`.
892    fn start_file_server(serve_dir: std::path::PathBuf) -> (String, impl FnOnce()) {
893        use std::io::{Read as _, Write as _};
894        use std::net::TcpListener;
895        use std::sync::{Arc, atomic::{AtomicBool, Ordering}};
896
897        let listener = TcpListener::bind("127.0.0.1:0").expect("bind");
898        let port = listener.local_addr().unwrap().port();
899        let base_url = format!("http://127.0.0.1:{}", port);
900        let stop = Arc::new(AtomicBool::new(false));
901        let stop_clone = Arc::clone(&stop);
902
903        std::thread::spawn(move || {
904            listener.set_nonblocking(false).ok();
905            while !stop_clone.load(Ordering::Relaxed) {
906                match listener.accept() {
907                    Ok((mut stream, _)) => {
908                        let mut buf = [0u8; 4096];
909                        let n = stream.read(&mut buf).unwrap_or(0);
910                        let request = std::str::from_utf8(&buf[..n]).unwrap_or("");
911                        let path = request
912                            .lines()
913                            .next()
914                            .and_then(|l| l.split_whitespace().nth(1))
915                            .unwrap_or("/");
916                        let rel = path.trim_start_matches('/');
917                        let file_path = serve_dir.join(rel);
918                        if file_path.exists() && file_path.is_file() {
919                            let data = fs::read(&file_path).unwrap_or_default();
920                            let header = format!(
921                                "HTTP/1.1 200 OK\r\nContent-Length: {}\r\nConnection: close\r\n\r\n",
922                                data.len()
923                            );
924                            let _ = stream.write_all(header.as_bytes());
925                            let _ = stream.write_all(&data);
926                        } else {
927                            let body = b"Not Found";
928                            let header = format!(
929                                "HTTP/1.1 404 Not Found\r\nContent-Length: {}\r\nConnection: close\r\n\r\n",
930                                body.len()
931                            );
932                            let _ = stream.write_all(header.as_bytes());
933                            let _ = stream.write_all(body);
934                        }
935                    }
936                    Err(_) => break,
937                }
938            }
939        });
940
941        let shutdown = move || {
942            stop.store(true, Ordering::Relaxed);
943            let _ = std::net::TcpStream::connect(format!("127.0.0.1:{}", port));
944        };
945
946        (base_url, shutdown)
947    }
948
949    /// Pull an FHR sidecar that does NOT exist locally yet.
950    /// KeepOurs: first pull should count as `pulled`, second pull as `skipped`.
951    #[test]
952    fn test_keep_ours_fhr_first_pull_counts_as_pulled() {
953        // "Remote" store: directory with a pre-built FHR JSON sidecar.
954        let remote_dir = tempdir().unwrap();
955        let collections_dir = remote_dir.path().join("fhr");
956        fs::create_dir_all(&collections_dir).unwrap();
957
958        // We need a fake digest string to use as the collection identity.
959        let fake_digest = "SQ.aaaaaaaaaaaaaaaaaaaaaaaa";
960        let sidecar_name = format!("{}.fhr.json", fake_digest);
961        let fhr = FhrMetadata {
962            genome: Some("TestGenome".to_string()),
963            ..Default::default()
964        };
965        let sidecar_json = serde_json::to_string(&fhr).unwrap();
966        fs::write(collections_dir.join(&sidecar_name), &sidecar_json).unwrap();
967
968        // Start HTTP server.
969        let (base_url, shutdown) = start_file_server(remote_dir.path().to_path_buf());
970
971        // "Local" store: disk-backed, with a stub collection so pull_fhr has a digest.
972        let local_dir = tempdir().unwrap();
973        let local_store_path = local_dir.path().join("store");
974
975        let mut store = RefgetStore::on_disk(&local_store_path).unwrap();
976        store.inner.remote_source = Some(base_url);
977
978        // Inject a minimal stub collection so pull_fhr iterates over it.
979        use crate::hashkeyable::HashKeyable;
980        use crate::digest::SequenceCollectionRecord;
981        let key = fake_digest.to_key();
982        let stub = crate::digest::SequenceCollectionMetadata {
983            digest: fake_digest.to_string(),
984            n_sequences: 0,
985            names_digest: String::new(),
986            sequences_digest: String::new(),
987            lengths_digest: String::new(),
988            name_length_pairs_digest: None,
989            sorted_name_length_pairs_digest: None,
990            sorted_sequences_digest: None,
991            file_path: None,
992        };
993        store.inner.collections.insert(key, SequenceCollectionRecord::Stub(stub));
994
995        // First pull: FHR sidecar not yet local → should be pulled.
996        let result = store.pull_fhr(Some(fake_digest), SyncStrategy::KeepOurs).unwrap();
997        assert_eq!(result.pulled, 1, "first pull should count as pulled, not skipped");
998        assert_eq!(result.skipped, 0, "first pull should not be skipped");
999        assert_eq!(result.not_found, 0);
1000
1001        // Second pull: sidecar now on disk → should be skipped.
1002        let result2 = store.pull_fhr(Some(fake_digest), SyncStrategy::KeepOurs).unwrap();
1003        assert_eq!(result2.skipped, 1, "second pull should be skipped (file already local)");
1004        assert_eq!(result2.pulled, 0, "second pull should not count as pulled");
1005
1006        shutdown();
1007    }
1008}