Skip to main content

anno_core/core/
provenance.rs

1//! Unified provenance tracking for the annotation pipeline.
2//!
3//! Provenance captures the full lineage of annotations:
4//! - **Source**: Where did the document come from? (URL, file, API)
5//! - **Ingest**: How was it converted to text? (pandoc, pdftotext, direct)
6//! - **Preprocessing**: What transformations were applied? (normalization, chunking)
7//! - **Extraction**: Which models produced annotations? (StackedNER, GLiNER)
8//! - **Mentions**: Individual entity extractions with per-entity provenance
9//! - **Tracks**: Coreference chains linking mentions
10//!
11//! This enables:
12//! - Reproducibility (re-run the same pipeline)
13//! - Debugging (trace errors back to source)
14//! - Auditing (who/what/when for compliance)
15//! - Confidence calibration (different pipelines have different accuracy)
16//!
17//! # Example
18//!
19//! ```rust
20//! use anno_core::core::provenance::{DocumentProvenance, SourceInfo, IngestInfo};
21//!
22//! let provenance = DocumentProvenance::builder()
23//!     .source(SourceInfo::url("https://example.com/doc.pdf"))
24//!     .ingest(IngestInfo::converter("pdftotext", "0.86.1"))
25//!     .preprocessor(&["whitespace_normalized", "unicode_nfc"])
26//!     .build();
27//! ```
28
29use serde::{Deserialize, Serialize};
30use std::borrow::Cow;
31use std::collections::HashMap;
32use std::time::SystemTime;
33
34/// Complete provenance for a document annotation pipeline.
35#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
36pub struct DocumentProvenance {
37    /// Unique identifier for this provenance chain
38    pub id: String,
39    /// Source of the document
40    pub source: SourceInfo,
41    /// How the document was ingested/converted
42    pub ingest: IngestInfo,
43    /// Preprocessing steps applied
44    pub preprocessing: Vec<PreprocessingStep>,
45    /// Extraction pipelines used
46    pub extraction: Vec<ExtractionPipeline>,
47    /// When annotation started
48    pub started_at: Option<String>,
49    /// When annotation completed
50    pub completed_at: Option<String>,
51    /// Tool and version
52    pub tool: ToolInfo,
53    /// Additional metadata
54    #[serde(default, skip_serializing_if = "HashMap::is_empty")]
55    pub metadata: HashMap<String, String>,
56}
57
58/// Source information - where did the document come from?
59#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
60#[serde(tag = "type")]
61pub enum SourceInfo {
62    /// Document from a URL
63    Url {
64        /// The URL of the document.
65        url: String,
66        /// ISO 8601 timestamp when the document was fetched.
67        fetched_at: Option<String>,
68        /// HTTP status code returned when fetching.
69        http_status: Option<u16>,
70        /// Content-Type header from response.
71        content_type: Option<String>,
72        /// ETag header for caching/validation.
73        etag: Option<String>,
74    },
75    /// Document from local file
76    File {
77        /// Path to the file on disk.
78        path: String,
79        /// ISO 8601 timestamp when the file was last modified.
80        modified_at: Option<String>,
81        /// Size of the file in bytes.
82        size_bytes: Option<u64>,
83        /// Checksum (e.g., SHA256) for integrity validation.
84        checksum: Option<String>,
85    },
86    /// Document from API
87    Api {
88        /// API endpoint URL.
89        endpoint: String,
90        /// Unique request identifier for tracing.
91        request_id: Option<String>,
92        /// Response time in milliseconds.
93        response_time_ms: Option<u64>,
94    },
95    /// Raw text input (no external source)
96    Raw {
97        /// Length of the raw text in characters.
98        length: usize,
99        /// Checksum (e.g., SHA256) for integrity validation.
100        checksum: Option<String>,
101    },
102    /// Dataset sample
103    Dataset {
104        /// Name of the dataset (e.g., "conll2003").
105        name: String,
106        /// Split of the dataset (e.g., "train", "test", "dev").
107        split: String,
108        /// Index of the sample within the split.
109        index: Option<usize>,
110    },
111    /// Unknown/unspecified source
112    Unknown,
113}
114
115impl SourceInfo {
116    /// Create URL source.
117    #[must_use]
118    pub fn url(url: impl Into<String>) -> Self {
119        Self::Url {
120            url: url.into(),
121            fetched_at: None,
122            http_status: None,
123            content_type: None,
124            etag: None,
125        }
126    }
127
128    /// Create file source.
129    #[must_use]
130    pub fn file(path: impl Into<String>) -> Self {
131        Self::File {
132            path: path.into(),
133            modified_at: None,
134            size_bytes: None,
135            checksum: None,
136        }
137    }
138
139    /// Create raw text source.
140    #[must_use]
141    pub fn raw(text: &str) -> Self {
142        Self::Raw {
143            length: text.len(),
144            checksum: None,
145        }
146    }
147
148    /// Create dataset source.
149    #[must_use]
150    pub fn dataset(name: impl Into<String>, split: impl Into<String>) -> Self {
151        Self::Dataset {
152            name: name.into(),
153            split: split.into(),
154            index: None,
155        }
156    }
157}
158
159/// Ingest information - how was the document converted to text?
160#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
161#[serde(tag = "method")]
162pub enum IngestInfo {
163    /// Direct text (no conversion needed)
164    Direct,
165    /// Converted via external tool
166    Converter {
167        /// Name of the conversion tool (e.g., "pdftotext", "pandoc").
168        tool: String,
169        /// Version of the tool used.
170        version: Option<String>,
171        /// Input format/MIME type.
172        input_format: Option<String>,
173        /// Output format/MIME type (typically "text/plain").
174        output_format: String,
175    },
176    /// Converted via library
177    Library {
178        /// Name of the library (e.g., "pdf-extract", "docx-rust").
179        name: String,
180        /// Version of the library used.
181        version: Option<String>,
182    },
183    /// Custom/unknown conversion
184    Custom {
185        /// Human-readable description of the conversion method.
186        description: String,
187    },
188}
189
190impl IngestInfo {
191    /// Create converter ingest info.
192    #[must_use]
193    pub fn converter(tool: impl Into<String>, version: impl Into<String>) -> Self {
194        Self::Converter {
195            tool: tool.into(),
196            version: Some(version.into()),
197            input_format: None,
198            output_format: "plain_text".to_string(),
199        }
200    }
201
202    /// Create direct ingest (no conversion).
203    #[must_use]
204    pub fn direct() -> Self {
205        Self::Direct
206    }
207}
208
209/// A preprocessing step applied to the document.
210#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
211pub struct PreprocessingStep {
212    /// Name of the step
213    pub name: String,
214    /// Parameters/configuration
215    #[serde(default, skip_serializing_if = "HashMap::is_empty")]
216    pub params: HashMap<String, String>,
217    /// Order in the pipeline (0 = first)
218    pub order: u32,
219}
220
221impl PreprocessingStep {
222    /// Create a new preprocessing step.
223    #[must_use]
224    pub fn new(name: impl Into<String>, order: u32) -> Self {
225        Self {
226            name: name.into(),
227            params: HashMap::new(),
228            order,
229        }
230    }
231
232    /// Add a parameter.
233    #[must_use]
234    pub fn with_param(mut self, key: impl Into<String>, value: impl Into<String>) -> Self {
235        self.params.insert(key.into(), value.into());
236        self
237    }
238}
239
240/// An extraction pipeline used to produce annotations.
241#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
242pub struct ExtractionPipeline {
243    /// Pipeline identifier (e.g., "ner", "coref", "rel")
244    pub task: String,
245    /// Model/backend name
246    pub model: String,
247    /// Model version
248    pub version: Option<String>,
249    /// Entity/relation types extracted
250    pub types_extracted: Vec<String>,
251    /// Configuration parameters
252    #[serde(default, skip_serializing_if = "HashMap::is_empty")]
253    pub config: HashMap<String, String>,
254    /// Sub-backends (for stacked/ensemble)
255    #[serde(default, skip_serializing_if = "Vec::is_empty")]
256    pub sub_backends: Vec<String>,
257}
258
259impl ExtractionPipeline {
260    /// Create a new extraction pipeline.
261    #[must_use]
262    pub fn new(task: impl Into<String>, model: impl Into<String>) -> Self {
263        Self {
264            task: task.into(),
265            model: model.into(),
266            version: None,
267            types_extracted: Vec::new(),
268            config: HashMap::new(),
269            sub_backends: Vec::new(),
270        }
271    }
272
273    /// Set version.
274    #[must_use]
275    pub fn with_version(mut self, version: impl Into<String>) -> Self {
276        self.version = Some(version.into());
277        self
278    }
279
280    /// Add extracted type.
281    #[must_use]
282    pub fn with_type(mut self, entity_type: impl Into<String>) -> Self {
283        self.types_extracted.push(entity_type.into());
284        self
285    }
286
287    /// Add sub-backend.
288    #[must_use]
289    pub fn with_sub_backend(mut self, backend: impl Into<String>) -> Self {
290        self.sub_backends.push(backend.into());
291        self
292    }
293}
294
295/// Tool information.
296#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
297pub struct ToolInfo {
298    /// Tool name
299    pub name: Cow<'static, str>,
300    /// Tool version
301    pub version: Cow<'static, str>,
302    /// Git commit hash (optional)
303    #[serde(default, skip_serializing_if = "Option::is_none")]
304    pub commit: Option<String>,
305}
306
307impl Default for ToolInfo {
308    fn default() -> Self {
309        Self {
310            name: Cow::Borrowed("anno"),
311            version: Cow::Borrowed(env!("CARGO_PKG_VERSION")),
312            commit: option_env!("GIT_HASH").map(String::from),
313        }
314    }
315}
316
317/// Builder for DocumentProvenance.
318#[derive(Debug, Default)]
319pub struct ProvenanceBuilder {
320    source: Option<SourceInfo>,
321    ingest: Option<IngestInfo>,
322    preprocessing: Vec<PreprocessingStep>,
323    extraction: Vec<ExtractionPipeline>,
324    metadata: HashMap<String, String>,
325}
326
327impl ProvenanceBuilder {
328    /// Create a new builder.
329    #[must_use]
330    pub fn new() -> Self {
331        Self::default()
332    }
333
334    /// Set the source.
335    #[must_use]
336    pub fn source(mut self, source: SourceInfo) -> Self {
337        self.source = Some(source);
338        self
339    }
340
341    /// Set the ingest method.
342    #[must_use]
343    pub fn ingest(mut self, ingest: IngestInfo) -> Self {
344        self.ingest = Some(ingest);
345        self
346    }
347
348    /// Add a preprocessing step.
349    #[must_use]
350    pub fn preprocessing(mut self, step: PreprocessingStep) -> Self {
351        self.preprocessing.push(step);
352        self
353    }
354
355    /// Add preprocessing steps by name.
356    #[must_use]
357    pub fn preprocessor(mut self, names: &[&str]) -> Self {
358        for (i, name) in names.iter().enumerate() {
359            self.preprocessing
360                .push(PreprocessingStep::new(*name, i as u32));
361        }
362        self
363    }
364
365    /// Add an extraction pipeline.
366    #[must_use]
367    pub fn extraction(mut self, pipeline: ExtractionPipeline) -> Self {
368        self.extraction.push(pipeline);
369        self
370    }
371
372    /// Add metadata.
373    #[must_use]
374    pub fn metadata(mut self, key: impl Into<String>, value: impl Into<String>) -> Self {
375        self.metadata.insert(key.into(), value.into());
376        self
377    }
378
379    /// Build the provenance.
380    #[must_use]
381    pub fn build(self) -> DocumentProvenance {
382        use std::collections::hash_map::DefaultHasher;
383        use std::hash::{Hash, Hasher};
384
385        // Generate ID from content
386        let mut hasher = DefaultHasher::new();
387        if let Some(ref source) = self.source {
388            format!("{:?}", source).hash(&mut hasher);
389        }
390        SystemTime::now()
391            .duration_since(SystemTime::UNIX_EPOCH)
392            .map(|d| d.as_nanos())
393            .unwrap_or(0)
394            .hash(&mut hasher);
395
396        let id = format!("prov:{:x}", hasher.finish());
397        let now = chrono::Utc::now().to_rfc3339();
398
399        DocumentProvenance {
400            id,
401            source: self.source.unwrap_or(SourceInfo::Unknown),
402            ingest: self.ingest.unwrap_or(IngestInfo::Direct),
403            preprocessing: self.preprocessing,
404            extraction: self.extraction,
405            started_at: Some(now.clone()),
406            completed_at: None,
407            tool: ToolInfo::default(),
408            metadata: self.metadata,
409        }
410    }
411}
412
413impl DocumentProvenance {
414    /// Create a new builder.
415    #[must_use]
416    pub fn builder() -> ProvenanceBuilder {
417        ProvenanceBuilder::new()
418    }
419
420    /// Mark as completed.
421    pub fn complete(&mut self) {
422        self.completed_at = Some(chrono::Utc::now().to_rfc3339());
423    }
424
425    /// Add extraction pipeline.
426    pub fn add_extraction(&mut self, pipeline: ExtractionPipeline) {
427        self.extraction.push(pipeline);
428    }
429}
430
431#[cfg(test)]
432mod tests {
433    use super::*;
434
435    #[test]
436    fn test_provenance_builder() {
437        let prov = DocumentProvenance::builder()
438            .source(SourceInfo::url("https://example.com/doc.pdf"))
439            .ingest(IngestInfo::converter("pdftotext", "0.86.1"))
440            .preprocessor(&["whitespace", "unicode_nfc"])
441            .extraction(
442                ExtractionPipeline::new("ner", "stacked")
443                    .with_sub_backend("pattern")
444                    .with_sub_backend("heuristic"),
445            )
446            .build();
447
448        assert!(prov.id.starts_with("prov:"));
449        assert!(matches!(prov.source, SourceInfo::Url { .. }));
450        assert!(matches!(prov.ingest, IngestInfo::Converter { .. }));
451        assert_eq!(prov.preprocessing.len(), 2);
452        assert_eq!(prov.extraction.len(), 1);
453    }
454
455    #[test]
456    fn test_source_info_variants() {
457        let url = SourceInfo::url("https://example.com");
458        assert!(matches!(url, SourceInfo::Url { .. }));
459
460        let file = SourceInfo::file("/path/to/doc.txt");
461        assert!(matches!(file, SourceInfo::File { .. }));
462
463        let raw = SourceInfo::raw("Hello, world!");
464        assert!(matches!(raw, SourceInfo::Raw { length: 13, .. }));
465    }
466
467    #[test]
468    fn test_serde_roundtrip() {
469        let prov = DocumentProvenance::builder()
470            .source(SourceInfo::url("https://example.com"))
471            .ingest(IngestInfo::direct())
472            .metadata("key", "value")
473            .build();
474
475        let json = serde_json::to_string(&prov).expect("serialize DocumentProvenance");
476        let recovered: DocumentProvenance =
477            serde_json::from_str(&json).expect("deserialize DocumentProvenance");
478
479        assert_eq!(prov.id, recovered.id);
480        assert_eq!(prov.metadata.get("key"), recovered.metadata.get("key"));
481    }
482
483    #[test]
484    fn test_extraction_pipeline_builder() {
485        let pipeline = ExtractionPipeline::new("ner", "stacked")
486            .with_version("0.2.0")
487            .with_type("PER")
488            .with_type("ORG")
489            .with_sub_backend("pattern")
490            .with_sub_backend("heuristic");
491
492        assert_eq!(pipeline.task, "ner");
493        assert_eq!(pipeline.model, "stacked");
494        assert_eq!(pipeline.version, Some("0.2.0".to_string()));
495        assert_eq!(pipeline.types_extracted, vec!["PER", "ORG"]);
496        assert_eq!(pipeline.sub_backends, vec!["pattern", "heuristic"]);
497    }
498}
499
500// =============================================================================
501// Property Tests
502// =============================================================================
503
504#[cfg(test)]
505mod proptests {
506    use super::*;
507    use proptest::prelude::*;
508
509    // Strategy for generating URLs
510    fn arb_url() -> impl Strategy<Value = String> {
511        prop::string::string_regex("https?://[a-z]+\\.[a-z]{2,4}/[a-z0-9/]*")
512            .expect("valid URL regex for proptest")
513            .prop_filter("valid url", |s| !s.is_empty())
514    }
515
516    // Strategy for generating file paths
517    fn arb_path() -> impl Strategy<Value = String> {
518        prop::string::string_regex("/[a-z]+(/[a-z0-9]+)*\\.txt")
519            .expect("valid path regex for proptest")
520            .prop_filter("valid path", |s| !s.is_empty())
521    }
522
523    // Strategy for generating alphanumeric identifiers
524    fn arb_identifier() -> impl Strategy<Value = String> {
525        prop::string::string_regex("[a-zA-Z][a-zA-Z0-9_]{0,20}")
526            .expect("valid identifier regex for proptest")
527    }
528
529    // -------------------------------------------------------------------------
530    // SourceInfo Properties
531    // -------------------------------------------------------------------------
532
533    proptest! {
534        /// URL source round-trips through serde
535        #[test]
536        fn prop_source_url_roundtrip(url in arb_url()) {
537            let source = SourceInfo::url(&url);
538            let json = serde_json::to_string(&source).expect("serialize SourceInfo::Url");
539            let recovered: SourceInfo = serde_json::from_str(&json).expect("deserialize SourceInfo");
540
541            if let SourceInfo::Url { url: recovered_url, .. } = recovered {
542                prop_assert_eq!(url, recovered_url);
543            } else {
544                prop_assert!(false, "Expected Url variant");
545            }
546        }
547
548        /// File source round-trips through serde
549        #[test]
550        fn prop_source_file_roundtrip(path in arb_path()) {
551            let source = SourceInfo::file(&path);
552            let json = serde_json::to_string(&source).expect("serialize SourceInfo::File");
553            let recovered: SourceInfo = serde_json::from_str(&json).expect("deserialize SourceInfo");
554
555            if let SourceInfo::File { path: recovered_path, .. } = recovered {
556                prop_assert_eq!(path, recovered_path);
557            } else {
558                prop_assert!(false, "Expected File variant");
559            }
560        }
561
562        /// Raw source captures correct length
563        #[test]
564        fn prop_source_raw_length(text in ".*") {
565            let source = SourceInfo::raw(&text);
566            if let SourceInfo::Raw { length, .. } = source {
567                prop_assert_eq!(length, text.len());
568            } else {
569                prop_assert!(false, "Expected Raw variant");
570            }
571        }
572
573        /// Dataset source round-trips
574        #[test]
575        fn prop_source_dataset_roundtrip(
576            name in arb_identifier(),
577            split in prop::sample::select(vec!["train", "test", "dev"])
578        ) {
579            let source = SourceInfo::dataset(&name, split);
580            let json = serde_json::to_string(&source).expect("serialize SourceInfo::Dataset");
581            let recovered: SourceInfo = serde_json::from_str(&json).expect("deserialize SourceInfo");
582
583            if let SourceInfo::Dataset { name: n, split: _, .. } = recovered {
584                prop_assert_eq!(name, n);
585            } else {
586                prop_assert!(false, "Expected Dataset variant");
587            }
588        }
589    }
590
591    // -------------------------------------------------------------------------
592    // IngestInfo Properties
593    // -------------------------------------------------------------------------
594
595    proptest! {
596        /// Converter ingest round-trips
597        #[test]
598        fn prop_ingest_converter_roundtrip(tool in arb_identifier(), version in "[0-9]+\\.[0-9]+\\.[0-9]+") {
599            let ingest = IngestInfo::converter(&tool, &version);
600            let json = serde_json::to_string(&ingest).expect("serialize IngestInfo::Converter");
601            let recovered: IngestInfo = serde_json::from_str(&json).expect("deserialize IngestInfo");
602
603            if let IngestInfo::Converter { tool: t, version: v, .. } = recovered {
604                prop_assert_eq!(tool, t);
605                prop_assert_eq!(Some(version), v);
606            } else {
607                prop_assert!(false, "Expected Converter variant");
608            }
609        }
610
611        /// Direct ingest round-trips
612        #[test]
613        fn prop_ingest_direct_roundtrip(_unused in Just(())) {
614            let ingest = IngestInfo::direct();
615            let json = serde_json::to_string(&ingest).expect("serialize IngestInfo::Direct");
616            let recovered: IngestInfo = serde_json::from_str(&json).expect("deserialize IngestInfo");
617            prop_assert!(matches!(recovered, IngestInfo::Direct));
618        }
619    }
620
621    // -------------------------------------------------------------------------
622    // PreprocessingStep Properties
623    // -------------------------------------------------------------------------
624
625    proptest! {
626        /// Preprocessing step round-trips
627        #[test]
628        fn prop_preprocessing_roundtrip(name in arb_identifier(), order in 0u32..100) {
629            let step = PreprocessingStep::new(&name, order);
630            let json = serde_json::to_string(&step).expect("serialize PreprocessingStep");
631            let recovered: PreprocessingStep =
632                serde_json::from_str(&json).expect("deserialize PreprocessingStep");
633
634            prop_assert_eq!(name, recovered.name);
635            prop_assert_eq!(order, recovered.order);
636        }
637
638        /// Preprocessing step with params round-trips
639        #[test]
640        fn prop_preprocessing_with_params_roundtrip(
641            name in arb_identifier(),
642            order in 0u32..100,
643            param_key in arb_identifier(),
644            param_value in arb_identifier()
645        ) {
646            let step = PreprocessingStep::new(&name, order)
647                .with_param(&param_key, &param_value);
648            let json = serde_json::to_string(&step).expect("serialize PreprocessingStep");
649            let recovered: PreprocessingStep =
650                serde_json::from_str(&json).expect("deserialize PreprocessingStep");
651
652            prop_assert_eq!(step.params.get(&param_key), recovered.params.get(&param_key));
653        }
654    }
655
656    // -------------------------------------------------------------------------
657    // ExtractionPipeline Properties
658    // -------------------------------------------------------------------------
659
660    proptest! {
661        /// Extraction pipeline round-trips
662        #[test]
663        fn prop_extraction_roundtrip(task in arb_identifier(), model in arb_identifier()) {
664            let pipeline = ExtractionPipeline::new(&task, &model);
665            let json = serde_json::to_string(&pipeline).expect("serialize ExtractionPipeline");
666            let recovered: ExtractionPipeline =
667                serde_json::from_str(&json).expect("deserialize ExtractionPipeline");
668
669            prop_assert_eq!(task, recovered.task);
670            prop_assert_eq!(model, recovered.model);
671        }
672
673        /// Pipeline with version round-trips
674        #[test]
675        fn prop_extraction_version_roundtrip(
676            task in arb_identifier(),
677            model in arb_identifier(),
678            version in "[0-9]+\\.[0-9]+\\.[0-9]+"
679        ) {
680            let pipeline = ExtractionPipeline::new(&task, &model)
681                .with_version(&version);
682            let json = serde_json::to_string(&pipeline).expect("serialize ExtractionPipeline");
683            let recovered: ExtractionPipeline =
684                serde_json::from_str(&json).expect("deserialize ExtractionPipeline");
685
686            prop_assert_eq!(Some(version), recovered.version);
687        }
688
689        /// Pipeline with types round-trips
690        #[test]
691        fn prop_extraction_types_roundtrip(
692            task in arb_identifier(),
693            model in arb_identifier(),
694            types in prop::collection::vec(arb_identifier(), 0..5)
695        ) {
696            let mut pipeline = ExtractionPipeline::new(&task, &model);
697            for t in &types {
698                pipeline = pipeline.with_type(t);
699            }
700            let json = serde_json::to_string(&pipeline).expect("serialize ExtractionPipeline");
701            let recovered: ExtractionPipeline =
702                serde_json::from_str(&json).expect("deserialize ExtractionPipeline");
703
704            prop_assert_eq!(types, recovered.types_extracted);
705        }
706    }
707
708    // -------------------------------------------------------------------------
709    // DocumentProvenance Properties
710    // -------------------------------------------------------------------------
711
712    proptest! {
713        /// Full provenance round-trips
714        #[test]
715        fn prop_provenance_roundtrip(url in arb_url()) {
716            let prov = DocumentProvenance::builder()
717                .source(SourceInfo::url(&url))
718                .ingest(IngestInfo::direct())
719                .build();
720
721            let json = serde_json::to_string(&prov).expect("serialize DocumentProvenance");
722            let recovered: DocumentProvenance =
723                serde_json::from_str(&json).expect("deserialize DocumentProvenance");
724
725            prop_assert_eq!(prov.id, recovered.id);
726
727            if let (SourceInfo::Url { url: u1, .. }, SourceInfo::Url { url: u2, .. }) =
728                (&prov.source, &recovered.source)
729            {
730                prop_assert_eq!(u1, u2);
731            }
732        }
733
734        /// Provenance ID starts with "prov:"
735        #[test]
736        fn prop_provenance_id_prefix(url in arb_url()) {
737            let prov = DocumentProvenance::builder()
738                .source(SourceInfo::url(&url))
739                .build();
740
741            prop_assert!(prov.id.starts_with("prov:"));
742        }
743
744        /// Provenance has started_at timestamp
745        #[test]
746        fn prop_provenance_has_timestamp(url in arb_url()) {
747            let prov = DocumentProvenance::builder()
748                .source(SourceInfo::url(&url))
749                .build();
750
751            prop_assert!(prov.started_at.is_some());
752        }
753
754        /// Metadata round-trips correctly
755        #[test]
756        fn prop_provenance_metadata(
757            key in arb_identifier(),
758            value in arb_identifier()
759        ) {
760            let prov = DocumentProvenance::builder()
761                .source(SourceInfo::Unknown)
762                .metadata(&key, &value)
763                .build();
764
765            let json = serde_json::to_string(&prov).expect("serialize DocumentProvenance");
766            let recovered: DocumentProvenance =
767                serde_json::from_str(&json).expect("deserialize DocumentProvenance");
768
769            prop_assert_eq!(prov.metadata.get(&key), recovered.metadata.get(&key));
770        }
771    }
772}