1use serde::{Deserialize, Serialize};
30use std::borrow::Cow;
31use std::collections::HashMap;
32use std::time::SystemTime;
33
34#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
36pub struct DocumentProvenance {
37 pub id: String,
39 pub source: SourceInfo,
41 pub ingest: IngestInfo,
43 pub preprocessing: Vec<PreprocessingStep>,
45 pub extraction: Vec<ExtractionPipeline>,
47 pub started_at: Option<String>,
49 pub completed_at: Option<String>,
51 pub tool: ToolInfo,
53 #[serde(default, skip_serializing_if = "HashMap::is_empty")]
55 pub metadata: HashMap<String, String>,
56}
57
58#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
60#[serde(tag = "type")]
61pub enum SourceInfo {
62 Url {
64 url: String,
66 fetched_at: Option<String>,
68 http_status: Option<u16>,
70 content_type: Option<String>,
72 etag: Option<String>,
74 },
75 File {
77 path: String,
79 modified_at: Option<String>,
81 size_bytes: Option<u64>,
83 checksum: Option<String>,
85 },
86 Api {
88 endpoint: String,
90 request_id: Option<String>,
92 response_time_ms: Option<u64>,
94 },
95 Raw {
97 length: usize,
99 checksum: Option<String>,
101 },
102 Dataset {
104 name: String,
106 split: String,
108 index: Option<usize>,
110 },
111 Unknown,
113}
114
115impl SourceInfo {
116 #[must_use]
118 pub fn url(url: impl Into<String>) -> Self {
119 Self::Url {
120 url: url.into(),
121 fetched_at: None,
122 http_status: None,
123 content_type: None,
124 etag: None,
125 }
126 }
127
128 #[must_use]
130 pub fn file(path: impl Into<String>) -> Self {
131 Self::File {
132 path: path.into(),
133 modified_at: None,
134 size_bytes: None,
135 checksum: None,
136 }
137 }
138
139 #[must_use]
141 pub fn raw(text: &str) -> Self {
142 Self::Raw {
143 length: text.len(),
144 checksum: None,
145 }
146 }
147
148 #[must_use]
150 pub fn dataset(name: impl Into<String>, split: impl Into<String>) -> Self {
151 Self::Dataset {
152 name: name.into(),
153 split: split.into(),
154 index: None,
155 }
156 }
157}
158
159#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
161#[serde(tag = "method")]
162pub enum IngestInfo {
163 Direct,
165 Converter {
167 tool: String,
169 version: Option<String>,
171 input_format: Option<String>,
173 output_format: String,
175 },
176 Library {
178 name: String,
180 version: Option<String>,
182 },
183 Custom {
185 description: String,
187 },
188}
189
190impl IngestInfo {
191 #[must_use]
193 pub fn converter(tool: impl Into<String>, version: impl Into<String>) -> Self {
194 Self::Converter {
195 tool: tool.into(),
196 version: Some(version.into()),
197 input_format: None,
198 output_format: "plain_text".to_string(),
199 }
200 }
201
202 #[must_use]
204 pub fn direct() -> Self {
205 Self::Direct
206 }
207}
208
209#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
211pub struct PreprocessingStep {
212 pub name: String,
214 #[serde(default, skip_serializing_if = "HashMap::is_empty")]
216 pub params: HashMap<String, String>,
217 pub order: u32,
219}
220
221impl PreprocessingStep {
222 #[must_use]
224 pub fn new(name: impl Into<String>, order: u32) -> Self {
225 Self {
226 name: name.into(),
227 params: HashMap::new(),
228 order,
229 }
230 }
231
232 #[must_use]
234 pub fn with_param(mut self, key: impl Into<String>, value: impl Into<String>) -> Self {
235 self.params.insert(key.into(), value.into());
236 self
237 }
238}
239
240#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
242pub struct ExtractionPipeline {
243 pub task: String,
245 pub model: String,
247 pub version: Option<String>,
249 pub types_extracted: Vec<String>,
251 #[serde(default, skip_serializing_if = "HashMap::is_empty")]
253 pub config: HashMap<String, String>,
254 #[serde(default, skip_serializing_if = "Vec::is_empty")]
256 pub sub_backends: Vec<String>,
257}
258
259impl ExtractionPipeline {
260 #[must_use]
262 pub fn new(task: impl Into<String>, model: impl Into<String>) -> Self {
263 Self {
264 task: task.into(),
265 model: model.into(),
266 version: None,
267 types_extracted: Vec::new(),
268 config: HashMap::new(),
269 sub_backends: Vec::new(),
270 }
271 }
272
273 #[must_use]
275 pub fn with_version(mut self, version: impl Into<String>) -> Self {
276 self.version = Some(version.into());
277 self
278 }
279
280 #[must_use]
282 pub fn with_type(mut self, entity_type: impl Into<String>) -> Self {
283 self.types_extracted.push(entity_type.into());
284 self
285 }
286
287 #[must_use]
289 pub fn with_sub_backend(mut self, backend: impl Into<String>) -> Self {
290 self.sub_backends.push(backend.into());
291 self
292 }
293}
294
295#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
297pub struct ToolInfo {
298 pub name: Cow<'static, str>,
300 pub version: Cow<'static, str>,
302 #[serde(default, skip_serializing_if = "Option::is_none")]
304 pub commit: Option<String>,
305}
306
307impl Default for ToolInfo {
308 fn default() -> Self {
309 Self {
310 name: Cow::Borrowed("anno"),
311 version: Cow::Borrowed(env!("CARGO_PKG_VERSION")),
312 commit: option_env!("GIT_HASH").map(String::from),
313 }
314 }
315}
316
317#[derive(Debug, Default)]
319pub struct ProvenanceBuilder {
320 source: Option<SourceInfo>,
321 ingest: Option<IngestInfo>,
322 preprocessing: Vec<PreprocessingStep>,
323 extraction: Vec<ExtractionPipeline>,
324 metadata: HashMap<String, String>,
325}
326
327impl ProvenanceBuilder {
328 #[must_use]
330 pub fn new() -> Self {
331 Self::default()
332 }
333
334 #[must_use]
336 pub fn source(mut self, source: SourceInfo) -> Self {
337 self.source = Some(source);
338 self
339 }
340
341 #[must_use]
343 pub fn ingest(mut self, ingest: IngestInfo) -> Self {
344 self.ingest = Some(ingest);
345 self
346 }
347
348 #[must_use]
350 pub fn preprocessing(mut self, step: PreprocessingStep) -> Self {
351 self.preprocessing.push(step);
352 self
353 }
354
355 #[must_use]
357 pub fn preprocessor(mut self, names: &[&str]) -> Self {
358 for (i, name) in names.iter().enumerate() {
359 self.preprocessing
360 .push(PreprocessingStep::new(*name, i as u32));
361 }
362 self
363 }
364
365 #[must_use]
367 pub fn extraction(mut self, pipeline: ExtractionPipeline) -> Self {
368 self.extraction.push(pipeline);
369 self
370 }
371
372 #[must_use]
374 pub fn metadata(mut self, key: impl Into<String>, value: impl Into<String>) -> Self {
375 self.metadata.insert(key.into(), value.into());
376 self
377 }
378
379 #[must_use]
381 pub fn build(self) -> DocumentProvenance {
382 use std::collections::hash_map::DefaultHasher;
383 use std::hash::{Hash, Hasher};
384
385 let mut hasher = DefaultHasher::new();
387 if let Some(ref source) = self.source {
388 format!("{:?}", source).hash(&mut hasher);
389 }
390 SystemTime::now()
391 .duration_since(SystemTime::UNIX_EPOCH)
392 .map(|d| d.as_nanos())
393 .unwrap_or(0)
394 .hash(&mut hasher);
395
396 let id = format!("prov:{:x}", hasher.finish());
397 let now = chrono::Utc::now().to_rfc3339();
398
399 DocumentProvenance {
400 id,
401 source: self.source.unwrap_or(SourceInfo::Unknown),
402 ingest: self.ingest.unwrap_or(IngestInfo::Direct),
403 preprocessing: self.preprocessing,
404 extraction: self.extraction,
405 started_at: Some(now.clone()),
406 completed_at: None,
407 tool: ToolInfo::default(),
408 metadata: self.metadata,
409 }
410 }
411}
412
413impl DocumentProvenance {
414 #[must_use]
416 pub fn builder() -> ProvenanceBuilder {
417 ProvenanceBuilder::new()
418 }
419
420 pub fn complete(&mut self) {
422 self.completed_at = Some(chrono::Utc::now().to_rfc3339());
423 }
424
425 pub fn add_extraction(&mut self, pipeline: ExtractionPipeline) {
427 self.extraction.push(pipeline);
428 }
429}
430
431#[cfg(test)]
432mod tests {
433 use super::*;
434
435 #[test]
436 fn test_provenance_builder() {
437 let prov = DocumentProvenance::builder()
438 .source(SourceInfo::url("https://example.com/doc.pdf"))
439 .ingest(IngestInfo::converter("pdftotext", "0.86.1"))
440 .preprocessor(&["whitespace", "unicode_nfc"])
441 .extraction(
442 ExtractionPipeline::new("ner", "stacked")
443 .with_sub_backend("pattern")
444 .with_sub_backend("heuristic"),
445 )
446 .build();
447
448 assert!(prov.id.starts_with("prov:"));
449 assert!(matches!(prov.source, SourceInfo::Url { .. }));
450 assert!(matches!(prov.ingest, IngestInfo::Converter { .. }));
451 assert_eq!(prov.preprocessing.len(), 2);
452 assert_eq!(prov.extraction.len(), 1);
453 }
454
455 #[test]
456 fn test_source_info_variants() {
457 let url = SourceInfo::url("https://example.com");
458 assert!(matches!(url, SourceInfo::Url { .. }));
459
460 let file = SourceInfo::file("/path/to/doc.txt");
461 assert!(matches!(file, SourceInfo::File { .. }));
462
463 let raw = SourceInfo::raw("Hello, world!");
464 assert!(matches!(raw, SourceInfo::Raw { length: 13, .. }));
465 }
466
467 #[test]
468 fn test_serde_roundtrip() {
469 let prov = DocumentProvenance::builder()
470 .source(SourceInfo::url("https://example.com"))
471 .ingest(IngestInfo::direct())
472 .metadata("key", "value")
473 .build();
474
475 let json = serde_json::to_string(&prov).expect("serialize DocumentProvenance");
476 let recovered: DocumentProvenance =
477 serde_json::from_str(&json).expect("deserialize DocumentProvenance");
478
479 assert_eq!(prov.id, recovered.id);
480 assert_eq!(prov.metadata.get("key"), recovered.metadata.get("key"));
481 }
482
483 #[test]
484 fn test_extraction_pipeline_builder() {
485 let pipeline = ExtractionPipeline::new("ner", "stacked")
486 .with_version("0.2.0")
487 .with_type("PER")
488 .with_type("ORG")
489 .with_sub_backend("pattern")
490 .with_sub_backend("heuristic");
491
492 assert_eq!(pipeline.task, "ner");
493 assert_eq!(pipeline.model, "stacked");
494 assert_eq!(pipeline.version, Some("0.2.0".to_string()));
495 assert_eq!(pipeline.types_extracted, vec!["PER", "ORG"]);
496 assert_eq!(pipeline.sub_backends, vec!["pattern", "heuristic"]);
497 }
498}
499
500#[cfg(test)]
505mod proptests {
506 use super::*;
507 use proptest::prelude::*;
508
509 fn arb_url() -> impl Strategy<Value = String> {
511 prop::string::string_regex("https?://[a-z]+\\.[a-z]{2,4}/[a-z0-9/]*")
512 .expect("valid URL regex for proptest")
513 .prop_filter("valid url", |s| !s.is_empty())
514 }
515
516 fn arb_path() -> impl Strategy<Value = String> {
518 prop::string::string_regex("/[a-z]+(/[a-z0-9]+)*\\.txt")
519 .expect("valid path regex for proptest")
520 .prop_filter("valid path", |s| !s.is_empty())
521 }
522
523 fn arb_identifier() -> impl Strategy<Value = String> {
525 prop::string::string_regex("[a-zA-Z][a-zA-Z0-9_]{0,20}")
526 .expect("valid identifier regex for proptest")
527 }
528
529 proptest! {
534 #[test]
536 fn prop_source_url_roundtrip(url in arb_url()) {
537 let source = SourceInfo::url(&url);
538 let json = serde_json::to_string(&source).expect("serialize SourceInfo::Url");
539 let recovered: SourceInfo = serde_json::from_str(&json).expect("deserialize SourceInfo");
540
541 if let SourceInfo::Url { url: recovered_url, .. } = recovered {
542 prop_assert_eq!(url, recovered_url);
543 } else {
544 prop_assert!(false, "Expected Url variant");
545 }
546 }
547
548 #[test]
550 fn prop_source_file_roundtrip(path in arb_path()) {
551 let source = SourceInfo::file(&path);
552 let json = serde_json::to_string(&source).expect("serialize SourceInfo::File");
553 let recovered: SourceInfo = serde_json::from_str(&json).expect("deserialize SourceInfo");
554
555 if let SourceInfo::File { path: recovered_path, .. } = recovered {
556 prop_assert_eq!(path, recovered_path);
557 } else {
558 prop_assert!(false, "Expected File variant");
559 }
560 }
561
562 #[test]
564 fn prop_source_raw_length(text in ".*") {
565 let source = SourceInfo::raw(&text);
566 if let SourceInfo::Raw { length, .. } = source {
567 prop_assert_eq!(length, text.len());
568 } else {
569 prop_assert!(false, "Expected Raw variant");
570 }
571 }
572
573 #[test]
575 fn prop_source_dataset_roundtrip(
576 name in arb_identifier(),
577 split in prop::sample::select(vec!["train", "test", "dev"])
578 ) {
579 let source = SourceInfo::dataset(&name, split);
580 let json = serde_json::to_string(&source).expect("serialize SourceInfo::Dataset");
581 let recovered: SourceInfo = serde_json::from_str(&json).expect("deserialize SourceInfo");
582
583 if let SourceInfo::Dataset { name: n, split: _, .. } = recovered {
584 prop_assert_eq!(name, n);
585 } else {
586 prop_assert!(false, "Expected Dataset variant");
587 }
588 }
589 }
590
591 proptest! {
596 #[test]
598 fn prop_ingest_converter_roundtrip(tool in arb_identifier(), version in "[0-9]+\\.[0-9]+\\.[0-9]+") {
599 let ingest = IngestInfo::converter(&tool, &version);
600 let json = serde_json::to_string(&ingest).expect("serialize IngestInfo::Converter");
601 let recovered: IngestInfo = serde_json::from_str(&json).expect("deserialize IngestInfo");
602
603 if let IngestInfo::Converter { tool: t, version: v, .. } = recovered {
604 prop_assert_eq!(tool, t);
605 prop_assert_eq!(Some(version), v);
606 } else {
607 prop_assert!(false, "Expected Converter variant");
608 }
609 }
610
611 #[test]
613 fn prop_ingest_direct_roundtrip(_unused in Just(())) {
614 let ingest = IngestInfo::direct();
615 let json = serde_json::to_string(&ingest).expect("serialize IngestInfo::Direct");
616 let recovered: IngestInfo = serde_json::from_str(&json).expect("deserialize IngestInfo");
617 prop_assert!(matches!(recovered, IngestInfo::Direct));
618 }
619 }
620
621 proptest! {
626 #[test]
628 fn prop_preprocessing_roundtrip(name in arb_identifier(), order in 0u32..100) {
629 let step = PreprocessingStep::new(&name, order);
630 let json = serde_json::to_string(&step).expect("serialize PreprocessingStep");
631 let recovered: PreprocessingStep =
632 serde_json::from_str(&json).expect("deserialize PreprocessingStep");
633
634 prop_assert_eq!(name, recovered.name);
635 prop_assert_eq!(order, recovered.order);
636 }
637
638 #[test]
640 fn prop_preprocessing_with_params_roundtrip(
641 name in arb_identifier(),
642 order in 0u32..100,
643 param_key in arb_identifier(),
644 param_value in arb_identifier()
645 ) {
646 let step = PreprocessingStep::new(&name, order)
647 .with_param(¶m_key, ¶m_value);
648 let json = serde_json::to_string(&step).expect("serialize PreprocessingStep");
649 let recovered: PreprocessingStep =
650 serde_json::from_str(&json).expect("deserialize PreprocessingStep");
651
652 prop_assert_eq!(step.params.get(¶m_key), recovered.params.get(¶m_key));
653 }
654 }
655
656 proptest! {
661 #[test]
663 fn prop_extraction_roundtrip(task in arb_identifier(), model in arb_identifier()) {
664 let pipeline = ExtractionPipeline::new(&task, &model);
665 let json = serde_json::to_string(&pipeline).expect("serialize ExtractionPipeline");
666 let recovered: ExtractionPipeline =
667 serde_json::from_str(&json).expect("deserialize ExtractionPipeline");
668
669 prop_assert_eq!(task, recovered.task);
670 prop_assert_eq!(model, recovered.model);
671 }
672
673 #[test]
675 fn prop_extraction_version_roundtrip(
676 task in arb_identifier(),
677 model in arb_identifier(),
678 version in "[0-9]+\\.[0-9]+\\.[0-9]+"
679 ) {
680 let pipeline = ExtractionPipeline::new(&task, &model)
681 .with_version(&version);
682 let json = serde_json::to_string(&pipeline).expect("serialize ExtractionPipeline");
683 let recovered: ExtractionPipeline =
684 serde_json::from_str(&json).expect("deserialize ExtractionPipeline");
685
686 prop_assert_eq!(Some(version), recovered.version);
687 }
688
689 #[test]
691 fn prop_extraction_types_roundtrip(
692 task in arb_identifier(),
693 model in arb_identifier(),
694 types in prop::collection::vec(arb_identifier(), 0..5)
695 ) {
696 let mut pipeline = ExtractionPipeline::new(&task, &model);
697 for t in &types {
698 pipeline = pipeline.with_type(t);
699 }
700 let json = serde_json::to_string(&pipeline).expect("serialize ExtractionPipeline");
701 let recovered: ExtractionPipeline =
702 serde_json::from_str(&json).expect("deserialize ExtractionPipeline");
703
704 prop_assert_eq!(types, recovered.types_extracted);
705 }
706 }
707
708 proptest! {
713 #[test]
715 fn prop_provenance_roundtrip(url in arb_url()) {
716 let prov = DocumentProvenance::builder()
717 .source(SourceInfo::url(&url))
718 .ingest(IngestInfo::direct())
719 .build();
720
721 let json = serde_json::to_string(&prov).expect("serialize DocumentProvenance");
722 let recovered: DocumentProvenance =
723 serde_json::from_str(&json).expect("deserialize DocumentProvenance");
724
725 prop_assert_eq!(prov.id, recovered.id);
726
727 if let (SourceInfo::Url { url: u1, .. }, SourceInfo::Url { url: u2, .. }) =
728 (&prov.source, &recovered.source)
729 {
730 prop_assert_eq!(u1, u2);
731 }
732 }
733
734 #[test]
736 fn prop_provenance_id_prefix(url in arb_url()) {
737 let prov = DocumentProvenance::builder()
738 .source(SourceInfo::url(&url))
739 .build();
740
741 prop_assert!(prov.id.starts_with("prov:"));
742 }
743
744 #[test]
746 fn prop_provenance_has_timestamp(url in arb_url()) {
747 let prov = DocumentProvenance::builder()
748 .source(SourceInfo::url(&url))
749 .build();
750
751 prop_assert!(prov.started_at.is_some());
752 }
753
754 #[test]
756 fn prop_provenance_metadata(
757 key in arb_identifier(),
758 value in arb_identifier()
759 ) {
760 let prov = DocumentProvenance::builder()
761 .source(SourceInfo::Unknown)
762 .metadata(&key, &value)
763 .build();
764
765 let json = serde_json::to_string(&prov).expect("serialize DocumentProvenance");
766 let recovered: DocumentProvenance =
767 serde_json::from_str(&json).expect("deserialize DocumentProvenance");
768
769 prop_assert_eq!(prov.metadata.get(&key), recovered.metadata.get(&key));
770 }
771 }
772}