use serde::{Deserialize, Serialize};
use std::borrow::Cow;
use std::collections::HashMap;
use std::time::{SystemTime, UNIX_EPOCH};
fn now_rfc3339() -> String {
let dur = SystemTime::now()
.duration_since(UNIX_EPOCH)
.unwrap_or_default();
let secs = dur.as_secs();
let days = secs / 86400;
let rem = secs % 86400;
let hours = rem / 3600;
let mins = (rem % 3600) / 60;
let s = rem % 60;
let (year, month, day) = days_to_ymd(days);
format!("{year:04}-{month:02}-{day:02}T{hours:02}:{mins:02}:{s:02}Z")
}
fn days_to_ymd(days: u64) -> (u64, u64, u64) {
let z = days + 719_468;
let era = z / 146_097;
let doe = z - era * 146_097;
let yoe = (doe - doe / 1460 + doe / 36524 - doe / 146_096) / 365;
let y = yoe + era * 400;
let doy = doe - (365 * yoe + yoe / 4 - yoe / 100);
let mp = (5 * doy + 2) / 153;
let d = doy - (153 * mp + 2) / 5 + 1;
let m = if mp < 10 { mp + 3 } else { mp - 9 };
let y = if m <= 2 { y + 1 } else { y };
(y, m, d)
}
#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
pub struct DocumentProvenance {
pub id: String,
pub source: SourceInfo,
pub ingest: IngestInfo,
pub preprocessing: Vec<PreprocessingStep>,
pub extraction: Vec<ExtractionPipeline>,
pub started_at: Option<String>,
pub completed_at: Option<String>,
pub tool: ToolInfo,
#[serde(default, skip_serializing_if = "HashMap::is_empty")]
pub metadata: HashMap<String, String>,
}
#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
#[serde(tag = "type")]
pub enum SourceInfo {
Url {
url: String,
fetched_at: Option<String>,
http_status: Option<u16>,
content_type: Option<String>,
etag: Option<String>,
},
File {
path: String,
modified_at: Option<String>,
size_bytes: Option<u64>,
checksum: Option<String>,
},
Api {
endpoint: String,
request_id: Option<String>,
response_time_ms: Option<u64>,
},
Raw {
length: usize,
checksum: Option<String>,
},
Dataset {
name: String,
split: String,
index: Option<usize>,
},
Unknown,
}
impl SourceInfo {
#[must_use]
pub fn url(url: impl Into<String>) -> Self {
Self::Url {
url: url.into(),
fetched_at: None,
http_status: None,
content_type: None,
etag: None,
}
}
#[must_use]
pub fn file(path: impl Into<String>) -> Self {
Self::File {
path: path.into(),
modified_at: None,
size_bytes: None,
checksum: None,
}
}
#[must_use]
pub fn raw(text: &str) -> Self {
Self::Raw {
length: text.chars().count(),
checksum: None,
}
}
#[must_use]
pub fn dataset(name: impl Into<String>, split: impl Into<String>) -> Self {
Self::Dataset {
name: name.into(),
split: split.into(),
index: None,
}
}
}
#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
#[serde(tag = "method")]
pub enum IngestInfo {
Direct,
Converter {
tool: String,
version: Option<String>,
input_format: Option<String>,
output_format: String,
},
Library {
name: String,
version: Option<String>,
},
Custom {
description: String,
},
}
impl IngestInfo {
#[must_use]
pub fn converter(tool: impl Into<String>, version: impl Into<String>) -> Self {
Self::Converter {
tool: tool.into(),
version: Some(version.into()),
input_format: None,
output_format: "plain_text".to_string(),
}
}
#[must_use]
pub fn direct() -> Self {
Self::Direct
}
}
#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
pub struct PreprocessingStep {
pub name: String,
#[serde(default, skip_serializing_if = "HashMap::is_empty")]
pub params: HashMap<String, String>,
pub order: u32,
}
impl PreprocessingStep {
#[must_use]
pub fn new(name: impl Into<String>, order: u32) -> Self {
Self {
name: name.into(),
params: HashMap::new(),
order,
}
}
#[must_use]
pub fn with_param(mut self, key: impl Into<String>, value: impl Into<String>) -> Self {
self.params.insert(key.into(), value.into());
self
}
}
#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
pub struct ExtractionPipeline {
pub task: String,
pub model: String,
pub version: Option<String>,
pub types_extracted: Vec<String>,
#[serde(default, skip_serializing_if = "HashMap::is_empty")]
pub config: HashMap<String, String>,
#[serde(default, skip_serializing_if = "Vec::is_empty")]
pub sub_backends: Vec<String>,
}
impl ExtractionPipeline {
#[must_use]
pub fn new(task: impl Into<String>, model: impl Into<String>) -> Self {
Self {
task: task.into(),
model: model.into(),
version: None,
types_extracted: Vec::new(),
config: HashMap::new(),
sub_backends: Vec::new(),
}
}
#[must_use]
pub fn with_version(mut self, version: impl Into<String>) -> Self {
self.version = Some(version.into());
self
}
#[must_use]
pub fn with_type(mut self, entity_type: impl Into<String>) -> Self {
self.types_extracted.push(entity_type.into());
self
}
#[must_use]
pub fn with_sub_backend(mut self, backend: impl Into<String>) -> Self {
self.sub_backends.push(backend.into());
self
}
}
#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
pub struct ToolInfo {
pub name: Cow<'static, str>,
pub version: Cow<'static, str>,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub commit: Option<String>,
}
impl Default for ToolInfo {
fn default() -> Self {
Self {
name: Cow::Borrowed("anno"),
version: Cow::Borrowed(env!("CARGO_PKG_VERSION")),
commit: option_env!("GIT_HASH").map(String::from),
}
}
}
#[derive(Debug, Default)]
pub struct ProvenanceBuilder {
source: Option<SourceInfo>,
ingest: Option<IngestInfo>,
preprocessing: Vec<PreprocessingStep>,
extraction: Vec<ExtractionPipeline>,
metadata: HashMap<String, String>,
}
impl ProvenanceBuilder {
#[must_use]
pub fn new() -> Self {
Self::default()
}
#[must_use]
pub fn source(mut self, source: SourceInfo) -> Self {
self.source = Some(source);
self
}
#[must_use]
pub fn ingest(mut self, ingest: IngestInfo) -> Self {
self.ingest = Some(ingest);
self
}
#[must_use]
pub fn preprocessing(mut self, step: PreprocessingStep) -> Self {
self.preprocessing.push(step);
self
}
#[must_use]
pub fn preprocessor(mut self, names: &[&str]) -> Self {
for (i, name) in names.iter().enumerate() {
self.preprocessing
.push(PreprocessingStep::new(*name, i as u32));
}
self
}
#[must_use]
pub fn extraction(mut self, pipeline: ExtractionPipeline) -> Self {
self.extraction.push(pipeline);
self
}
#[must_use]
pub fn metadata(mut self, key: impl Into<String>, value: impl Into<String>) -> Self {
self.metadata.insert(key.into(), value.into());
self
}
#[must_use]
pub fn build(self) -> DocumentProvenance {
use std::collections::hash_map::DefaultHasher;
use std::hash::{Hash, Hasher};
let mut hasher = DefaultHasher::new();
if let Some(ref source) = self.source {
format!("{:?}", source).hash(&mut hasher);
}
SystemTime::now()
.duration_since(UNIX_EPOCH)
.map(|d| d.as_nanos())
.unwrap_or(0)
.hash(&mut hasher);
let id = format!("prov:{:x}", hasher.finish());
let now = now_rfc3339();
DocumentProvenance {
id,
source: self.source.unwrap_or(SourceInfo::Unknown),
ingest: self.ingest.unwrap_or(IngestInfo::Direct),
preprocessing: self.preprocessing,
extraction: self.extraction,
started_at: Some(now.clone()),
completed_at: None,
tool: ToolInfo::default(),
metadata: self.metadata,
}
}
}
impl DocumentProvenance {
#[must_use]
pub fn builder() -> ProvenanceBuilder {
ProvenanceBuilder::new()
}
pub fn complete(&mut self) {
self.completed_at = Some(now_rfc3339());
}
pub fn add_extraction(&mut self, pipeline: ExtractionPipeline) {
self.extraction.push(pipeline);
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_provenance_builder() {
let prov = DocumentProvenance::builder()
.source(SourceInfo::url("https://example.com/doc.pdf"))
.ingest(IngestInfo::converter("pdftotext", "0.86.1"))
.preprocessor(&["whitespace", "unicode_nfc"])
.extraction(
ExtractionPipeline::new("ner", "stacked")
.with_sub_backend("pattern")
.with_sub_backend("heuristic"),
)
.build();
assert!(prov.id.starts_with("prov:"));
assert!(matches!(prov.source, SourceInfo::Url { .. }));
assert!(matches!(prov.ingest, IngestInfo::Converter { .. }));
assert_eq!(prov.preprocessing.len(), 2);
assert_eq!(prov.extraction.len(), 1);
}
#[test]
fn test_source_info_variants() {
let url = SourceInfo::url("https://example.com");
assert!(matches!(url, SourceInfo::Url { .. }));
let file = SourceInfo::file("/path/to/doc.txt");
assert!(matches!(file, SourceInfo::File { .. }));
let raw = SourceInfo::raw("Hello, world!");
assert!(matches!(raw, SourceInfo::Raw { length: 13, .. }));
}
#[test]
fn test_serde_roundtrip() {
let prov = DocumentProvenance::builder()
.source(SourceInfo::url("https://example.com"))
.ingest(IngestInfo::direct())
.metadata("key", "value")
.build();
let json = serde_json::to_string(&prov).expect("serialize DocumentProvenance");
let recovered: DocumentProvenance =
serde_json::from_str(&json).expect("deserialize DocumentProvenance");
assert_eq!(prov.id, recovered.id);
assert_eq!(prov.metadata.get("key"), recovered.metadata.get("key"));
}
#[test]
fn test_extraction_pipeline_builder() {
let pipeline = ExtractionPipeline::new("ner", "stacked")
.with_version("0.2.0")
.with_type("PER")
.with_type("ORG")
.with_sub_backend("pattern")
.with_sub_backend("heuristic");
assert_eq!(pipeline.task, "ner");
assert_eq!(pipeline.model, "stacked");
assert_eq!(pipeline.version, Some("0.2.0".to_string()));
assert_eq!(pipeline.types_extracted, vec!["PER", "ORG"]);
assert_eq!(pipeline.sub_backends, vec!["pattern", "heuristic"]);
}
}
#[cfg(test)]
mod proptests {
use super::*;
use proptest::prelude::*;
fn arb_url() -> impl Strategy<Value = String> {
prop::string::string_regex("https?://[a-z]+\\.[a-z]{2,4}/[a-z0-9/]*")
.expect("valid URL regex for proptest")
.prop_filter("valid url", |s| !s.is_empty())
}
fn arb_path() -> impl Strategy<Value = String> {
prop::string::string_regex("/[a-z]+(/[a-z0-9]+)*\\.txt")
.expect("valid path regex for proptest")
.prop_filter("valid path", |s| !s.is_empty())
}
fn arb_identifier() -> impl Strategy<Value = String> {
prop::string::string_regex("[a-zA-Z][a-zA-Z0-9_]{0,20}")
.expect("valid identifier regex for proptest")
}
proptest! {
#[test]
fn prop_source_url_roundtrip(url in arb_url()) {
let source = SourceInfo::url(&url);
let json = serde_json::to_string(&source).expect("serialize SourceInfo::Url");
let recovered: SourceInfo = serde_json::from_str(&json).expect("deserialize SourceInfo");
if let SourceInfo::Url { url: recovered_url, .. } = recovered {
prop_assert_eq!(url, recovered_url);
} else {
prop_assert!(false, "Expected Url variant");
}
}
#[test]
fn prop_source_file_roundtrip(path in arb_path()) {
let source = SourceInfo::file(&path);
let json = serde_json::to_string(&source).expect("serialize SourceInfo::File");
let recovered: SourceInfo = serde_json::from_str(&json).expect("deserialize SourceInfo");
if let SourceInfo::File { path: recovered_path, .. } = recovered {
prop_assert_eq!(path, recovered_path);
} else {
prop_assert!(false, "Expected File variant");
}
}
#[test]
fn prop_source_raw_length(text in ".*") {
let source = SourceInfo::raw(&text);
if let SourceInfo::Raw { length, .. } = source {
prop_assert_eq!(length, text.chars().count());
} else {
prop_assert!(false, "Expected Raw variant");
}
}
#[test]
fn prop_source_dataset_roundtrip(
name in arb_identifier(),
split in prop::sample::select(vec!["train", "test", "dev"])
) {
let source = SourceInfo::dataset(&name, split);
let json = serde_json::to_string(&source).expect("serialize SourceInfo::Dataset");
let recovered: SourceInfo = serde_json::from_str(&json).expect("deserialize SourceInfo");
if let SourceInfo::Dataset { name: n, split: _, .. } = recovered {
prop_assert_eq!(name, n);
} else {
prop_assert!(false, "Expected Dataset variant");
}
}
}
proptest! {
#[test]
fn prop_ingest_converter_roundtrip(tool in arb_identifier(), version in "[0-9]+\\.[0-9]+\\.[0-9]+") {
let ingest = IngestInfo::converter(&tool, &version);
let json = serde_json::to_string(&ingest).expect("serialize IngestInfo::Converter");
let recovered: IngestInfo = serde_json::from_str(&json).expect("deserialize IngestInfo");
if let IngestInfo::Converter { tool: t, version: v, .. } = recovered {
prop_assert_eq!(tool, t);
prop_assert_eq!(Some(version), v);
} else {
prop_assert!(false, "Expected Converter variant");
}
}
#[test]
fn prop_ingest_direct_roundtrip(_unused in Just(())) {
let ingest = IngestInfo::direct();
let json = serde_json::to_string(&ingest).expect("serialize IngestInfo::Direct");
let recovered: IngestInfo = serde_json::from_str(&json).expect("deserialize IngestInfo");
prop_assert!(matches!(recovered, IngestInfo::Direct));
}
}
proptest! {
#[test]
fn prop_preprocessing_roundtrip(name in arb_identifier(), order in 0u32..100) {
let step = PreprocessingStep::new(&name, order);
let json = serde_json::to_string(&step).expect("serialize PreprocessingStep");
let recovered: PreprocessingStep =
serde_json::from_str(&json).expect("deserialize PreprocessingStep");
prop_assert_eq!(name, recovered.name);
prop_assert_eq!(order, recovered.order);
}
#[test]
fn prop_preprocessing_with_params_roundtrip(
name in arb_identifier(),
order in 0u32..100,
param_key in arb_identifier(),
param_value in arb_identifier()
) {
let step = PreprocessingStep::new(&name, order)
.with_param(¶m_key, ¶m_value);
let json = serde_json::to_string(&step).expect("serialize PreprocessingStep");
let recovered: PreprocessingStep =
serde_json::from_str(&json).expect("deserialize PreprocessingStep");
prop_assert_eq!(step.params.get(¶m_key), recovered.params.get(¶m_key));
}
}
proptest! {
#[test]
fn prop_extraction_roundtrip(task in arb_identifier(), model in arb_identifier()) {
let pipeline = ExtractionPipeline::new(&task, &model);
let json = serde_json::to_string(&pipeline).expect("serialize ExtractionPipeline");
let recovered: ExtractionPipeline =
serde_json::from_str(&json).expect("deserialize ExtractionPipeline");
prop_assert_eq!(task, recovered.task);
prop_assert_eq!(model, recovered.model);
}
#[test]
fn prop_extraction_version_roundtrip(
task in arb_identifier(),
model in arb_identifier(),
version in "[0-9]+\\.[0-9]+\\.[0-9]+"
) {
let pipeline = ExtractionPipeline::new(&task, &model)
.with_version(&version);
let json = serde_json::to_string(&pipeline).expect("serialize ExtractionPipeline");
let recovered: ExtractionPipeline =
serde_json::from_str(&json).expect("deserialize ExtractionPipeline");
prop_assert_eq!(Some(version), recovered.version);
}
#[test]
fn prop_extraction_types_roundtrip(
task in arb_identifier(),
model in arb_identifier(),
types in prop::collection::vec(arb_identifier(), 0..5)
) {
let mut pipeline = ExtractionPipeline::new(&task, &model);
for t in &types {
pipeline = pipeline.with_type(t);
}
let json = serde_json::to_string(&pipeline).expect("serialize ExtractionPipeline");
let recovered: ExtractionPipeline =
serde_json::from_str(&json).expect("deserialize ExtractionPipeline");
prop_assert_eq!(types, recovered.types_extracted);
}
}
proptest! {
#[test]
fn prop_provenance_roundtrip(url in arb_url()) {
let prov = DocumentProvenance::builder()
.source(SourceInfo::url(&url))
.ingest(IngestInfo::direct())
.build();
let json = serde_json::to_string(&prov).expect("serialize DocumentProvenance");
let recovered: DocumentProvenance =
serde_json::from_str(&json).expect("deserialize DocumentProvenance");
prop_assert_eq!(prov.id, recovered.id);
if let (SourceInfo::Url { url: u1, .. }, SourceInfo::Url { url: u2, .. }) =
(&prov.source, &recovered.source)
{
prop_assert_eq!(u1, u2);
}
}
#[test]
fn prop_provenance_id_prefix(url in arb_url()) {
let prov = DocumentProvenance::builder()
.source(SourceInfo::url(&url))
.build();
prop_assert!(prov.id.starts_with("prov:"));
}
#[test]
fn prop_provenance_has_timestamp(url in arb_url()) {
let prov = DocumentProvenance::builder()
.source(SourceInfo::url(&url))
.build();
prop_assert!(prov.started_at.is_some());
}
#[test]
fn prop_provenance_metadata(
key in arb_identifier(),
value in arb_identifier()
) {
let prov = DocumentProvenance::builder()
.source(SourceInfo::Unknown)
.metadata(&key, &value)
.build();
let json = serde_json::to_string(&prov).expect("serialize DocumentProvenance");
let recovered: DocumentProvenance =
serde_json::from_str(&json).expect("deserialize DocumentProvenance");
prop_assert_eq!(prov.metadata.get(&key), recovered.metadata.get(&key));
}
}
}