Skip to main content

objects/object/
state_provenance.rs

1// SPDX-License-Identifier: Apache-2.0
2//! Line-level provenance for text files.
3
4use chrono::{DateTime, Utc};
5use serde::{Deserialize, Serialize};
6
7use super::{Attribution, ChangeId, ContentHash};
8
9#[derive(Clone, Debug, PartialEq, Eq, Serialize, Deserialize)]
10pub struct FileProvenance {
11    pub format_version: u8,
12    pub file_blob: ContentHash,
13    pub line_count: u32,
14    pub origins: Vec<Origin>,
15    pub origin_sets: Vec<OriginSet>,
16    pub spans: Vec<LineSpan>,
17}
18
19#[derive(Clone, Debug, PartialEq, Eq, Serialize, Deserialize)]
20pub struct Origin {
21    pub state_id: ChangeId,
22    pub attribution: Attribution,
23    /// Committer time — when the state object came into being. Stable
24    /// across re-imports because it's part of the state hash.
25    pub created_at: DateTime<Utc>,
26    /// Authoring time, when distinct from `created_at`. Populated by
27    /// the git-ingest importer from the commit's `authored_at` so
28    /// blame can match git's default of showing author time. Native
29    /// heddle commits leave this `None` and blame falls back to
30    /// `created_at`. Tail-only optional field for forward compat.
31    #[serde(default)]
32    pub authored_at: Option<DateTime<Utc>>,
33}
34
35#[derive(Clone, Debug, PartialEq, Eq, Serialize, Deserialize)]
36pub struct OriginSet {
37    pub origin_indexes: Vec<u32>,
38}
39
40#[derive(Clone, Debug, PartialEq, Eq, Serialize, Deserialize)]
41pub struct LineSpan {
42    pub start_line: u32,
43    pub line_len: u32,
44    pub origin_set_index: u32,
45}
46
47#[derive(Debug, Clone, PartialEq, Eq, thiserror::Error)]
48pub enum ProvenanceError {
49    #[error("unsupported provenance format version {0}")]
50    UnsupportedVersion(u8),
51    #[error("line spans do not cover the file exactly")]
52    InvalidCoverage,
53    #[error("invalid origin set index {0}")]
54    InvalidOriginSetIndex(u32),
55    #[error("invalid origin index {0}")]
56    InvalidOriginIndex(u32),
57    #[error("provenance file blob mismatch")]
58    BlobMismatch,
59    #[error("provenance line count mismatch")]
60    LineCountMismatch,
61}
62
63impl FileProvenance {
64    pub const FORMAT_VERSION: u8 = 1;
65
66    pub fn new(
67        file_blob: ContentHash,
68        line_count: u32,
69        spans: Vec<LineSpan>,
70        origins: Vec<Origin>,
71        origin_sets: Vec<OriginSet>,
72    ) -> Self {
73        Self {
74            format_version: Self::FORMAT_VERSION,
75            file_blob,
76            line_count,
77            origins,
78            origin_sets,
79            spans,
80        }
81    }
82
83    pub fn validate(&self) -> Result<(), ProvenanceError> {
84        if self.format_version != Self::FORMAT_VERSION {
85            return Err(ProvenanceError::UnsupportedVersion(self.format_version));
86        }
87
88        let mut next_line = 0u32;
89        for span in &self.spans {
90            if span.start_line != next_line || span.line_len == 0 {
91                return Err(ProvenanceError::InvalidCoverage);
92            }
93            let Some(origin_set) = self.origin_sets.get(span.origin_set_index as usize) else {
94                return Err(ProvenanceError::InvalidOriginSetIndex(
95                    span.origin_set_index,
96                ));
97            };
98            for origin_index in &origin_set.origin_indexes {
99                if self.origins.get(*origin_index as usize).is_none() {
100                    return Err(ProvenanceError::InvalidOriginIndex(*origin_index));
101                }
102            }
103            next_line = next_line.saturating_add(span.line_len);
104        }
105
106        if next_line != self.line_count {
107            return Err(ProvenanceError::InvalidCoverage);
108        }
109
110        Ok(())
111    }
112
113    pub fn line_origin_set_indexes(&self) -> Result<Vec<u32>, ProvenanceError> {
114        self.validate()?;
115        let mut out = Vec::with_capacity(self.line_count as usize);
116        for span in &self.spans {
117            for _ in 0..span.line_len {
118                out.push(span.origin_set_index);
119            }
120        }
121        Ok(out)
122    }
123}