Skip to main content

converge_knowledge/ingest/
source.rs

1//! Shared source provenance contracts for ingestion pipelines.
2//!
3//! These types are intentionally backend-agnostic and are used by upcoming
4//! Phase 2 (Apple ecosystem) and Phase 3 (rich media) ingestion paths.
5
6use chrono::{DateTime, Utc};
7use serde::{Deserialize, Serialize};
8use std::collections::{BTreeMap, HashMap};
9
10/// The high-level source type an ingested artifact came from.
11#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
12#[serde(rename_all = "snake_case")]
13pub enum SourceKind {
14    /// Apple Notes exported content.
15    AppleNote,
16    /// Screenshot image files.
17    Screenshot,
18    /// Photo image files.
19    Photo,
20    /// Video media files.
21    Video,
22    /// Audio media files.
23    Audio,
24    /// PDF documents.
25    Pdf,
26    /// Markdown documents.
27    Markdown,
28    /// Any other or not-yet-classified source.
29    Unknown,
30}
31
32impl SourceKind {
33    /// String form used in metadata keys and idempotency keys.
34    pub fn as_str(self) -> &'static str {
35        match self {
36            Self::AppleNote => "apple_note",
37            Self::Screenshot => "screenshot",
38            Self::Photo => "photo",
39            Self::Video => "video",
40            Self::Audio => "audio",
41            Self::Pdf => "pdf",
42            Self::Markdown => "markdown",
43            Self::Unknown => "unknown",
44        }
45    }
46}
47
48/// Provenance metadata shared by all ingesters.
49///
50/// This supports consistent source attribution and deterministic re-ingestion.
51#[derive(Debug, Clone, Serialize, Deserialize)]
52pub struct SourceProvenance {
53    /// What kind of source produced the content.
54    pub source_kind: SourceKind,
55    /// Stable source URI/path (e.g., `file:///...`, `notes://...`).
56    pub source_uri: String,
57    /// Optional upstream source identifier (e.g., Apple Note ID, Photos asset ID).
58    pub origin_id: Option<String>,
59    /// Optional content fingerprint (checksum, hash, opaque digest).
60    pub fingerprint: Option<String>,
61    /// When the source artifact was originally created/captured, if known.
62    pub captured_at: Option<DateTime<Utc>>,
63    /// When this provenance record was created during import.
64    pub imported_at: DateTime<Utc>,
65    /// Arbitrary source-specific metadata.
66    pub metadata: HashMap<String, String>,
67}
68
69impl SourceProvenance {
70    /// Create a new provenance record.
71    pub fn new(source_kind: SourceKind, source_uri: impl Into<String>) -> Self {
72        Self {
73            source_kind,
74            source_uri: source_uri.into(),
75            origin_id: None,
76            fingerprint: None,
77            captured_at: None,
78            imported_at: Utc::now(),
79            metadata: HashMap::new(),
80        }
81    }
82
83    /// Set the upstream origin ID.
84    pub fn with_origin_id(mut self, origin_id: impl Into<String>) -> Self {
85        self.origin_id = Some(origin_id.into());
86        self
87    }
88
89    /// Set the source fingerprint.
90    pub fn with_fingerprint(mut self, fingerprint: impl Into<String>) -> Self {
91        self.fingerprint = Some(fingerprint.into());
92        self
93    }
94
95    /// Set the capture timestamp.
96    pub fn with_captured_at(mut self, captured_at: DateTime<Utc>) -> Self {
97        self.captured_at = Some(captured_at);
98        self
99    }
100
101    /// Add a source-specific metadata key/value pair.
102    pub fn with_metadata(mut self, key: impl Into<String>, value: impl Into<String>) -> Self {
103        self.metadata.insert(key.into(), value.into());
104        self
105    }
106
107    /// Generate a deterministic idempotency key for de-duplication.
108    ///
109    /// Preference order:
110    /// 1. `origin_id` (best external identity)
111    /// 2. `fingerprint` (content identity)
112    /// 3. `source_uri` (path identity fallback)
113    pub fn idempotency_key(&self) -> String {
114        if let Some(origin_id) = &self.origin_id {
115            return format!("{}:origin:{}", self.source_kind.as_str(), origin_id);
116        }
117
118        if let Some(fingerprint) = &self.fingerprint {
119            return format!("{}:fingerprint:{}", self.source_kind.as_str(), fingerprint);
120        }
121
122        format!("{}:uri:{}", self.source_kind.as_str(), self.source_uri)
123    }
124
125    /// Export normalized metadata pairs for attaching to knowledge entries.
126    ///
127    /// Keys are namespaced under `source.*`.
128    pub fn metadata_pairs(&self) -> Vec<(String, String)> {
129        let mut out = BTreeMap::new();
130        out.insert(
131            "source.kind".to_string(),
132            self.source_kind.as_str().to_string(),
133        );
134        out.insert("source.uri".to_string(), self.source_uri.clone());
135        out.insert("source.idempotency_key".to_string(), self.idempotency_key());
136
137        if let Some(origin_id) = &self.origin_id {
138            out.insert("source.origin_id".to_string(), origin_id.clone());
139        }
140        if let Some(fingerprint) = &self.fingerprint {
141            out.insert("source.fingerprint".to_string(), fingerprint.clone());
142        }
143        if let Some(captured_at) = self.captured_at {
144            out.insert("source.captured_at".to_string(), captured_at.to_rfc3339());
145        }
146        out.insert(
147            "source.imported_at".to_string(),
148            self.imported_at.to_rfc3339(),
149        );
150
151        for (key, value) in &self.metadata {
152            out.insert(format!("source.meta.{key}"), value.clone());
153        }
154
155        out.into_iter().collect()
156    }
157}
158
159#[cfg(test)]
160mod tests {
161    use super::*;
162    use chrono::TimeZone;
163
164    #[test]
165    fn idempotency_key_prefers_origin_id() {
166        let provenance = SourceProvenance::new(SourceKind::AppleNote, "notes://abc")
167            .with_fingerprint("sha256:deadbeef")
168            .with_origin_id("note-123");
169
170        assert_eq!(
171            provenance.idempotency_key(),
172            "apple_note:origin:note-123".to_string()
173        );
174    }
175
176    #[test]
177    fn metadata_pairs_are_namespaced_and_sorted() {
178        let captured = Utc.with_ymd_and_hms(2025, 1, 2, 3, 4, 5).unwrap();
179        let mut provenance = SourceProvenance::new(SourceKind::Screenshot, "/tmp/shot.png")
180            .with_fingerprint("abc123")
181            .with_captured_at(captured);
182        provenance.imported_at = Utc.with_ymd_and_hms(2025, 1, 2, 10, 11, 12).unwrap();
183        provenance
184            .metadata
185            .insert("window_title".into(), "Mail".into());
186        provenance.metadata.insert("app".into(), "Mail".into());
187
188        let pairs = provenance.metadata_pairs();
189        let keys: Vec<&str> = pairs.iter().map(|(k, _)| k.as_str()).collect();
190        assert!(keys.windows(2).all(|w| w[0] <= w[1]));
191        assert!(keys.contains(&"source.kind"));
192        assert!(keys.contains(&"source.meta.window_title"));
193        assert!(keys.contains(&"source.idempotency_key"));
194    }
195}