ceres_core/
sync.rs

1//! Sync service layer for portal synchronization logic.
2//!
3//! This module provides pure business logic for delta detection and sync statistics,
4//! decoupled from I/O operations and CLI orchestration.
5
6/// Outcome of processing a single dataset during sync.
7#[derive(Debug, Clone, Copy, PartialEq, Eq)]
8pub enum SyncOutcome {
9    /// Dataset content hash matches existing - no changes needed
10    Unchanged,
11    /// Dataset content changed - embedding regenerated
12    Updated,
13    /// New dataset - first time seeing this dataset
14    Created,
15    /// Processing failed for this dataset
16    Failed,
17}
18
19/// Statistics for a portal sync operation.
20#[derive(Debug, Default, Clone)]
21pub struct SyncStats {
22    pub unchanged: usize,
23    pub updated: usize,
24    pub created: usize,
25    pub failed: usize,
26}
27
28impl SyncStats {
29    /// Creates a new empty stats tracker.
30    pub fn new() -> Self {
31        Self::default()
32    }
33
34    /// Records an outcome, incrementing the appropriate counter.
35    pub fn record(&mut self, outcome: SyncOutcome) {
36        match outcome {
37            SyncOutcome::Unchanged => self.unchanged += 1,
38            SyncOutcome::Updated => self.updated += 1,
39            SyncOutcome::Created => self.created += 1,
40            SyncOutcome::Failed => self.failed += 1,
41        }
42    }
43
44    /// Returns the total number of processed datasets.
45    pub fn total(&self) -> usize {
46        self.unchanged + self.updated + self.created + self.failed
47    }
48
49    /// Returns the number of successfully processed datasets.
50    pub fn successful(&self) -> usize {
51        self.unchanged + self.updated + self.created
52    }
53}
54
55/// Result of delta detection for a dataset.
56#[derive(Debug, Clone, PartialEq, Eq)]
57pub struct ReprocessingDecision {
58    /// Whether embedding needs to be regenerated
59    pub needs_embedding: bool,
60    /// The outcome classification for this dataset
61    pub outcome: SyncOutcome,
62    /// Human-readable reason for the decision
63    pub reason: &'static str,
64}
65
66impl ReprocessingDecision {
67    /// Returns true if this is a legacy record update (existing record without hash).
68    pub fn is_legacy(&self) -> bool {
69        self.reason == "legacy record without hash"
70    }
71}
72
73/// Determines if a dataset needs reprocessing based on content hash comparison.
74///
75/// # Arguments
76/// * `existing_hash` - The stored content hash for this dataset (None if new dataset)
77/// * `new_hash` - The computed content hash from the portal data
78///
79/// # Returns
80/// A `ReprocessingDecision` indicating whether embedding regeneration is needed
81/// and the classification of this sync operation.
82pub fn needs_reprocessing(
83    existing_hash: Option<&Option<String>>,
84    new_hash: &str,
85) -> ReprocessingDecision {
86    match existing_hash {
87        Some(Some(hash)) if hash == new_hash => {
88            // Hash matches - content unchanged
89            ReprocessingDecision {
90                needs_embedding: false,
91                outcome: SyncOutcome::Unchanged,
92                reason: "content hash matches",
93            }
94        }
95        Some(Some(_)) => {
96            // Hash exists but differs - content updated
97            ReprocessingDecision {
98                needs_embedding: true,
99                outcome: SyncOutcome::Updated,
100                reason: "content hash changed",
101            }
102        }
103        Some(None) => {
104            // Exists but no hash (legacy data) - treat as update
105            ReprocessingDecision {
106                needs_embedding: true,
107                outcome: SyncOutcome::Updated,
108                reason: "legacy record without hash",
109            }
110        }
111        None => {
112            // Not in existing data - new dataset
113            ReprocessingDecision {
114                needs_embedding: true,
115                outcome: SyncOutcome::Created,
116                reason: "new dataset",
117            }
118        }
119    }
120}
121
122// =============================================================================
123// Batch Harvest Types
124// =============================================================================
125
126/// Result of harvesting a single portal in batch mode.
127#[derive(Debug, Clone)]
128pub struct PortalHarvestResult {
129    /// Portal name identifier.
130    pub portal_name: String,
131    /// Portal URL.
132    pub portal_url: String,
133    /// Sync statistics for this portal.
134    pub stats: SyncStats,
135    /// Error message if harvest failed, None if successful.
136    pub error: Option<String>,
137}
138
139impl PortalHarvestResult {
140    /// Creates a successful harvest result.
141    pub fn success(name: String, url: String, stats: SyncStats) -> Self {
142        Self {
143            portal_name: name,
144            portal_url: url,
145            stats,
146            error: None,
147        }
148    }
149
150    /// Creates a failed harvest result.
151    pub fn failure(name: String, url: String, error: String) -> Self {
152        Self {
153            portal_name: name,
154            portal_url: url,
155            stats: SyncStats::default(),
156            error: Some(error),
157        }
158    }
159
160    /// Returns true if the harvest was successful.
161    pub fn is_success(&self) -> bool {
162        self.error.is_none()
163    }
164}
165
166/// Aggregated results from batch harvesting multiple portals.
167#[derive(Debug, Clone, Default)]
168pub struct BatchHarvestSummary {
169    /// Results for each portal.
170    pub results: Vec<PortalHarvestResult>,
171}
172
173impl BatchHarvestSummary {
174    /// Creates a new empty summary.
175    pub fn new() -> Self {
176        Self::default()
177    }
178
179    /// Adds a portal harvest result.
180    pub fn add(&mut self, result: PortalHarvestResult) {
181        self.results.push(result);
182    }
183
184    /// Returns the count of successful harvests.
185    pub fn successful_count(&self) -> usize {
186        self.results.iter().filter(|r| r.is_success()).count()
187    }
188
189    /// Returns the count of failed harvests.
190    pub fn failed_count(&self) -> usize {
191        self.results.iter().filter(|r| !r.is_success()).count()
192    }
193
194    /// Returns the total number of datasets across all successful portals.
195    pub fn total_datasets(&self) -> usize {
196        self.results.iter().map(|r| r.stats.total()).sum()
197    }
198
199    /// Returns the total number of portals processed.
200    pub fn total_portals(&self) -> usize {
201        self.results.len()
202    }
203}
204
205#[cfg(test)]
206mod tests {
207    use super::*;
208
209    #[test]
210    fn test_sync_stats_default() {
211        let stats = SyncStats::new();
212        assert_eq!(stats.unchanged, 0);
213        assert_eq!(stats.updated, 0);
214        assert_eq!(stats.created, 0);
215        assert_eq!(stats.failed, 0);
216    }
217
218    #[test]
219    fn test_sync_stats_record() {
220        let mut stats = SyncStats::new();
221        stats.record(SyncOutcome::Unchanged);
222        stats.record(SyncOutcome::Updated);
223        stats.record(SyncOutcome::Created);
224        stats.record(SyncOutcome::Failed);
225
226        assert_eq!(stats.unchanged, 1);
227        assert_eq!(stats.updated, 1);
228        assert_eq!(stats.created, 1);
229        assert_eq!(stats.failed, 1);
230    }
231
232    #[test]
233    fn test_sync_stats_total() {
234        let mut stats = SyncStats::new();
235        stats.unchanged = 10;
236        stats.updated = 5;
237        stats.created = 3;
238        stats.failed = 2;
239
240        assert_eq!(stats.total(), 20);
241    }
242
243    #[test]
244    fn test_sync_stats_successful() {
245        let mut stats = SyncStats::new();
246        stats.unchanged = 10;
247        stats.updated = 5;
248        stats.created = 3;
249        stats.failed = 2;
250
251        assert_eq!(stats.successful(), 18);
252    }
253
254    #[test]
255    fn test_needs_reprocessing_unchanged() {
256        let hash = "abc123".to_string();
257        let existing = Some(Some(hash.clone()));
258        let decision = needs_reprocessing(existing.as_ref(), &hash);
259
260        assert!(!decision.needs_embedding);
261        assert_eq!(decision.outcome, SyncOutcome::Unchanged);
262        assert_eq!(decision.reason, "content hash matches");
263    }
264
265    #[test]
266    fn test_needs_reprocessing_updated() {
267        let old_hash = "abc123".to_string();
268        let new_hash = "def456";
269        let existing = Some(Some(old_hash));
270        let decision = needs_reprocessing(existing.as_ref(), new_hash);
271
272        assert!(decision.needs_embedding);
273        assert_eq!(decision.outcome, SyncOutcome::Updated);
274        assert_eq!(decision.reason, "content hash changed");
275    }
276
277    #[test]
278    fn test_needs_reprocessing_legacy() {
279        let existing: Option<Option<String>> = Some(None);
280        let decision = needs_reprocessing(existing.as_ref(), "new_hash");
281
282        assert!(decision.needs_embedding);
283        assert_eq!(decision.outcome, SyncOutcome::Updated);
284        assert_eq!(decision.reason, "legacy record without hash");
285    }
286
287    #[test]
288    fn test_needs_reprocessing_new() {
289        let decision = needs_reprocessing(None, "new_hash");
290
291        assert!(decision.needs_embedding);
292        assert_eq!(decision.outcome, SyncOutcome::Created);
293        assert_eq!(decision.reason, "new dataset");
294    }
295
296    #[test]
297    fn test_is_legacy_true() {
298        let existing: Option<Option<String>> = Some(None);
299        let decision = needs_reprocessing(existing.as_ref(), "new_hash");
300
301        assert!(decision.is_legacy());
302    }
303
304    #[test]
305    fn test_is_legacy_false() {
306        let decision = needs_reprocessing(None, "new_hash");
307        assert!(!decision.is_legacy());
308
309        let hash = "abc123".to_string();
310        let existing = Some(Some(hash.clone()));
311        let decision = needs_reprocessing(existing.as_ref(), &hash);
312        assert!(!decision.is_legacy());
313    }
314
315    // =========================================================================
316    // PortalHarvestResult tests
317    // =========================================================================
318
319    #[test]
320    fn test_portal_harvest_result_success() {
321        let stats = SyncStats {
322            unchanged: 5,
323            updated: 3,
324            created: 2,
325            failed: 0,
326        };
327        let result = PortalHarvestResult::success(
328            "test".to_string(),
329            "https://example.com".to_string(),
330            stats,
331        );
332        assert!(result.is_success());
333        assert!(result.error.is_none());
334        assert_eq!(result.stats.total(), 10);
335        assert_eq!(result.portal_name, "test");
336        assert_eq!(result.portal_url, "https://example.com");
337    }
338
339    #[test]
340    fn test_portal_harvest_result_failure() {
341        let result = PortalHarvestResult::failure(
342            "test".to_string(),
343            "https://example.com".to_string(),
344            "Connection timeout".to_string(),
345        );
346        assert!(!result.is_success());
347        assert_eq!(result.error, Some("Connection timeout".to_string()));
348        assert_eq!(result.stats.total(), 0);
349    }
350
351    // =========================================================================
352    // BatchHarvestSummary tests
353    // =========================================================================
354
355    #[test]
356    fn test_batch_harvest_summary_empty() {
357        let summary = BatchHarvestSummary::new();
358        assert_eq!(summary.successful_count(), 0);
359        assert_eq!(summary.failed_count(), 0);
360        assert_eq!(summary.total_datasets(), 0);
361        assert_eq!(summary.total_portals(), 0);
362    }
363
364    #[test]
365    fn test_batch_harvest_summary_mixed_results() {
366        let mut summary = BatchHarvestSummary::new();
367
368        let stats1 = SyncStats {
369            unchanged: 10,
370            updated: 5,
371            created: 3,
372            failed: 2,
373        };
374        summary.add(PortalHarvestResult::success(
375            "a".into(),
376            "https://a.com".into(),
377            stats1,
378        ));
379
380        summary.add(PortalHarvestResult::failure(
381            "b".into(),
382            "https://b.com".into(),
383            "error".into(),
384        ));
385
386        let stats2 = SyncStats {
387            unchanged: 20,
388            updated: 0,
389            created: 0,
390            failed: 0,
391        };
392        summary.add(PortalHarvestResult::success(
393            "c".into(),
394            "https://c.com".into(),
395            stats2,
396        ));
397
398        assert_eq!(summary.total_portals(), 3);
399        assert_eq!(summary.successful_count(), 2);
400        assert_eq!(summary.failed_count(), 1);
401        assert_eq!(summary.total_datasets(), 40); // 20 + 20 + 0 (failed portal has 0)
402    }
403
404    #[test]
405    fn test_batch_harvest_summary_all_successful() {
406        let mut summary = BatchHarvestSummary::new();
407
408        let stats = SyncStats {
409            unchanged: 5,
410            updated: 0,
411            created: 5,
412            failed: 0,
413        };
414        summary.add(PortalHarvestResult::success(
415            "portal1".into(),
416            "https://portal1.com".into(),
417            stats,
418        ));
419
420        assert_eq!(summary.successful_count(), 1);
421        assert_eq!(summary.failed_count(), 0);
422        assert_eq!(summary.total_datasets(), 10);
423    }
424
425    #[test]
426    fn test_batch_harvest_summary_all_failed() {
427        let mut summary = BatchHarvestSummary::new();
428
429        summary.add(PortalHarvestResult::failure(
430            "portal1".into(),
431            "https://portal1.com".into(),
432            "error1".into(),
433        ));
434        summary.add(PortalHarvestResult::failure(
435            "portal2".into(),
436            "https://portal2.com".into(),
437            "error2".into(),
438        ));
439
440        assert_eq!(summary.successful_count(), 0);
441        assert_eq!(summary.failed_count(), 2);
442        assert_eq!(summary.total_datasets(), 0);
443        assert_eq!(summary.total_portals(), 2);
444    }
445}