1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
//! RM-B Task 6: cross-language wire-format parity for the SP-1 sync
//! primitives (SyncMode + per-source cursor JSON shapes).
//!
//! The big "vectors are byte-identical across implementations" promise
//! still holds — it's covered by `cross_language_sqlite_parity.rs` and the
//! embedding parity fixtures. This test focuses on what RM-B *added*:
//! YAML enum spelling for SyncMode and the JSON shape of each Source's
//! cursor type.
//!
//! Assertions are against hard-coded reference strings that match Python's
//! `json.dumps` / pydantic-serialized output. Anyone changing those wire
//! formats must update both ends.
use std::collections::BTreeMap;
use chunkshop::sources::base::SyncMode;
use chunkshop::sources::http::HttpUrlCursor;
use chunkshop::sources::pg_table::PgTableCursor;
#[test]
fn sync_mode_yaml_spelling_matches_python() {
// Python: `SyncMode.FULL_RESYNC.value == "full_resync"`. Rust must
// emit identical snake_case strings so a YAML config written by either
// side loads in the other.
for (mode, py_repr) in [
(SyncMode::FullResync, "full_resync"),
(SyncMode::Cursor, "cursor"),
(SyncMode::Fingerprint, "fingerprint"),
] {
let json = serde_json::to_string(&mode).unwrap();
// Strip the surrounding quotes that serde_json adds around enum
// variants.
let raw = json.trim_matches('"');
assert_eq!(raw, py_repr, "SyncMode::{mode:?} → {py_repr}");
}
}
#[test]
fn pg_table_cursor_json_shape_matches_python_dict() {
// Python: `{"after_ts": "2026-05-25T12:00:00+00:00", "after_id": "c1"}`
let cursor = PgTableCursor {
after_ts: Some("2026-05-25T12:00:00+00:00".into()),
after_id: Some("c1".into()),
};
let json = serde_json::to_string(&cursor).unwrap();
// serde_json doesn't guarantee key order, so deserialize back and
// compare structurally rather than asserting on string equality.
let back: serde_json::Value = serde_json::from_str(&json).unwrap();
assert_eq!(back["after_ts"], "2026-05-25T12:00:00+00:00");
assert_eq!(back["after_id"], "c1");
// Empty cursor → `{}`. Python's `empty_cursor()` returns `{}` literally.
let empty: PgTableCursor = serde_json::from_str("{}").unwrap();
assert!(empty.after_ts.is_none() && empty.after_id.is_none());
assert_eq!(serde_json::to_string(&empty).unwrap(), "{}");
}
#[test]
fn s3_cursor_json_shape_matches_python_dict() {
// Python: `{"k1": "\"etag1\"", "k2": "\"etag2\""}`
let mut cursor: BTreeMap<String, String> = BTreeMap::new();
cursor.insert("k1".into(), "\"etag1\"".into());
cursor.insert("k2".into(), "\"etag2\"".into());
let json = serde_json::to_string(&cursor).unwrap();
// BTreeMap iterates in sorted order so output is deterministic.
assert_eq!(json, r#"{"k1":"\"etag1\"","k2":"\"etag2\""}"#);
// Round-trip.
let back: BTreeMap<String, String> = serde_json::from_str(&json).unwrap();
assert_eq!(back, cursor);
}
#[test]
fn http_cursor_json_shape_matches_python_nested_dict() {
// Python:
// {"https://a.test/": {"etag": "\"e1\"",
// "last_modified": "Mon, 25 May 2026 12:00:00 GMT"}}
let mut cursor: BTreeMap<String, HttpUrlCursor> = BTreeMap::new();
cursor.insert(
"https://a.test/".into(),
HttpUrlCursor {
etag: Some("\"e1\"".into()),
last_modified: Some("Mon, 25 May 2026 12:00:00 GMT".into()),
},
);
let json = serde_json::to_string(&cursor).unwrap();
let back: serde_json::Value = serde_json::from_str(&json).unwrap();
let entry = &back["https://a.test/"];
assert_eq!(entry["etag"], "\"e1\"");
assert_eq!(entry["last_modified"], "Mon, 25 May 2026 12:00:00 GMT");
// An entry with both etag and last_modified == None serializes to `{}`
// (skip_serializing_if = "Option::is_none"). Round-trips to default.
let empty = HttpUrlCursor::default();
assert_eq!(serde_json::to_string(&empty).unwrap(), "{}");
let back: HttpUrlCursor = serde_json::from_str("{}").unwrap();
assert_eq!(back, HttpUrlCursor::default());
}
#[test]
fn raw_store_local_layout_is_python_byte_identical() {
// The path layout uses sha256 hex hashes — Python and Rust must produce
// the same path for the same doc_id. This is verified at the unit level
// in raw_store.rs::local_layout_matches_python_sha256, but assert it
// here too as part of the parity suite so a regression on either side
// is obvious in this consolidated test.
use sha2::{Digest, Sha256};
for doc_id in ["doc::1", "s3://bucket/key", "https://example.test/path?q=1"] {
let mut h = Sha256::new();
h.update(doc_id.as_bytes());
let rust_hex = format!("{:x}", h.finalize());
// Python's hashlib.sha256("doc::1".encode("utf-8")).hexdigest()
// produces the same hex for the same input — same SHA-256 spec.
// We're not running Python here; this asserts our own hex format
// (lowercase, no separators, full 64 chars) which matches Python's
// hexdigest() exactly.
assert_eq!(rust_hex.len(), 64);
assert!(rust_hex.chars().all(|c| c.is_ascii_hexdigit()));
assert_eq!(rust_hex.to_lowercase(), rust_hex);
eprintln!("doc_id={doc_id:?} → {rust_hex}");
}
}