chunkshop-rs 0.9.1

Standalone ingest-to-pgvector: source -> chunker -> embedder -> extractor -> table. int8 BGE by default; bakeoff matrix evaluator built in. Cross-language wire-format compatible with the Python `chunkshop` package.
Documentation
//! RM-B Task 6: cross-language wire-format parity for the SP-1 sync
//! primitives (SyncMode + per-source cursor JSON shapes).
//!
//! The big "vectors are byte-identical across implementations" promise
//! still holds — it's covered by `cross_language_sqlite_parity.rs` and the
//! embedding parity fixtures. This test focuses on what RM-B *added*:
//! YAML enum spelling for SyncMode and the JSON shape of each Source's
//! cursor type.
//!
//! Assertions are against hard-coded reference strings that match Python's
//! `json.dumps` / pydantic-serialized output. Anyone changing those wire
//! formats must update both ends.

use std::collections::BTreeMap;

use chunkshop::sources::base::SyncMode;
use chunkshop::sources::http::HttpUrlCursor;
use chunkshop::sources::pg_table::PgTableCursor;

#[test]
fn sync_mode_yaml_spelling_matches_python() {
    // Python: `SyncMode.FULL_RESYNC.value == "full_resync"`. Rust must
    // emit identical snake_case strings so a YAML config written by either
    // side loads in the other.
    for (mode, py_repr) in [
        (SyncMode::FullResync, "full_resync"),
        (SyncMode::Cursor, "cursor"),
        (SyncMode::Fingerprint, "fingerprint"),
    ] {
        let json = serde_json::to_string(&mode).unwrap();
        // Strip the surrounding quotes that serde_json adds around enum
        // variants.
        let raw = json.trim_matches('"');
        assert_eq!(raw, py_repr, "SyncMode::{mode:?} → {py_repr}");
    }
}

#[test]
fn pg_table_cursor_json_shape_matches_python_dict() {
    // Python: `{"after_ts": "2026-05-25T12:00:00+00:00", "after_id": "c1"}`
    let cursor = PgTableCursor {
        after_ts: Some("2026-05-25T12:00:00+00:00".into()),
        after_id: Some("c1".into()),
    };
    let json = serde_json::to_string(&cursor).unwrap();
    // serde_json doesn't guarantee key order, so deserialize back and
    // compare structurally rather than asserting on string equality.
    let back: serde_json::Value = serde_json::from_str(&json).unwrap();
    assert_eq!(back["after_ts"], "2026-05-25T12:00:00+00:00");
    assert_eq!(back["after_id"], "c1");

    // Empty cursor → `{}`. Python's `empty_cursor()` returns `{}` literally.
    let empty: PgTableCursor = serde_json::from_str("{}").unwrap();
    assert!(empty.after_ts.is_none() && empty.after_id.is_none());
    assert_eq!(serde_json::to_string(&empty).unwrap(), "{}");
}

#[test]
fn s3_cursor_json_shape_matches_python_dict() {
    // Python: `{"k1": "\"etag1\"", "k2": "\"etag2\""}`
    let mut cursor: BTreeMap<String, String> = BTreeMap::new();
    cursor.insert("k1".into(), "\"etag1\"".into());
    cursor.insert("k2".into(), "\"etag2\"".into());
    let json = serde_json::to_string(&cursor).unwrap();
    // BTreeMap iterates in sorted order so output is deterministic.
    assert_eq!(json, r#"{"k1":"\"etag1\"","k2":"\"etag2\""}"#);

    // Round-trip.
    let back: BTreeMap<String, String> = serde_json::from_str(&json).unwrap();
    assert_eq!(back, cursor);
}

#[test]
fn http_cursor_json_shape_matches_python_nested_dict() {
    // Python:
    //   {"https://a.test/": {"etag": "\"e1\"",
    //                        "last_modified": "Mon, 25 May 2026 12:00:00 GMT"}}
    let mut cursor: BTreeMap<String, HttpUrlCursor> = BTreeMap::new();
    cursor.insert(
        "https://a.test/".into(),
        HttpUrlCursor {
            etag: Some("\"e1\"".into()),
            last_modified: Some("Mon, 25 May 2026 12:00:00 GMT".into()),
        },
    );
    let json = serde_json::to_string(&cursor).unwrap();
    let back: serde_json::Value = serde_json::from_str(&json).unwrap();
    let entry = &back["https://a.test/"];
    assert_eq!(entry["etag"], "\"e1\"");
    assert_eq!(entry["last_modified"], "Mon, 25 May 2026 12:00:00 GMT");

    // An entry with both etag and last_modified == None serializes to `{}`
    // (skip_serializing_if = "Option::is_none"). Round-trips to default.
    let empty = HttpUrlCursor::default();
    assert_eq!(serde_json::to_string(&empty).unwrap(), "{}");
    let back: HttpUrlCursor = serde_json::from_str("{}").unwrap();
    assert_eq!(back, HttpUrlCursor::default());
}

#[test]
fn raw_store_local_layout_is_python_byte_identical() {
    // The path layout uses sha256 hex hashes — Python and Rust must produce
    // the same path for the same doc_id. This is verified at the unit level
    // in raw_store.rs::local_layout_matches_python_sha256, but assert it
    // here too as part of the parity suite so a regression on either side
    // is obvious in this consolidated test.
    use sha2::{Digest, Sha256};
    for doc_id in ["doc::1", "s3://bucket/key", "https://example.test/path?q=1"] {
        let mut h = Sha256::new();
        h.update(doc_id.as_bytes());
        let rust_hex = format!("{:x}", h.finalize());
        // Python's hashlib.sha256("doc::1".encode("utf-8")).hexdigest()
        // produces the same hex for the same input — same SHA-256 spec.
        // We're not running Python here; this asserts our own hex format
        // (lowercase, no separators, full 64 chars) which matches Python's
        // hexdigest() exactly.
        assert_eq!(rust_hex.len(), 64);
        assert!(rust_hex.chars().all(|c| c.is_ascii_hexdigit()));
        assert_eq!(rust_hex.to_lowercase(), rust_hex);
        eprintln!("doc_id={doc_id:?}{rust_hex}");
    }
}