zer-lib 1.0.0

Zero-shot entity resolution pipeline for Dutch-centric data, with GPU acceleration and neural judging
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
//! `zer-lib`, unified entity resolution library.
//!
//! Provides [`Comparator`], [`Scorer`], and a [`Backend`] abstraction that
//! selects GPU acceleration automatically when compiled with the `cuda` or
//! `vulkan` features and suitable hardware is present.  Without those features
//! the crate compiles and runs entirely on CPU via `zer-compare`.
//!
//! # Quick start
//!
//! ```rust,no_run
//! use zer_lib::prelude::*;
//!
//! let schema = SchemaBuilder::new()
//!     .field("naam",  FieldKind::Name)
//!     .field("datum", FieldKind::Date)
//!     .build().unwrap();
//!
//! let backend    = Backend::auto_detect();        // CUDA → Vulkan → AVX2 → CPU
//! let comparator = Comparator::new(&schema, &backend);
//! let scorer     = Scorer::new(&backend);
//! ```
//!
//! # Feature flags
//!
//! **Compute backends** (mutually exclusive in practice; pick one):
//!
//! | Flag             | Description                                                              |
//! |------------------|--------------------------------------------------------------------------|
//! | `cuda`           | NVIDIA CUDA via `zer-compute`, requires CUDA Toolkit 13.1+ and `nvcc`   |
//! | `vulkan`         | Vulkan 1.3 compute via `zer-compute`, requires `slangc` on `PATH`        |
//! | `avx2`           | x86_64 AVX2 SIMD via `zer-compute`, no external toolchain required       |
//! | `cpu`            | Explicit scalar CPU path via `zer-compute` (Rayon parallel)              |
//! | `debug-shaders`  | Embed debug info in CUDA kernels for `cuda-gdb` / Nsight (needs `cuda`) |
//!
//! **Pipeline integration:**
//!
//! | Flag       | Description                                                              |
//! |------------|--------------------------------------------------------------------------|
//! | `pipeline` | Enable `Pipeline`, `Ingester`, and related types from `zer-pipeline`     |
//!
//! **Neural judge ORT execution providers** (independent of compute backend):
//!
//! | Flag             | Description                                                              |
//! |------------------|--------------------------------------------------------------------------|
//! | `judge_cpu`      | Scalar CPU execution provider for ORT (no extra dependencies)            |
//! | `judge_cuda`     | NVIDIA CUDA execution provider for ORT                                   |
//! | `judge_rocm`     | AMD ROCm execution provider for ORT                                      |
//! | `judge_directml` | Windows DirectML execution provider for ORT                              |
//! | `judge_openvino` | Intel OpenVINO execution provider for ORT                                |
//!
//! # CPU-only usage
//!
//! Users who never need GPU can depend on `zer-compare` directly and never
//! import this crate.  `zer_compare::FieldComparator` and
//! `zer_compare::FellegiSunterScorer` are the raw CPU implementations.

#[cfg(any(feature = "cuda", feature = "avx2", feature = "vulkan"))]
use std::sync::Arc;

use zer_core::{
    comparison::{ComparisonBatch, ComparisonVector},
    record::Record,
    record_pool::RecordPool,
    schema::Schema,
    scoring::{ModelParams, ScoredPair},
    traits::{Comparator as ComparatorTrait, Result as ZerResult, Scorer as ScorerTrait},
};

// ── Backend ───────────────────────────────────────────────────────────────────

enum BackendInner {
    Cpu,
    #[cfg(any(feature = "cuda", feature = "avx2", feature = "vulkan"))]
    Gpu(Arc<zer_compute::DeviceBackend>),
}

/// Opaque compute backend handle.
///
/// Create once and share between [`Comparator`] and [`Scorer`] so both use the
/// same underlying GPU device.
///
/// ```rust,no_run
/// use zer_lib::prelude::*;
///
/// let schema     = SchemaBuilder::new().field("naam", FieldKind::Name).build().unwrap();
/// let backend    = Backend::auto_detect();
/// let comparator = Comparator::new(&schema, &backend);
/// let scorer     = Scorer::new(&backend);
/// ```
pub struct Backend {
    inner: BackendInner,
    name:  &'static str,
}

impl Backend {
    /// Read `--target=<name>` from process args and return the matching backend.
    ///
    /// Falls back to CPU when the flag is absent, no hardware probing.
    /// Pass `--target=auto` to restore the hardware-detection order
    /// (CUDA → Vulkan → AVX2 → CPU).
    pub fn auto_detect() -> Self {
        match std::env::args()
            .find_map(|a| a.strip_prefix("--target=").map(str::to_owned))
            .as_deref()
        {
            Some(t) => Self::from_target(t),
            None    => Self::cpu(),
        }
    }

    /// Force the CPU backend regardless of available hardware.
    pub fn cpu() -> Self {
        Self { inner: BackendInner::Cpu, name: "cpu" }
    }

    /// Select a backend by name, called by `auto_detect()` to resolve `--target=<name>`.
    ///
    /// Accepted values: `"auto"` (hardware-detect), `"cpu"`, `"cuda"`, `"avx2"`, `"vulkan"`.
    ///
    /// Exits with a diagnostic if the target is unknown, not compiled in, or hardware init fails.
    pub fn from_target(target: &str) -> Self {
        if target == "cpu" {
            return Self::cpu();
        }

        #[cfg(any(feature = "cuda", feature = "avx2", feature = "vulkan"))]
        {
            let pref = match target {
                "auto"   => zer_compute::BackendPreference::Auto,
                "cuda"   => zer_compute::BackendPreference::Cuda,
                "vulkan" => zer_compute::BackendPreference::Vulkan,
                "avx2"   => zer_compute::BackendPreference::Avx2,
                other => {
                    tracing::error!(target = other, "unknown --target; valid: auto, cpu, avx2, cuda, vulkan");
                    std::process::exit(1);
                }
            };
            return match zer_compute::DeviceBackend::from_preference(pref) {
                Ok(dev) => {
                    let name = dev.name();
                    if dev.is_accelerated() {
                        Self { inner: BackendInner::Gpu(Arc::new(dev)), name }
                    } else {
                        Self { inner: BackendInner::Cpu, name: "cpu" }
                    }
                }
                Err(e) => {
                    tracing::error!(target, error = %e, "--target unavailable");
                    std::process::exit(1);
                }
            };
        }

        #[allow(unreachable_code)]
        {
            if target == "auto" {
                return Self::cpu();
            }
            tracing::error!(target, "unknown --target; valid values when built without GPU features: auto, cpu");
            std::process::exit(1);
        }
    }

    /// Human-readable name of the active backend: `"cpu"`, `"cuda"`, or `"avx2"`.
    pub fn name(&self) -> &'static str {
        self.name
    }

    /// `true` when a GPU backend is active.
    pub fn is_gpu(&self) -> bool {
        !matches!(self.inner, BackendInner::Cpu)
    }
}

// ── Comparator ────────────────────────────────────────────────────────────────

enum ComparatorInner {
    Cpu(zer_compare::FieldComparator),
    #[cfg(any(feature = "cuda", feature = "avx2", feature = "vulkan"))]
    Gpu(zer_compute::DeviceComparator),
}

/// Pairwise record comparator with automatic GPU/CPU selection.
///
/// Wraps `FieldComparator` (CPU) or `DeviceComparator` (GPU) depending on the
/// [`Backend`].  Implements [`ComparatorTrait`] identically in both cases.
pub struct Comparator {
    inner: ComparatorInner,
}

impl Comparator {
    /// Wrap an already-constructed [`zer_compare::FieldComparator`] directly.
    ///
    /// Use this when you want to override default similarity functions via
    /// [`zer_compare::FieldComparator::with_fns`] before creating the comparator.
    /// Always uses the CPU path; GPU acceleration is not available this way.
    pub fn from_cpu(fc: zer_compare::FieldComparator) -> Self {
        Self { inner: ComparatorInner::Cpu(fc) }
    }

    /// Build a comparator from a schema and backend.
    pub fn new(schema: &Schema, backend: &Backend) -> Self {
        match &backend.inner {
            BackendInner::Cpu => Self {
                inner: ComparatorInner::Cpu(
                    zer_compare::FieldComparator::from_schema(schema),
                ),
            },
            #[cfg(any(feature = "cuda", feature = "avx2", feature = "vulkan"))]
            BackendInner::Gpu(dev) => Self {
                inner: ComparatorInner::Gpu(
                    zer_compute::DeviceComparator::new(Arc::clone(dev), schema).unwrap(),
                ),
            },
        }
    }

    /// Name of the active backend, for diagnostics.
    pub fn backend_name(&self) -> &'static str {
        match &self.inner {
            ComparatorInner::Cpu(_) => "cpu",
            #[cfg(any(feature = "cuda", feature = "avx2", feature = "vulkan"))]
            ComparatorInner::Gpu(c) => c.backend_name(),
        }
    }

    /// Primary hot-path: pool-native batch comparison.
    ///
    /// `pool` is a `RecordPool` built from the candidate records; `pair_indices`
    /// holds `(i, j)` pairs where `i` and `j` are indices into the pool.
    /// Avoids all `Record::clone()` and `HashMap` lookups, the fastest path for
    /// large BRP-style jobs where records are already loaded into a pool.
    pub fn compare_batch_from_pool(
        &self,
        pool:         &RecordPool,
        pair_indices: &[(usize, usize)],
        schema:       &Schema,
    ) -> ComparisonBatch {
        match &self.inner {
            ComparatorInner::Cpu(c) => c.compare_batch_from_pool(pool, pair_indices, schema),
            #[cfg(any(feature = "cuda", feature = "avx2", feature = "vulkan"))]
            ComparatorInner::Gpu(c) => c.compare_batch_from_pool(pool, pair_indices, schema),
        }
    }

    /// Convenience wrapper: builds a pool from a flat `records` slice and compares
    /// the `pair_indices` pairs.  No `Record::clone()`.
    pub fn compare_batch_indexed(
        &self,
        records:      &[Record],
        pair_indices: &[(usize, usize)],
        schema:       &Schema,
    ) -> ComparisonBatch {
        let pool = RecordPool::from_records(records, schema);
        self.compare_batch_from_pool(&pool, pair_indices, schema)
    }
}

impl ComparatorTrait for Comparator {
    fn compare(&self, a: &Record, b: &Record, schema: &Schema) -> ComparisonVector {
        match &self.inner {
            ComparatorInner::Cpu(c) => c.compare(a, b, schema),
            #[cfg(any(feature = "cuda", feature = "avx2", feature = "vulkan"))]
            ComparatorInner::Gpu(c) => c.compare(a, b, schema),
        }
    }

    fn compare_batch_from_pool(
        &self,
        pool:    &RecordPool,
        indices: &[(usize, usize)],
        schema:  &Schema,
    ) -> ComparisonBatch {
        self.compare_batch_from_pool(pool, indices, schema)
    }
}

// ── Scorer ────────────────────────────────────────────────────────────────────

enum ScorerInner {
    Cpu(zer_compare::FellegiSunterScorer),
    #[cfg(any(feature = "cuda", feature = "avx2", feature = "vulkan"))]
    Gpu(zer_compute::DeviceScorer),
}

/// Fellegi-Sunter scorer with automatic GPU/CPU EM acceleration.
///
/// `score` / `score_batch` always run on CPU, no kernel overhead for small
/// operations.  `estimate_params` uses the GPU EM kernel when the backend is
/// GPU and the batch exceeds the transfer break-even threshold; otherwise it
/// falls back to `zer_compare::run_em` on the CPU.
pub struct Scorer {
    inner: ScorerInner,
}

impl Scorer {
    /// Build a scorer using the given backend.
    pub fn new(backend: &Backend) -> Self {
        match &backend.inner {
            BackendInner::Cpu => Self {
                inner: ScorerInner::Cpu(zer_compare::FellegiSunterScorer),
            },
            #[cfg(any(feature = "cuda", feature = "avx2", feature = "vulkan"))]
            BackendInner::Gpu(dev) => Self {
                inner: ScorerInner::Gpu(zer_compute::DeviceScorer::new(Arc::clone(dev))),
            },
        }
    }

    /// Name of the active backend, for diagnostics.
    pub fn backend_name(&self) -> &'static str {
        match &self.inner {
            ScorerInner::Cpu(_) => "cpu",
            #[cfg(any(feature = "cuda", feature = "avx2", feature = "vulkan"))]
            ScorerInner::Gpu(s) => s.backend_name(),
        }
    }
}

impl ScorerTrait for Scorer {
    fn score(&self, vector: &ComparisonVector, params: &ModelParams) -> ScoredPair {
        match &self.inner {
            ScorerInner::Cpu(s) => s.score(vector, params),
            #[cfg(any(feature = "cuda", feature = "avx2", feature = "vulkan"))]
            ScorerInner::Gpu(s) => s.score(vector, params),
        }
    }

    fn score_batch(
        &self,
        batch:  &ComparisonBatch,
        params: &ModelParams,
    ) -> Vec<ScoredPair> {
        match &self.inner {
            ScorerInner::Cpu(s) => s.score_batch(batch, params),
            #[cfg(any(feature = "cuda", feature = "avx2", feature = "vulkan"))]
            ScorerInner::Gpu(s) => s.score_batch(batch, params),
        }
    }

    fn estimate_params(
        &self,
        batch:    &ComparisonBatch,
        init:     Option<ModelParams>,
        max_iter: usize,
    ) -> ZerResult<ModelParams> {
        match &self.inner {
            ScorerInner::Cpu(s) => s.estimate_params(batch, init, max_iter),
            #[cfg(any(feature = "cuda", feature = "avx2", feature = "vulkan"))]
            ScorerInner::Gpu(s) => s.estimate_params(batch, init, max_iter),
        }
    }
}

// ── Low-level kernel access for power users ───────────────────────────────────

/// Raw GPU kernel dispatch, for users writing custom kernels.
///
/// Requires the `cuda` or `avx2` feature.  Most users should use
/// [`Comparator`] and [`Scorer`] instead.
///
/// # Writing a custom kernel
///
/// 1. Define a zero-sized marker struct and `impl Kernel for It`.
/// 2. `impl KernelDispatch<It> for zer_compute::backend::cpu::CpuDevice`, CPU fallback.
/// 3. `impl KernelDispatch<It> for zer_compute::backend::cuda::CudaDevice`, CUDA path.
/// 4. Add the `impl KernelDispatch<It> for DeviceBackend` match in
///    `zer_compute::backend::mod`.
/// 5. Access the raw device via `zer::compute::DeviceBackend`.
#[cfg(any(feature = "cuda", feature = "avx2", feature = "vulkan"))]
pub mod kernel {
    pub use zer_compute::{
        backend::DeviceBackend,
        error::GpuError,
        kernel::{Kernel, KernelDispatch},
    };
}

// ── Crate re-exports ──────────────────────────────────────────────────────────

pub use zer_blocking as blocking;
pub use zer_compare  as compare;
pub use zer_core     as core;
pub use zer_schema   as schema;
pub use zer_cluster  as cluster;

#[cfg(feature = "pipeline")]
pub use zer_pipeline as pipeline;

#[cfg(any(feature = "cuda", feature = "avx2", feature = "vulkan"))]
pub use zer_compute as compute;

// ── Prelude ───────────────────────────────────────────────────────────────────

pub mod prelude {
    // Concrete auto-detecting types, primary user-facing API
    pub use crate::{Backend, Comparator, Scorer};

    // Core data types
    pub use zer_core::{
        comparison::{ComparisonBatch, ComparisonLevel, ComparisonVector},
        entity::{Entity, EntityId, EntityMember, ResolutionMethod},
        error::ZerError,
        record::{FieldValue, Record, RecordId},
        record_pool::RecordPool,
        schema::{FieldKind, Schema, SchemaBuilder},
        scoring::{MatchBand, ModelParams, ScoredPair},
        traits::{
            BlockIndex, Blocker, Clusterer, EntityStore, Judge, JudgeVerdict, RecordStore,
            // Renamed to avoid shadowing the concrete Comparator / Scorer structs above
            Comparator as ComparatorTrait,
            Scorer as ScorerTrait,
        },
        VecRecordStore,
    };

    // Blocking
    pub use zer_blocking::{
        BlockerFactory, CompositeBlocker, InvertedIndex, SchemaCategory,
        keys::{
            AddressInitialKey, AliasPhoneticKey, CameraTimeWindowKey, DateFragmentKey,
            DateGranularity, DocumentDigitSuffixKey, DocumentSuffixKey, ExactFieldKey,
            FuzzyYearKey, GeoGridKey, LicensePlateNormKey, PhoneticAlgo, PhoneticNameDobKey,
            PlateOCRFuzzyKey, SuffixKey, TransliteratedPhoneticKey,
        },
    };

    // CPU implementations, available directly for users who want the raw types
    pub use zer_compare::{
        FellegiSunterScorer, FieldComparator, LevelThresholds, SimilarityFn,
        JaroWinklerSimilarity, PhoneticEqualitySimilarity, TokenOverlapSimilarity,
        AddressTokenOverlap, StreetNumberEditDistance,
    };

    // Schema registry and artifact management (Phase 6)
    pub use zer_schema::{ModelArtifact, SchemaFingerprint, SchemaInferrer, SchemaRegistry, StartupMode};

    // Clustering and entity store (Phase 6)
    pub use zer_cluster::{ClusterConfig, ConnectedComponentsClusterer, ZalEntityStore};

    // Pipeline types, available with the `pipeline` feature (no polars required)
    #[cfg(feature = "pipeline")]
    pub use zer_pipeline::{
        BatchReport, ClusterIter, ClusterView, IngestResult, Ingester,
        Pipeline, PipelineBuilder, PipelineConfig, RateConfig,
    };

}