Skip to main content

difi/
types.rs

1//! Core data types for difi.
2//!
3//! Defines the `ObservationTable` and `LinkageMemberTable` traits as the
4//! compile-time contract for data access. Downstream consumers (e.g. thor-rust)
5//! implement these traits for their own types. difi also provides built-in
6//! implementations (`Observations`, `LinkageMembers`) for standalone use.
7//!
8//! Internally, algorithms work on `ObservationSlices` / `LinkageMemberSlices`
9//! (concrete borrowed slice bundles) so generics stay at the API boundary.
10//!
11//! # Memory efficiency
12//!
13//! - `object_id` uses a sentinel value (`NO_OBJECT = u64::MAX`) instead of
14//!   `Option<u64>`, saving 8 bytes per observation due to alignment.
15//! - Sorted ID indices enable O(log N) binary search lookups, eliminating
16//!   the need for O(N × 50B) HashMaps at survey scale.
17
18// ---------------------------------------------------------------------------
19// Constants
20// ---------------------------------------------------------------------------
21
22/// Sentinel value indicating no associated object.
23/// Used instead of `Option<u64>` to avoid 8 bytes of alignment padding per row.
24pub const NO_OBJECT: u64 = u64::MAX;
25
26// ---------------------------------------------------------------------------
27// Traits — the public contract
28// ---------------------------------------------------------------------------
29
30/// Trait for read-only access to observation data.
31///
32/// Implementors must provide parallel arrays of equal length.
33/// All string IDs should be pre-interned to `u64` by the caller.
34/// Observations with no known object use `NO_OBJECT` as the sentinel.
35pub trait ObservationTable: Sync {
36    fn len(&self) -> usize;
37    fn ids(&self) -> &[u64];
38    fn times_mjd(&self) -> &[f64];
39    fn ra(&self) -> &[f64];
40    fn dec(&self) -> &[f64];
41    fn nights(&self) -> &[i64];
42    /// Object IDs. `NO_OBJECT` (u64::MAX) means no association.
43    fn object_ids(&self) -> &[u64];
44    fn observatory_codes(&self) -> &[u32];
45
46    fn is_empty(&self) -> bool {
47        self.len() == 0
48    }
49}
50
51/// Trait for read-only access to linkage membership data.
52///
53/// Maps linkage IDs to observation IDs (both interned to `u64`).
54pub trait LinkageMemberTable: Sync {
55    fn len(&self) -> usize;
56    fn linkage_ids(&self) -> &[u64];
57    fn obs_ids(&self) -> &[u64];
58
59    fn is_empty(&self) -> bool {
60        self.len() == 0
61    }
62}
63
64// ---------------------------------------------------------------------------
65// Internal slice bundles — concrete types for algorithm internals
66// ---------------------------------------------------------------------------
67
68/// Borrowed slice view into observation data.
69///
70/// Created once at API entry points from an `&impl ObservationTable`,
71/// then passed to all internal functions. This keeps generics out of
72/// the algorithm code.
73pub struct ObservationSlices<'a> {
74    pub ids: &'a [u64],
75    pub times_mjd: &'a [f64],
76    pub ra: &'a [f64],
77    pub dec: &'a [f64],
78    pub nights: &'a [i64],
79    pub object_ids: &'a [u64],
80    pub observatory_codes: &'a [u32],
81}
82
83impl<'a> ObservationSlices<'a> {
84    /// Extract slices from any `ObservationTable` implementor.
85    pub fn from_table(table: &'a impl ObservationTable) -> Self {
86        Self {
87            ids: table.ids(),
88            times_mjd: table.times_mjd(),
89            ra: table.ra(),
90            dec: table.dec(),
91            nights: table.nights(),
92            object_ids: table.object_ids(),
93            observatory_codes: table.observatory_codes(),
94        }
95    }
96
97    pub fn len(&self) -> usize {
98        self.ids.len()
99    }
100
101    pub fn is_empty(&self) -> bool {
102        self.ids.is_empty()
103    }
104}
105
106/// Borrowed slice view into linkage member data.
107pub struct LinkageMemberSlices<'a> {
108    pub linkage_ids: &'a [u64],
109    pub obs_ids: &'a [u64],
110}
111
112impl<'a> LinkageMemberSlices<'a> {
113    /// Extract slices from any `LinkageMemberTable` implementor.
114    pub fn from_table(table: &'a impl LinkageMemberTable) -> Self {
115        Self {
116            linkage_ids: table.linkage_ids(),
117            obs_ids: table.obs_ids(),
118        }
119    }
120
121    pub fn len(&self) -> usize {
122        self.linkage_ids.len()
123    }
124
125    pub fn is_empty(&self) -> bool {
126        self.linkage_ids.is_empty()
127    }
128}
129
130// ---------------------------------------------------------------------------
131// Utility
132// ---------------------------------------------------------------------------
133
134/// Build a sorted index over nights for O(log n) partition filtering.
135/// Uses Rayon parallel sort for large arrays.
136pub fn compute_night_sorted_indices(nights: &[i64]) -> Vec<usize> {
137    use rayon::prelude::*;
138    let mut indices: Vec<usize> = (0..nights.len()).collect();
139    indices.par_sort_unstable_by_key(|&i| nights[i]);
140    indices
141}
142
143/// Build a sorted index over IDs for O(log n) lookups by observation ID.
144/// Uses Rayon parallel sort for large arrays.
145pub fn compute_id_sorted_indices(ids: &[u64]) -> Vec<usize> {
146    use rayon::prelude::*;
147    let mut indices: Vec<usize> = (0..ids.len()).collect();
148    indices.par_sort_unstable_by_key(|&i| ids[i]);
149    indices
150}
151
152/// Look up the original index of an observation by its ID using the sorted
153/// index. Returns `None` if not found. O(log n).
154pub fn lookup_by_id(ids: &[u64], id_sorted_indices: &[usize], target: u64) -> Option<usize> {
155    id_sorted_indices
156        .binary_search_by_key(&target, |&i| ids[i])
157        .ok()
158        .map(|pos| id_sorted_indices[pos])
159}
160
161/// Return the sub-slice of `sorted_indices` whose nights fall within
162/// `[start_night, end_night]` (inclusive), using binary search.
163pub fn indices_in_partition<'a>(
164    nights: &[i64],
165    sorted_indices: &'a [usize],
166    start_night: i64,
167    end_night: i64,
168) -> &'a [usize] {
169    let lo = sorted_indices.partition_point(|&i| nights[i] < start_night);
170    let hi = sorted_indices.partition_point(|&i| nights[i] <= end_night);
171    &sorted_indices[lo..hi]
172}
173
174// ---------------------------------------------------------------------------
175// Built-in implementations — for standalone / Python use
176// ---------------------------------------------------------------------------
177
178/// Built-in struct-of-arrays observation storage.
179#[derive(Debug, Clone)]
180pub struct Observations {
181    pub id: Vec<u64>,
182    pub time_mjd: Vec<f64>,
183    pub ra: Vec<f64>,
184    pub dec: Vec<f64>,
185    pub observatory_code: Vec<u32>,
186    /// Object ID per observation. `NO_OBJECT` means no association.
187    pub object_id: Vec<u64>,
188    pub night: Vec<i64>,
189}
190
191impl Observations {
192    pub fn new(
193        id: Vec<u64>,
194        time_mjd: Vec<f64>,
195        ra: Vec<f64>,
196        dec: Vec<f64>,
197        observatory_code: Vec<u32>,
198        object_id: Vec<u64>,
199        night: Vec<i64>,
200    ) -> Self {
201        Self {
202            id,
203            time_mjd,
204            ra,
205            dec,
206            observatory_code,
207            object_id,
208            night,
209        }
210    }
211}
212
213impl ObservationTable for Observations {
214    fn len(&self) -> usize {
215        self.id.len()
216    }
217    fn ids(&self) -> &[u64] {
218        &self.id
219    }
220    fn times_mjd(&self) -> &[f64] {
221        &self.time_mjd
222    }
223    fn ra(&self) -> &[f64] {
224        &self.ra
225    }
226    fn dec(&self) -> &[f64] {
227        &self.dec
228    }
229    fn nights(&self) -> &[i64] {
230        &self.night
231    }
232    fn object_ids(&self) -> &[u64] {
233        &self.object_id
234    }
235    fn observatory_codes(&self) -> &[u32] {
236        &self.observatory_code
237    }
238}
239
240/// Built-in struct-of-arrays linkage member storage.
241#[derive(Debug, Clone)]
242pub struct LinkageMembers {
243    pub linkage_id: Vec<u64>,
244    pub obs_id: Vec<u64>,
245}
246
247impl LinkageMemberTable for LinkageMembers {
248    fn len(&self) -> usize {
249        self.linkage_id.len()
250    }
251    fn linkage_ids(&self) -> &[u64] {
252        &self.linkage_id
253    }
254    fn obs_ids(&self) -> &[u64] {
255        &self.obs_id
256    }
257}
258
259// ---------------------------------------------------------------------------
260// Output types — owned, not behind traits
261// ---------------------------------------------------------------------------
262
263/// Classification of a single linkage.
264#[derive(Debug, Clone)]
265pub struct LinkageSummary {
266    pub linkage_id: u64,
267    pub partition_id: u64,
268    pub linked_object_id: u64,
269    pub num_obs: i64,
270    pub num_obs_outside_partition: i64,
271    pub num_members: i64,
272    pub pure: bool,
273    pub pure_complete: bool,
274    pub contaminated: bool,
275    pub contamination: f64,
276    pub mixed: bool,
277    pub found_pure: bool,
278    pub found_contaminated: bool,
279}
280
281/// Columnar storage for all linkage classifications.
282#[derive(Debug, Clone, Default)]
283pub struct AllLinkages {
284    pub linkage_id: Vec<u64>,
285    pub partition_id: Vec<u64>,
286    /// `NO_OBJECT` means no linked object (mixed linkage).
287    pub linked_object_id: Vec<u64>,
288    pub num_obs: Vec<i64>,
289    pub num_obs_outside_partition: Vec<i64>,
290    pub num_members: Vec<i64>,
291    pub pure: Vec<bool>,
292    pub pure_complete: Vec<bool>,
293    pub contaminated: Vec<bool>,
294    pub contamination: Vec<f64>,
295    pub mixed: Vec<bool>,
296    pub found_pure: Vec<bool>,
297    pub found_contaminated: Vec<bool>,
298}
299
300impl AllLinkages {
301    pub fn len(&self) -> usize {
302        self.linkage_id.len()
303    }
304
305    pub fn is_empty(&self) -> bool {
306        self.linkage_id.is_empty()
307    }
308
309    pub fn push(&mut self, s: LinkageSummary) {
310        self.linkage_id.push(s.linkage_id);
311        self.partition_id.push(s.partition_id);
312        self.linked_object_id.push(s.linked_object_id);
313        self.num_obs.push(s.num_obs);
314        self.num_obs_outside_partition
315            .push(s.num_obs_outside_partition);
316        self.num_members.push(s.num_members);
317        self.pure.push(s.pure);
318        self.pure_complete.push(s.pure_complete);
319        self.contaminated.push(s.contaminated);
320        self.contamination.push(s.contamination);
321        self.mixed.push(s.mixed);
322        self.found_pure.push(s.found_pure);
323        self.found_contaminated.push(s.found_contaminated);
324    }
325}
326
327/// Reason a linkage was excluded from the partition's `AllLinkages` table.
328///
329/// Linkages that would otherwise produce phantom rows (e.g. classified against
330/// a partition where none of their observations fall) are redirected here so
331/// the rest of difi's output remains aggregation-safe.
332#[derive(Debug, Clone, Copy, PartialEq, Eq)]
333pub enum IgnoredLinkageReason {
334    /// Linkage has observations, but none of them fall within the partition's
335    /// `[start_night, end_night]` range. In a multi-partition run this is a
336    /// soft signal (the linkage belongs to a different partition); across all
337    /// partitions it's a hard user-error signal (wrong linkage file).
338    NoObservationsInPartition,
339}
340
341impl IgnoredLinkageReason {
342    pub fn as_str(self) -> &'static str {
343        match self {
344            IgnoredLinkageReason::NoObservationsInPartition => "no_observations_in_partition",
345        }
346    }
347}
348
349/// A single (linkage, partition) pair that was excluded from the partition's
350/// `AllLinkages` table, with context for why.
351#[derive(Debug, Clone)]
352pub struct IgnoredLinkage {
353    pub linkage_id: u64,
354    pub partition_id: u64,
355    pub reason: IgnoredLinkageReason,
356    pub num_obs: i64,
357    pub num_members: i64,
358}
359
360/// Columnar storage for ignored linkages.
361#[derive(Debug, Clone, Default)]
362pub struct IgnoredLinkages {
363    pub linkage_id: Vec<u64>,
364    pub partition_id: Vec<u64>,
365    pub reason: Vec<IgnoredLinkageReason>,
366    pub num_obs: Vec<i64>,
367    pub num_members: Vec<i64>,
368}
369
370impl IgnoredLinkages {
371    pub fn len(&self) -> usize {
372        self.linkage_id.len()
373    }
374
375    pub fn is_empty(&self) -> bool {
376        self.linkage_id.is_empty()
377    }
378
379    pub fn push(&mut self, i: IgnoredLinkage) {
380        self.linkage_id.push(i.linkage_id);
381        self.partition_id.push(i.partition_id);
382        self.reason.push(i.reason);
383        self.num_obs.push(i.num_obs);
384        self.num_members.push(i.num_members);
385    }
386
387    pub fn extend(&mut self, other: IgnoredLinkages) {
388        self.linkage_id.extend(other.linkage_id);
389        self.partition_id.extend(other.partition_id);
390        self.reason.extend(other.reason);
391        self.num_obs.extend(other.num_obs);
392        self.num_members.extend(other.num_members);
393    }
394}
395
396/// Per-object summary of linkage results.
397#[derive(Debug, Clone)]
398pub struct ObjectSummary {
399    pub object_id: u64,
400    pub partition_id: u64,
401    pub mjd_min: f64,
402    pub mjd_max: f64,
403    pub arc_length: f64,
404    pub num_obs: i64,
405    pub num_observatories: i64,
406    pub findable: Option<bool>,
407    pub found_pure: i64,
408    pub found_contaminated: i64,
409    pub pure: i64,
410    pub pure_complete: i64,
411    pub contaminated: i64,
412    pub contaminant: i64,
413    pub mixed: i64,
414    pub obs_in_pure: i64,
415    pub obs_in_pure_complete: i64,
416    pub obs_in_contaminated: i64,
417    pub obs_as_contaminant: i64,
418    pub obs_in_mixed: i64,
419}
420
421/// Columnar storage for all object summaries.
422#[derive(Debug, Clone, Default)]
423pub struct AllObjects {
424    pub object_id: Vec<u64>,
425    pub partition_id: Vec<u64>,
426    pub mjd_min: Vec<f64>,
427    pub mjd_max: Vec<f64>,
428    pub arc_length: Vec<f64>,
429    pub num_obs: Vec<i64>,
430    pub num_observatories: Vec<i64>,
431    pub findable: Vec<Option<bool>>,
432    pub found_pure: Vec<i64>,
433    pub found_contaminated: Vec<i64>,
434    pub pure: Vec<i64>,
435    pub pure_complete: Vec<i64>,
436    pub contaminated: Vec<i64>,
437    pub contaminant: Vec<i64>,
438    pub mixed: Vec<i64>,
439    pub obs_in_pure: Vec<i64>,
440    pub obs_in_pure_complete: Vec<i64>,
441    pub obs_in_contaminated: Vec<i64>,
442    pub obs_as_contaminant: Vec<i64>,
443    pub obs_in_mixed: Vec<i64>,
444}
445
446impl AllObjects {
447    pub fn len(&self) -> usize {
448        self.object_id.len()
449    }
450
451    pub fn is_empty(&self) -> bool {
452        self.object_id.is_empty()
453    }
454
455    pub fn push(&mut self, s: ObjectSummary) {
456        self.object_id.push(s.object_id);
457        self.partition_id.push(s.partition_id);
458        self.mjd_min.push(s.mjd_min);
459        self.mjd_max.push(s.mjd_max);
460        self.arc_length.push(s.arc_length);
461        self.num_obs.push(s.num_obs);
462        self.num_observatories.push(s.num_observatories);
463        self.findable.push(s.findable);
464        self.found_pure.push(s.found_pure);
465        self.found_contaminated.push(s.found_contaminated);
466        self.pure.push(s.pure);
467        self.pure_complete.push(s.pure_complete);
468        self.contaminated.push(s.contaminated);
469        self.contaminant.push(s.contaminant);
470        self.mixed.push(s.mixed);
471        self.obs_in_pure.push(s.obs_in_pure);
472        self.obs_in_pure_complete.push(s.obs_in_pure_complete);
473        self.obs_in_contaminated.push(s.obs_in_contaminated);
474        self.obs_as_contaminant.push(s.obs_as_contaminant);
475        self.obs_in_mixed.push(s.obs_in_mixed);
476    }
477}
478
479/// Observations that satisfy the findability criteria for a given partition.
480#[derive(Debug, Clone)]
481pub struct FindableObservation {
482    pub partition_id: u64,
483    pub object_id: u64,
484    pub discovery_night: Option<i64>,
485    pub obs_ids: Option<Vec<u64>>,
486}
487
488/// Columnar storage for findable observations.
489#[derive(Debug, Clone, Default)]
490pub struct FindableObservations {
491    pub partition_id: Vec<u64>,
492    pub object_id: Vec<u64>,
493    pub discovery_night: Vec<Option<i64>>,
494    pub obs_ids: Vec<Option<Vec<u64>>>,
495}
496
497impl FindableObservations {
498    pub fn len(&self) -> usize {
499        self.partition_id.len()
500    }
501
502    pub fn is_empty(&self) -> bool {
503        self.partition_id.is_empty()
504    }
505
506    pub fn push(&mut self, f: FindableObservation) {
507        self.partition_id.push(f.partition_id);
508        self.object_id.push(f.object_id);
509        self.discovery_night.push(f.discovery_night);
510        self.obs_ids.push(f.obs_ids);
511    }
512}
513
514/// Maps string IDs to interned integer IDs and back.
515#[derive(Debug, Clone, Default, serde::Serialize, serde::Deserialize)]
516pub struct StringInterner {
517    to_id: std::collections::HashMap<String, u64>,
518    to_string: Vec<String>,
519}
520
521impl StringInterner {
522    pub fn new() -> Self {
523        Self::default()
524    }
525
526    /// Intern a string, returning its integer ID.
527    /// If already interned, returns the existing ID.
528    pub fn intern(&mut self, s: &str) -> u64 {
529        if let Some(&id) = self.to_id.get(s) {
530            return id;
531        }
532        let id = self.to_string.len() as u64;
533        self.to_string.push(s.to_owned());
534        self.to_id.insert(s.to_owned(), id);
535        id
536    }
537
538    /// Look up the string for an interned ID.
539    pub fn resolve(&self, id: u64) -> Option<&str> {
540        if id == NO_OBJECT {
541            return None;
542        }
543        self.to_string.get(id as usize).map(|s| s.as_str())
544    }
545
546    pub fn len(&self) -> usize {
547        self.to_string.len()
548    }
549
550    pub fn is_empty(&self) -> bool {
551        self.to_string.is_empty()
552    }
553}