Skip to main content

indexkit/
types.rs

1//! Core domain types -- [`Constituent`], [`IndexSnapshot`], [`IndexId`],
2//! [`DataSource`], [`Resolution`].
3
4use crate::date::YearMonth;
5use chrono::NaiveDate;
6use serde::{Deserialize, Serialize};
7
8/// Which upstream source produced a given row.
9///
10/// Rows written by different sources for the same `(index, identity, date)`
11/// are coalesced by the [`crate::coalesce`] layer with a priority ladder
12/// (highest first):
13///
14/// | Priority | Variant                    | Coverage             | Fields       |
15/// |----------|----------------------------|----------------------|--------------|
16/// | 5        | `IsharesCdn`, `InvescoCdn`, `SpdrCdn` | forward, daily    | full         |
17/// | 4        | `GithubFja05680`           | 1996-present, daily  | ticker only  |
18/// | 3        | `GithubYfiua { month }`    | ~2018-present, monthly | ticker only |
19/// | 3        | `GithubHanshof`            | 1996-present, daily  | ticker only  |
20/// | 2        | `Wayback(date)`            | 2019+, sparse        | varies       |
21/// | 1        | `SecNport`                 | 2019-11-present, monthly | full (no ticker) |
22#[derive(Debug, Clone, PartialEq, Eq, Hash, Serialize, Deserialize)]
23#[serde(rename_all = "snake_case")]
24pub enum DataSource {
25    /// Live sponsor CDN (iShares, Invesco, State Street).
26    ///
27    /// Payload is the ETF issuer's own public holdings file, which they
28    /// refresh daily. These CDN endpoints are covered by each sponsor's
29    /// terms of service; indexkit treats them as best-effort and always
30    /// keeps a Wayback snapshot as a fallback.
31    IsharesCdn,
32    InvescoCdn,
33    SpdrCdn,
34    /// Internet Archive's Wayback Machine. Snapshots of sponsor pages
35    /// captured by `archive.org` on a specific date.
36    ///
37    /// `YYYYMMDD` encodes the snapshot date. Coverage is patchy
38    /// (typically 40-60 % of trading days).
39    Wayback(String),
40    /// SEC EDGAR N-PORT filing -- monthly baseline, guaranteed from
41    /// 2019-11 onwards.
42    SecNport,
43    /// fja05680/sp500 GitHub mirror (MIT license).
44    ///
45    /// Daily S&P 500 component changes from 1996-01-02 onwards. Rows
46    /// provide ticker only (no CUSIP / LEI / weight / shares). Maintained
47    /// by Farrell J. Aultman.
48    ///
49    /// Source repo: <https://github.com/fja05680/sp500>.
50    GithubFja05680,
51    /// yfiua/index-constituents GitHub mirror (Apache-2.0 license).
52    ///
53    /// Monthly snapshots of major index constituents (S&P 500, Nasdaq-100,
54    /// Dow Jones, etc.) from 2018 onwards. Rows provide ticker only.
55    ///
56    /// `month` is the `YYYY-MM` directory the row was sourced from.
57    ///
58    /// Source repo: <https://github.com/yfiua/index-constituents>.
59    GithubYfiua {
60        /// Year-month the yfiua snapshot belongs to.
61        month: YearMonth,
62    },
63    /// hanshof/sp500_constituents GitHub mirror (MIT license).
64    ///
65    /// Daily S&P 500 historical components, 1996-present. Same shape as
66    /// `GithubFja05680` but maintained independently; used as a cross-
67    /// check layer.
68    ///
69    /// Source repo: <https://github.com/hanshof/sp500_constituents>.
70    GithubHanshof,
71}
72
73impl DataSource {
74    /// Short string tag stored in the parquet `source` column.
75    pub fn tag(&self) -> String {
76        match self {
77            DataSource::IsharesCdn => "ishares_cdn".into(),
78            DataSource::InvescoCdn => "invesco_cdn".into(),
79            DataSource::SpdrCdn => "spdr_cdn".into(),
80            DataSource::Wayback(yyyymmdd) => format!("wayback_{yyyymmdd}"),
81            DataSource::SecNport => "sec_nport".into(),
82            DataSource::GithubFja05680 => "github_fja05680".into(),
83            DataSource::GithubYfiua { month } => format!("github_yfiua_{month}"),
84            DataSource::GithubHanshof => "github_hanshof".into(),
85        }
86    }
87
88    /// Parse a `source` tag back into a [`DataSource`].
89    pub fn from_tag(s: &str) -> Option<Self> {
90        match s {
91            "ishares_cdn" => Some(DataSource::IsharesCdn),
92            "invesco_cdn" => Some(DataSource::InvescoCdn),
93            "spdr_cdn" => Some(DataSource::SpdrCdn),
94            "sec_nport" => Some(DataSource::SecNport),
95            "github_fja05680" => Some(DataSource::GithubFja05680),
96            "github_hanshof" => Some(DataSource::GithubHanshof),
97            tag if tag.starts_with("wayback_") => Some(DataSource::Wayback(tag[8..].to_string())),
98            tag if tag.starts_with("github_yfiua_") => {
99                let rest = &tag[13..];
100                rest.parse::<YearMonth>()
101                    .ok()
102                    .map(|month| DataSource::GithubYfiua { month })
103            }
104            _ => None,
105        }
106    }
107
108    /// Priority weight. Higher wins when multiple sources cover the same
109    /// `(index, identity, date)` key during coalesce.
110    pub fn priority(&self) -> u8 {
111        match self {
112            DataSource::IsharesCdn | DataSource::InvescoCdn | DataSource::SpdrCdn => 5,
113            DataSource::GithubFja05680 => 4,
114            DataSource::GithubYfiua { .. } => 3,
115            DataSource::GithubHanshof => 3,
116            DataSource::Wayback(_) => 2,
117            DataSource::SecNport => 1,
118        }
119    }
120}
121
122/// Confidence tier of the data available for a given `(index, month)`.
123#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
124#[serde(rename_all = "lowercase")]
125pub enum Resolution {
126    /// Every trading day in the month has at least one row (from CDN or
127    /// Wayback).
128    Daily,
129    /// Some trading days are covered, others are not.
130    Sparse,
131    /// Only one row per month (N-PORT baseline).
132    Monthly,
133    /// No data.
134    None,
135}
136
137/// One security held by an index ETF on a specific date.
138///
139/// # Field coverage by source
140///
141/// Different upstream sources populate different fields. Always present:
142/// `name` (may be empty for ticker-only mirrors), `as_of`, `source`.
143///
144/// | Field        | CDN / Wayback | N-PORT (1) | GitHub mirrors (fja05680, yfiua, hanshof) |
145/// |--------------|---------------|------------|-------------------------------------------|
146/// | `ticker`     | ~99 % present | `None`     | always `Some(t)`                          |
147/// | `cusip`      | present       | present    | empty string (`""`) -- unknown            |
148/// | `lei`        | optional      | present    | `None`                                    |
149/// | `shares`     | present       | present    | `0.0` -- unknown                          |
150/// | `market_value_usd` | present | present    | `0.0` -- unknown                          |
151/// | `weight`     | fraction of NAV | fraction | `f64::NAN` -- unknown, use [`weight_opt`][Self::weight_opt] |
152///
153/// (1) SEC N-PORT has no ticker column; every N-PORT `Constituent::ticker`
154/// is `None`. Use `cusip` as the join key when N-PORT rows are in play.
155///
156/// # Primary join keys
157///
158/// - **CUSIP** -- preferred, always present for CDN / Wayback / N-PORT rows.
159/// - **Ticker** -- preferred when joining GitHub mirror rows (cusip is empty).
160/// - **LEI** -- available for most US issuers, joinable against GLEIF data.
161///
162/// # Missing weights
163///
164/// GitHub mirror rows ([`DataSource::GithubFja05680`],
165/// [`DataSource::GithubYfiua`], [`DataSource::GithubHanshof`]) are
166/// ticker-only -- they carry no weight, shares, or market value. The
167/// `weight` field is set to `f64::NAN` for these rows as a sentinel.
168/// Prefer [`weight_opt`][Self::weight_opt] for consumer code that needs
169/// to branch on presence.
170#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
171pub struct Constituent {
172    /// Ticker symbol.
173    pub ticker: Option<String>,
174    /// Security name as reported on the source file (issuer + share class).
175    pub name: String,
176    /// CUSIP (9-char). Primary join key for CDN / Wayback / N-PORT rows.
177    /// Empty string for GitHub mirror rows (ticker-only sources).
178    pub cusip: String,
179    /// Legal Entity Identifier (20-char) -- ISO 17442 issuer ID.
180    pub lei: Option<String>,
181    /// Shares held (floating point: allows fractional shares for some ETFs).
182    /// `0.0` when unknown (GitHub mirror sources).
183    pub shares: f64,
184    /// Fair value in USD as reported on the source file.
185    /// `0.0` when unknown (GitHub mirror sources).
186    pub market_value_usd: f64,
187    /// Weight as fraction of NAV in `[0.0, 1.0]`.
188    ///
189    /// `f64::NAN` when the source does not carry weight data (GitHub mirror
190    /// sources). Use [`weight_opt`][Self::weight_opt] for an `Option<f64>`
191    /// that returns `None` on `NaN`.
192    pub weight: f64,
193    /// SEC CIK of the issuer, if identifiable. Usually `None`.
194    pub issuer_cik: Option<String>,
195    /// GICS / SIC sector. Reserved for v1.1; currently always `None`.
196    pub sector: Option<Sector>,
197    /// Date this row represents (the business day as of which the
198    /// holdings are priced). For monthly-only rows from N-PORT this is
199    /// the last business day of the reporting period.
200    pub as_of: NaiveDate,
201    /// Upstream that produced this row.
202    pub source: DataSource,
203}
204
205impl Constituent {
206    /// Weight as [`Option<f64>`].
207    ///
208    /// Returns `None` when [`weight`][Self::weight] is `NaN` (the sentinel
209    /// used by ticker-only GitHub mirror sources) or a subnormal/infinite
210    /// value. Otherwise returns `Some(weight)`.
211    ///
212    /// # Example
213    ///
214    /// ```
215    /// use indexkit::{Constituent, DataSource};
216    /// use chrono::NaiveDate;
217    ///
218    /// let row = Constituent {
219    ///     ticker: Some("AAPL".into()),
220    ///     name: "".into(),
221    ///     cusip: "".into(),
222    ///     lei: None,
223    ///     shares: 0.0,
224    ///     market_value_usd: 0.0,
225    ///     weight: f64::NAN,
226    ///     issuer_cik: None,
227    ///     sector: None,
228    ///     as_of: NaiveDate::from_ymd_opt(2024, 1, 15).unwrap(),
229    ///     source: DataSource::GithubFja05680,
230    /// };
231    /// assert_eq!(row.weight_opt(), None);
232    /// ```
233    pub fn weight_opt(&self) -> Option<f64> {
234        if self.weight.is_finite() {
235            Some(self.weight)
236        } else {
237            None
238        }
239    }
240}
241
242/// GICS sector placeholder.
243///
244/// Reserved for a v1.1 feature. N-PORT does not include GICS sector. A
245/// future `indexkit-gics` module will derive sector from SEC SIC codes via
246/// a SIC -> GICS cross-walk. Currently every [`Constituent::sector`] field
247/// is `None`.
248#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
249pub enum Sector {
250    CommunicationServices,
251    ConsumerDiscretionary,
252    ConsumerStaples,
253    Energy,
254    Financials,
255    HealthCare,
256    Industrials,
257    InformationTechnology,
258    Materials,
259    RealEstate,
260    Utilities,
261}
262
263/// A full snapshot of index constituents for a given month.
264///
265/// `constituents` may contain rows from multiple calendar dates within the
266/// month (when daily-resolution data exists) or just one row per holding
267/// (when only monthly N-PORT data is available). Rows are sorted by
268/// `(as_of, -weight)`.
269#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
270pub struct IndexSnapshot {
271    /// The index this snapshot represents.
272    pub index: IndexId,
273    /// Month of the snapshot.
274    pub year_month: YearMonth,
275    /// Holdings. Multi-date if daily data is available.
276    pub constituents: Vec<Constituent>,
277}
278
279impl IndexSnapshot {
280    /// Whether this snapshot carries weight data.
281    ///
282    /// Returns `true` if at least one row has a finite `weight` value
283    /// (i.e. it came from a CDN, Wayback, or N-PORT source). Returns
284    /// `false` if every row is ticker-only (all GitHub mirror sources)
285    /// or the snapshot is empty.
286    ///
287    /// Useful as a quick gate for analytics code: a snapshot with
288    /// `has_weights() == false` is a ticker universe only, not a weight
289    /// vector.
290    pub fn has_weights(&self) -> bool {
291        self.constituents.iter().any(|c| c.weight.is_finite())
292    }
293}
294
295/// Single-day snapshot -- every holding as of a specific date.
296#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
297pub struct DailySnapshot {
298    /// The index this snapshot represents.
299    pub index: IndexId,
300    /// Date of the snapshot.
301    pub date: NaiveDate,
302    /// Holdings sorted by descending weight.
303    pub constituents: Vec<Constituent>,
304    /// Source that produced this snapshot.
305    pub source: DataSource,
306}
307
308/// Supported index identifiers.
309///
310/// Strings: `"sp500"`, `"sp400"`, `"sp600"`, `"ndx"`, `"dji"`.
311#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
312#[serde(rename_all = "lowercase")]
313pub enum IndexId {
314    /// S&P 500 (via IVV -- iShares Core S&P 500 ETF).
315    Sp500,
316    /// S&P MidCap 400 (via IJH).
317    Sp400,
318    /// S&P SmallCap 600 (via IJR).
319    Sp600,
320    /// Nasdaq-100 (via QQQ).
321    Ndx,
322    /// Dow Jones Industrial Average (via DIA).
323    Dji,
324}
325
326impl IndexId {
327    /// All five indices.
328    pub const ALL: [IndexId; 5] = [
329        IndexId::Sp500,
330        IndexId::Sp400,
331        IndexId::Sp600,
332        IndexId::Ndx,
333        IndexId::Dji,
334    ];
335
336    /// Parse from short string id.
337    pub fn from_str_id(s: &str) -> Option<Self> {
338        match s.to_ascii_lowercase().as_str() {
339            "sp500" => Some(IndexId::Sp500),
340            "sp400" => Some(IndexId::Sp400),
341            "sp600" => Some(IndexId::Sp600),
342            "ndx" | "nasdaq100" | "nasdaq-100" => Some(IndexId::Ndx),
343            "dji" | "djia" | "dow" => Some(IndexId::Dji),
344            _ => None,
345        }
346    }
347
348    /// Short string id used for parquet file prefixes.
349    pub fn as_str(self) -> &'static str {
350        match self {
351            IndexId::Sp500 => "sp500",
352            IndexId::Sp400 => "sp400",
353            IndexId::Sp600 => "sp600",
354            IndexId::Ndx => "ndx",
355            IndexId::Dji => "dji",
356        }
357    }
358}
359
360impl std::fmt::Display for IndexId {
361    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
362        f.write_str(self.as_str())
363    }
364}
365
366impl std::str::FromStr for IndexId {
367    type Err = String;
368    fn from_str(s: &str) -> Result<Self, Self::Err> {
369        IndexId::from_str_id(s).ok_or_else(|| format!("unknown index id: {s:?}"))
370    }
371}
372
373#[cfg(test)]
374mod tests {
375    use super::*;
376
377    #[test]
378    fn indexid_roundtrip() {
379        for &id in &IndexId::ALL {
380            let s = id.as_str();
381            assert_eq!(IndexId::from_str_id(s), Some(id));
382        }
383    }
384
385    #[test]
386    fn indexid_aliases() {
387        assert_eq!(IndexId::from_str_id("nasdaq100"), Some(IndexId::Ndx));
388        assert_eq!(IndexId::from_str_id("djia"), Some(IndexId::Dji));
389        assert_eq!(IndexId::from_str_id("SP500"), Some(IndexId::Sp500));
390    }
391
392    #[test]
393    fn indexid_unknown() {
394        assert_eq!(IndexId::from_str_id("totally-fake"), None);
395    }
396
397    #[test]
398    fn data_source_tag_roundtrip_core() {
399        for ds in [
400            DataSource::IsharesCdn,
401            DataSource::InvescoCdn,
402            DataSource::SpdrCdn,
403            DataSource::SecNport,
404            DataSource::GithubFja05680,
405            DataSource::GithubHanshof,
406            DataSource::Wayback("20240315".into()),
407            DataSource::GithubYfiua {
408                month: YearMonth::new(2024, 3).unwrap(),
409            },
410        ] {
411            let tag = ds.tag();
412            let back = DataSource::from_tag(&tag).expect("parseable");
413            assert_eq!(back, ds, "tag {tag} did not round-trip");
414        }
415    }
416
417    #[test]
418    fn data_source_priority_ladder() {
419        assert_eq!(DataSource::IsharesCdn.priority(), 5);
420        assert_eq!(DataSource::InvescoCdn.priority(), 5);
421        assert_eq!(DataSource::SpdrCdn.priority(), 5);
422        assert_eq!(DataSource::GithubFja05680.priority(), 4);
423        assert_eq!(
424            DataSource::GithubYfiua {
425                month: YearMonth::new(2024, 3).unwrap()
426            }
427            .priority(),
428            3
429        );
430        assert_eq!(DataSource::GithubHanshof.priority(), 3);
431        assert_eq!(DataSource::Wayback("20240315".into()).priority(), 2);
432        assert_eq!(DataSource::SecNport.priority(), 1);
433    }
434
435    fn ticker_only_row(ticker: &str, date: NaiveDate, src: DataSource) -> Constituent {
436        Constituent {
437            ticker: Some(ticker.into()),
438            name: String::new(),
439            cusip: String::new(),
440            lei: None,
441            shares: 0.0,
442            market_value_usd: 0.0,
443            weight: f64::NAN,
444            issuer_cik: None,
445            sector: None,
446            as_of: date,
447            source: src,
448        }
449    }
450
451    #[test]
452    fn weight_opt_nan_is_none() {
453        let d = NaiveDate::from_ymd_opt(2024, 1, 15).unwrap();
454        let row = ticker_only_row("AAPL", d, DataSource::GithubFja05680);
455        assert_eq!(row.weight_opt(), None);
456    }
457
458    #[test]
459    fn weight_opt_finite_is_some() {
460        let d = NaiveDate::from_ymd_opt(2024, 1, 15).unwrap();
461        let mut row = ticker_only_row("AAPL", d, DataSource::IsharesCdn);
462        row.weight = 0.072;
463        assert_eq!(row.weight_opt(), Some(0.072));
464    }
465
466    #[test]
467    fn weight_opt_infinity_is_none() {
468        let d = NaiveDate::from_ymd_opt(2024, 1, 15).unwrap();
469        let mut row = ticker_only_row("AAPL", d, DataSource::IsharesCdn);
470        row.weight = f64::INFINITY;
471        assert_eq!(row.weight_opt(), None);
472    }
473
474    #[test]
475    fn snapshot_has_weights_true_when_any_finite() {
476        let d = NaiveDate::from_ymd_opt(2024, 1, 15).unwrap();
477        let mut with_weight = ticker_only_row("AAPL", d, DataSource::IsharesCdn);
478        with_weight.weight = 0.05;
479        let nan_row = ticker_only_row("MSFT", d, DataSource::GithubFja05680);
480        let s = IndexSnapshot {
481            index: IndexId::Sp500,
482            year_month: YearMonth::new(2024, 1).unwrap(),
483            constituents: vec![with_weight, nan_row],
484        };
485        assert!(s.has_weights());
486    }
487
488    #[test]
489    fn snapshot_has_weights_false_when_all_nan() {
490        let d = NaiveDate::from_ymd_opt(2024, 1, 15).unwrap();
491        let row1 = ticker_only_row("AAPL", d, DataSource::GithubFja05680);
492        let row2 = ticker_only_row("MSFT", d, DataSource::GithubHanshof);
493        let s = IndexSnapshot {
494            index: IndexId::Sp500,
495            year_month: YearMonth::new(2024, 1).unwrap(),
496            constituents: vec![row1, row2],
497        };
498        assert!(!s.has_weights());
499    }
500
501    #[test]
502    fn snapshot_has_weights_false_when_empty() {
503        let s = IndexSnapshot {
504            index: IndexId::Sp500,
505            year_month: YearMonth::new(2024, 1).unwrap(),
506            constituents: Vec::new(),
507        };
508        assert!(!s.has_weights());
509    }
510}