indexkit/types.rs
1//! Core domain types -- [`Constituent`], [`IndexSnapshot`], [`IndexId`],
2//! [`DataSource`], [`Resolution`].
3
4use crate::date::YearMonth;
5use chrono::NaiveDate;
6use serde::{Deserialize, Serialize};
7
8/// Which upstream source produced a given row.
9///
10/// Rows written by different sources for the same `(index, identity, date)`
11/// are coalesced by the [`crate::coalesce`] layer with a priority ladder
12/// (highest first):
13///
14/// | Priority | Variant | Coverage | Fields |
15/// |----------|----------------------------|----------------------|--------------|
16/// | 5 | `IsharesCdn`, `InvescoCdn`, `SpdrCdn` | forward, daily | full |
17/// | 4 | `GithubFja05680` | 1996-present, daily | ticker only |
18/// | 3 | `GithubYfiua { month }` | ~2018-present, monthly | ticker only |
19/// | 3 | `GithubHanshof` | 1996-present, daily | ticker only |
20/// | 2 | `Wayback(date)` | 2019+, sparse | varies |
21/// | 1 | `SecNport` | 2019-11-present, monthly | full (no ticker) |
22#[derive(Debug, Clone, PartialEq, Eq, Hash, Serialize, Deserialize)]
23#[serde(rename_all = "snake_case")]
24pub enum DataSource {
25 /// Live sponsor CDN (iShares, Invesco, State Street).
26 ///
27 /// Payload is the ETF issuer's own public holdings file, which they
28 /// refresh daily. These CDN endpoints are covered by each sponsor's
29 /// terms of service; indexkit treats them as best-effort and always
30 /// keeps a Wayback snapshot as a fallback.
31 IsharesCdn,
32 InvescoCdn,
33 SpdrCdn,
34 /// Internet Archive's Wayback Machine. Snapshots of sponsor pages
35 /// captured by `archive.org` on a specific date.
36 ///
37 /// `YYYYMMDD` encodes the snapshot date. Coverage is patchy
38 /// (typically 40-60 % of trading days).
39 Wayback(String),
40 /// SEC EDGAR N-PORT filing -- monthly baseline, guaranteed from
41 /// 2019-11 onwards.
42 SecNport,
43 /// fja05680/sp500 GitHub mirror (MIT license).
44 ///
45 /// Daily S&P 500 component changes from 1996-01-02 onwards. Rows
46 /// provide ticker only (no CUSIP / LEI / weight / shares). Maintained
47 /// by Farrell J. Aultman.
48 ///
49 /// Source repo: <https://github.com/fja05680/sp500>.
50 GithubFja05680,
51 /// yfiua/index-constituents GitHub mirror (Apache-2.0 license).
52 ///
53 /// Monthly snapshots of major index constituents (S&P 500, Nasdaq-100,
54 /// Dow Jones, etc.) from 2018 onwards. Rows provide ticker only.
55 ///
56 /// `month` is the `YYYY-MM` directory the row was sourced from.
57 ///
58 /// Source repo: <https://github.com/yfiua/index-constituents>.
59 GithubYfiua {
60 /// Year-month the yfiua snapshot belongs to.
61 month: YearMonth,
62 },
63 /// hanshof/sp500_constituents GitHub mirror (MIT license).
64 ///
65 /// Daily S&P 500 historical components, 1996-present. Same shape as
66 /// `GithubFja05680` but maintained independently; used as a cross-
67 /// check layer.
68 ///
69 /// Source repo: <https://github.com/hanshof/sp500_constituents>.
70 GithubHanshof,
71}
72
73impl DataSource {
74 /// Short string tag stored in the parquet `source` column.
75 pub fn tag(&self) -> String {
76 match self {
77 DataSource::IsharesCdn => "ishares_cdn".into(),
78 DataSource::InvescoCdn => "invesco_cdn".into(),
79 DataSource::SpdrCdn => "spdr_cdn".into(),
80 DataSource::Wayback(yyyymmdd) => format!("wayback_{yyyymmdd}"),
81 DataSource::SecNport => "sec_nport".into(),
82 DataSource::GithubFja05680 => "github_fja05680".into(),
83 DataSource::GithubYfiua { month } => format!("github_yfiua_{month}"),
84 DataSource::GithubHanshof => "github_hanshof".into(),
85 }
86 }
87
88 /// Parse a `source` tag back into a [`DataSource`].
89 pub fn from_tag(s: &str) -> Option<Self> {
90 match s {
91 "ishares_cdn" => Some(DataSource::IsharesCdn),
92 "invesco_cdn" => Some(DataSource::InvescoCdn),
93 "spdr_cdn" => Some(DataSource::SpdrCdn),
94 "sec_nport" => Some(DataSource::SecNport),
95 "github_fja05680" => Some(DataSource::GithubFja05680),
96 "github_hanshof" => Some(DataSource::GithubHanshof),
97 tag if tag.starts_with("wayback_") => Some(DataSource::Wayback(tag[8..].to_string())),
98 tag if tag.starts_with("github_yfiua_") => {
99 let rest = &tag[13..];
100 rest.parse::<YearMonth>()
101 .ok()
102 .map(|month| DataSource::GithubYfiua { month })
103 }
104 _ => None,
105 }
106 }
107
108 /// Priority weight. Higher wins when multiple sources cover the same
109 /// `(index, identity, date)` key during coalesce.
110 pub fn priority(&self) -> u8 {
111 match self {
112 DataSource::IsharesCdn | DataSource::InvescoCdn | DataSource::SpdrCdn => 5,
113 DataSource::GithubFja05680 => 4,
114 DataSource::GithubYfiua { .. } => 3,
115 DataSource::GithubHanshof => 3,
116 DataSource::Wayback(_) => 2,
117 DataSource::SecNport => 1,
118 }
119 }
120}
121
122/// Confidence tier of the data available for a given `(index, month)`.
123#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
124#[serde(rename_all = "lowercase")]
125pub enum Resolution {
126 /// Every trading day in the month has at least one row (from CDN or
127 /// Wayback).
128 Daily,
129 /// Some trading days are covered, others are not.
130 Sparse,
131 /// Only one row per month (N-PORT baseline).
132 Monthly,
133 /// No data.
134 None,
135}
136
137/// One security held by an index ETF on a specific date.
138///
139/// # Field coverage by source
140///
141/// Different upstream sources populate different fields. Always present:
142/// `name` (may be empty for ticker-only mirrors), `as_of`, `source`.
143///
144/// | Field | CDN / Wayback | N-PORT (1) | GitHub mirrors (fja05680, yfiua, hanshof) |
145/// |--------------|---------------|------------|-------------------------------------------|
146/// | `ticker` | ~99 % present | `None` | always `Some(t)` |
147/// | `cusip` | present | present | empty string (`""`) -- unknown |
148/// | `lei` | optional | present | `None` |
149/// | `shares` | present | present | `0.0` -- unknown |
150/// | `market_value_usd` | present | present | `0.0` -- unknown |
151/// | `weight` | fraction of NAV | fraction | `f64::NAN` -- unknown, use [`weight_opt`][Self::weight_opt] |
152///
153/// (1) SEC N-PORT has no ticker column; every N-PORT `Constituent::ticker`
154/// is `None`. Use `cusip` as the join key when N-PORT rows are in play.
155///
156/// # Primary join keys
157///
158/// - **CUSIP** -- preferred, always present for CDN / Wayback / N-PORT rows.
159/// - **Ticker** -- preferred when joining GitHub mirror rows (cusip is empty).
160/// - **LEI** -- available for most US issuers, joinable against GLEIF data.
161///
162/// # Missing weights
163///
164/// GitHub mirror rows ([`DataSource::GithubFja05680`],
165/// [`DataSource::GithubYfiua`], [`DataSource::GithubHanshof`]) are
166/// ticker-only -- they carry no weight, shares, or market value. The
167/// `weight` field is set to `f64::NAN` for these rows as a sentinel.
168/// Prefer [`weight_opt`][Self::weight_opt] for consumer code that needs
169/// to branch on presence.
170#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
171pub struct Constituent {
172 /// Ticker symbol.
173 pub ticker: Option<String>,
174 /// Security name as reported on the source file (issuer + share class).
175 pub name: String,
176 /// CUSIP (9-char). Primary join key for CDN / Wayback / N-PORT rows.
177 /// Empty string for GitHub mirror rows (ticker-only sources).
178 pub cusip: String,
179 /// Legal Entity Identifier (20-char) -- ISO 17442 issuer ID.
180 pub lei: Option<String>,
181 /// Shares held (floating point: allows fractional shares for some ETFs).
182 /// `0.0` when unknown (GitHub mirror sources).
183 pub shares: f64,
184 /// Fair value in USD as reported on the source file.
185 /// `0.0` when unknown (GitHub mirror sources).
186 pub market_value_usd: f64,
187 /// Weight as fraction of NAV in `[0.0, 1.0]`.
188 ///
189 /// `f64::NAN` when the source does not carry weight data (GitHub mirror
190 /// sources). Use [`weight_opt`][Self::weight_opt] for an `Option<f64>`
191 /// that returns `None` on `NaN`.
192 pub weight: f64,
193 /// SEC CIK of the issuer, if identifiable. Usually `None`.
194 pub issuer_cik: Option<String>,
195 /// GICS / SIC sector. Reserved for v1.1; currently always `None`.
196 pub sector: Option<Sector>,
197 /// Date this row represents (the business day as of which the
198 /// holdings are priced). For monthly-only rows from N-PORT this is
199 /// the last business day of the reporting period.
200 pub as_of: NaiveDate,
201 /// Upstream that produced this row.
202 pub source: DataSource,
203}
204
205impl Constituent {
206 /// Weight as [`Option<f64>`].
207 ///
208 /// Returns `None` when [`weight`][Self::weight] is `NaN` (the sentinel
209 /// used by ticker-only GitHub mirror sources) or a subnormal/infinite
210 /// value. Otherwise returns `Some(weight)`.
211 ///
212 /// # Example
213 ///
214 /// ```
215 /// use indexkit::{Constituent, DataSource};
216 /// use chrono::NaiveDate;
217 ///
218 /// let row = Constituent {
219 /// ticker: Some("AAPL".into()),
220 /// name: "".into(),
221 /// cusip: "".into(),
222 /// lei: None,
223 /// shares: 0.0,
224 /// market_value_usd: 0.0,
225 /// weight: f64::NAN,
226 /// issuer_cik: None,
227 /// sector: None,
228 /// as_of: NaiveDate::from_ymd_opt(2024, 1, 15).unwrap(),
229 /// source: DataSource::GithubFja05680,
230 /// };
231 /// assert_eq!(row.weight_opt(), None);
232 /// ```
233 pub fn weight_opt(&self) -> Option<f64> {
234 if self.weight.is_finite() {
235 Some(self.weight)
236 } else {
237 None
238 }
239 }
240}
241
242/// GICS sector placeholder.
243///
244/// Reserved for a v1.1 feature. N-PORT does not include GICS sector. A
245/// future `indexkit-gics` module will derive sector from SEC SIC codes via
246/// a SIC -> GICS cross-walk. Currently every [`Constituent::sector`] field
247/// is `None`.
248#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
249pub enum Sector {
250 CommunicationServices,
251 ConsumerDiscretionary,
252 ConsumerStaples,
253 Energy,
254 Financials,
255 HealthCare,
256 Industrials,
257 InformationTechnology,
258 Materials,
259 RealEstate,
260 Utilities,
261}
262
263/// A full snapshot of index constituents for a given month.
264///
265/// `constituents` may contain rows from multiple calendar dates within the
266/// month (when daily-resolution data exists) or just one row per holding
267/// (when only monthly N-PORT data is available). Rows are sorted by
268/// `(as_of, -weight)`.
269#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
270pub struct IndexSnapshot {
271 /// The index this snapshot represents.
272 pub index: IndexId,
273 /// Month of the snapshot.
274 pub year_month: YearMonth,
275 /// Holdings. Multi-date if daily data is available.
276 pub constituents: Vec<Constituent>,
277}
278
279impl IndexSnapshot {
280 /// Whether this snapshot carries weight data.
281 ///
282 /// Returns `true` if at least one row has a finite `weight` value
283 /// (i.e. it came from a CDN, Wayback, or N-PORT source). Returns
284 /// `false` if every row is ticker-only (all GitHub mirror sources)
285 /// or the snapshot is empty.
286 ///
287 /// Useful as a quick gate for analytics code: a snapshot with
288 /// `has_weights() == false` is a ticker universe only, not a weight
289 /// vector.
290 pub fn has_weights(&self) -> bool {
291 self.constituents.iter().any(|c| c.weight.is_finite())
292 }
293}
294
295/// Single-day snapshot -- every holding as of a specific date.
296#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
297pub struct DailySnapshot {
298 /// The index this snapshot represents.
299 pub index: IndexId,
300 /// Date of the snapshot.
301 pub date: NaiveDate,
302 /// Holdings sorted by descending weight.
303 pub constituents: Vec<Constituent>,
304 /// Source that produced this snapshot.
305 pub source: DataSource,
306}
307
308/// Supported index identifiers.
309///
310/// Strings: `"sp500"`, `"sp400"`, `"sp600"`, `"ndx"`, `"dji"`.
311#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
312#[serde(rename_all = "lowercase")]
313pub enum IndexId {
314 /// S&P 500 (via IVV -- iShares Core S&P 500 ETF).
315 Sp500,
316 /// S&P MidCap 400 (via IJH).
317 Sp400,
318 /// S&P SmallCap 600 (via IJR).
319 Sp600,
320 /// Nasdaq-100 (via QQQ).
321 Ndx,
322 /// Dow Jones Industrial Average (via DIA).
323 Dji,
324}
325
326impl IndexId {
327 /// All five indices.
328 pub const ALL: [IndexId; 5] = [
329 IndexId::Sp500,
330 IndexId::Sp400,
331 IndexId::Sp600,
332 IndexId::Ndx,
333 IndexId::Dji,
334 ];
335
336 /// Parse from short string id.
337 pub fn from_str_id(s: &str) -> Option<Self> {
338 match s.to_ascii_lowercase().as_str() {
339 "sp500" => Some(IndexId::Sp500),
340 "sp400" => Some(IndexId::Sp400),
341 "sp600" => Some(IndexId::Sp600),
342 "ndx" | "nasdaq100" | "nasdaq-100" => Some(IndexId::Ndx),
343 "dji" | "djia" | "dow" => Some(IndexId::Dji),
344 _ => None,
345 }
346 }
347
348 /// Short string id used for parquet file prefixes.
349 pub fn as_str(self) -> &'static str {
350 match self {
351 IndexId::Sp500 => "sp500",
352 IndexId::Sp400 => "sp400",
353 IndexId::Sp600 => "sp600",
354 IndexId::Ndx => "ndx",
355 IndexId::Dji => "dji",
356 }
357 }
358}
359
360impl std::fmt::Display for IndexId {
361 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
362 f.write_str(self.as_str())
363 }
364}
365
366impl std::str::FromStr for IndexId {
367 type Err = String;
368 fn from_str(s: &str) -> Result<Self, Self::Err> {
369 IndexId::from_str_id(s).ok_or_else(|| format!("unknown index id: {s:?}"))
370 }
371}
372
373#[cfg(test)]
374mod tests {
375 use super::*;
376
377 #[test]
378 fn indexid_roundtrip() {
379 for &id in &IndexId::ALL {
380 let s = id.as_str();
381 assert_eq!(IndexId::from_str_id(s), Some(id));
382 }
383 }
384
385 #[test]
386 fn indexid_aliases() {
387 assert_eq!(IndexId::from_str_id("nasdaq100"), Some(IndexId::Ndx));
388 assert_eq!(IndexId::from_str_id("djia"), Some(IndexId::Dji));
389 assert_eq!(IndexId::from_str_id("SP500"), Some(IndexId::Sp500));
390 }
391
392 #[test]
393 fn indexid_unknown() {
394 assert_eq!(IndexId::from_str_id("totally-fake"), None);
395 }
396
397 #[test]
398 fn data_source_tag_roundtrip_core() {
399 for ds in [
400 DataSource::IsharesCdn,
401 DataSource::InvescoCdn,
402 DataSource::SpdrCdn,
403 DataSource::SecNport,
404 DataSource::GithubFja05680,
405 DataSource::GithubHanshof,
406 DataSource::Wayback("20240315".into()),
407 DataSource::GithubYfiua {
408 month: YearMonth::new(2024, 3).unwrap(),
409 },
410 ] {
411 let tag = ds.tag();
412 let back = DataSource::from_tag(&tag).expect("parseable");
413 assert_eq!(back, ds, "tag {tag} did not round-trip");
414 }
415 }
416
417 #[test]
418 fn data_source_priority_ladder() {
419 assert_eq!(DataSource::IsharesCdn.priority(), 5);
420 assert_eq!(DataSource::InvescoCdn.priority(), 5);
421 assert_eq!(DataSource::SpdrCdn.priority(), 5);
422 assert_eq!(DataSource::GithubFja05680.priority(), 4);
423 assert_eq!(
424 DataSource::GithubYfiua {
425 month: YearMonth::new(2024, 3).unwrap()
426 }
427 .priority(),
428 3
429 );
430 assert_eq!(DataSource::GithubHanshof.priority(), 3);
431 assert_eq!(DataSource::Wayback("20240315".into()).priority(), 2);
432 assert_eq!(DataSource::SecNport.priority(), 1);
433 }
434
435 fn ticker_only_row(ticker: &str, date: NaiveDate, src: DataSource) -> Constituent {
436 Constituent {
437 ticker: Some(ticker.into()),
438 name: String::new(),
439 cusip: String::new(),
440 lei: None,
441 shares: 0.0,
442 market_value_usd: 0.0,
443 weight: f64::NAN,
444 issuer_cik: None,
445 sector: None,
446 as_of: date,
447 source: src,
448 }
449 }
450
451 #[test]
452 fn weight_opt_nan_is_none() {
453 let d = NaiveDate::from_ymd_opt(2024, 1, 15).unwrap();
454 let row = ticker_only_row("AAPL", d, DataSource::GithubFja05680);
455 assert_eq!(row.weight_opt(), None);
456 }
457
458 #[test]
459 fn weight_opt_finite_is_some() {
460 let d = NaiveDate::from_ymd_opt(2024, 1, 15).unwrap();
461 let mut row = ticker_only_row("AAPL", d, DataSource::IsharesCdn);
462 row.weight = 0.072;
463 assert_eq!(row.weight_opt(), Some(0.072));
464 }
465
466 #[test]
467 fn weight_opt_infinity_is_none() {
468 let d = NaiveDate::from_ymd_opt(2024, 1, 15).unwrap();
469 let mut row = ticker_only_row("AAPL", d, DataSource::IsharesCdn);
470 row.weight = f64::INFINITY;
471 assert_eq!(row.weight_opt(), None);
472 }
473
474 #[test]
475 fn snapshot_has_weights_true_when_any_finite() {
476 let d = NaiveDate::from_ymd_opt(2024, 1, 15).unwrap();
477 let mut with_weight = ticker_only_row("AAPL", d, DataSource::IsharesCdn);
478 with_weight.weight = 0.05;
479 let nan_row = ticker_only_row("MSFT", d, DataSource::GithubFja05680);
480 let s = IndexSnapshot {
481 index: IndexId::Sp500,
482 year_month: YearMonth::new(2024, 1).unwrap(),
483 constituents: vec![with_weight, nan_row],
484 };
485 assert!(s.has_weights());
486 }
487
488 #[test]
489 fn snapshot_has_weights_false_when_all_nan() {
490 let d = NaiveDate::from_ymd_opt(2024, 1, 15).unwrap();
491 let row1 = ticker_only_row("AAPL", d, DataSource::GithubFja05680);
492 let row2 = ticker_only_row("MSFT", d, DataSource::GithubHanshof);
493 let s = IndexSnapshot {
494 index: IndexId::Sp500,
495 year_month: YearMonth::new(2024, 1).unwrap(),
496 constituents: vec![row1, row2],
497 };
498 assert!(!s.has_weights());
499 }
500
501 #[test]
502 fn snapshot_has_weights_false_when_empty() {
503 let s = IndexSnapshot {
504 index: IndexId::Sp500,
505 year_month: YearMonth::new(2024, 1).unwrap(),
506 constituents: Vec::new(),
507 };
508 assert!(!s.has_weights());
509 }
510}