use crate::date::YearMonth;
use crate::error::{Error, Result};
use crate::types::{Constituent, DataSource, IndexId};
use chrono::NaiveDate;
use std::time::Duration;
pub const GITHUB_USER_AGENT: &str = "indexkit/1.0.1 (+https://github.com/userFRM/indexkit)";
pub const FJA05680_CSV_URL: &str = "https://raw.githubusercontent.com/fja05680/sp500/master/\
S%26P%20500%20Historical%20Components%20%26%20Changes(01-17-2026).csv";
pub const FJA05680_CSV_URL_LEGACY: &str =
"https://raw.githubusercontent.com/fja05680/sp500/master/\
S%26P%20500%20Historical%20Components%20%26%20Changes.csv";
pub const HANSHOF_CSV_URL: &str =
"https://raw.githubusercontent.com/hanshof/sp500_constituents/master/\
sp_500_historical_components.csv";
pub fn yfiua_url(index: IndexId, ym: YearMonth) -> Option<String> {
let code = github_yfiua_index_code(index)?;
Some(format!(
"https://raw.githubusercontent.com/yfiua/index-constituents/master/\
docs/{:04}/{:02}/constituents-{code}.csv",
ym.year(),
ym.month(),
))
}
pub fn github_yfiua_index_code(id: IndexId) -> Option<&'static str> {
match id {
IndexId::Sp500 => Some("sp500"),
IndexId::Ndx => Some("nasdaq100"),
IndexId::Dji => Some("dowjones"),
IndexId::Sp400 | IndexId::Sp600 => None,
}
}
fn http_client() -> Result<reqwest::Client> {
Ok(reqwest::Client::builder()
.user_agent(GITHUB_USER_AGENT)
.timeout(Duration::from_secs(120))
.build()?)
}
pub async fn fetch_fja05680_sp500() -> Result<Vec<(NaiveDate, Vec<String>)>> {
let http = http_client()?;
let body = match fetch_text(&http, FJA05680_CSV_URL).await {
Ok(b) => b,
Err(_) => fetch_text(&http, FJA05680_CSV_URL_LEGACY).await?,
};
parse_fja05680_csv(&body)
}
pub async fn fetch_hanshof_sp500() -> Result<Vec<(NaiveDate, Vec<String>)>> {
let http = http_client()?;
let body = fetch_text(&http, HANSHOF_CSV_URL).await?;
parse_hanshof_csv(&body)
}
pub async fn fetch_yfiua(index: IndexId, ym: YearMonth) -> Result<Vec<String>> {
let url = yfiua_url(index, ym).ok_or_else(|| {
Error::Other(format!(
"yfiua does not publish {index}; supported: sp500, ndx, dji"
))
})?;
let http = http_client()?;
let body = fetch_text(&http, &url).await?;
Ok(parse_yfiua_csv(&body))
}
pub async fn fetch_yfiua_full(
index: IndexId,
start: Option<YearMonth>,
end: Option<YearMonth>,
) -> Result<Vec<(YearMonth, Vec<String>)>> {
let start = start.unwrap_or_else(|| YearMonth::new(2018, 7).unwrap());
let end = end.unwrap_or_else(YearMonth::current_utc);
if start > end {
return Err(Error::Other(format!(
"yfiua_full: start {start} > end {end}"
)));
}
let http = http_client()?;
let mut out = Vec::new();
let months: Vec<YearMonth> = start.iter_to(end).collect();
for ym in months {
let Some(url) = yfiua_url(index, ym) else {
return Err(Error::Other(format!(
"yfiua does not publish {index}; supported: sp500, ndx, dji"
)));
};
match fetch_text(&http, &url).await {
Ok(body) => {
let tickers = parse_yfiua_csv(&body);
if !tickers.is_empty() {
out.push((ym, tickers));
}
}
Err(e) => {
tracing::debug!(%index, %ym, "yfiua month not available: {e}");
}
}
}
Ok(out)
}
#[allow(clippy::needless_collect)]
pub fn forward_fill(changes: &[(NaiveDate, Vec<String>)]) -> Vec<(NaiveDate, Vec<String>)> {
if changes.is_empty() {
return Vec::new();
}
let mut out = Vec::new();
let first = changes[0].0;
let last = changes[changes.len() - 1].0;
let mut cur_idx = 0usize;
let mut cur_tickers: &[String] = &changes[0].1;
let mut d = first;
while d <= last {
while cur_idx + 1 < changes.len() && changes[cur_idx + 1].0 <= d {
cur_idx += 1;
cur_tickers = &changes[cur_idx].1;
}
out.push((d, cur_tickers.to_vec()));
d = match d.succ_opt() {
Some(n) => n,
None => break,
};
}
out
}
pub fn tickers_to_constituents(
tickers: &[String],
as_of: NaiveDate,
source: DataSource,
) -> Vec<Constituent> {
tickers
.iter()
.filter_map(|t| {
let t = t.trim();
if t.is_empty() {
return None;
}
Some(Constituent {
ticker: Some(t.to_string()),
name: String::new(),
cusip: String::new(),
lei: None,
shares: 0.0,
market_value_usd: 0.0,
weight: f64::NAN,
issuer_cik: None,
sector: None,
as_of,
source: source.clone(),
})
})
.collect()
}
pub fn parse_fja05680_csv(body: &str) -> Result<Vec<(NaiveDate, Vec<String>)>> {
let mut out = Vec::new();
for (i, line) in body.lines().enumerate() {
if i == 0 {
if !line.starts_with("date") {
return Err(Error::Other(format!(
"fja05680: unexpected header {line:?}"
)));
}
continue;
}
if line.trim().is_empty() {
continue;
}
let Some((date_s, tickers_s)) = split_date_tickers(line) else {
continue;
};
let Ok(date) = NaiveDate::parse_from_str(date_s.trim(), "%Y-%m-%d") else {
continue;
};
let tickers = parse_ticker_list(&tickers_s)
.into_iter()
.map(|t| strip_fja05680_suffix(&t))
.filter(|t| !t.is_empty())
.collect::<Vec<_>>();
out.push((date, tickers));
}
Ok(out)
}
pub fn parse_hanshof_csv(body: &str) -> Result<Vec<(NaiveDate, Vec<String>)>> {
let mut out = Vec::new();
for (i, line) in body.lines().enumerate() {
if i == 0 {
if !line.starts_with("date") {
return Err(Error::Other(format!("hanshof: unexpected header {line:?}")));
}
continue;
}
if line.trim().is_empty() {
continue;
}
let Some((date_s, tickers_s)) = split_date_tickers(line) else {
continue;
};
let Ok(date) = NaiveDate::parse_from_str(date_s.trim(), "%Y-%m-%d") else {
continue;
};
let tickers = parse_ticker_list(&tickers_s)
.into_iter()
.filter(|t| !t.is_empty())
.collect::<Vec<_>>();
out.push((date, tickers));
}
Ok(out)
}
pub fn parse_yfiua_csv(body: &str) -> Vec<String> {
let mut out = Vec::new();
for (i, line) in body.lines().enumerate() {
if i == 0 {
continue;
}
let trimmed = line.trim();
if trimmed.is_empty() {
continue;
}
let sym = match split_once_csv(trimmed) {
Some((s, _)) => s.trim().trim_matches('"').to_string(),
None => continue,
};
if !sym.is_empty() {
out.push(sym);
}
}
out
}
fn split_date_tickers(line: &str) -> Option<(String, String)> {
let first_comma = line.find(',')?;
let date_s = line[..first_comma].to_string();
let rest = line[first_comma + 1..].to_string();
let rest = rest.trim();
let rest = rest
.strip_prefix('"')
.and_then(|s| s.strip_suffix('"'))
.unwrap_or(rest)
.to_string();
Some((date_s, rest))
}
fn parse_ticker_list(s: &str) -> Vec<String> {
s.split(',').map(|t| t.trim().to_string()).collect()
}
fn strip_fja05680_suffix(raw: &str) -> String {
let t = raw.trim();
if let Some((head, tail)) = t.rsplit_once('-') {
if tail.len() == 6 && tail.chars().all(|c| c.is_ascii_digit()) {
return head.to_string();
}
}
t.to_string()
}
fn split_once_csv(line: &str) -> Option<(String, String)> {
let mut in_quotes = false;
for (i, c) in line.char_indices() {
match c {
'"' => in_quotes = !in_quotes,
',' if !in_quotes => {
return Some((line[..i].to_string(), line[i + 1..].to_string()));
}
_ => {}
}
}
None
}
async fn fetch_text(http: &reqwest::Client, url: &str) -> Result<String> {
let resp = http.get(url).send().await?;
if !resp.status().is_success() {
return Err(Error::Other(format!(
"github_mirror fetch {url}: HTTP {}",
resp.status().as_u16()
)));
}
Ok(resp.text().await?)
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn strip_fja05680_suffix_removes_six_digits() {
assert_eq!(strip_fja05680_suffix("AAL-199702"), "AAL");
assert_eq!(strip_fja05680_suffix("AZA.A-200106"), "AZA.A");
assert_eq!(strip_fja05680_suffix("BF.B"), "BF.B");
assert_eq!(strip_fja05680_suffix("AAPL"), "AAPL");
assert_eq!(strip_fja05680_suffix("BRK.B"), "BRK.B");
}
#[test]
fn strip_fja05680_suffix_keeps_short_tails() {
assert_eq!(strip_fja05680_suffix("FOO-12345"), "FOO-12345");
assert_eq!(strip_fja05680_suffix("FOO-1234567"), "FOO-1234567");
assert_eq!(strip_fja05680_suffix("FOO-BAR"), "FOO-BAR");
}
#[test]
fn parse_fja05680_minimal() {
let csv = r#"date,tickers
1996-01-02,"AAL-199702,AAPL,MSFT,IBM"
2024-03-15,"AAPL,MSFT,NVDA,BRK.B"
"#;
let rows = parse_fja05680_csv(csv).unwrap();
assert_eq!(rows.len(), 2);
assert_eq!(rows[0].0, NaiveDate::from_ymd_opt(1996, 1, 2).unwrap());
assert_eq!(rows[0].1, vec!["AAL", "AAPL", "MSFT", "IBM"]);
assert_eq!(rows[1].1, vec!["AAPL", "MSFT", "NVDA", "BRK.B"]);
}
#[test]
fn parse_hanshof_minimal() {
let csv = r#"date,tickers
1996-01-02,"AAL,AAPL,MSFT,IBM"
2019-01-11,"AAPL,MSFT,IBM"
"#;
let rows = parse_hanshof_csv(csv).unwrap();
assert_eq!(rows.len(), 2);
assert_eq!(rows[0].0, NaiveDate::from_ymd_opt(1996, 1, 2).unwrap());
assert_eq!(rows[0].1, vec!["AAL", "AAPL", "MSFT", "IBM"]);
}
#[test]
fn parse_yfiua_minimal() {
let csv = r#"Symbol,Name
AAPL,Apple Inc.
MSFT,Microsoft Corp
META,"Meta Platforms, Inc. Class A"
BRK.B,Berkshire Hathaway Class B
"#;
let syms = parse_yfiua_csv(csv);
assert_eq!(syms, vec!["AAPL", "MSFT", "META", "BRK.B"]);
}
#[test]
fn yfiua_url_shape() {
let url = yfiua_url(IndexId::Sp500, YearMonth::new(2024, 3).unwrap()).unwrap();
assert_eq!(
url,
"https://raw.githubusercontent.com/yfiua/index-constituents/master/\
docs/2024/03/constituents-sp500.csv"
);
let url = yfiua_url(IndexId::Ndx, YearMonth::new(2024, 3).unwrap()).unwrap();
assert!(url.ends_with("constituents-nasdaq100.csv"));
let url = yfiua_url(IndexId::Dji, YearMonth::new(2024, 3).unwrap()).unwrap();
assert!(url.ends_with("constituents-dowjones.csv"));
}
#[test]
fn yfiua_not_available_for_sp400_sp600() {
assert!(yfiua_url(IndexId::Sp400, YearMonth::new(2024, 3).unwrap()).is_none());
assert!(yfiua_url(IndexId::Sp600, YearMonth::new(2024, 3).unwrap()).is_none());
}
#[test]
fn tickers_to_constituents_nan_weight() {
let d = NaiveDate::from_ymd_opt(2024, 3, 15).unwrap();
let ts = ["AAPL".to_string(), "MSFT".to_string(), "".to_string()];
let rows = tickers_to_constituents(&ts, d, DataSource::GithubFja05680);
assert_eq!(rows.len(), 2, "empty tickers must be filtered");
for r in &rows {
assert!(r.weight.is_nan());
assert!(r.cusip.is_empty());
assert_eq!(r.as_of, d);
assert_eq!(r.source, DataSource::GithubFja05680);
assert!(r.ticker.is_some());
}
}
#[test]
fn forward_fill_carries_composition() {
let d = |y, m, day| NaiveDate::from_ymd_opt(y, m, day).unwrap();
let changes = vec![
(d(2024, 1, 2), vec!["A".to_string(), "B".to_string()]),
(d(2024, 1, 4), vec!["A".to_string(), "C".to_string()]),
];
let ff = forward_fill(&changes);
assert_eq!(ff.len(), 3); assert_eq!(ff[0].0, d(2024, 1, 2));
assert_eq!(ff[0].1, vec!["A", "B"]);
assert_eq!(ff[1].0, d(2024, 1, 3));
assert_eq!(ff[1].1, vec!["A", "B"], "must carry forward to 1/3");
assert_eq!(ff[2].0, d(2024, 1, 4));
assert_eq!(ff[2].1, vec!["A", "C"], "must reflect change on 1/4");
}
#[test]
fn forward_fill_empty() {
let empty: Vec<(NaiveDate, Vec<String>)> = Vec::new();
assert!(forward_fill(&empty).is_empty());
}
#[test]
fn github_yfiua_index_code_mapping() {
assert_eq!(github_yfiua_index_code(IndexId::Sp500), Some("sp500"));
assert_eq!(github_yfiua_index_code(IndexId::Ndx), Some("nasdaq100"));
assert_eq!(github_yfiua_index_code(IndexId::Dji), Some("dowjones"));
assert_eq!(github_yfiua_index_code(IndexId::Sp400), None);
assert_eq!(github_yfiua_index_code(IndexId::Sp600), None);
}
#[test]
fn split_once_csv_basic() {
let r = split_once_csv(r#"AAPL,Apple Inc."#).unwrap();
assert_eq!(r.0, "AAPL");
assert_eq!(r.1, "Apple Inc.");
}
#[test]
fn split_once_csv_quoted_comma() {
let r = split_once_csv(r#"META,"Meta Platforms, Inc. Class A""#).unwrap();
assert_eq!(r.0, "META");
assert_eq!(r.1, r#""Meta Platforms, Inc. Class A""#);
}
}