use std::path::PathBuf;
use crate::datasets::sec::catalog::{self, RATE_LIMIT_PER_SEC};
use crate::datasets::sec::client::{FetchMode, SecClient};
use crate::datasets::sec::error::{Result, SecError};
use crate::datasets::sec::layout::Workdir;
#[derive(Debug, Clone, Copy)]
pub struct YearRange {
pub start: u16,
pub end: u16,
}
impl YearRange {
pub fn new(start: u16, end: u16) -> Self {
debug_assert!(start <= end);
Self { start, end }
}
pub fn quarters(self) -> impl Iterator<Item = (u16, u8)> {
(self.start..=self.end).flat_map(|year| {
(1u8..=4u8).filter_map(move |quarter| {
if year == 1993 && quarter < 3 {
None
} else {
Some((year, quarter))
}
})
})
}
}
pub async fn fetch_quarterly_master_idx(
client: &SecClient,
workdir: &Workdir,
range: YearRange,
current_year: u16,
current_quarter: u8,
) -> Result<(usize, usize)> {
workdir.ensure_dirs(None)?;
let mut downloaded = 0;
let mut skipped = 0;
for (year, quarter) in range.quarters() {
let url = catalog::quarterly_master_idx_url(year, quarter);
let path = workdir.raw_master_idx(year, quarter);
let is_current = year == current_year && quarter == current_quarter;
let mode = if is_current {
FetchMode::Always
} else {
FetchMode::OnlyIfMissing
};
match client.fetch_to_file(&url, &path, mode).await {
Ok(true) => downloaded += 1,
Ok(false) => skipped += 1,
Err(SecError::BadStatus { status: 404, .. }) => {
skipped += 1;
}
Err(e) => return Err(e),
}
}
Ok((downloaded, skipped))
}
pub async fn fetch_submissions_bulk(
client: &SecClient,
workdir: &Workdir,
staleness_hours: u64,
force_refetch: bool,
) -> Result<bool> {
workdir.ensure_dirs(None)?;
let path = workdir.raw_submissions_zip();
if !force_refetch && path.is_file() {
let metadata = std::fs::metadata(&path)?;
let modified = metadata
.modified()
.ok()
.and_then(|t| t.elapsed().ok())
.map(|d| d.as_secs())
.unwrap_or(u64::MAX);
let stale_seconds = staleness_hours * 3600;
if modified < stale_seconds {
return Ok(false);
}
}
let url = catalog::submissions_bulk_url();
client.fetch_to_file(url, &path, FetchMode::Always).await?;
Ok(true)
}
pub async fn fetch_company_tickers(
client: &SecClient,
workdir: &Workdir,
force_refetch: bool,
) -> Result<bool> {
workdir.ensure_dirs(None)?;
let path = workdir.raw_company_tickers_json();
let mode = if force_refetch {
FetchMode::Always
} else {
FetchMode::OnlyIfMissing
};
client
.fetch_to_file(catalog::company_tickers_url(), &path, mode)
.await
}
pub fn rate_limit_cost_seconds(range: YearRange) -> f64 {
range.quarters().count() as f64 / RATE_LIMIT_PER_SEC as f64
}
pub async fn fetch_company_facts(
client: &SecClient,
workdir: &Workdir,
cik: u64,
force_refetch: bool,
) -> Result<bool> {
workdir.ensure_dirs(None)?;
let path = workdir
.raw_financials_dir()
.join(format!("companyfacts_CIK{cik:010}.json"));
if !force_refetch && path.is_file() {
return Ok(false);
}
let url = catalog::companyfacts_url(cik);
match client.fetch_to_file(&url, &path, FetchMode::Always).await {
Ok(v) => Ok(v),
Err(SecError::BadStatus { status: 404, .. }) => Ok(false),
Err(e) => Err(e),
}
}
pub async fn fetch_company_submission(
client: &SecClient,
workdir: &Workdir,
cik: u64,
force_refetch: bool,
) -> Result<bool> {
workdir.ensure_dirs(None)?;
let path = workdir
.raw_submissions_dir()
.join(format!("CIK{cik:010}.json"));
if !force_refetch && path.is_file() {
return Ok(false);
}
let url = catalog::submissions_cik_url(cik);
match client.fetch_to_file(&url, &path, FetchMode::Always).await {
Ok(v) => Ok(v),
Err(SecError::BadStatus { status: 404, .. }) => Ok(false),
Err(e) => Err(e),
}
}
pub async fn fetch_13f_info_table(
client: &SecClient,
workdir: &Workdir,
issuer_cik: u64,
accession_dashed: &str,
) -> Result<bool> {
let accession_no_dashes = catalog::accession_no_dashes(accession_dashed);
let dest = workdir
.raw_filings_dir()
.join(issuer_cik.to_string())
.join(&accession_no_dashes)
.join("13f-infotable.xml");
if dest.is_file() {
return Ok(false);
}
let index_url = format!(
"{}{}",
catalog::filing_index_url(issuer_cik, &accession_no_dashes),
"index.json"
);
let bytes = client.fetch_bytes(&index_url).await?;
let v: serde_json::Value = serde_json::from_slice(&bytes)
.map_err(|e| SecError::Decode(format!("filing index.json: {e}")))?;
let docs = v
.get("directory")
.and_then(|d| d.get("item"))
.and_then(|i| i.as_array())
.ok_or_else(|| SecError::Decode("filing index: missing directory.item".into()))?;
let mut info_filename: Option<String> = None;
let mut fallback_filename: Option<String> = None;
for d in docs {
let typ = d.get("type").and_then(|t| t.as_str()).unwrap_or("");
let name = d.get("name").and_then(|n| n.as_str()).unwrap_or("");
if !name.ends_with(".xml") {
continue;
}
let matches = typ.eq_ignore_ascii_case("INFORMATION TABLE")
|| name.to_ascii_lowercase().contains("infotable")
|| name.to_ascii_lowercase().contains("info_table");
if matches {
info_filename = Some(name.to_string());
break;
}
if name != "primary_doc.xml" && fallback_filename.is_none() {
fallback_filename = Some(name.to_string());
}
}
let Some(fname) = info_filename.or(fallback_filename) else {
return Err(SecError::Decode(format!(
"no info-table XML in {accession_dashed}"
)));
};
let url = format!(
"{}{}",
catalog::filing_index_url(issuer_cik, &accession_no_dashes),
fname
);
client
.fetch_to_file(&url, &dest, FetchMode::OnlyIfMissing)
.await
}
pub async fn fetch_form4_filing(
client: &SecClient,
workdir: &Workdir,
issuer_cik: u64,
accession_dashed: &str,
primary_document: &str,
) -> Result<bool> {
let accession_no_dashes = catalog::accession_no_dashes(accession_dashed);
let xml_filename = primary_document
.rsplit_once('/')
.map(|(_, name)| name)
.unwrap_or(primary_document);
let url = format!(
"{}{}",
catalog::filing_index_url(issuer_cik, &accession_no_dashes),
xml_filename
);
let path = workdir
.raw_filings_dir()
.join(issuer_cik.to_string())
.join(&accession_no_dashes)
.join("form4.xml");
client
.fetch_to_file(&url, &path, FetchMode::OnlyIfMissing)
.await
}
pub async fn fetch_filing_primary_doc(
client: &SecClient,
workdir: &Workdir,
issuer_cik: u64,
accession_dashed: &str,
primary_document: &str,
) -> Result<bool> {
if primary_document.is_empty() {
return Ok(false);
}
let accession_no_dashes = catalog::accession_no_dashes(accession_dashed);
let url = format!(
"{}{}",
catalog::filing_index_url(issuer_cik, &accession_no_dashes),
primary_document
);
let path = workdir
.raw_filings_dir()
.join(issuer_cik.to_string())
.join(&accession_no_dashes)
.join(primary_document);
client
.fetch_to_file(&url, &path, FetchMode::OnlyIfMissing)
.await
}
pub async fn fetch_exhibit21_attachment(
client: &SecClient,
workdir: &Workdir,
issuer_cik: u64,
accession_dashed: &str,
) -> Result<usize> {
let accession_no_dashes = catalog::accession_no_dashes(accession_dashed);
let index_url = format!(
"{}{}",
catalog::filing_index_url(issuer_cik, &accession_no_dashes),
"index.json"
);
let bytes = client.fetch_bytes(&index_url).await?;
let v: serde_json::Value = serde_json::from_slice(&bytes)
.map_err(|e| SecError::Decode(format!("filing index.json: {e}")))?;
let docs = v
.get("directory")
.and_then(|d| d.get("item"))
.and_then(|i| i.as_array())
.ok_or_else(|| SecError::Decode("filing index: missing directory.item".into()))?;
let mut downloaded = 0;
for d in docs {
let name = d.get("name").and_then(|n| n.as_str()).unwrap_or("");
if !is_exhibit21_attachment_name(name) {
continue;
}
let url = format!(
"{}{}",
catalog::filing_index_url(issuer_cik, &accession_no_dashes),
name
);
let dest = workdir
.raw_filings_dir()
.join(issuer_cik.to_string())
.join(&accession_no_dashes)
.join(name);
match client
.fetch_to_file(&url, &dest, FetchMode::OnlyIfMissing)
.await
{
Ok(true) => downloaded += 1,
Ok(false) => downloaded += 1,
Err(_) => continue,
}
}
Ok(downloaded)
}
fn is_exhibit21_attachment_name(name: &str) -> bool {
let n = name.to_ascii_lowercase();
if !(n.ends_with(".htm") || n.ends_with(".html") || n.ends_with(".txt")) {
return false;
}
n.contains("ex21") || n.contains("exhibit21") || n.contains("ex-21") || n.contains("exhibit-21")
}
#[allow(dead_code)]
pub(crate) fn raw_master_idx_path(workdir: &Workdir, year: u16, q: u8) -> PathBuf {
workdir.raw_master_idx(year, q)
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn year_range_quarters_skips_pre_1993_q3() {
let r = YearRange::new(1992, 1994);
let qs: Vec<_> = r.quarters().collect();
assert!(qs.contains(&(1993, 3)));
assert!(qs.contains(&(1993, 4)));
assert!(!qs.contains(&(1993, 1)));
assert!(!qs.contains(&(1993, 2)));
assert!(qs.contains(&(1994, 1)));
}
#[test]
fn year_range_quarters_one_year() {
let r = YearRange::new(2024, 2024);
let qs: Vec<_> = r.quarters().collect();
assert_eq!(qs, vec![(2024, 1), (2024, 2), (2024, 3), (2024, 4)]);
}
#[test]
fn rate_limit_cost_is_quarters_div_ten() {
let r = YearRange::new(2020, 2024); assert!((rate_limit_cost_seconds(r) - 2.0).abs() < 1e-9);
}
#[test]
fn is_exhibit21_attachment_name_matches_known_patterns() {
assert!(is_exhibit21_attachment_name("ex21.htm"));
assert!(is_exhibit21_attachment_name("aapl-20240928-ex21.htm"));
assert!(is_exhibit21_attachment_name("exhibit-21.htm"));
assert!(is_exhibit21_attachment_name("Exhibit21.HTML"));
assert!(is_exhibit21_attachment_name("aapl_ex21.txt"));
assert!(!is_exhibit21_attachment_name("ex21.pdf"));
assert!(!is_exhibit21_attachment_name("ex22.htm"));
assert!(!is_exhibit21_attachment_name("aapl-10k.htm"));
assert!(!is_exhibit21_attachment_name(""));
}
#[test]
fn fetch_filing_primary_doc_skips_empty_filename() {
let rt = tokio::runtime::Builder::new_current_thread()
.enable_all()
.build()
.unwrap();
let tmp = tempfile::tempdir().unwrap();
let wd = Workdir::new(tmp.path().to_path_buf());
let client = SecClient::new("test agent test@example.com").unwrap();
let result = rt.block_on(fetch_filing_primary_doc(
&client,
&wd,
320193,
"0001-23-456",
"",
));
assert!(matches!(result, Ok(false)));
}
}