use chrono::SecondsFormat;
#[derive(Debug, Clone)]
pub struct Provenance {
pub form: String,
pub accession: String,
pub document: String,
pub url: String,
pub lot: Option<usize>,
pub page: Option<usize>,
pub paragraph: Option<usize>,
pub extracted_at: String,
}
impl Provenance {
pub const HEADER: &'static [&'static str] = &[
"source_form",
"source_accession",
"source_document",
"source_url",
"source_lot",
"source_page",
"source_paragraph",
"source_extracted_at",
];
pub fn for_filing(
form: &str,
accession: &str,
filer_cik: &str,
document: &str,
extracted_at: &str,
) -> Self {
Self {
form: form.to_string(),
accession: accession.to_string(),
document: document.to_string(),
url: build_archives_url(filer_cik, accession, document),
lot: None,
page: None,
paragraph: None,
extracted_at: extracted_at.to_string(),
}
}
pub fn with_lot(mut self, lot: usize) -> Self {
self.lot = Some(lot);
self
}
pub fn with_page(mut self, page: usize) -> Self {
self.page = Some(page);
self
}
pub fn with_paragraph(mut self, paragraph: usize) -> Self {
self.paragraph = Some(paragraph);
self
}
pub fn as_cells(&self) -> [String; 8] {
[
self.form.clone(),
self.accession.clone(),
self.document.clone(),
self.url.clone(),
self.lot.map(|n| n.to_string()).unwrap_or_default(),
self.page.map(|n| n.to_string()).unwrap_or_default(),
self.paragraph.map(|n| n.to_string()).unwrap_or_default(),
self.extracted_at.clone(),
]
}
}
pub fn build_archives_url(cik: &str, accession: &str, document: &str) -> String {
let cik_unpadded = cik.trim_start_matches('0');
let acc_no_dashes: String = accession.chars().filter(|c| *c != '-').collect();
format!("/Archives/edgar/data/{cik_unpadded}/{acc_no_dashes}/{document}")
}
pub fn now_iso() -> String {
chrono::Utc::now().to_rfc3339_opts(SecondsFormat::Secs, true)
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn build_url_strips_leading_zeros_and_dashes() {
let url = build_archives_url("0000320193", "0001104659-25-073753", "doc.xml");
assert_eq!(
url,
"/Archives/edgar/data/320193/000110465925073753/doc.xml"
);
}
#[test]
fn build_url_accepts_already_no_dash_accession() {
let url = build_archives_url("320193", "000110465925073753", "doc.xml");
assert_eq!(
url,
"/Archives/edgar/data/320193/000110465925073753/doc.xml"
);
}
#[test]
fn provenance_header_has_eight_columns() {
assert_eq!(Provenance::HEADER.len(), 8);
}
#[test]
fn provenance_as_cells_emits_eight_strings() {
let p = Provenance::for_filing(
"4",
"0001104659-25-073753",
"0001318605",
"form4.xml",
"2026-05-20T10:00:00Z",
)
.with_lot(3);
let cells = p.as_cells();
assert_eq!(cells.len(), 8);
assert_eq!(cells[0], "4");
assert_eq!(cells[1], "0001104659-25-073753");
assert_eq!(cells[2], "form4.xml");
assert_eq!(
cells[3],
"/Archives/edgar/data/1318605/000110465925073753/form4.xml"
);
assert_eq!(cells[4], "3");
assert_eq!(cells[5], "");
assert_eq!(cells[6], "");
assert_eq!(cells[7], "2026-05-20T10:00:00Z");
}
#[test]
fn provenance_lot_page_paragraph_optional() {
let p = Provenance::for_filing("DEF 14A", "x", "1", "p.htm", "t");
let cells = p.as_cells();
assert_eq!(cells[4], "");
assert_eq!(cells[5], "");
assert_eq!(cells[6], "");
}
}