use std::io::BufRead;
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct FilingEntry {
pub cik: u64,
pub company_name: String,
pub form_type: String,
pub date_filed: String,
pub file_path: String,
}
impl FilingEntry {
pub fn accession_number(&self) -> Option<&str> {
let filename = self.file_path.rsplit('/').next()?;
filename
.strip_suffix(".txt")
.or_else(|| filename.strip_suffix("-index.htm"))
}
}
#[derive(Debug, thiserror::Error)]
pub enum ParseError {
#[error("io error: {0}")]
Io(#[from] std::io::Error),
#[error("malformed entry: {line}")]
Malformed { line: String },
}
pub fn parse_master_idx<R: BufRead>(
reader: R,
) -> impl Iterator<Item = Result<FilingEntry, ParseError>> {
MasterIdxParser {
lines: reader.lines(),
past_header: false,
}
}
struct MasterIdxParser<L> {
lines: L,
past_header: bool,
}
impl<L> Iterator for MasterIdxParser<L>
where
L: Iterator<Item = std::io::Result<String>>,
{
type Item = Result<FilingEntry, ParseError>;
fn next(&mut self) -> Option<Self::Item> {
loop {
let raw = match self.lines.next()? {
Ok(line) => line,
Err(e) => return Some(Err(e.into())),
};
let trimmed = raw.trim();
if !self.past_header {
if trimmed.starts_with("----") {
self.past_header = true;
}
continue;
}
if trimmed.is_empty() {
continue;
}
return Some(parse_entry(trimmed));
}
}
}
fn parse_entry(line: &str) -> Result<FilingEntry, ParseError> {
let parts: Vec<&str> = line.splitn(5, '|').collect();
if parts.len() != 5 {
return Err(ParseError::Malformed {
line: line.to_string(),
});
}
let cik: u64 = parts[0].trim().parse().map_err(|_| ParseError::Malformed {
line: line.to_string(),
})?;
let date = parts[3].trim();
if !is_iso_date(date) {
return Err(ParseError::Malformed {
line: line.to_string(),
});
}
Ok(FilingEntry {
cik,
company_name: parts[1].trim().to_string(),
form_type: parts[2].trim().to_string(),
date_filed: date.to_string(),
file_path: parts[4].trim().to_string(),
})
}
fn is_iso_date(s: &str) -> bool {
s.len() == 10
&& s.as_bytes()[4] == b'-'
&& s.as_bytes()[7] == b'-'
&& s[0..4].bytes().all(|b| b.is_ascii_digit())
&& s[5..7].bytes().all(|b| b.is_ascii_digit())
&& s[8..10].bytes().all(|b| b.is_ascii_digit())
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn is_iso_date_validates() {
assert!(is_iso_date("2024-11-01"));
assert!(is_iso_date("1993-01-01"));
assert!(!is_iso_date("2024-1-01"));
assert!(!is_iso_date("24-11-01"));
assert!(!is_iso_date(""));
assert!(!is_iso_date("not-a-date"));
}
#[test]
fn accession_number_extracted_index_htm() {
let e = FilingEntry {
cik: 320193,
company_name: "APPLE INC".into(),
form_type: "10-K".into(),
date_filed: "2024-11-01".into(),
file_path: "edgar/data/320193/0000320193-24-000123-index.htm".into(),
};
assert_eq!(e.accession_number(), Some("0000320193-24-000123"));
}
#[test]
fn accession_number_extracted_txt() {
let e = FilingEntry {
cik: 1000045,
company_name: "NICHOLAS FINANCIAL INC".into(),
form_type: "10-Q".into(),
date_filed: "2024-02-13".into(),
file_path: "edgar/data/1000045/0000950170-24-014566.txt".into(),
};
assert_eq!(e.accession_number(), Some("0000950170-24-014566"));
}
#[test]
fn accession_number_none_for_unexpected_shape() {
let e = FilingEntry {
cik: 1,
company_name: "X".into(),
form_type: "10-K".into(),
date_filed: "2024-01-01".into(),
file_path: "edgar/data/1/something-else.pdf".into(),
};
assert_eq!(e.accession_number(), None);
}
}