#[derive(Debug, Clone, PartialEq)]
pub struct CompensationRow {
pub person_name: String,
pub position_title: String,
pub fiscal_year: String,
pub salary: Option<f64>,
pub bonus: Option<f64>,
pub stock_awards: Option<f64>,
pub option_awards: Option<f64>,
pub non_equity_incentive: Option<f64>,
pub pension_change: Option<f64>,
pub other_compensation: Option<f64>,
pub total: Option<f64>,
}
pub fn extract_summary_compensation(html: &str) -> Vec<CompensationRow> {
let Some(segment) = find_table_segment(html) else {
return Vec::new();
};
let mut out: Vec<CompensationRow> = Vec::new();
let mut last_name = String::new();
let mut last_position = String::new();
for cells in table_rows(segment) {
if let Some(mut row) = parse_data_row(&cells) {
if row.person_name.is_empty() {
if last_name.is_empty() {
continue;
}
row.person_name = last_name.clone();
row.position_title = last_position.clone();
} else {
last_name = row.person_name.clone();
last_position = row.position_title.clone();
}
out.push(row);
}
}
out
}
fn find_table_segment(html: &str) -> Option<&str> {
let lower = html.to_ascii_lowercase();
let needle = "summary compensation table";
let mut from = 0;
while let Some(rel) = lower[from..].find(needle) {
let idx = from + rel;
let sig_end = (idx + 6_000).min(html.len());
let sig = strip_tags(&html[idx..sig_end]).to_ascii_lowercase();
if sig.contains("salary") && sig.contains("total") {
let seg_end = (idx + 40_000).min(html.len());
return Some(&html[idx..seg_end]);
}
from = idx + needle.len();
}
None
}
pub(crate) fn table_rows(segment: &str) -> Vec<Vec<String>> {
let lower = segment.to_ascii_lowercase();
let mut rows: Vec<Vec<String>> = Vec::new();
let mut row_start = 0;
let bytes = lower.as_bytes();
let mut i = 0;
while i + 4 <= bytes.len() {
if bytes[i] == b'<' && lower[i..].starts_with("</tr") {
let raw_row = &segment[row_start..i];
rows.push(split_cells(raw_row));
if let Some(gt) = lower[i..].find('>') {
i += gt + 1;
row_start = i;
continue;
}
}
i += 1;
}
if row_start < segment.len() {
rows.push(split_cells(&segment[row_start..]));
}
rows
}
fn split_cells(row_html: &str) -> Vec<String> {
let lower = row_html.to_ascii_lowercase();
let mut cells: Vec<String> = Vec::new();
let mut start = 0;
let bytes = lower.as_bytes();
let mut i = 0;
while i < bytes.len() {
if bytes[i] == b'<' && (lower[i..].starts_with("</td") || lower[i..].starts_with("</th")) {
cells.push(strip_tags(&row_html[start..i]));
if let Some(gt) = lower[i..].find('>') {
i += gt + 1;
start = i;
continue;
}
}
i += 1;
}
if start < row_html.len() {
let tail = strip_tags(&row_html[start..]);
if !tail.is_empty() {
cells.push(tail);
}
}
cells.into_iter().filter(|c| !c.is_empty()).collect()
}
fn strip_tags(html: &str) -> String {
let mut out = String::with_capacity(html.len());
let mut in_tag = false;
let mut chars = html.char_indices().peekable();
while let Some((i, c)) = chars.next() {
match c {
'<' => in_tag = true,
'>' => {
in_tag = false;
out.push(' ');
}
_ if in_tag => {}
'&' => {
let rest = &html[i..];
if rest.starts_with("&") {
out.push('&');
for _ in 0..4 {
chars.next();
}
} else if rest.starts_with(" ") || rest.starts_with(" ") {
out.push(' ');
for _ in 0..5 {
chars.next();
}
} else if rest.starts_with("—") || rest.starts_with("–") {
out.push('-');
for _ in 0..6 {
chars.next();
}
} else {
out.push('&');
}
}
_ => out.push(c),
}
}
out.split_whitespace().collect::<Vec<_>>().join(" ")
}
fn parse_data_row(cells: &[String]) -> Option<CompensationRow> {
if cells.len() < 3 {
return None;
}
let year_idx = cells.iter().position(|c| is_fiscal_year(c))?;
let fiscal_year = cells[year_idx].trim().to_string();
let mut person_name = String::new();
let mut position_title = String::new();
for cell in &cells[..year_idx] {
if looks_like_name(cell) {
let (n, p) = split_name_title(cell);
person_name = n;
position_title = p;
break;
}
}
let money: Vec<f64> = cells[year_idx + 1..]
.iter()
.filter_map(|c| clean_money(c))
.collect();
if money.is_empty() {
return None;
}
let total = money.last().copied();
let (salary, bonus, stock, option, nonequity, pension, other) = if money.len() == 8 {
(
Some(money[0]),
Some(money[1]),
Some(money[2]),
Some(money[3]),
Some(money[4]),
Some(money[5]),
Some(money[6]),
)
} else {
(money.first().copied(), None, None, None, None, None, None)
};
Some(CompensationRow {
person_name,
position_title,
fiscal_year,
salary,
bonus,
stock_awards: stock,
option_awards: option,
non_equity_incentive: nonequity,
pension_change: pension,
other_compensation: other,
total,
})
}
fn is_fiscal_year(cell: &str) -> bool {
let t = cell.trim();
t.len() == 4
&& t.chars().all(|c| c.is_ascii_digit())
&& t.parse::<u16>().is_ok_and(|y| (1995..=2099).contains(&y))
}
fn looks_like_name(cell: &str) -> bool {
let head = cell.split(',').next().unwrap_or(cell);
let words: Vec<&str> = head.split_whitespace().collect();
if words.len() < 2 || words.len() > 6 {
return false;
}
let alpha = cell.chars().filter(|c| c.is_ascii_alphabetic()).count();
let digit = cell.chars().filter(|c| c.is_ascii_digit()).count();
alpha >= 4 && digit <= 1
}
fn split_name_title(cell: &str) -> (String, String) {
match cell.find(',') {
Some(c) => (
cell[..c].trim().to_string(),
cell[c + 1..].trim().to_string(),
),
None => (cell.trim().to_string(), String::new()),
}
}
fn clean_money(cell: &str) -> Option<f64> {
let t = cell.trim();
if t.is_empty() {
return None;
}
if t == "-" || t == "—" || t == "–" {
return Some(0.0);
}
if t.starts_with('(') && t.ends_with(')') {
let inner = &t[1..t.len() - 1];
if inner.len() <= 2 && inner.chars().all(|c| c.is_ascii_digit()) {
return None;
}
}
let mut digits = String::with_capacity(t.len());
for c in t.chars() {
match c {
'0'..='9' | '.' => digits.push(c),
'$' | ',' | ' ' | '(' | ')' => {}
'*' | '†' | '‡' => {}
_ => return None,
}
}
if digits.is_empty() {
return None;
}
digits.parse::<f64>().ok()
}
#[cfg(test)]
mod tests {
use super::*;
const SAMPLE_HTML: &str = r#"<html><body>
<h2>Executive Compensation</h2>
<p>See the discussion below.</p>
<h3>Summary Compensation Table</h3>
<p>The following table sets forth compensation for our named executive officers.</p>
<table>
<tr><th>Name and Principal Position</th><th>Year</th><th>Salary ($)</th><th>Bonus ($)</th>
<th>Stock Awards ($)</th><th>Option Awards ($)</th><th>Non-Equity Incentive ($)</th>
<th>Change in Pension ($)</th><th>All Other ($)</th><th>Total ($)</th></tr>
<tr><td>Elon Musk, Chief Executive Officer</td><td>2020</td><td>0</td><td>0</td>
<td>0</td><td>0</td><td>0</td><td>0</td><td>0</td><td>0</td></tr>
<tr><td>Zachary Kirkhorn, Chief Financial Officer</td><td>2020</td><td>283,269</td><td>0</td>
<td>0</td><td>0</td><td>0</td><td>0</td><td>1,000</td><td>284,269</td></tr>
<tr><td></td><td>2019</td><td>301,154</td><td>0</td>
<td>0</td><td>13,090,706</td><td>0</td><td>0</td><td>2,500</td><td>13,394,360</td></tr>
</table>
</body></html>"#;
#[test]
fn extracts_named_executive_rows() {
let rows = extract_summary_compensation(SAMPLE_HTML);
assert_eq!(rows.len(), 3, "expected 3 comp rows, got {rows:?}");
assert_eq!(rows[0].person_name, "Elon Musk");
assert_eq!(rows[0].position_title, "Chief Executive Officer");
assert_eq!(rows[0].fiscal_year, "2020");
assert_eq!(rows[0].total, Some(0.0));
}
#[test]
fn maps_full_eight_column_row() {
let rows = extract_summary_compensation(SAMPLE_HTML);
let cfo = &rows[1];
assert_eq!(cfo.person_name, "Zachary Kirkhorn");
assert_eq!(cfo.salary, Some(283_269.0));
assert_eq!(cfo.other_compensation, Some(1_000.0));
assert_eq!(cfo.total, Some(284_269.0));
}
#[test]
fn carries_name_forward_to_blank_year_row() {
let rows = extract_summary_compensation(SAMPLE_HTML);
assert_eq!(rows[2].person_name, "Zachary Kirkhorn");
assert_eq!(rows[2].fiscal_year, "2019");
assert_eq!(rows[2].option_awards, Some(13_090_706.0));
}
#[test]
fn missing_table_returns_empty() {
assert!(extract_summary_compensation("<html>nothing here</html>").is_empty());
}
#[test]
fn toc_entry_without_signature_is_skipped() {
let html = "<html><body><a>Summary Compensation Table</a> 46</body></html>";
assert!(extract_summary_compensation(html).is_empty());
}
#[test]
fn clean_money_handles_formats() {
assert_eq!(clean_money("$1,234,567"), Some(1_234_567.0));
assert_eq!(clean_money("—"), Some(0.0));
assert_eq!(clean_money("(5)"), None);
assert_eq!(clean_money("n/a"), None);
}
}