use std::collections::BTreeSet;
#[derive(Debug, Clone, PartialEq)]
pub struct BeneficialOwner {
pub name: String,
pub holder_type: String,
pub shares: Option<u64>,
pub percent_of_class: Option<f64>,
pub source_page: usize,
pub source_paragraph: usize,
}
pub fn extract_beneficial_ownership(html: &str) -> Vec<BeneficialOwner> {
let text = strip_html(html);
let lines: Vec<&str> = text.lines().collect();
let heading_idx = match find_heading(&lines) {
Some(i) => i,
None => return Vec::new(),
};
let mut out = Vec::new();
let mut seen_keys: BTreeSet<String> = BTreeSet::new();
let mut page: usize = approx_page(&lines, heading_idx);
let mut paragraph: usize = 0;
for (offset, &raw_line) in lines.iter().enumerate().skip(heading_idx + 1).take(500) {
let line = raw_line.trim();
if line.is_empty() {
paragraph = paragraph.saturating_add(1);
continue;
}
if is_next_section_heading(line) {
break;
}
if is_page_break_marker(line) {
page = page.saturating_add(1);
paragraph = 0;
continue;
}
let Some(row) = parse_row(line) else {
continue;
};
let key = format!("{}|{}", row.name, row.shares.unwrap_or(0));
if !seen_keys.insert(key) {
continue;
}
let with_loc = BeneficialOwner {
source_page: page,
source_paragraph: offset.saturating_sub(heading_idx),
..row
};
out.push(with_loc);
}
out
}
fn strip_html(html: &str) -> String {
let mut out = String::with_capacity(html.len());
let bytes = html.as_bytes();
let mut i = 0;
let mut in_tag = false;
let mut block_close: Option<&[u8]> = None;
while i < bytes.len() {
let c = bytes[i];
if !in_tag && c == b'<' {
in_tag = true;
let after = &bytes[i + 1..];
let starts_with_any = |needles: &[&[u8]]| needles.iter().any(|n| after.starts_with(n));
if starts_with_any(&[
b"/tr", b"/TR", b"/p", b"/P", b"br", b"BR", b"/div", b"/DIV", b"/h1", b"/h2",
b"/h3",
]) {
block_close = Some(b"\n");
} else if starts_with_any(&[b"hr", b"HR"]) {
block_close = Some(b"\n<<PAGE>>\n");
}
i += 1;
continue;
}
if in_tag && c == b'>' {
in_tag = false;
if let Some(b) = block_close.take() {
out.push_str(std::str::from_utf8(b).unwrap_or("\n"));
} else {
out.push(' ');
}
i += 1;
continue;
}
if in_tag {
i += 1;
continue;
}
if c == b'&' {
if bytes[i..].starts_with(b"&") {
out.push('&');
i += 5;
continue;
}
if bytes[i..].starts_with(b" ") {
out.push(' ');
i += 6;
continue;
}
if bytes[i..].starts_with(b" ") {
out.push(' ');
i += 6;
continue;
}
if bytes[i..].starts_with(b""") {
out.push('"');
i += 6;
continue;
}
if bytes[i..].starts_with(b"'") {
out.push('\'');
i += 6;
continue;
}
}
out.push(c as char);
i += 1;
}
out
}
fn find_heading(lines: &[&str]) -> Option<usize> {
let needles: &[&str] = &[
"security ownership of certain beneficial owners",
"beneficial ownership of common stock",
"principal stockholders",
"security ownership of management",
];
for (i, line) in lines.iter().enumerate() {
let lc = line.to_ascii_lowercase();
if needles.iter().any(|n| lc.contains(n)) {
return Some(i);
}
}
None
}
fn approx_page(lines: &[&str], idx: usize) -> usize {
let mut p: usize = 1;
for line in &lines[..idx] {
if line.contains("<<PAGE>>") {
p += 1;
}
}
p
}
fn is_page_break_marker(line: &str) -> bool {
line.contains("<<PAGE>>")
}
fn is_next_section_heading(line: &str) -> bool {
let lc = line.to_ascii_lowercase();
if !lc.contains("section")
&& !lc.contains("item")
&& !lc.contains("proposal")
&& !lc.contains("executive compensation")
&& !lc.contains("certain relationships")
{
return false;
}
line.len() < 120 && line.chars().filter(|c| c.is_ascii_digit()).count() < 6
}
fn parse_row(line: &str) -> Option<BeneficialOwner> {
let tokens: Vec<&str> = line.split_whitespace().collect();
if tokens.len() < 2 {
return None;
}
let mut percent: Option<f64> = None;
let mut shares: Option<u64> = None;
let mut name_end_idx: Option<usize> = None;
for (i, tok) in tokens.iter().enumerate().rev() {
let cleaned = clean_numeric(tok);
if cleaned.is_empty() {
continue;
}
if tok.contains('%') && percent.is_none() {
percent = cleaned.parse::<f64>().ok();
continue;
}
if shares.is_none() {
let as_int: Option<u64> = cleaned.parse().ok();
if let Some(n) = as_int {
if n >= 1_000 {
shares = Some(n);
name_end_idx = Some(i);
break;
}
}
}
}
let shares_value = shares?;
let name_end = name_end_idx?;
if name_end == 0 {
return None;
}
let name = tokens[..name_end].join(" ").trim().to_string();
if name.is_empty() || name.len() < 3 {
return None;
}
if name == name.to_ascii_uppercase() && name.split_whitespace().count() <= 2 {
return None;
}
if !is_plausible_holder_name(&name) {
return None;
}
let holder_type = classify_holder(&name);
Some(BeneficialOwner {
name,
holder_type,
shares: Some(shares_value),
percent_of_class: percent,
source_page: 0,
source_paragraph: 0,
})
}
fn clean_numeric(s: &str) -> String {
let mut out = String::with_capacity(s.len());
for c in s.chars() {
match c {
'0'..='9' | '.' => out.push(c),
',' | '$' | '%' | ' ' => {}
'*' | '†' | '‡' => {}
'(' | ')' => {}
_ => return String::new(), }
}
out
}
fn is_plausible_holder_name(name: &str) -> bool {
if name.len() > 70 {
return false;
}
let lc = name.to_ascii_lowercase();
const NOISE: &[&str] = &[
"based solely",
"respect to",
"the table",
"number of shares",
"beneficially owned",
"common stock",
"schedule 13",
"footnote",
"following table",
"as of december",
"pursuant to",
"shares of",
];
if NOISE.iter().any(|n| lc.contains(n)) {
return false;
}
if let Some((_, tail)) = name.rsplit_once(',') {
let t = tail.trim();
if t.len() == 2 && t.chars().all(|c| c.is_ascii_uppercase()) {
return false;
}
}
true
}
fn classify_holder(name: &str) -> String {
let lc = name.to_ascii_lowercase();
if lc.contains("all directors")
|| lc.contains("directors and executive")
|| lc.contains("all executive officers")
|| lc.contains("officers as a group")
|| lc.contains("named executives")
{
return "group_total".to_string();
}
let institutional_markers = [
"LLC",
"L.L.C.",
"Inc",
"Inc.",
"Corp",
"Corp.",
"L.P.",
" LP",
"Capital",
"Asset",
"Holdings",
"Management",
"Investors",
"Trust",
"Partners",
"Group",
"Fund",
"Advisors",
];
if institutional_markers.iter().any(|m| name.contains(m)) {
return "5pct_holder".to_string();
}
"director_officer".to_string()
}
#[cfg(test)]
mod tests {
use super::*;
const SAMPLE_HTML: &str = r#"<html><body>
<h2>SECURITY OWNERSHIP OF CERTAIN BENEFICIAL OWNERS AND MANAGEMENT</h2>
<p>The following table sets forth ...</p>
<table>
<tr><th>Name</th><th>Shares</th><th>Percent</th></tr>
<tr><td>The Vanguard Group, Inc.</td><td>5,234,567</td><td>8.2%</td></tr>
<tr><td>BlackRock, Inc.</td><td>4,123,456</td><td>6.5%</td></tr>
<tr><td>Tim Cook</td><td>3,386,335</td><td>0.02%</td></tr>
<tr><td>Arthur D. Levinson</td><td>1,150,103</td><td>0.007%</td></tr>
<tr><td>All directors and executive officers as a group</td><td>5,500,000</td><td>0.03%</td></tr>
</table>
<h2>Item 12 — Executive Compensation</h2>
</body></html>"#;
#[test]
fn extracts_typical_ownership_table() {
let owners = extract_beneficial_ownership(SAMPLE_HTML);
let names: Vec<String> = owners.iter().map(|o| o.name.clone()).collect();
assert!(owners.len() >= 4, "expected ≥4 owners, got {names:?}");
assert!(
names
.iter()
.any(|n| n.contains("Vanguard") || n.contains("BlackRock")),
"expected institutional holder, got {names:?}"
);
assert!(
names
.iter()
.any(|n| n.contains("Cook") || n.contains("Levinson")),
"expected individual holder, got {names:?}"
);
let group = owners
.iter()
.find(|o| o.holder_type == "group_total")
.expect("expected a group_total row");
assert_eq!(group.shares, Some(5_500_000));
}
#[test]
fn institutional_classification_recognises_llc() {
let owners = extract_beneficial_ownership(
r#"<html><body>
<h2>Beneficial Ownership of Common Stock</h2>
<p>Acme Capital LLC 1,000,000 5.0%</p>
</body></html>"#,
);
let acme = owners
.iter()
.find(|o| o.name.contains("Acme"))
.expect("Acme row");
assert_eq!(acme.holder_type, "5pct_holder");
}
#[test]
fn missing_heading_returns_empty() {
assert!(
extract_beneficial_ownership("<html><body>Nothing relevant.</body></html>").is_empty()
);
}
#[test]
fn clean_numeric_handles_dollar_and_commas() {
assert_eq!(clean_numeric("$1,234,567"), "1234567");
assert_eq!(clean_numeric("3.45%"), "3.45");
assert_eq!(clean_numeric("not-a-number"), "");
}
#[test]
fn plausible_holder_name_accepts_real_names() {
assert!(is_plausible_holder_name("Arthur D. Levinson"));
assert!(is_plausible_holder_name("The Vanguard Group, Inc."));
assert!(is_plausible_holder_name(
"All directors and executive officers as a group"
));
}
#[test]
fn plausible_holder_name_rejects_footnote_and_address_noise() {
assert!(!is_plausible_holder_name(
"Based solely on the Schedule 13G/A reporting ownership as of December 31"
));
assert!(!is_plausible_holder_name(
"The table below reports the number of shares of common stock"
));
assert!(!is_plausible_holder_name("Malvern, PA"));
assert!(!is_plausible_holder_name("New York, NY"));
}
}