use std::collections::BTreeSet;
#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord)]
pub struct EightKItem {
pub item_code: String,
pub description: String,
}
pub fn extract_8k_items(text: &str) -> Vec<EightKItem> {
let stripped = strip_html(text);
let normalized = stripped
.replace(" ", " ")
.replace(" ", " ")
.replace(" ", " ")
.replace(" ", " ");
let mut seen: BTreeSet<String> = BTreeSet::new();
let mut items: Vec<EightKItem> = Vec::new();
for chunk in normalized.split("Item ") {
if chunk.is_empty() {
continue;
}
let first_token: String = chunk
.chars()
.take_while(|c| !c.is_whitespace() && *c != '.')
.chain(std::iter::once('.'))
.chain(
chunk
.chars()
.skip_while(|c| !c.is_whitespace() && *c != '.')
.skip(1) .take_while(|c| c.is_ascii_digit()),
)
.collect();
if !is_item_code(&first_token) {
continue;
}
let after =
chunk[first_token.len()..].trim_start_matches(|c: char| c.is_whitespace() || c == '.');
if !after.chars().next().is_some_and(|c| c.is_ascii_uppercase()) {
continue;
}
if !seen.insert(first_token.clone()) {
continue;
}
let description = after
.split(['.', '\n'])
.next()
.unwrap_or("")
.trim()
.to_string();
items.push(EightKItem {
item_code: first_token,
description,
});
}
items.sort();
items
}
fn is_item_code(s: &str) -> bool {
let parts: Vec<&str> = s.split('.').collect();
if parts.len() != 2 {
return false;
}
if parts[0].len() != 1 || !parts[0].chars().all(|c| c.is_ascii_digit()) {
return false;
}
if parts[1].len() != 2 || !parts[1].chars().all(|c| c.is_ascii_digit()) {
return false;
}
true
}
fn strip_html(html: &str) -> String {
let mut out = String::with_capacity(html.len());
let mut in_tag = false;
for c in html.chars() {
match c {
'<' => in_tag = true,
'>' => in_tag = false,
_ if !in_tag => out.push(c),
_ => {}
}
}
out
}
#[cfg(test)]
mod tests {
use super::*;
const SAMPLE_HTML: &str = r#"<html><body>
<p>This Current Report on Form 8-K is being filed to report:</p>
<p><b>Item 1.01</b> Entry into a Material Definitive Agreement.</p>
<p>On July 30, 2024, ABC Corp entered into ...</p>
<p><b>Item 5.02</b> Departure of Directors or Certain Officers; Election of Directors; Appointment of Certain Officers; Compensatory Arrangements of Certain Officers.</p>
<p>Item 9.01 Financial Statements and Exhibits.</p>
<p>(d) Exhibits</p>
</body></html>"#;
#[test]
fn extracts_three_items_from_sample_html() {
let items = extract_8k_items(SAMPLE_HTML);
let codes: Vec<&str> = items.iter().map(|i| i.item_code.as_str()).collect();
assert_eq!(codes, vec!["1.01", "5.02", "9.01"]);
}
#[test]
fn deduplicates_repeated_codes() {
let s = "Item 5.02 Departure of Officers. Item 5.02 Election of Directors.";
let items = extract_8k_items(s);
assert_eq!(items.len(), 1);
assert_eq!(items[0].item_code, "5.02");
}
#[test]
fn decodes_nbsp_entity_separated_codes() {
let s = "Item 5.07 Submission of Matters. Item 1.01 Entry into Agreement.";
let items = extract_8k_items(s);
let codes: Vec<&str> = items.iter().map(|i| i.item_code.as_str()).collect();
assert_eq!(codes, vec!["1.01", "5.07"]);
}
#[test]
fn skips_mid_sentence_item_references() {
let s = "Item 8.01 Other Events. As described under Item 5.02 of our prior report.";
let items = extract_8k_items(s);
let codes: Vec<&str> = items.iter().map(|i| i.item_code.as_str()).collect();
assert_eq!(codes, vec!["8.01"]);
}
#[test]
fn ignores_invalid_item_shapes() {
let s = "Item 1 (bare) and Item 5.0 (short) and Item 5.020 (long).";
let items = extract_8k_items(s);
assert_eq!(items.len(), 0);
}
#[test]
fn empty_input_yields_no_items() {
assert_eq!(extract_8k_items("").len(), 0);
assert_eq!(extract_8k_items("<html></html>").len(), 0);
}
#[test]
fn description_stops_at_first_sentence_on_inline_xbrl() {
let s = "Item 2.02  Results of Operations and Financial \
Condition.On April 2, 2026, the registrant published ...";
let items = extract_8k_items(s);
assert_eq!(items.len(), 1);
assert_eq!(items[0].item_code, "2.02");
assert_eq!(
items[0].description,
"Results of Operations and Financial Condition"
);
}
}