Skip to main content

ixhtml_scan/
lib.rs

1//! Inline HTML scanning primitives.
2
3use serde::{Deserialize, Serialize};
4use std::collections::BTreeMap;
5
6#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize, Default)]
7pub struct InlineFragment {
8    pub element_name: String,
9    pub fact_name: Option<String>,
10    pub context_ref: Option<String>,
11    pub unit_ref: Option<String>,
12    pub decimals: Option<String>,
13    pub value: String,
14    #[serde(default)]
15    pub attributes: BTreeMap<String, String>,
16}
17
18#[must_use]
19pub fn scan_inline_fragments(html: &str) -> Vec<InlineFragment> {
20    let mut fragments = Vec::new();
21    let mut offset = 0;
22    while let Some(relative_start) = html[offset..].find("<ix:") {
23        let start = offset + relative_start;
24        let Some(tag_end) = find_tag_end(html, start) else {
25            break;
26        };
27        let tag_body = &html[start + 1..tag_end];
28        if tag_body.starts_with('/') {
29            offset = tag_end + 1;
30            continue;
31        }
32        let self_closing = tag_body.trim_end().ends_with('/');
33        let trimmed_tag = tag_body.trim_end_matches('/').trim();
34        let (element_name, attributes) = parse_start_tag(trimmed_tag);
35        if !element_name.starts_with("ix:") {
36            offset = tag_end + 1;
37            continue;
38        }
39        let (value, next_offset) = if self_closing {
40            (String::new(), tag_end + 1)
41        } else {
42            let close_tag = format!("</{element_name}>");
43            if let Some(relative_close) = html[tag_end + 1..].find(&close_tag) {
44                let value_end = tag_end + 1 + relative_close;
45                (
46                    strip_tags(&html[tag_end + 1..value_end]).trim().to_string(),
47                    value_end + close_tag.len(),
48                )
49            } else {
50                (String::new(), tag_end + 1)
51            }
52        };
53        fragments.push(InlineFragment {
54            element_name: element_name.to_string(),
55            fact_name: attributes.get("name").cloned(),
56            context_ref: attributes.get("contextRef").cloned(),
57            unit_ref: attributes.get("unitRef").cloned(),
58            decimals: attributes.get("decimals").cloned(),
59            value,
60            attributes,
61        });
62        offset = next_offset;
63    }
64    fragments
65}
66
67fn find_tag_end(html: &str, start: usize) -> Option<usize> {
68    let mut in_quotes = false;
69    for (relative_index, ch) in html[start..].char_indices() {
70        match ch {
71            '"' => in_quotes = !in_quotes,
72            '>' if !in_quotes => return Some(start + relative_index),
73            _ => {}
74        }
75    }
76    None
77}
78
79fn parse_start_tag(tag_body: &str) -> (&str, BTreeMap<String, String>) {
80    let mut split_index = tag_body.len();
81    for (index, ch) in tag_body.char_indices() {
82        if ch.is_whitespace() {
83            split_index = index;
84            break;
85        }
86    }
87    let element_name = &tag_body[..split_index];
88    let attributes = parse_attributes(&tag_body[split_index..]);
89    (element_name, attributes)
90}
91
92fn parse_attributes(raw: &str) -> BTreeMap<String, String> {
93    let bytes = raw.as_bytes();
94    let mut attributes = BTreeMap::new();
95    let mut index = 0;
96    while index < bytes.len() {
97        while index < bytes.len() && bytes[index].is_ascii_whitespace() {
98            index += 1;
99        }
100        if index >= bytes.len() {
101            break;
102        }
103        let key_start = index;
104        while index < bytes.len() && !bytes[index].is_ascii_whitespace() && bytes[index] != b'=' {
105            index += 1;
106        }
107        let key = raw[key_start..index].trim();
108        while index < bytes.len() && bytes[index].is_ascii_whitespace() {
109            index += 1;
110        }
111        if index < bytes.len() && bytes[index] == b'=' {
112            index += 1;
113            while index < bytes.len() && bytes[index].is_ascii_whitespace() {
114                index += 1;
115            }
116            let value = if index < bytes.len() && bytes[index] == b'"' {
117                index += 1;
118                let value_start = index;
119                while index < bytes.len() && bytes[index] != b'"' {
120                    index += 1;
121                }
122                let value = raw[value_start..index].to_string();
123                if index < bytes.len() {
124                    index += 1;
125                }
126                value
127            } else {
128                let value_start = index;
129                while index < bytes.len() && !bytes[index].is_ascii_whitespace() {
130                    index += 1;
131                }
132                raw[value_start..index].to_string()
133            };
134            if !key.is_empty() {
135                attributes.insert(key.to_string(), value);
136            }
137        } else if !key.is_empty() {
138            attributes.insert(key.to_string(), String::new());
139        }
140    }
141    attributes
142}
143
144fn strip_tags(value: &str) -> String {
145    let mut text = String::new();
146    let mut in_tag = false;
147    for ch in value.chars() {
148        match ch {
149            '<' => in_tag = true,
150            '>' => in_tag = false,
151            _ if !in_tag => text.push(ch),
152            _ => {}
153        }
154    }
155    text
156}
157
158#[cfg(test)]
159mod tests {
160    use super::scan_inline_fragments;
161
162    #[test]
163    fn parses_inline_fact_attributes_and_value() {
164        let html = r#"<html><body><ix:nonNumeric name="dei:DocumentType" contextRef="c1">10-K</ix:nonNumeric></body></html>"#;
165        let fragments = scan_inline_fragments(html);
166
167        assert_eq!(fragments.len(), 1);
168        assert_eq!(fragments[0].element_name, "ix:nonNumeric");
169        assert_eq!(fragments[0].fact_name.as_deref(), Some("dei:DocumentType"));
170        assert_eq!(fragments[0].context_ref.as_deref(), Some("c1"));
171        assert_eq!(fragments[0].value, "10-K");
172    }
173
174    #[test]
175    fn captures_banned_attributes_for_rule_checks() {
176        let html = r#"<html><body><ix:nonFraction name="us-gaap:Assets" contextRef="c1" xml:base="https://example.com">100</ix:nonFraction></body></html>"#;
177        let fragments = scan_inline_fragments(html);
178
179        assert_eq!(fragments.len(), 1);
180        assert_eq!(
181            fragments[0].attributes.get("xml:base").map(String::as_str),
182            Some("https://example.com")
183        );
184    }
185}