1use serde::{Deserialize, Serialize};
4use std::collections::BTreeMap;
5
6#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize, Default)]
7pub struct InlineFragment {
8 pub element_name: String,
9 pub fact_name: Option<String>,
10 pub context_ref: Option<String>,
11 pub unit_ref: Option<String>,
12 pub decimals: Option<String>,
13 pub value: String,
14 #[serde(default)]
15 pub attributes: BTreeMap<String, String>,
16}
17
18#[must_use]
19pub fn scan_inline_fragments(html: &str) -> Vec<InlineFragment> {
20 let mut fragments = Vec::new();
21 let mut offset = 0;
22 while let Some(relative_start) = html[offset..].find("<ix:") {
23 let start = offset + relative_start;
24 let Some(tag_end) = find_tag_end(html, start) else {
25 break;
26 };
27 let tag_body = &html[start + 1..tag_end];
28 if tag_body.starts_with('/') {
29 offset = tag_end + 1;
30 continue;
31 }
32 let self_closing = tag_body.trim_end().ends_with('/');
33 let trimmed_tag = tag_body.trim_end_matches('/').trim();
34 let (element_name, attributes) = parse_start_tag(trimmed_tag);
35 if !element_name.starts_with("ix:") {
36 offset = tag_end + 1;
37 continue;
38 }
39 let (value, next_offset) = if self_closing {
40 (String::new(), tag_end + 1)
41 } else {
42 let close_tag = format!("</{element_name}>");
43 if let Some(relative_close) = html[tag_end + 1..].find(&close_tag) {
44 let value_end = tag_end + 1 + relative_close;
45 (
46 strip_tags(&html[tag_end + 1..value_end]).trim().to_string(),
47 value_end + close_tag.len(),
48 )
49 } else {
50 (String::new(), tag_end + 1)
51 }
52 };
53 fragments.push(InlineFragment {
54 element_name: element_name.to_string(),
55 fact_name: attributes.get("name").cloned(),
56 context_ref: attributes.get("contextRef").cloned(),
57 unit_ref: attributes.get("unitRef").cloned(),
58 decimals: attributes.get("decimals").cloned(),
59 value,
60 attributes,
61 });
62 offset = next_offset;
63 }
64 fragments
65}
66
67fn find_tag_end(html: &str, start: usize) -> Option<usize> {
68 let mut in_quotes = false;
69 for (relative_index, ch) in html[start..].char_indices() {
70 match ch {
71 '"' => in_quotes = !in_quotes,
72 '>' if !in_quotes => return Some(start + relative_index),
73 _ => {}
74 }
75 }
76 None
77}
78
79fn parse_start_tag(tag_body: &str) -> (&str, BTreeMap<String, String>) {
80 let mut split_index = tag_body.len();
81 for (index, ch) in tag_body.char_indices() {
82 if ch.is_whitespace() {
83 split_index = index;
84 break;
85 }
86 }
87 let element_name = &tag_body[..split_index];
88 let attributes = parse_attributes(&tag_body[split_index..]);
89 (element_name, attributes)
90}
91
92fn parse_attributes(raw: &str) -> BTreeMap<String, String> {
93 let bytes = raw.as_bytes();
94 let mut attributes = BTreeMap::new();
95 let mut index = 0;
96 while index < bytes.len() {
97 while index < bytes.len() && bytes[index].is_ascii_whitespace() {
98 index += 1;
99 }
100 if index >= bytes.len() {
101 break;
102 }
103 let key_start = index;
104 while index < bytes.len() && !bytes[index].is_ascii_whitespace() && bytes[index] != b'=' {
105 index += 1;
106 }
107 let key = raw[key_start..index].trim();
108 while index < bytes.len() && bytes[index].is_ascii_whitespace() {
109 index += 1;
110 }
111 if index < bytes.len() && bytes[index] == b'=' {
112 index += 1;
113 while index < bytes.len() && bytes[index].is_ascii_whitespace() {
114 index += 1;
115 }
116 let value = if index < bytes.len() && bytes[index] == b'"' {
117 index += 1;
118 let value_start = index;
119 while index < bytes.len() && bytes[index] != b'"' {
120 index += 1;
121 }
122 let value = raw[value_start..index].to_string();
123 if index < bytes.len() {
124 index += 1;
125 }
126 value
127 } else {
128 let value_start = index;
129 while index < bytes.len() && !bytes[index].is_ascii_whitespace() {
130 index += 1;
131 }
132 raw[value_start..index].to_string()
133 };
134 if !key.is_empty() {
135 attributes.insert(key.to_string(), value);
136 }
137 } else if !key.is_empty() {
138 attributes.insert(key.to_string(), String::new());
139 }
140 }
141 attributes
142}
143
144fn strip_tags(value: &str) -> String {
145 let mut text = String::new();
146 let mut in_tag = false;
147 for ch in value.chars() {
148 match ch {
149 '<' => in_tag = true,
150 '>' => in_tag = false,
151 _ if !in_tag => text.push(ch),
152 _ => {}
153 }
154 }
155 text
156}
157
158#[cfg(test)]
159mod tests {
160 use super::scan_inline_fragments;
161
162 #[test]
163 fn parses_inline_fact_attributes_and_value() {
164 let html = r#"<html><body><ix:nonNumeric name="dei:DocumentType" contextRef="c1">10-K</ix:nonNumeric></body></html>"#;
165 let fragments = scan_inline_fragments(html);
166
167 assert_eq!(fragments.len(), 1);
168 assert_eq!(fragments[0].element_name, "ix:nonNumeric");
169 assert_eq!(fragments[0].fact_name.as_deref(), Some("dei:DocumentType"));
170 assert_eq!(fragments[0].context_ref.as_deref(), Some("c1"));
171 assert_eq!(fragments[0].value, "10-K");
172 }
173
174 #[test]
175 fn captures_banned_attributes_for_rule_checks() {
176 let html = r#"<html><body><ix:nonFraction name="us-gaap:Assets" contextRef="c1" xml:base="https://example.com">100</ix:nonFraction></body></html>"#;
177 let fragments = scan_inline_fragments(html);
178
179 assert_eq!(fragments.len(), 1);
180 assert_eq!(
181 fragments[0].attributes.get("xml:base").map(String::as_str),
182 Some("https://example.com")
183 );
184 }
185}