1use std::collections::HashSet;
2
3#[derive(Debug, Clone, PartialEq, Eq)]
4pub struct PairedMarkerOpen {
5 pub tag_name: String,
6 pub id: String,
7 pub len: usize,
8}
9
10#[derive(Debug, Clone, PartialEq, Eq)]
11pub struct EmptyMarker {
12 pub id: String,
13 pub len: usize,
14}
15
16#[derive(Debug, Clone, PartialEq, Eq)]
17pub struct MarkerClose {
18 pub tag_name: String,
19 pub len: usize,
20}
21
22pub fn marker_ids_in_text(text: &str) -> Vec<String> {
23 let mut ids = Vec::new();
24 let mut rest = text;
25
26 while let Some(index) = rest.find('<') {
27 let tag = &rest[index..];
28 if let Some(open) = parse_paired_marker_open(tag) {
29 ids.push(open.id);
30 rest = &tag[open.len..];
31 } else if let Some(empty) = parse_empty_marker(tag) {
32 ids.push(empty.id);
33 rest = &tag[empty.len..];
34 } else if let Some(close) = parse_marker_close(tag) {
35 rest = &tag[close.len..];
36 } else {
37 rest = &tag[1..];
38 }
39 }
40
41 ids
42}
43
44pub fn extract_marker_id(tag: &str) -> Option<String> {
45 extract_marker_id_attr(tag).or_else(|| short_marker_name(tag).map(ToString::to_string))
46}
47
48fn extract_marker_id_attr(tag: &str) -> Option<String> {
49 let id_offset = tag.find("id=")? + 3;
50 let quote = tag[id_offset..].chars().next()?;
51 if quote != '"' && quote != '\'' {
52 return None;
53 }
54 let value_start = id_offset + quote.len_utf8();
55 let value_end = tag[value_start..].find(quote)? + value_start;
56 Some(tag[value_start..value_end].to_string())
57}
58
59pub fn parse_paired_marker_open(text: &str) -> Option<PairedMarkerOpen> {
60 if !text.starts_with('<') {
61 return None;
62 }
63 for tag_name in ["m", "keep"] {
64 let prefix = format!("<{tag_name} ");
65 if !text.starts_with(&prefix) {
66 continue;
67 }
68 let open_end = text.find('>')?;
69 if text[..open_end].ends_with('/') {
70 return None;
71 }
72 let id = extract_marker_id_attr(&text[..=open_end])?;
73 return Some(PairedMarkerOpen {
74 tag_name: tag_name.to_string(),
75 id,
76 len: open_end + 1,
77 });
78 }
79
80 let open_end = text.find('>')?;
81 if open_end == 0 {
82 return None;
83 }
84 if text[..open_end].ends_with('/') {
85 return None;
86 }
87 let name = &text[1..open_end];
88 if is_short_paired_marker_name(name) {
89 return Some(PairedMarkerOpen {
90 tag_name: name.to_string(),
91 id: name.to_string(),
92 len: open_end + 1,
93 });
94 }
95
96 None
97}
98
99pub fn parse_empty_marker(text: &str) -> Option<EmptyMarker> {
100 if !text.starts_with('<') {
101 return None;
102 }
103 for tag_name in ["ref", "m", "keep"] {
104 let prefix = format!("<{tag_name} ");
105 if !text.starts_with(&prefix) {
106 continue;
107 }
108 let end = text.find('>')?;
109 let tag = &text[..=end];
110 if !tag.ends_with("/>") {
111 return None;
112 }
113 let id = extract_marker_id_attr(tag)?;
114 return Some(EmptyMarker { id, len: end + 1 });
115 }
116
117 let end = text.find('>')?;
118 if end < 2 {
119 return None;
120 }
121 let tag = &text[..=end];
122 if !tag.ends_with("/>") {
123 return None;
124 }
125 let name = &text[1..end - 1];
126 if is_short_empty_marker_name(name) || is_short_paired_marker_name(name) {
127 return Some(EmptyMarker {
128 id: name.to_string(),
129 len: end + 1,
130 });
131 }
132
133 None
134}
135
136pub fn parse_marker_close(text: &str) -> Option<MarkerClose> {
137 if !text.starts_with("</") {
138 return None;
139 }
140 for tag_name in ["m", "keep"] {
141 let close = format!("</{tag_name}>");
142 if text.starts_with(&close) {
143 return Some(MarkerClose {
144 tag_name: tag_name.to_string(),
145 len: close.len(),
146 });
147 }
148 }
149
150 let end = text.find('>')?;
151 let name = &text[2..end];
152 if is_short_paired_marker_name(name) {
153 return Some(MarkerClose {
154 tag_name: name.to_string(),
155 len: end + 1,
156 });
157 }
158
159 None
160}
161
162pub fn is_marker_token(text: &str) -> bool {
163 let text = text.trim();
164 parse_paired_marker_open(text).is_some_and(|marker| marker.len == text.len())
165 || parse_empty_marker(text).is_some_and(|marker| marker.len == text.len())
166 || parse_marker_close(text).is_some_and(|marker| marker.len == text.len())
167}
168
169pub fn strip_marker_tokens(text: &str) -> String {
170 let mut output = String::new();
171 let mut rest = text;
172
173 while let Some(index) = rest.find('<') {
174 output.push_str(&rest[..index]);
175 let tag = &rest[index..];
176
177 if let Some(open) = parse_paired_marker_open(tag) {
178 rest = &tag[open.len..];
179 } else if let Some(empty) = parse_empty_marker(tag) {
180 rest = &tag[empty.len..];
181 } else if let Some(close) = parse_marker_close(tag) {
182 rest = &tag[close.len..];
183 } else {
184 output.push('<');
185 rest = &tag[1..];
186 }
187 }
188
189 output.push_str(rest);
190 output
191}
192
193fn short_marker_name(tag: &str) -> Option<&str> {
194 if let Some(open) = tag.strip_prefix("</") {
195 let name = open.strip_suffix('>')?;
196 return is_short_paired_marker_name(name).then_some(name);
197 }
198 let body = tag.strip_prefix('<')?.strip_suffix('>')?;
199 let name = body.strip_suffix('/').unwrap_or(body);
200 (is_short_paired_marker_name(name) || is_short_empty_marker_name(name)).then_some(name)
201}
202
203fn is_short_paired_marker_name(name: &str) -> bool {
204 name.strip_prefix('m')
205 .is_some_and(|suffix| !suffix.is_empty() && suffix.chars().all(|ch| ch.is_ascii_digit()))
206}
207
208fn is_short_empty_marker_name(name: &str) -> bool {
209 name.strip_prefix('r')
210 .is_some_and(|suffix| !suffix.is_empty() && suffix.chars().all(|ch| ch.is_ascii_digit()))
211}
212
213pub fn has_markers_in_expected_set(text: &str, expected: &HashSet<String>) -> bool {
214 let actual_set: HashSet<String> = marker_ids_in_text(text).into_iter().collect();
215 actual_set == *expected
216}
217
218pub fn all_markers_present(text: &str, required: &[String]) -> bool {
219 required.iter().all(|marker| text.contains(marker))
220}
221
222#[cfg(test)]
223mod tests {
224 use super::*;
225
226 #[test]
227 fn marker_ids_include_short_and_legacy_markers() {
228 let ids =
229 marker_ids_in_text(r#"A <m1>bold <r1/> text</m1> and <m id="m000000_000">old</m>."#);
230
231 assert_eq!(ids, vec!["m1", "r1", "m000000_000"]);
232 }
233
234 #[test]
235 fn parses_short_marker_tokens() {
236 let open = parse_paired_marker_open("<m12>text</m12>").expect("short paired marker");
237 assert_eq!(open.tag_name, "m12");
238 assert_eq!(open.id, "m12");
239 assert_eq!(open.len, "<m12>".len());
240
241 let empty = parse_empty_marker("<r3/>tail").expect("short empty marker");
242 assert_eq!(empty.id, "r3");
243 assert_eq!(empty.len, "<r3/>".len());
244
245 let close = parse_marker_close("</m12>").expect("short close marker");
246 assert_eq!(close.tag_name, "m12");
247 assert_eq!(close.len, "</m12>".len());
248 }
249
250 #[test]
251 fn strips_short_and_legacy_marker_tokens() {
252 let stripped = strip_marker_tokens(
253 r#"Hello <m1>wide <ref id="r000000_000"/> world</m1> and <m id="m000000_000">old</m>."#,
254 );
255
256 assert_eq!(stripped, "Hello wide world and old.");
257 }
258}