1use std::collections::HashSet;
2
3#[derive(Debug, Clone, PartialEq, Eq)]
4pub struct PairedMarkerOpen {
5 pub tag_name: String,
6 pub id: String,
7 pub len: usize,
8}
9
10#[derive(Debug, Clone, PartialEq, Eq)]
11pub struct EmptyMarker {
12 pub id: String,
13 pub len: usize,
14}
15
16#[derive(Debug, Clone, PartialEq, Eq)]
17pub struct MarkerClose {
18 pub tag_name: String,
19 pub len: usize,
20}
21
22pub fn marker_ids_in_text(text: &str) -> Vec<String> {
23 let mut ids = Vec::new();
24 let mut rest = text;
25
26 while let Some(index) = rest.find('<') {
27 let tag = &rest[index..];
28 if let Some(open) = parse_paired_marker_open(tag) {
29 ids.push(open.id);
30 rest = &tag[open.len..];
31 } else if let Some(empty) = parse_empty_marker(tag) {
32 ids.push(empty.id);
33 rest = &tag[empty.len..];
34 } else if let Some(close) = parse_marker_close(tag) {
35 rest = &tag[close.len..];
36 } else {
37 rest = &tag[1..];
38 }
39 }
40
41 ids
42}
43
44pub fn marker_structure_error(text: &str) -> Option<String> {
49 let mut stack = Vec::<PairedMarkerOpen>::new();
50 let mut rest = text;
51
52 while let Some(index) = rest.find('<') {
53 let tag = &rest[index..];
54 if let Some(open) = parse_paired_marker_open(tag) {
55 let len = open.len;
56 stack.push(open);
57 rest = &tag[len..];
58 } else if let Some(empty) = parse_empty_marker(tag) {
59 rest = &tag[empty.len..];
60 } else if let Some(close) = parse_marker_close(tag) {
61 let Some(open) = stack.pop() else {
62 return Some(format!(
63 "unexpected inline marker close </{}>",
64 close.tag_name
65 ));
66 };
67 if open.tag_name != close.tag_name {
68 return Some(format!(
69 "inline marker <{}> is closed by </{}>",
70 open.tag_name, close.tag_name
71 ));
72 }
73 rest = &tag[close.len..];
74 } else {
75 rest = &tag[1..];
76 }
77 }
78
79 stack.last().map(|open| {
80 format!(
81 "inline marker <{}> is missing closing tag </{}>",
82 open.tag_name, open.tag_name
83 )
84 })
85}
86
87pub fn extract_marker_id(tag: &str) -> Option<String> {
88 extract_marker_id_attr(tag).or_else(|| short_marker_name(tag).map(ToString::to_string))
89}
90
91fn extract_marker_id_attr(tag: &str) -> Option<String> {
92 let id_offset = tag.find("id=")? + 3;
93 let quote = tag[id_offset..].chars().next()?;
94 if quote != '"' && quote != '\'' {
95 return None;
96 }
97 let value_start = id_offset + quote.len_utf8();
98 let value_end = tag[value_start..].find(quote)? + value_start;
99 Some(tag[value_start..value_end].to_string())
100}
101
102pub fn parse_paired_marker_open(text: &str) -> Option<PairedMarkerOpen> {
103 if !text.starts_with('<') {
104 return None;
105 }
106 for tag_name in ["m", "keep"] {
107 let prefix = format!("<{tag_name} ");
108 if !text.starts_with(&prefix) {
109 continue;
110 }
111 let open_end = text.find('>')?;
112 if text[..open_end].ends_with('/') {
113 return None;
114 }
115 let id = extract_marker_id_attr(&text[..=open_end])?;
116 return Some(PairedMarkerOpen {
117 tag_name: tag_name.to_string(),
118 id,
119 len: open_end + 1,
120 });
121 }
122
123 let open_end = text.find('>')?;
124 if open_end == 0 {
125 return None;
126 }
127 if text[..open_end].ends_with('/') {
128 return None;
129 }
130 let name = &text[1..open_end];
131 if is_short_paired_marker_name(name) {
132 return Some(PairedMarkerOpen {
133 tag_name: name.to_string(),
134 id: name.to_string(),
135 len: open_end + 1,
136 });
137 }
138
139 None
140}
141
142pub fn parse_empty_marker(text: &str) -> Option<EmptyMarker> {
143 if !text.starts_with('<') {
144 return None;
145 }
146 for tag_name in ["ref", "m", "keep"] {
147 let prefix = format!("<{tag_name} ");
148 if !text.starts_with(&prefix) {
149 continue;
150 }
151 let end = text.find('>')?;
152 let tag = &text[..=end];
153 if !tag.ends_with("/>") {
154 return None;
155 }
156 let id = extract_marker_id_attr(tag)?;
157 return Some(EmptyMarker { id, len: end + 1 });
158 }
159
160 let end = text.find('>')?;
161 if end < 2 {
162 return None;
163 }
164 let tag = &text[..=end];
165 if !tag.ends_with("/>") {
166 return None;
167 }
168 let name = &text[1..end - 1];
169 if is_short_empty_marker_name(name) || is_short_paired_marker_name(name) {
170 return Some(EmptyMarker {
171 id: name.to_string(),
172 len: end + 1,
173 });
174 }
175
176 None
177}
178
179pub fn parse_marker_close(text: &str) -> Option<MarkerClose> {
180 if !text.starts_with("</") {
181 return None;
182 }
183 for tag_name in ["m", "keep"] {
184 let close = format!("</{tag_name}>");
185 if text.starts_with(&close) {
186 return Some(MarkerClose {
187 tag_name: tag_name.to_string(),
188 len: close.len(),
189 });
190 }
191 }
192
193 let end = text.find('>')?;
194 let name = &text[2..end];
195 if is_short_paired_marker_name(name) {
196 return Some(MarkerClose {
197 tag_name: name.to_string(),
198 len: end + 1,
199 });
200 }
201
202 None
203}
204
205pub fn is_marker_token(text: &str) -> bool {
206 let text = text.trim();
207 parse_paired_marker_open(text).is_some_and(|marker| marker.len == text.len())
208 || parse_empty_marker(text).is_some_and(|marker| marker.len == text.len())
209 || parse_marker_close(text).is_some_and(|marker| marker.len == text.len())
210}
211
212pub fn strip_marker_tokens(text: &str) -> String {
213 let mut output = String::new();
214 let mut rest = text;
215
216 while let Some(index) = rest.find('<') {
217 output.push_str(&rest[..index]);
218 let tag = &rest[index..];
219
220 if let Some(open) = parse_paired_marker_open(tag) {
221 rest = &tag[open.len..];
222 } else if let Some(empty) = parse_empty_marker(tag) {
223 rest = &tag[empty.len..];
224 } else if let Some(close) = parse_marker_close(tag) {
225 rest = &tag[close.len..];
226 } else {
227 output.push('<');
228 rest = &tag[1..];
229 }
230 }
231
232 output.push_str(rest);
233 output
234}
235
236fn short_marker_name(tag: &str) -> Option<&str> {
237 if let Some(open) = tag.strip_prefix("</") {
238 let name = open.strip_suffix('>')?;
239 return is_short_paired_marker_name(name).then_some(name);
240 }
241 let body = tag.strip_prefix('<')?.strip_suffix('>')?;
242 let name = body.strip_suffix('/').unwrap_or(body);
243 (is_short_paired_marker_name(name) || is_short_empty_marker_name(name)).then_some(name)
244}
245
246fn is_short_paired_marker_name(name: &str) -> bool {
247 name.strip_prefix('m')
248 .is_some_and(|suffix| !suffix.is_empty() && suffix.chars().all(|ch| ch.is_ascii_digit()))
249}
250
251fn is_short_empty_marker_name(name: &str) -> bool {
252 name.strip_prefix('r')
253 .is_some_and(|suffix| !suffix.is_empty() && suffix.chars().all(|ch| ch.is_ascii_digit()))
254}
255
256pub fn has_markers_in_expected_set(text: &str, expected: &HashSet<String>) -> bool {
257 let actual_set: HashSet<String> = marker_ids_in_text(text).into_iter().collect();
258 actual_set == *expected
259}
260
261pub fn all_markers_present(text: &str, required: &[String]) -> bool {
262 required.iter().all(|marker| text.contains(marker))
263}
264
265#[cfg(test)]
266mod tests {
267 use super::*;
268
269 #[test]
270 fn marker_ids_include_short_and_legacy_markers() {
271 let ids =
272 marker_ids_in_text(r#"A <m1>bold <r1/> text</m1> and <m id="m000000_000">old</m>."#);
273
274 assert_eq!(ids, vec!["m1", "r1", "m000000_000"]);
275 }
276
277 #[test]
278 fn parses_short_marker_tokens() {
279 let open = parse_paired_marker_open("<m12>text</m12>").expect("short paired marker");
280 assert_eq!(open.tag_name, "m12");
281 assert_eq!(open.id, "m12");
282 assert_eq!(open.len, "<m12>".len());
283
284 let empty = parse_empty_marker("<r3/>tail").expect("short empty marker");
285 assert_eq!(empty.id, "r3");
286 assert_eq!(empty.len, "<r3/>".len());
287
288 let close = parse_marker_close("</m12>").expect("short close marker");
289 assert_eq!(close.tag_name, "m12");
290 assert_eq!(close.len, "</m12>".len());
291 }
292
293 #[test]
294 fn strips_short_and_legacy_marker_tokens() {
295 let stripped = strip_marker_tokens(
296 r#"Hello <m1>wide <ref id="r000000_000"/> world</m1> and <m id="m000000_000">old</m>."#,
297 );
298
299 assert_eq!(stripped, "Hello wide world and old.");
300 }
301
302 #[test]
303 fn marker_structure_accepts_balanced_nested_and_empty_markers() {
304 assert_eq!(
305 marker_structure_error("<m1>outer <m2>inner</m2><r1/></m1>"),
306 None
307 );
308 assert_eq!(marker_structure_error(r#"<m id="legacy">text</m>"#), None);
309 }
310
311 #[test]
312 fn marker_structure_rejects_missing_mismatched_and_orphan_closes() {
313 assert!(
314 marker_structure_error("<m1>text")
315 .expect("missing close should fail")
316 .contains("missing closing tag")
317 );
318 assert!(
319 marker_structure_error("<m1><m2>text</m1></m2>")
320 .expect("mis-nesting should fail")
321 .contains("closed by")
322 );
323 assert!(
324 marker_structure_error("text</m1>")
325 .expect("orphan close should fail")
326 .contains("unexpected")
327 );
328 }
329}