Skip to main content

bookforge_core/
marker.rs

1use std::collections::HashSet;
2
3#[derive(Debug, Clone, PartialEq, Eq)]
4pub struct PairedMarkerOpen {
5    pub tag_name: String,
6    pub id: String,
7    pub len: usize,
8}
9
10#[derive(Debug, Clone, PartialEq, Eq)]
11pub struct EmptyMarker {
12    pub id: String,
13    pub len: usize,
14}
15
16#[derive(Debug, Clone, PartialEq, Eq)]
17pub struct MarkerClose {
18    pub tag_name: String,
19    pub len: usize,
20}
21
22pub fn marker_ids_in_text(text: &str) -> Vec<String> {
23    let mut ids = Vec::new();
24    let mut rest = text;
25
26    while let Some(index) = rest.find('<') {
27        let tag = &rest[index..];
28        if let Some(open) = parse_paired_marker_open(tag) {
29            ids.push(open.id);
30            rest = &tag[open.len..];
31        } else if let Some(empty) = parse_empty_marker(tag) {
32            ids.push(empty.id);
33            rest = &tag[empty.len..];
34        } else if let Some(close) = parse_marker_close(tag) {
35            rest = &tag[close.len..];
36        } else {
37            rest = &tag[1..];
38        }
39    }
40
41    ids
42}
43
44/// Return a deterministic error when paired inline markers are unbalanced,
45/// mis-nested, or closed by the wrong marker tag. ID-presence checks alone
46/// cannot detect `<m1>text` with a missing `</m1>`, which is valid prose JSON
47/// but cannot be reassembled into the original inline structure.
48pub fn marker_structure_error(text: &str) -> Option<String> {
49    let mut stack = Vec::<PairedMarkerOpen>::new();
50    let mut rest = text;
51
52    while let Some(index) = rest.find('<') {
53        let tag = &rest[index..];
54        if let Some(open) = parse_paired_marker_open(tag) {
55            let len = open.len;
56            stack.push(open);
57            rest = &tag[len..];
58        } else if let Some(empty) = parse_empty_marker(tag) {
59            rest = &tag[empty.len..];
60        } else if let Some(close) = parse_marker_close(tag) {
61            let Some(open) = stack.pop() else {
62                return Some(format!(
63                    "unexpected inline marker close </{}>",
64                    close.tag_name
65                ));
66            };
67            if open.tag_name != close.tag_name {
68                return Some(format!(
69                    "inline marker <{}> is closed by </{}>",
70                    open.tag_name, close.tag_name
71                ));
72            }
73            rest = &tag[close.len..];
74        } else {
75            rest = &tag[1..];
76        }
77    }
78
79    stack.last().map(|open| {
80        format!(
81            "inline marker <{}> is missing closing tag </{}>",
82            open.tag_name, open.tag_name
83        )
84    })
85}
86
87pub fn extract_marker_id(tag: &str) -> Option<String> {
88    extract_marker_id_attr(tag).or_else(|| short_marker_name(tag).map(ToString::to_string))
89}
90
91fn extract_marker_id_attr(tag: &str) -> Option<String> {
92    let id_offset = tag.find("id=")? + 3;
93    let quote = tag[id_offset..].chars().next()?;
94    if quote != '"' && quote != '\'' {
95        return None;
96    }
97    let value_start = id_offset + quote.len_utf8();
98    let value_end = tag[value_start..].find(quote)? + value_start;
99    Some(tag[value_start..value_end].to_string())
100}
101
102pub fn parse_paired_marker_open(text: &str) -> Option<PairedMarkerOpen> {
103    if !text.starts_with('<') {
104        return None;
105    }
106    for tag_name in ["m", "keep"] {
107        let prefix = format!("<{tag_name} ");
108        if !text.starts_with(&prefix) {
109            continue;
110        }
111        let open_end = text.find('>')?;
112        if text[..open_end].ends_with('/') {
113            return None;
114        }
115        let id = extract_marker_id_attr(&text[..=open_end])?;
116        return Some(PairedMarkerOpen {
117            tag_name: tag_name.to_string(),
118            id,
119            len: open_end + 1,
120        });
121    }
122
123    let open_end = text.find('>')?;
124    if open_end == 0 {
125        return None;
126    }
127    if text[..open_end].ends_with('/') {
128        return None;
129    }
130    let name = &text[1..open_end];
131    if is_short_paired_marker_name(name) {
132        return Some(PairedMarkerOpen {
133            tag_name: name.to_string(),
134            id: name.to_string(),
135            len: open_end + 1,
136        });
137    }
138
139    None
140}
141
142pub fn parse_empty_marker(text: &str) -> Option<EmptyMarker> {
143    if !text.starts_with('<') {
144        return None;
145    }
146    for tag_name in ["ref", "m", "keep"] {
147        let prefix = format!("<{tag_name} ");
148        if !text.starts_with(&prefix) {
149            continue;
150        }
151        let end = text.find('>')?;
152        let tag = &text[..=end];
153        if !tag.ends_with("/>") {
154            return None;
155        }
156        let id = extract_marker_id_attr(tag)?;
157        return Some(EmptyMarker { id, len: end + 1 });
158    }
159
160    let end = text.find('>')?;
161    if end < 2 {
162        return None;
163    }
164    let tag = &text[..=end];
165    if !tag.ends_with("/>") {
166        return None;
167    }
168    let name = &text[1..end - 1];
169    if is_short_empty_marker_name(name) || is_short_paired_marker_name(name) {
170        return Some(EmptyMarker {
171            id: name.to_string(),
172            len: end + 1,
173        });
174    }
175
176    None
177}
178
179pub fn parse_marker_close(text: &str) -> Option<MarkerClose> {
180    if !text.starts_with("</") {
181        return None;
182    }
183    for tag_name in ["m", "keep"] {
184        let close = format!("</{tag_name}>");
185        if text.starts_with(&close) {
186            return Some(MarkerClose {
187                tag_name: tag_name.to_string(),
188                len: close.len(),
189            });
190        }
191    }
192
193    let end = text.find('>')?;
194    let name = &text[2..end];
195    if is_short_paired_marker_name(name) {
196        return Some(MarkerClose {
197            tag_name: name.to_string(),
198            len: end + 1,
199        });
200    }
201
202    None
203}
204
205pub fn is_marker_token(text: &str) -> bool {
206    let text = text.trim();
207    parse_paired_marker_open(text).is_some_and(|marker| marker.len == text.len())
208        || parse_empty_marker(text).is_some_and(|marker| marker.len == text.len())
209        || parse_marker_close(text).is_some_and(|marker| marker.len == text.len())
210}
211
212pub fn strip_marker_tokens(text: &str) -> String {
213    let mut output = String::new();
214    let mut rest = text;
215
216    while let Some(index) = rest.find('<') {
217        output.push_str(&rest[..index]);
218        let tag = &rest[index..];
219
220        if let Some(open) = parse_paired_marker_open(tag) {
221            rest = &tag[open.len..];
222        } else if let Some(empty) = parse_empty_marker(tag) {
223            rest = &tag[empty.len..];
224        } else if let Some(close) = parse_marker_close(tag) {
225            rest = &tag[close.len..];
226        } else {
227            output.push('<');
228            rest = &tag[1..];
229        }
230    }
231
232    output.push_str(rest);
233    output
234}
235
236fn short_marker_name(tag: &str) -> Option<&str> {
237    if let Some(open) = tag.strip_prefix("</") {
238        let name = open.strip_suffix('>')?;
239        return is_short_paired_marker_name(name).then_some(name);
240    }
241    let body = tag.strip_prefix('<')?.strip_suffix('>')?;
242    let name = body.strip_suffix('/').unwrap_or(body);
243    (is_short_paired_marker_name(name) || is_short_empty_marker_name(name)).then_some(name)
244}
245
246fn is_short_paired_marker_name(name: &str) -> bool {
247    name.strip_prefix('m')
248        .is_some_and(|suffix| !suffix.is_empty() && suffix.chars().all(|ch| ch.is_ascii_digit()))
249}
250
251fn is_short_empty_marker_name(name: &str) -> bool {
252    name.strip_prefix('r')
253        .is_some_and(|suffix| !suffix.is_empty() && suffix.chars().all(|ch| ch.is_ascii_digit()))
254}
255
256pub fn has_markers_in_expected_set(text: &str, expected: &HashSet<String>) -> bool {
257    let actual_set: HashSet<String> = marker_ids_in_text(text).into_iter().collect();
258    actual_set == *expected
259}
260
261pub fn all_markers_present(text: &str, required: &[String]) -> bool {
262    required.iter().all(|marker| text.contains(marker))
263}
264
265#[cfg(test)]
266mod tests {
267    use super::*;
268
269    #[test]
270    fn marker_ids_include_short_and_legacy_markers() {
271        let ids =
272            marker_ids_in_text(r#"A <m1>bold <r1/> text</m1> and <m id="m000000_000">old</m>."#);
273
274        assert_eq!(ids, vec!["m1", "r1", "m000000_000"]);
275    }
276
277    #[test]
278    fn parses_short_marker_tokens() {
279        let open = parse_paired_marker_open("<m12>text</m12>").expect("short paired marker");
280        assert_eq!(open.tag_name, "m12");
281        assert_eq!(open.id, "m12");
282        assert_eq!(open.len, "<m12>".len());
283
284        let empty = parse_empty_marker("<r3/>tail").expect("short empty marker");
285        assert_eq!(empty.id, "r3");
286        assert_eq!(empty.len, "<r3/>".len());
287
288        let close = parse_marker_close("</m12>").expect("short close marker");
289        assert_eq!(close.tag_name, "m12");
290        assert_eq!(close.len, "</m12>".len());
291    }
292
293    #[test]
294    fn strips_short_and_legacy_marker_tokens() {
295        let stripped = strip_marker_tokens(
296            r#"Hello <m1>wide <ref id="r000000_000"/> world</m1> and <m id="m000000_000">old</m>."#,
297        );
298
299        assert_eq!(stripped, "Hello wide  world and old.");
300    }
301
302    #[test]
303    fn marker_structure_accepts_balanced_nested_and_empty_markers() {
304        assert_eq!(
305            marker_structure_error("<m1>outer <m2>inner</m2><r1/></m1>"),
306            None
307        );
308        assert_eq!(marker_structure_error(r#"<m id="legacy">text</m>"#), None);
309    }
310
311    #[test]
312    fn marker_structure_rejects_missing_mismatched_and_orphan_closes() {
313        assert!(
314            marker_structure_error("<m1>text")
315                .expect("missing close should fail")
316                .contains("missing closing tag")
317        );
318        assert!(
319            marker_structure_error("<m1><m2>text</m1></m2>")
320                .expect("mis-nesting should fail")
321                .contains("closed by")
322        );
323        assert!(
324            marker_structure_error("text</m1>")
325                .expect("orphan close should fail")
326                .contains("unexpected")
327        );
328    }
329}