Skip to main content

bookforge_core/
marker.rs

1use std::collections::HashSet;
2
3#[derive(Debug, Clone, PartialEq, Eq)]
4pub struct PairedMarkerOpen {
5    pub tag_name: String,
6    pub id: String,
7    pub len: usize,
8}
9
10#[derive(Debug, Clone, PartialEq, Eq)]
11pub struct EmptyMarker {
12    pub id: String,
13    pub len: usize,
14}
15
16#[derive(Debug, Clone, PartialEq, Eq)]
17pub struct MarkerClose {
18    pub tag_name: String,
19    pub len: usize,
20}
21
22pub fn marker_ids_in_text(text: &str) -> Vec<String> {
23    let mut ids = Vec::new();
24    let mut rest = text;
25
26    while let Some(index) = rest.find('<') {
27        let tag = &rest[index..];
28        if let Some(open) = parse_paired_marker_open(tag) {
29            ids.push(open.id);
30            rest = &tag[open.len..];
31        } else if let Some(empty) = parse_empty_marker(tag) {
32            ids.push(empty.id);
33            rest = &tag[empty.len..];
34        } else if let Some(close) = parse_marker_close(tag) {
35            rest = &tag[close.len..];
36        } else {
37            rest = &tag[1..];
38        }
39    }
40
41    ids
42}
43
44pub fn extract_marker_id(tag: &str) -> Option<String> {
45    extract_marker_id_attr(tag).or_else(|| short_marker_name(tag).map(ToString::to_string))
46}
47
48fn extract_marker_id_attr(tag: &str) -> Option<String> {
49    let id_offset = tag.find("id=")? + 3;
50    let quote = tag[id_offset..].chars().next()?;
51    if quote != '"' && quote != '\'' {
52        return None;
53    }
54    let value_start = id_offset + quote.len_utf8();
55    let value_end = tag[value_start..].find(quote)? + value_start;
56    Some(tag[value_start..value_end].to_string())
57}
58
59pub fn parse_paired_marker_open(text: &str) -> Option<PairedMarkerOpen> {
60    if !text.starts_with('<') {
61        return None;
62    }
63    for tag_name in ["m", "keep"] {
64        let prefix = format!("<{tag_name} ");
65        if !text.starts_with(&prefix) {
66            continue;
67        }
68        let open_end = text.find('>')?;
69        if text[..open_end].ends_with('/') {
70            return None;
71        }
72        let id = extract_marker_id_attr(&text[..=open_end])?;
73        return Some(PairedMarkerOpen {
74            tag_name: tag_name.to_string(),
75            id,
76            len: open_end + 1,
77        });
78    }
79
80    let open_end = text.find('>')?;
81    if open_end == 0 {
82        return None;
83    }
84    if text[..open_end].ends_with('/') {
85        return None;
86    }
87    let name = &text[1..open_end];
88    if is_short_paired_marker_name(name) {
89        return Some(PairedMarkerOpen {
90            tag_name: name.to_string(),
91            id: name.to_string(),
92            len: open_end + 1,
93        });
94    }
95
96    None
97}
98
99pub fn parse_empty_marker(text: &str) -> Option<EmptyMarker> {
100    if !text.starts_with('<') {
101        return None;
102    }
103    for tag_name in ["ref", "m", "keep"] {
104        let prefix = format!("<{tag_name} ");
105        if !text.starts_with(&prefix) {
106            continue;
107        }
108        let end = text.find('>')?;
109        let tag = &text[..=end];
110        if !tag.ends_with("/>") {
111            return None;
112        }
113        let id = extract_marker_id_attr(tag)?;
114        return Some(EmptyMarker { id, len: end + 1 });
115    }
116
117    let end = text.find('>')?;
118    if end < 2 {
119        return None;
120    }
121    let tag = &text[..=end];
122    if !tag.ends_with("/>") {
123        return None;
124    }
125    let name = &text[1..end - 1];
126    if is_short_empty_marker_name(name) || is_short_paired_marker_name(name) {
127        return Some(EmptyMarker {
128            id: name.to_string(),
129            len: end + 1,
130        });
131    }
132
133    None
134}
135
136pub fn parse_marker_close(text: &str) -> Option<MarkerClose> {
137    if !text.starts_with("</") {
138        return None;
139    }
140    for tag_name in ["m", "keep"] {
141        let close = format!("</{tag_name}>");
142        if text.starts_with(&close) {
143            return Some(MarkerClose {
144                tag_name: tag_name.to_string(),
145                len: close.len(),
146            });
147        }
148    }
149
150    let end = text.find('>')?;
151    let name = &text[2..end];
152    if is_short_paired_marker_name(name) {
153        return Some(MarkerClose {
154            tag_name: name.to_string(),
155            len: end + 1,
156        });
157    }
158
159    None
160}
161
162pub fn is_marker_token(text: &str) -> bool {
163    let text = text.trim();
164    parse_paired_marker_open(text).is_some_and(|marker| marker.len == text.len())
165        || parse_empty_marker(text).is_some_and(|marker| marker.len == text.len())
166        || parse_marker_close(text).is_some_and(|marker| marker.len == text.len())
167}
168
169pub fn strip_marker_tokens(text: &str) -> String {
170    let mut output = String::new();
171    let mut rest = text;
172
173    while let Some(index) = rest.find('<') {
174        output.push_str(&rest[..index]);
175        let tag = &rest[index..];
176
177        if let Some(open) = parse_paired_marker_open(tag) {
178            rest = &tag[open.len..];
179        } else if let Some(empty) = parse_empty_marker(tag) {
180            rest = &tag[empty.len..];
181        } else if let Some(close) = parse_marker_close(tag) {
182            rest = &tag[close.len..];
183        } else {
184            output.push('<');
185            rest = &tag[1..];
186        }
187    }
188
189    output.push_str(rest);
190    output
191}
192
193fn short_marker_name(tag: &str) -> Option<&str> {
194    if let Some(open) = tag.strip_prefix("</") {
195        let name = open.strip_suffix('>')?;
196        return is_short_paired_marker_name(name).then_some(name);
197    }
198    let body = tag.strip_prefix('<')?.strip_suffix('>')?;
199    let name = body.strip_suffix('/').unwrap_or(body);
200    (is_short_paired_marker_name(name) || is_short_empty_marker_name(name)).then_some(name)
201}
202
203fn is_short_paired_marker_name(name: &str) -> bool {
204    name.strip_prefix('m')
205        .is_some_and(|suffix| !suffix.is_empty() && suffix.chars().all(|ch| ch.is_ascii_digit()))
206}
207
208fn is_short_empty_marker_name(name: &str) -> bool {
209    name.strip_prefix('r')
210        .is_some_and(|suffix| !suffix.is_empty() && suffix.chars().all(|ch| ch.is_ascii_digit()))
211}
212
213pub fn has_markers_in_expected_set(text: &str, expected: &HashSet<String>) -> bool {
214    let actual_set: HashSet<String> = marker_ids_in_text(text).into_iter().collect();
215    actual_set == *expected
216}
217
218pub fn all_markers_present(text: &str, required: &[String]) -> bool {
219    required.iter().all(|marker| text.contains(marker))
220}
221
222#[cfg(test)]
223mod tests {
224    use super::*;
225
226    #[test]
227    fn marker_ids_include_short_and_legacy_markers() {
228        let ids =
229            marker_ids_in_text(r#"A <m1>bold <r1/> text</m1> and <m id="m000000_000">old</m>."#);
230
231        assert_eq!(ids, vec!["m1", "r1", "m000000_000"]);
232    }
233
234    #[test]
235    fn parses_short_marker_tokens() {
236        let open = parse_paired_marker_open("<m12>text</m12>").expect("short paired marker");
237        assert_eq!(open.tag_name, "m12");
238        assert_eq!(open.id, "m12");
239        assert_eq!(open.len, "<m12>".len());
240
241        let empty = parse_empty_marker("<r3/>tail").expect("short empty marker");
242        assert_eq!(empty.id, "r3");
243        assert_eq!(empty.len, "<r3/>".len());
244
245        let close = parse_marker_close("</m12>").expect("short close marker");
246        assert_eq!(close.tag_name, "m12");
247        assert_eq!(close.len, "</m12>".len());
248    }
249
250    #[test]
251    fn strips_short_and_legacy_marker_tokens() {
252        let stripped = strip_marker_tokens(
253            r#"Hello <m1>wide <ref id="r000000_000"/> world</m1> and <m id="m000000_000">old</m>."#,
254        );
255
256        assert_eq!(stripped, "Hello wide  world and old.");
257    }
258}