braze_sync/values/
correlation.rs

1//! Remote-body correlation primitives for `export` (RFC §2.5).
2//!
3//! These functions inspect a *remote* body (HTML, plaintext, subject,
4//! preheader) and return the per-occurrence lid / cb_id values together
5//! with the anchor used to correlate them back to the values file
6//! entries.
7//!
8//! - HTML lid: anchor = the URL of the immediately-preceding
9//!   `<a href="...">`. Multiple `<a>`s with the same URL fall back to
10//!   appearance order (RFC §2.5 "Key 対応の曖昧性").
11//! - Plaintext lid: anchor = the raw URL (`https?://…`) immediately
12//!   preceding the `| lid: '…'` token; trailing punctuation is trimmed
13//!   (RFC §5 Edge case for `]`/`)` etc.).
14//! - subject / preheader lid: anchor = adjacent Liquid identifiers
15//!   inside the same `{{…}}` block. Phase 3 first cut covers the URL
16//!   variants; the anchor-only variant is supported by carrying the
17//!   anchor string verbatim from the existing values entry.
18//! - cb_id: anchor = the `${NAME}` inside the same Liquid token as
19//!   `| id: 'cbN'`. NAME is the source for the slug-derived key.
20
21use regex_lite::Regex;
22use std::sync::OnceLock;
23
24/// Normalize a URL for anchor comparison per RFC §2.2:
25/// keep `scheme://host/path`, drop `?query` and `#fragment`.
26///
27/// Returns the input unchanged if it doesn't look like a URL with a
28/// scheme — callers pass already-detected URLs, but normalizing
29/// idempotently keeps the function safe to apply in either direction.
30pub fn normalize_url(url: &str) -> String {
31    let stop = url.find(['?', '#']).unwrap_or(url.len());
32    url[..stop].to_string()
33}
34
35fn href_re() -> &'static Regex {
36    static RE: OnceLock<Regex> = OnceLock::new();
37    RE.get_or_init(|| {
38        // Tolerant of attribute order and either quote style. The href
39        // value runs up to the matching quote — Braze-issued anchor
40        // tags do not nest quotes inside the URL.
41        Regex::new(r#"(?i)<a\b[^>]*?\bhref\s*=\s*(?:"([^"]*)"|'([^']*)')"#)
42            .expect("href regex is valid")
43    })
44}
45
46fn lid_value_re() -> &'static Regex {
47    static RE: OnceLock<Regex> = OnceLock::new();
48    RE.get_or_init(|| {
49        // The pipe anchor (`|`) prevents false matches on hash literals
50        // or unrelated keyword args that happen to spell `lid:`. Matches
51        // both quote styles, and the value class matches the built-in
52        // shape check (`^[a-z0-9]{8,}$`).
53        Regex::new(r#"\|\s*lid:\s*(?:"([a-z0-9]{8,})"|'([a-z0-9]{8,})')"#)
54            .expect("lid value regex is valid")
55    })
56}
57
58fn plaintext_url_re() -> &'static Regex {
59    static RE: OnceLock<Regex> = OnceLock::new();
60    RE.get_or_init(|| {
61        // Greedy `[^\s<>"]` runs up to whitespace or a quote/angle —
62        // good enough for Braze plaintext where URLs aren't routinely
63        // wrapped in markup. Trailing punctuation is trimmed post-hoc
64        // (see `trim_trailing_punctuation`).
65        Regex::new(r#"https?://[^\s<>"']+"#).expect("plaintext URL regex is valid")
66    })
67}
68
69fn cb_id_include_re() -> &'static Regex {
70    static RE: OnceLock<Regex> = OnceLock::new();
71    RE.get_or_init(|| {
72        // Captures `${NAME}` (group 1) and `cbN` (group 2) from
73        //   {{content_blocks.${NAME} | id: 'cbN'}}
74        // Matches existing dependency-graph regex in
75        // src/diff/content_block_order.rs but tightened to require
76        // `| id: '…'` form (we need the cbN value, not just NAME).
77        Regex::new(
78            r#"\{\{\s*content_blocks\.\$\{\s*([^\s}|]+)\s*\}\s*\|\s*id:\s*(?:"(cb[0-9]+)"|'(cb[0-9]+)')\s*\}\}"#,
79        )
80        .expect("cb_id include regex is valid")
81    })
82}
83
84/// Trim trailing punctuation that a greedy URL match would otherwise
85/// swallow. Per RFC §5 Edge case, the following are *always* trimmed:
86/// `.`, `,`, `;`, `:`, `!`, `?`, `>`. The closers `)` and `]` are
87/// trimmed *only* when the URL is preceded by the corresponding opener
88/// (`(` or `[`) — Markdown-style `[text](https://…)` is the motivating
89/// case. This conservative rule preserves URLs that legitimately end
90/// in `)`/`]` (e.g., Wikipedia disambiguation pages) when no opener is
91/// present in the surrounding text.
92fn trim_trailing_punctuation(url: &str, preceded_by: Option<char>) -> &str {
93    let pair_closer = match preceded_by {
94        Some('(') => Some(')'),
95        Some('[') => Some(']'),
96        Some('<') => Some('>'),
97        _ => None,
98    };
99    let mut end = url.len();
100    while end > 0 {
101        let c = url[..end].chars().last().unwrap();
102        let drop_general = matches!(c, '.' | ',' | ';' | ':' | '!' | '?' | '>');
103        let drop_pair = Some(c) == pair_closer;
104        if drop_general || drop_pair {
105            end -= c.len_utf8();
106        } else {
107            break;
108        }
109    }
110    &url[..end]
111}
112
113/// One remote-side correlation point: a URL anchor (in field byte
114/// offset order) paired with the lid value that follows it in the
115/// same anchor scope.
116#[derive(Debug, Clone, PartialEq, Eq)]
117pub struct LidCorrelation {
118    /// Normalized URL anchor.
119    pub url: String,
120    /// The lid value extracted from `| lid: '…'`.
121    pub value: String,
122    /// Byte offset where the `<a href>` (HTML) or raw URL (plaintext)
123    /// begins. Useful for ordering and ambiguity reporting.
124    pub url_offset: usize,
125}
126
127/// Extract `(url, lid_value)` pairs from an HTML field by pairing each
128/// `<a href="…">` with the next `| lid: '…'` that follows it before
129/// the next `<a href>` or end of string. Unpaired anchors are skipped.
130pub fn extract_html_lid_values(body: &str) -> Vec<LidCorrelation> {
131    pair_urls_with_lids(href_iter(body), body)
132}
133
134/// Extract `(url, lid_value)` pairs from a plaintext field. Same
135/// pairing rule as HTML but URLs come from raw `https?://…` matches.
136pub fn extract_plaintext_lid_values(body: &str) -> Vec<LidCorrelation> {
137    pair_urls_with_lids(plaintext_url_iter(body), body)
138}
139
140fn href_iter(body: &str) -> Vec<(usize, String)> {
141    href_re()
142        .captures_iter(body)
143        .filter_map(|cap| {
144            let whole = cap.get(0)?;
145            let url = cap
146                .get(1)
147                .or(cap.get(2))
148                .map(|m| m.as_str())
149                .unwrap_or_default();
150            Some((whole.start(), normalize_url(url)))
151        })
152        .collect()
153}
154
155fn plaintext_url_iter(body: &str) -> Vec<(usize, String)> {
156    plaintext_url_re()
157        .find_iter(body)
158        .map(|m| {
159            let raw = m.as_str();
160            let preceded_by = if m.start() > 0 {
161                body[..m.start()].chars().last()
162            } else {
163                None
164            };
165            let trimmed = trim_trailing_punctuation(raw, preceded_by);
166            (m.start(), normalize_url(trimmed))
167        })
168        .collect()
169}
170
171fn pair_urls_with_lids(urls: Vec<(usize, String)>, body: &str) -> Vec<LidCorrelation> {
172    let lids: Vec<(usize, String)> = lid_value_re()
173        .captures_iter(body)
174        .filter_map(|cap| {
175            let whole = cap.get(0)?;
176            let value = cap.get(1).or(cap.get(2)).map(|m| m.as_str().to_string())?;
177            Some((whole.start(), value))
178        })
179        .collect();
180
181    let mut out = Vec::new();
182    for (i, (url_off, url)) in urls.iter().enumerate() {
183        let next_url_off = urls.get(i + 1).map(|(o, _)| *o).unwrap_or(body.len());
184        if let Some((_, value)) = lids
185            .iter()
186            .find(|(off, _)| *off > *url_off && *off < next_url_off)
187        {
188            out.push(LidCorrelation {
189                url: url.clone(),
190                value: value.clone(),
191                url_offset: *url_off,
192            });
193        }
194    }
195    out
196}
197
198/// One cb_id include occurrence extracted from a remote body. Slug is
199/// the RFC §3 Q3 key derived from `${NAME}`.
200#[derive(Debug, Clone, PartialEq, Eq)]
201pub struct CbIdCorrelation {
202    /// The verbatim `${NAME}` content_block name from the include.
203    pub name: String,
204    /// `cbN` form, e.g. `cb42`.
205    pub value: String,
206    /// Slug-form key per RFC §3 Q3.
207    pub key: String,
208}
209
210/// Extract every `{{content_blocks.${NAME} | id: 'cbN'}}` from `body`.
211pub fn extract_cb_id_values(body: &str) -> Vec<CbIdCorrelation> {
212    cb_id_include_re()
213        .captures_iter(body)
214        .filter_map(|cap| {
215            let name = cap.get(1)?.as_str().to_string();
216            let value = cap.get(2).or(cap.get(3)).map(|m| m.as_str().to_string())?;
217            let key = slug_for_cb_id(&name);
218            Some(CbIdCorrelation { name, value, key })
219        })
220        .collect()
221}
222
223/// Slug a content_block name for use as a `cb_id` key per RFC §3 Q3.
224pub fn slug_for_cb_id(name: &str) -> String {
225    let base = slug_core(name);
226    if base.is_empty() || base.starts_with(|c: char| c.is_ascii_digit()) {
227        format!("cb_{base}")
228    } else {
229        base
230    }
231}
232
233/// Slug a URL path tail or arbitrary anchor for use as a `lid` key.
234/// `link_` prefix is applied when the source produces no meaningful
235/// ASCII content (RFC §3 Q3).
236pub fn slug_for_lid(source: &str) -> String {
237    let base = slug_core(source);
238    if base.is_empty() || base.starts_with(|c: char| c.is_ascii_digit()) {
239        format!("link_{base}")
240    } else {
241        base
242    }
243}
244
245fn slug_core(s: &str) -> String {
246    let mut out = String::with_capacity(s.len());
247    let mut last_underscore = false;
248    for ch in s.chars() {
249        let mapped = if ch.is_ascii_alphanumeric() {
250            ch.to_ascii_lowercase()
251        } else {
252            '_'
253        };
254        if mapped == '_' {
255            if last_underscore {
256                continue;
257            }
258            last_underscore = true;
259        } else {
260            last_underscore = false;
261        }
262        out.push(mapped);
263    }
264    let trimmed = out.trim_matches('_');
265    trimmed.to_string()
266}
267
268#[cfg(test)]
269mod tests {
270    use super::*;
271
272    #[test]
273    fn normalize_strips_query_and_fragment() {
274        assert_eq!(
275            normalize_url("https://example.com/x?utm=1"),
276            "https://example.com/x"
277        );
278        assert_eq!(
279            normalize_url("https://example.com/x#frag"),
280            "https://example.com/x"
281        );
282        assert_eq!(
283            normalize_url("https://example.com/x"),
284            "https://example.com/x"
285        );
286    }
287
288    #[test]
289    fn html_lid_pairs_each_anchor_with_following_value() {
290        let body = r#"<p>
291<a href="https://example.com/a">{{ x | lid: 'lidvalueaa1' }}A</a>
292<a href="https://example.com/b">{{ x | lid: 'lidvaluebb2' }}B</a>
293</p>"#;
294        let pairs = extract_html_lid_values(body);
295        assert_eq!(pairs.len(), 2);
296        assert_eq!(pairs[0].url, "https://example.com/a");
297        assert_eq!(pairs[0].value, "lidvalueaa1");
298        assert_eq!(pairs[1].url, "https://example.com/b");
299        assert_eq!(pairs[1].value, "lidvaluebb2");
300    }
301
302    #[test]
303    fn html_lid_unpaired_anchor_is_skipped() {
304        let body = r#"<a href="https://example.com/a">no lid here</a>
305<a href="https://example.com/b">{{ x | lid: 'lidvaluebb2' }}B</a>"#;
306        let pairs = extract_html_lid_values(body);
307        assert_eq!(pairs.len(), 1);
308        assert_eq!(pairs[0].url, "https://example.com/b");
309    }
310
311    #[test]
312    fn html_lid_handles_both_quote_styles_and_query_string() {
313        let body = r#"<a href='https://example.com/x?utm=foo'>{{ x | lid: "lidvaluexyz1" }}X</a>"#;
314        let pairs = extract_html_lid_values(body);
315        assert_eq!(pairs.len(), 1);
316        assert_eq!(pairs[0].url, "https://example.com/x");
317        assert_eq!(pairs[0].value, "lidvaluexyz1");
318    }
319
320    #[test]
321    fn plaintext_lid_trims_trailing_punctuation() {
322        // Markdown-style link: closing `)` must be trimmed because the
323        // URL was preceded by `(`. Following `| lid:` syntax in
324        // plaintext is unusual but Braze does emit it.
325        let body = "Visit (https://example.com/cta) | lid: 'lidplain01a' for the deal.";
326        let pairs = extract_plaintext_lid_values(body);
327        assert_eq!(pairs.len(), 1);
328        assert_eq!(pairs[0].url, "https://example.com/cta");
329        assert_eq!(pairs[0].value, "lidplain01a");
330    }
331
332    #[test]
333    fn plaintext_lid_trims_sentence_period() {
334        let body = "See https://example.com/end. | lid: 'lidplain02b'";
335        let pairs = extract_plaintext_lid_values(body);
336        assert_eq!(pairs.len(), 1);
337        assert_eq!(pairs[0].url, "https://example.com/end");
338    }
339
340    #[test]
341    fn cb_id_extracts_name_and_value() {
342        // Liquid variable names inside `${...}` carry no whitespace by
343        // construction — matches the dep-graph regex in
344        // src/diff/content_block_order.rs.
345        let body = "before {{content_blocks.${promo_banner} | id: 'cb42'}} after";
346        let pairs = extract_cb_id_values(body);
347        assert_eq!(pairs.len(), 1);
348        assert_eq!(pairs[0].name, "promo_banner");
349        assert_eq!(pairs[0].value, "cb42");
350        assert_eq!(pairs[0].key, "promo_banner");
351    }
352
353    #[test]
354    fn cb_id_handles_multiple_includes() {
355        let body = "{{content_blocks.${alpha} | id: 'cb1'}} {{content_blocks.${beta} | id: 'cb2'}}";
356        let pairs = extract_cb_id_values(body);
357        assert_eq!(pairs.len(), 2);
358        assert_eq!(pairs[0].name, "alpha");
359        assert_eq!(pairs[0].value, "cb1");
360        assert_eq!(pairs[0].key, "alpha");
361        assert_eq!(pairs[1].name, "beta");
362        assert_eq!(pairs[1].value, "cb2");
363    }
364
365    #[test]
366    fn cb_id_slug_uses_cb_prefix_for_empty_or_digit_start() {
367        assert_eq!(slug_for_cb_id("2024_summer"), "cb_2024_summer");
368        assert_eq!(slug_for_cb_id(""), "cb_");
369        assert_eq!(slug_for_cb_id("My Promo Banner"), "my_promo_banner");
370        assert_eq!(slug_for_cb_id("cb_promo_image"), "cb_promo_image");
371    }
372
373    #[test]
374    fn lid_slug_uses_link_prefix_for_empty_or_digit_start() {
375        assert_eq!(slug_for_lid("/spring-sale"), "spring_sale");
376        assert_eq!(slug_for_lid("/"), "link_");
377        assert_eq!(slug_for_lid("123"), "link_123");
378        // Non-ASCII source collapses to empty per RFC §3 Q3 Unicode rule.
379        assert_eq!(slug_for_lid("プロモ"), "link_");
380    }
381
382    #[test]
383    fn slug_collapses_multiple_separators() {
384        assert_eq!(slug_for_lid("foo//bar--baz"), "foo_bar_baz");
385        assert_eq!(slug_for_lid("--leading"), "leading");
386    }
387}
braze_sync/values/correlation.rs

braze_sync/values/
correlation.rs