braze_sync/values/
correlation.rs

1//! Remote-body correlation primitives.
2//!
3//! Extract lid / cb_id values from a remote body together with the
4//! anchor used to pair them with template placeholders.
5//!
6//! - HTML lid: anchor = the URL attribute of the enclosing element.
7//!   Same-URL occurrences are matched by appearance order.
8//! - Plaintext lid: anchor = the raw `https?://…` preceding the lid.
9//! - cb_id: anchor = `${NAME}` in the same Liquid include.
10
11use regex_lite::Regex;
12use std::sync::OnceLock;
13
14/// Normalize a URL for anchor comparison: keep `scheme://host/path`,
15/// drop `?query` and `#fragment`.
16///
17/// Returns the input unchanged if it doesn't look like a URL with a
18/// scheme — callers pass already-detected URLs, but normalizing
19/// idempotently keeps the function safe to apply in either direction.
20pub fn normalize_url(url: &str) -> String {
21    let stop = url.find(['?', '#']).unwrap_or(url.len());
22    url[..stop].to_string()
23}
24
25fn href_re() -> &'static Regex {
26    static RE: OnceLock<Regex> = OnceLock::new();
27    RE.get_or_init(|| {
28        // Tolerant of attribute order and either quote style. Matches
29        // `href`, `src`, `action` — with an optional namespace prefix
30        // like `xlink:` or `v:` — on any element, not just `<a>`. This
31        // mirrors templatize::url_attr_re so VML / SVG CTAs whose lid
32        // sits inside a non-anchor element's href round-trip through
33        // apply/diff resolution. Leading `\s` (not `\b`) prevents
34        // `data-href`-style custom attributes from tail-matching.
35        Regex::new(
36            r#"(?i)<[a-z][a-z0-9_.:-]*\b[^>]*?\s(?:[a-z][a-z0-9_-]*:)?(?:href|src|action)\s*=\s*(?:"([^"]*)"|'([^']*)')"#,
37        )
38        .expect("href regex is valid")
39    })
40}
41
42fn lid_value_re() -> &'static Regex {
43    static RE: OnceLock<Regex> = OnceLock::new();
44    RE.get_or_init(|| {
45        // The pipe anchor (`|`) prevents false matches on hash literals
46        // or unrelated keyword args that happen to spell `lid:`. Matches
47        // both quote styles, and the value class matches the built-in
48        // shape check (`^[a-z0-9]{8,}$`).
49        Regex::new(r#"\|\s*lid:\s*(?:"([a-z0-9]{8,})"|'([a-z0-9]{8,})')"#)
50            .expect("lid value regex is valid")
51    })
52}
53
54fn plaintext_url_re() -> &'static Regex {
55    static RE: OnceLock<Regex> = OnceLock::new();
56    RE.get_or_init(|| {
57        // Greedy `[^\s<>"]` runs up to whitespace or a quote/angle —
58        // good enough for Braze plaintext where URLs aren't routinely
59        // wrapped in markup. Trailing punctuation is trimmed post-hoc
60        // (see `trim_trailing_punctuation`).
61        Regex::new(r#"https?://[^\s<>"']+"#).expect("plaintext URL regex is valid")
62    })
63}
64
65fn cb_id_include_re() -> &'static Regex {
66    static RE: OnceLock<Regex> = OnceLock::new();
67    RE.get_or_init(|| {
68        // Captures `${NAME}` (group 1) and `cbN` (group 2) from
69        //   {{content_blocks.${NAME} | id: 'cbN'}}
70        // Matches existing dependency-graph regex in
71        // src/diff/content_block_order.rs but tightened to require
72        // `| id: '…'` form (we need the cbN value, not just NAME).
73        Regex::new(
74            r#"\{\{\s*content_blocks\.\$\{\s*([^\s}|]+)\s*\}\s*\|\s*id:\s*(?:"(cb[0-9]+)"|'(cb[0-9]+)')\s*\}\}"#,
75        )
76        .expect("cb_id include regex is valid")
77    })
78}
79
80/// Trim trailing punctuation that a greedy URL match would otherwise
81/// swallow. The following are *always* trimmed:
82/// `.`, `,`, `;`, `:`, `!`, `?`, `>`. The closers `)` and `]` are
83/// trimmed *only* when the URL is preceded by the corresponding opener
84/// (`(` or `[`) — Markdown-style `[text](https://…)` is the motivating
85/// case. This conservative rule preserves URLs that legitimately end
86/// in `)`/`]` (e.g., Wikipedia disambiguation pages) when no opener is
87/// present in the surrounding text.
88fn trim_trailing_punctuation(url: &str, preceded_by: Option<char>) -> &str {
89    let pair_closer = match preceded_by {
90        Some('(') => Some(')'),
91        Some('[') => Some(']'),
92        Some('<') => Some('>'),
93        _ => None,
94    };
95    let mut end = url.len();
96    while end > 0 {
97        let c = url[..end].chars().last().unwrap();
98        let drop_general = matches!(c, '.' | ',' | ';' | ':' | '!' | '?' | '>');
99        let drop_pair = Some(c) == pair_closer;
100        if drop_general || drop_pair {
101            end -= c.len_utf8();
102        } else {
103            break;
104        }
105    }
106    &url[..end]
107}
108
109/// One remote-side correlation point: a URL anchor (in field byte
110/// offset order) paired with the lid value that follows it in the
111/// same anchor scope.
112#[derive(Debug, Clone, PartialEq, Eq)]
113pub struct LidCorrelation {
114    /// Normalized URL anchor.
115    pub url: String,
116    /// The lid value extracted from `| lid: '…'`.
117    pub value: String,
118    /// Byte offset where the `<a href>` (HTML) or raw URL (plaintext)
119    /// begins. Useful for ordering and ambiguity reporting.
120    pub url_offset: usize,
121}
122
123/// Extract `(url, lid_value)` pairs from an HTML field by pairing each
124/// `<a href="…">` with the next `| lid: '…'` that follows it before
125/// the next `<a href>` or end of string. Unpaired anchors are skipped.
126pub fn extract_html_lid_values(body: &str) -> Vec<LidCorrelation> {
127    pair_urls_with_lids(href_iter(body), body)
128}
129
130/// Extract `(url, lid_value)` pairs from a plaintext field. Same
131/// pairing rule as HTML but URLs come from raw `https?://…` matches.
132pub fn extract_plaintext_lid_values(body: &str) -> Vec<LidCorrelation> {
133    pair_urls_with_lids(plaintext_url_iter(body), body)
134}
135
136/// Extract raw lid values in field appearance order without any URL
137/// anchoring. Used for subject / preheader where no anchor exists; the
138/// caller matches template placeholders to remote values positionally.
139pub fn extract_lid_values_unanchored(body: &str) -> Vec<String> {
140    lid_value_re()
141        .captures_iter(body)
142        .filter_map(|c| c.get(1).or(c.get(2)).map(|m| m.as_str().to_string()))
143        .collect()
144}
145
146fn href_iter(body: &str) -> Vec<(usize, String)> {
147    href_re()
148        .captures_iter(body)
149        .filter_map(|cap| {
150            let whole = cap.get(0)?;
151            let url = cap
152                .get(1)
153                .or(cap.get(2))
154                .map(|m| m.as_str())
155                .unwrap_or_default();
156            Some((whole.start(), normalize_url(url)))
157        })
158        .collect()
159}
160
161fn plaintext_url_iter(body: &str) -> Vec<(usize, String)> {
162    plaintext_url_re()
163        .find_iter(body)
164        .map(|m| {
165            let raw = m.as_str();
166            let preceded_by = if m.start() > 0 {
167                body[..m.start()].chars().last()
168            } else {
169                None
170            };
171            let trimmed = trim_trailing_punctuation(raw, preceded_by);
172            (m.start(), normalize_url(trimmed))
173        })
174        .collect()
175}
176
177fn pair_urls_with_lids(urls: Vec<(usize, String)>, body: &str) -> Vec<LidCorrelation> {
178    let lids: Vec<(usize, String)> = lid_value_re()
179        .captures_iter(body)
180        .filter_map(|cap| {
181            let whole = cap.get(0)?;
182            let value = cap.get(1).or(cap.get(2)).map(|m| m.as_str().to_string())?;
183            Some((whole.start(), value))
184        })
185        .collect();
186
187    let mut out = Vec::new();
188    for (i, (url_off, url)) in urls.iter().enumerate() {
189        let next_url_off = urls.get(i + 1).map(|(o, _)| *o).unwrap_or(body.len());
190        for (_, value) in lids
191            .iter()
192            .filter(|(off, _)| *off > *url_off && *off < next_url_off)
193        {
194            out.push(LidCorrelation {
195                url: url.clone(),
196                value: value.clone(),
197                url_offset: *url_off,
198            });
199        }
200    }
201    out
202}
203
204/// One cb_id include occurrence extracted from a remote body. Slug is
205/// the key derived from `${NAME}`.
206#[derive(Debug, Clone, PartialEq, Eq)]
207pub struct CbIdCorrelation {
208    /// The verbatim `${NAME}` content_block name from the include.
209    pub name: String,
210    /// `cbN` form, e.g. `cb42`.
211    pub value: String,
212    /// Slug-form key.
213    pub key: String,
214}
215
216/// Extract every `{{content_blocks.${NAME} | id: 'cbN'}}` from `body`.
217pub fn extract_cb_id_values(body: &str) -> Vec<CbIdCorrelation> {
218    cb_id_include_re()
219        .captures_iter(body)
220        .filter_map(|cap| {
221            let name = cap.get(1)?.as_str().to_string();
222            let value = cap.get(2).or(cap.get(3)).map(|m| m.as_str().to_string())?;
223            let key = slug_for_cb_id(&name);
224            Some(CbIdCorrelation { name, value, key })
225        })
226        .collect()
227}
228
229/// Slug a content_block name for use as a `cb_id` key.
230///
231/// Keys never end in `_` — a trailing underscore followed by the `__`
232/// envelope close produces ambiguous triple-underscores in templates.
233pub fn slug_for_cb_id(name: &str) -> String {
234    let base = slug_core(name);
235    if base.is_empty() {
236        "cb".to_string()
237    } else if base.starts_with(|c: char| c.is_ascii_digit()) {
238        format!("cb_{base}")
239    } else {
240        base
241    }
242}
243
244/// Slug a URL path tail or arbitrary anchor for use as a `lid` key.
245/// `link` prefix is applied when the source produces no meaningful
246/// ASCII content. Keys never end in `_` — see
247/// [`slug_for_cb_id`] for the rationale.
248pub fn slug_for_lid(source: &str) -> String {
249    let base = slug_core(source);
250    if base.is_empty() {
251        "link".to_string()
252    } else if base.starts_with(|c: char| c.is_ascii_digit()) {
253        format!("link_{base}")
254    } else {
255        base
256    }
257}
258
259fn slug_core(s: &str) -> String {
260    let mut out = String::with_capacity(s.len());
261    let mut last_underscore = false;
262    for ch in s.chars() {
263        let mapped = if ch.is_ascii_alphanumeric() {
264            ch.to_ascii_lowercase()
265        } else {
266            '_'
267        };
268        if mapped == '_' {
269            if last_underscore {
270                continue;
271            }
272            last_underscore = true;
273        } else {
274            last_underscore = false;
275        }
276        out.push(mapped);
277    }
278    let trimmed = out.trim_matches('_');
279    trimmed.to_string()
280}
281
282#[cfg(test)]
283mod tests {
284    use super::*;
285
286    #[test]
287    fn normalize_strips_query_and_fragment() {
288        assert_eq!(
289            normalize_url("https://example.com/x?utm=1"),
290            "https://example.com/x"
291        );
292        assert_eq!(
293            normalize_url("https://example.com/x#frag"),
294            "https://example.com/x"
295        );
296        assert_eq!(
297            normalize_url("https://example.com/x"),
298            "https://example.com/x"
299        );
300    }
301
302    #[test]
303    fn html_lid_pairs_each_anchor_with_following_value() {
304        let body = r#"<p>
305<a href="https://example.com/a">{{ x | lid: 'lidvalueaa1' }}A</a>
306<a href="https://example.com/b">{{ x | lid: 'lidvaluebb2' }}B</a>
307</p>"#;
308        let pairs = extract_html_lid_values(body);
309        assert_eq!(pairs.len(), 2);
310        assert_eq!(pairs[0].url, "https://example.com/a");
311        assert_eq!(pairs[0].value, "lidvalueaa1");
312        assert_eq!(pairs[1].url, "https://example.com/b");
313        assert_eq!(pairs[1].value, "lidvaluebb2");
314    }
315
316    #[test]
317    fn html_lid_unpaired_anchor_is_skipped() {
318        let body = r#"<a href="https://example.com/a">no lid here</a>
319<a href="https://example.com/b">{{ x | lid: 'lidvaluebb2' }}B</a>"#;
320        let pairs = extract_html_lid_values(body);
321        assert_eq!(pairs.len(), 1);
322        assert_eq!(pairs[0].url, "https://example.com/b");
323    }
324
325    #[test]
326    fn html_lid_handles_both_quote_styles_and_query_string() {
327        let body = r#"<a href='https://example.com/x?utm=foo'>{{ x | lid: "lidvaluexyz1" }}X</a>"#;
328        let pairs = extract_html_lid_values(body);
329        assert_eq!(pairs.len(), 1);
330        assert_eq!(pairs[0].url, "https://example.com/x");
331        assert_eq!(pairs[0].value, "lidvaluexyz1");
332    }
333
334    #[test]
335    fn plaintext_lid_trims_trailing_punctuation() {
336        // Markdown-style link: closing `)` must be trimmed because the
337        // URL was preceded by `(`. Following `| lid:` syntax in
338        // plaintext is unusual but Braze does emit it.
339        let body = "Visit (https://example.com/cta) | lid: 'lidplain01a' for the deal.";
340        let pairs = extract_plaintext_lid_values(body);
341        assert_eq!(pairs.len(), 1);
342        assert_eq!(pairs[0].url, "https://example.com/cta");
343        assert_eq!(pairs[0].value, "lidplain01a");
344    }
345
346    #[test]
347    fn plaintext_lid_trims_sentence_period() {
348        let body = "See https://example.com/end. | lid: 'lidplain02b'";
349        let pairs = extract_plaintext_lid_values(body);
350        assert_eq!(pairs.len(), 1);
351        assert_eq!(pairs[0].url, "https://example.com/end");
352    }
353
354    #[test]
355    fn cb_id_extracts_name_and_value() {
356        // Liquid variable names inside `${...}` carry no whitespace by
357        // construction — matches the dep-graph regex in
358        // src/diff/content_block_order.rs.
359        let body = "before {{content_blocks.${promo_banner} | id: 'cb42'}} after";
360        let pairs = extract_cb_id_values(body);
361        assert_eq!(pairs.len(), 1);
362        assert_eq!(pairs[0].name, "promo_banner");
363        assert_eq!(pairs[0].value, "cb42");
364        assert_eq!(pairs[0].key, "promo_banner");
365    }
366
367    #[test]
368    fn cb_id_handles_multiple_includes() {
369        let body = "{{content_blocks.${alpha} | id: 'cb1'}} {{content_blocks.${beta} | id: 'cb2'}}";
370        let pairs = extract_cb_id_values(body);
371        assert_eq!(pairs.len(), 2);
372        assert_eq!(pairs[0].name, "alpha");
373        assert_eq!(pairs[0].value, "cb1");
374        assert_eq!(pairs[0].key, "alpha");
375        assert_eq!(pairs[1].name, "beta");
376        assert_eq!(pairs[1].value, "cb2");
377    }
378
379    #[test]
380    fn cb_id_slug_uses_cb_prefix_for_empty_or_digit_start() {
381        assert_eq!(slug_for_cb_id("2024_summer"), "cb_2024_summer");
382        assert_eq!(slug_for_cb_id(""), "cb");
383        assert_eq!(slug_for_cb_id("My Promo Banner"), "my_promo_banner");
384        assert_eq!(slug_for_cb_id("cb_promo_image"), "cb_promo_image");
385    }
386
387    #[test]
388    fn lid_slug_uses_link_prefix_for_empty_or_digit_start() {
389        assert_eq!(slug_for_lid("/spring-sale"), "spring_sale");
390        assert_eq!(slug_for_lid("/"), "link");
391        assert_eq!(slug_for_lid("123"), "link_123");
392        // Non-ASCII source collapses to empty Unicode rule.
393        assert_eq!(slug_for_lid("プロモ"), "link");
394    }
395
396    #[test]
397    fn slug_collapses_multiple_separators() {
398        assert_eq!(slug_for_lid("foo//bar--baz"), "foo_bar_baz");
399        assert_eq!(slug_for_lid("--leading"), "leading");
400    }
401}
braze_sync/values/correlation.rs

braze_sync/values/
correlation.rs