Skip to main content

braze_sync/values/
correlation.rs

1//! Remote-body correlation primitives for `export` (RFC §2.5).
2//!
3//! These functions inspect a *remote* body (HTML, plaintext, subject,
4//! preheader) and return the per-occurrence lid / cb_id values together
5//! with the anchor used to correlate them back to the values file
6//! entries.
7//!
8//! - HTML lid: anchor = the URL of the immediately-preceding
9//!   `<a href="...">`. Multiple `<a>`s with the same URL fall back to
10//!   appearance order (RFC §2.5 "Key 対応の曖昧性").
11//! - Plaintext lid: anchor = the raw URL (`https?://…`) immediately
12//!   preceding the `| lid: '…'` token; trailing punctuation is trimmed
13//!   (RFC §5 Edge case for `]`/`)` etc.).
14//! - subject / preheader lid: anchor = adjacent Liquid identifiers
15//!   inside the same `{{…}}` block. Phase 3 first cut covers the URL
16//!   variants; the anchor-only variant is supported by carrying the
17//!   anchor string verbatim from the existing values entry.
18//! - cb_id: anchor = the `${NAME}` inside the same Liquid token as
19//!   `| id: 'cbN'`. NAME is the source for the slug-derived key.
20
21use regex_lite::Regex;
22use std::sync::OnceLock;
23
24/// Normalize a URL for anchor comparison per RFC §2.2:
25/// keep `scheme://host/path`, drop `?query` and `#fragment`.
26///
27/// Returns the input unchanged if it doesn't look like a URL with a
28/// scheme — callers pass already-detected URLs, but normalizing
29/// idempotently keeps the function safe to apply in either direction.
30pub fn normalize_url(url: &str) -> String {
31    let stop = url.find(['?', '#']).unwrap_or(url.len());
32    url[..stop].to_string()
33}
34
35fn href_re() -> &'static Regex {
36    static RE: OnceLock<Regex> = OnceLock::new();
37    RE.get_or_init(|| {
38        // Tolerant of attribute order and either quote style. The href
39        // value runs up to the matching quote — Braze-issued anchor
40        // tags do not nest quotes inside the URL.
41        Regex::new(r#"(?i)<a\b[^>]*?\bhref\s*=\s*(?:"([^"]*)"|'([^']*)')"#)
42            .expect("href regex is valid")
43    })
44}
45
46fn lid_value_re() -> &'static Regex {
47    static RE: OnceLock<Regex> = OnceLock::new();
48    RE.get_or_init(|| {
49        // The pipe anchor (`|`) prevents false matches on hash literals
50        // or unrelated keyword args that happen to spell `lid:`. Matches
51        // both quote styles, and the value class matches the built-in
52        // shape check (`^[a-z0-9]{8,}$`).
53        Regex::new(r#"\|\s*lid:\s*(?:"([a-z0-9]{8,})"|'([a-z0-9]{8,})')"#)
54            .expect("lid value regex is valid")
55    })
56}
57
58fn plaintext_url_re() -> &'static Regex {
59    static RE: OnceLock<Regex> = OnceLock::new();
60    RE.get_or_init(|| {
61        // Greedy `[^\s<>"]` runs up to whitespace or a quote/angle —
62        // good enough for Braze plaintext where URLs aren't routinely
63        // wrapped in markup. Trailing punctuation is trimmed post-hoc
64        // (see `trim_trailing_punctuation`).
65        Regex::new(r#"https?://[^\s<>"']+"#).expect("plaintext URL regex is valid")
66    })
67}
68
69fn cb_id_include_re() -> &'static Regex {
70    static RE: OnceLock<Regex> = OnceLock::new();
71    RE.get_or_init(|| {
72        // Captures `${NAME}` (group 1) and `cbN` (group 2) from
73        //   {{content_blocks.${NAME} | id: 'cbN'}}
74        // Matches existing dependency-graph regex in
75        // src/diff/content_block_order.rs but tightened to require
76        // `| id: '…'` form (we need the cbN value, not just NAME).
77        Regex::new(
78            r#"\{\{\s*content_blocks\.\$\{\s*([^\s}|]+)\s*\}\s*\|\s*id:\s*(?:"(cb[0-9]+)"|'(cb[0-9]+)')\s*\}\}"#,
79        )
80        .expect("cb_id include regex is valid")
81    })
82}
83
84/// Trim trailing punctuation that a greedy URL match would otherwise
85/// swallow. Per RFC §5 Edge case, the following are *always* trimmed:
86/// `.`, `,`, `;`, `:`, `!`, `?`, `>`. The closers `)` and `]` are
87/// trimmed *only* when the URL is preceded by the corresponding opener
88/// (`(` or `[`) — Markdown-style `[text](https://…)` is the motivating
89/// case. This conservative rule preserves URLs that legitimately end
90/// in `)`/`]` (e.g., Wikipedia disambiguation pages) when no opener is
91/// present in the surrounding text.
92fn trim_trailing_punctuation(url: &str, preceded_by: Option<char>) -> &str {
93    let pair_closer = match preceded_by {
94        Some('(') => Some(')'),
95        Some('[') => Some(']'),
96        Some('<') => Some('>'),
97        _ => None,
98    };
99    let mut end = url.len();
100    while end > 0 {
101        let c = url[..end].chars().last().unwrap();
102        let drop_general = matches!(c, '.' | ',' | ';' | ':' | '!' | '?' | '>');
103        let drop_pair = Some(c) == pair_closer;
104        if drop_general || drop_pair {
105            end -= c.len_utf8();
106        } else {
107            break;
108        }
109    }
110    &url[..end]
111}
112
113/// One remote-side correlation point: a URL anchor (in field byte
114/// offset order) paired with the lid value that follows it in the
115/// same anchor scope.
116#[derive(Debug, Clone, PartialEq, Eq)]
117pub struct LidCorrelation {
118    /// Normalized URL anchor.
119    pub url: String,
120    /// The lid value extracted from `| lid: '…'`.
121    pub value: String,
122    /// Byte offset where the `<a href>` (HTML) or raw URL (plaintext)
123    /// begins. Useful for ordering and ambiguity reporting.
124    pub url_offset: usize,
125}
126
127/// Extract `(url, lid_value)` pairs from an HTML field by pairing each
128/// `<a href="…">` with the next `| lid: '…'` that follows it before
129/// the next `<a href>` or end of string. Unpaired anchors are skipped.
130pub fn extract_html_lid_values(body: &str) -> Vec<LidCorrelation> {
131    pair_urls_with_lids(href_iter(body), body)
132}
133
134/// Extract `(url, lid_value)` pairs from a plaintext field. Same
135/// pairing rule as HTML but URLs come from raw `https?://…` matches.
136pub fn extract_plaintext_lid_values(body: &str) -> Vec<LidCorrelation> {
137    pair_urls_with_lids(plaintext_url_iter(body), body)
138}
139
140fn href_iter(body: &str) -> Vec<(usize, String)> {
141    href_re()
142        .captures_iter(body)
143        .filter_map(|cap| {
144            let whole = cap.get(0)?;
145            let url = cap
146                .get(1)
147                .or(cap.get(2))
148                .map(|m| m.as_str())
149                .unwrap_or_default();
150            Some((whole.start(), normalize_url(url)))
151        })
152        .collect()
153}
154
155fn plaintext_url_iter(body: &str) -> Vec<(usize, String)> {
156    plaintext_url_re()
157        .find_iter(body)
158        .map(|m| {
159            let raw = m.as_str();
160            let preceded_by = if m.start() > 0 {
161                body[..m.start()].chars().last()
162            } else {
163                None
164            };
165            let trimmed = trim_trailing_punctuation(raw, preceded_by);
166            (m.start(), normalize_url(trimmed))
167        })
168        .collect()
169}
170
171fn pair_urls_with_lids(urls: Vec<(usize, String)>, body: &str) -> Vec<LidCorrelation> {
172    let lids: Vec<(usize, String)> = lid_value_re()
173        .captures_iter(body)
174        .filter_map(|cap| {
175            let whole = cap.get(0)?;
176            let value = cap.get(1).or(cap.get(2)).map(|m| m.as_str().to_string())?;
177            Some((whole.start(), value))
178        })
179        .collect();
180
181    let mut out = Vec::new();
182    for (i, (url_off, url)) in urls.iter().enumerate() {
183        let next_url_off = urls.get(i + 1).map(|(o, _)| *o).unwrap_or(body.len());
184        if let Some((_, value)) = lids
185            .iter()
186            .find(|(off, _)| *off > *url_off && *off < next_url_off)
187        {
188            out.push(LidCorrelation {
189                url: url.clone(),
190                value: value.clone(),
191                url_offset: *url_off,
192            });
193        }
194    }
195    out
196}
197
198/// One cb_id include occurrence extracted from a remote body. Slug is
199/// the RFC §3 Q3 key derived from `${NAME}`.
200#[derive(Debug, Clone, PartialEq, Eq)]
201pub struct CbIdCorrelation {
202    /// The verbatim `${NAME}` content_block name from the include.
203    pub name: String,
204    /// `cbN` form, e.g. `cb42`.
205    pub value: String,
206    /// Slug-form key per RFC §3 Q3.
207    pub key: String,
208}
209
210/// Extract every `{{content_blocks.${NAME} | id: 'cbN'}}` from `body`.
211pub fn extract_cb_id_values(body: &str) -> Vec<CbIdCorrelation> {
212    cb_id_include_re()
213        .captures_iter(body)
214        .filter_map(|cap| {
215            let name = cap.get(1)?.as_str().to_string();
216            let value = cap.get(2).or(cap.get(3)).map(|m| m.as_str().to_string())?;
217            let key = slug_for_cb_id(&name);
218            Some(CbIdCorrelation { name, value, key })
219        })
220        .collect()
221}
222
223/// Slug a content_block name for use as a `cb_id` key per RFC §3 Q3.
224///
225/// Keys never end in `_`: when the input slugifies to empty the result
226/// is the bare prefix (`cb`), not `cb_`. A trailing `_` followed by the
227/// placeholder envelope `__` produces three consecutive underscores in
228/// the rendered template, which is ambiguous to parse — the resolver
229/// recovers it but operators tripped over the ambiguity (see CHANGELOG
230/// for v0.14.3).
231pub fn slug_for_cb_id(name: &str) -> String {
232    let base = slug_core(name);
233    if base.is_empty() {
234        "cb".to_string()
235    } else if base.starts_with(|c: char| c.is_ascii_digit()) {
236        format!("cb_{base}")
237    } else {
238        base
239    }
240}
241
242/// Slug a URL path tail or arbitrary anchor for use as a `lid` key.
243/// `link` prefix is applied when the source produces no meaningful
244/// ASCII content (RFC §3 Q3). Keys never end in `_` — see
245/// [`slug_for_cb_id`] for the rationale.
246pub fn slug_for_lid(source: &str) -> String {
247    let base = slug_core(source);
248    if base.is_empty() {
249        "link".to_string()
250    } else if base.starts_with(|c: char| c.is_ascii_digit()) {
251        format!("link_{base}")
252    } else {
253        base
254    }
255}
256
257fn slug_core(s: &str) -> String {
258    let mut out = String::with_capacity(s.len());
259    let mut last_underscore = false;
260    for ch in s.chars() {
261        let mapped = if ch.is_ascii_alphanumeric() {
262            ch.to_ascii_lowercase()
263        } else {
264            '_'
265        };
266        if mapped == '_' {
267            if last_underscore {
268                continue;
269            }
270            last_underscore = true;
271        } else {
272            last_underscore = false;
273        }
274        out.push(mapped);
275    }
276    let trimmed = out.trim_matches('_');
277    trimmed.to_string()
278}
279
280#[cfg(test)]
281mod tests {
282    use super::*;
283
284    #[test]
285    fn normalize_strips_query_and_fragment() {
286        assert_eq!(
287            normalize_url("https://example.com/x?utm=1"),
288            "https://example.com/x"
289        );
290        assert_eq!(
291            normalize_url("https://example.com/x#frag"),
292            "https://example.com/x"
293        );
294        assert_eq!(
295            normalize_url("https://example.com/x"),
296            "https://example.com/x"
297        );
298    }
299
300    #[test]
301    fn html_lid_pairs_each_anchor_with_following_value() {
302        let body = r#"<p>
303<a href="https://example.com/a">{{ x | lid: 'lidvalueaa1' }}A</a>
304<a href="https://example.com/b">{{ x | lid: 'lidvaluebb2' }}B</a>
305</p>"#;
306        let pairs = extract_html_lid_values(body);
307        assert_eq!(pairs.len(), 2);
308        assert_eq!(pairs[0].url, "https://example.com/a");
309        assert_eq!(pairs[0].value, "lidvalueaa1");
310        assert_eq!(pairs[1].url, "https://example.com/b");
311        assert_eq!(pairs[1].value, "lidvaluebb2");
312    }
313
314    #[test]
315    fn html_lid_unpaired_anchor_is_skipped() {
316        let body = r#"<a href="https://example.com/a">no lid here</a>
317<a href="https://example.com/b">{{ x | lid: 'lidvaluebb2' }}B</a>"#;
318        let pairs = extract_html_lid_values(body);
319        assert_eq!(pairs.len(), 1);
320        assert_eq!(pairs[0].url, "https://example.com/b");
321    }
322
323    #[test]
324    fn html_lid_handles_both_quote_styles_and_query_string() {
325        let body = r#"<a href='https://example.com/x?utm=foo'>{{ x | lid: "lidvaluexyz1" }}X</a>"#;
326        let pairs = extract_html_lid_values(body);
327        assert_eq!(pairs.len(), 1);
328        assert_eq!(pairs[0].url, "https://example.com/x");
329        assert_eq!(pairs[0].value, "lidvaluexyz1");
330    }
331
332    #[test]
333    fn plaintext_lid_trims_trailing_punctuation() {
334        // Markdown-style link: closing `)` must be trimmed because the
335        // URL was preceded by `(`. Following `| lid:` syntax in
336        // plaintext is unusual but Braze does emit it.
337        let body = "Visit (https://example.com/cta) | lid: 'lidplain01a' for the deal.";
338        let pairs = extract_plaintext_lid_values(body);
339        assert_eq!(pairs.len(), 1);
340        assert_eq!(pairs[0].url, "https://example.com/cta");
341        assert_eq!(pairs[0].value, "lidplain01a");
342    }
343
344    #[test]
345    fn plaintext_lid_trims_sentence_period() {
346        let body = "See https://example.com/end. | lid: 'lidplain02b'";
347        let pairs = extract_plaintext_lid_values(body);
348        assert_eq!(pairs.len(), 1);
349        assert_eq!(pairs[0].url, "https://example.com/end");
350    }
351
352    #[test]
353    fn cb_id_extracts_name_and_value() {
354        // Liquid variable names inside `${...}` carry no whitespace by
355        // construction — matches the dep-graph regex in
356        // src/diff/content_block_order.rs.
357        let body = "before {{content_blocks.${promo_banner} | id: 'cb42'}} after";
358        let pairs = extract_cb_id_values(body);
359        assert_eq!(pairs.len(), 1);
360        assert_eq!(pairs[0].name, "promo_banner");
361        assert_eq!(pairs[0].value, "cb42");
362        assert_eq!(pairs[0].key, "promo_banner");
363    }
364
365    #[test]
366    fn cb_id_handles_multiple_includes() {
367        let body = "{{content_blocks.${alpha} | id: 'cb1'}} {{content_blocks.${beta} | id: 'cb2'}}";
368        let pairs = extract_cb_id_values(body);
369        assert_eq!(pairs.len(), 2);
370        assert_eq!(pairs[0].name, "alpha");
371        assert_eq!(pairs[0].value, "cb1");
372        assert_eq!(pairs[0].key, "alpha");
373        assert_eq!(pairs[1].name, "beta");
374        assert_eq!(pairs[1].value, "cb2");
375    }
376
377    #[test]
378    fn cb_id_slug_uses_cb_prefix_for_empty_or_digit_start() {
379        assert_eq!(slug_for_cb_id("2024_summer"), "cb_2024_summer");
380        assert_eq!(slug_for_cb_id(""), "cb");
381        assert_eq!(slug_for_cb_id("My Promo Banner"), "my_promo_banner");
382        assert_eq!(slug_for_cb_id("cb_promo_image"), "cb_promo_image");
383    }
384
385    #[test]
386    fn lid_slug_uses_link_prefix_for_empty_or_digit_start() {
387        assert_eq!(slug_for_lid("/spring-sale"), "spring_sale");
388        assert_eq!(slug_for_lid("/"), "link");
389        assert_eq!(slug_for_lid("123"), "link_123");
390        // Non-ASCII source collapses to empty per RFC §3 Q3 Unicode rule.
391        assert_eq!(slug_for_lid("プロモ"), "link");
392    }
393
394    #[test]
395    fn slug_collapses_multiple_separators() {
396        assert_eq!(slug_for_lid("foo//bar--baz"), "foo_bar_baz");
397        assert_eq!(slug_for_lid("--leading"), "leading");
398    }
399}