braze_sync/values/
correlation.rs1use regex_lite::Regex;
12use std::sync::OnceLock;
13
14pub fn normalize_url(url: &str) -> String {
21 let stop = url.find(['?', '#']).unwrap_or(url.len());
22 url[..stop].to_string()
23}
24
25fn href_re() -> &'static Regex {
26 static RE: OnceLock<Regex> = OnceLock::new();
27 RE.get_or_init(|| {
28 Regex::new(
36 r#"(?i)<[a-z][a-z0-9_.:-]*\b[^>]*?\s(?:[a-z][a-z0-9_-]*:)?(?:href|src|action)\s*=\s*(?:"([^"]*)"|'([^']*)')"#,
37 )
38 .expect("href regex is valid")
39 })
40}
41
42fn lid_value_re() -> &'static Regex {
43 static RE: OnceLock<Regex> = OnceLock::new();
44 RE.get_or_init(|| {
45 Regex::new(r#"\|\s*lid:\s*(?:"([a-z0-9]{8,})"|'([a-z0-9]{8,})')"#)
50 .expect("lid value regex is valid")
51 })
52}
53
54fn plaintext_url_re() -> &'static Regex {
55 static RE: OnceLock<Regex> = OnceLock::new();
56 RE.get_or_init(|| {
57 Regex::new(r#"https?://[^\s<>"']+"#).expect("plaintext URL regex is valid")
62 })
63}
64
65fn cb_id_include_re() -> &'static Regex {
66 static RE: OnceLock<Regex> = OnceLock::new();
67 RE.get_or_init(|| {
68 Regex::new(
74 r#"\{\{\s*content_blocks\.\$\{\s*([^\s}|]+)\s*\}\s*\|\s*id:\s*(?:"(cb[0-9]+)"|'(cb[0-9]+)')\s*\}\}"#,
75 )
76 .expect("cb_id include regex is valid")
77 })
78}
79
80fn trim_trailing_punctuation(url: &str, preceded_by: Option<char>) -> &str {
89 let pair_closer = match preceded_by {
90 Some('(') => Some(')'),
91 Some('[') => Some(']'),
92 Some('<') => Some('>'),
93 _ => None,
94 };
95 let mut end = url.len();
96 while end > 0 {
97 let c = url[..end].chars().last().unwrap();
98 let drop_general = matches!(c, '.' | ',' | ';' | ':' | '!' | '?' | '>');
99 let drop_pair = Some(c) == pair_closer;
100 if drop_general || drop_pair {
101 end -= c.len_utf8();
102 } else {
103 break;
104 }
105 }
106 &url[..end]
107}
108
109#[derive(Debug, Clone, PartialEq, Eq)]
113pub struct LidCorrelation {
114 pub url: String,
116 pub value: String,
118 pub url_offset: usize,
121}
122
123pub fn extract_html_lid_values(body: &str) -> Vec<LidCorrelation> {
127 pair_urls_with_lids(href_iter(body), body)
128}
129
130pub fn extract_plaintext_lid_values(body: &str) -> Vec<LidCorrelation> {
133 pair_urls_with_lids(plaintext_url_iter(body), body)
134}
135
136pub fn extract_lid_values_unanchored(body: &str) -> Vec<String> {
140 lid_value_re()
141 .captures_iter(body)
142 .filter_map(|c| c.get(1).or(c.get(2)).map(|m| m.as_str().to_string()))
143 .collect()
144}
145
146fn href_iter(body: &str) -> Vec<(usize, String)> {
147 href_re()
148 .captures_iter(body)
149 .filter_map(|cap| {
150 let whole = cap.get(0)?;
151 let url = cap
152 .get(1)
153 .or(cap.get(2))
154 .map(|m| m.as_str())
155 .unwrap_or_default();
156 Some((whole.start(), normalize_url(url)))
157 })
158 .collect()
159}
160
161fn plaintext_url_iter(body: &str) -> Vec<(usize, String)> {
162 plaintext_url_re()
163 .find_iter(body)
164 .map(|m| {
165 let raw = m.as_str();
166 let preceded_by = if m.start() > 0 {
167 body[..m.start()].chars().last()
168 } else {
169 None
170 };
171 let trimmed = trim_trailing_punctuation(raw, preceded_by);
172 (m.start(), normalize_url(trimmed))
173 })
174 .collect()
175}
176
177fn pair_urls_with_lids(urls: Vec<(usize, String)>, body: &str) -> Vec<LidCorrelation> {
178 let lids: Vec<(usize, String)> = lid_value_re()
179 .captures_iter(body)
180 .filter_map(|cap| {
181 let whole = cap.get(0)?;
182 let value = cap.get(1).or(cap.get(2)).map(|m| m.as_str().to_string())?;
183 Some((whole.start(), value))
184 })
185 .collect();
186
187 let mut out = Vec::new();
188 for (i, (url_off, url)) in urls.iter().enumerate() {
189 let next_url_off = urls.get(i + 1).map(|(o, _)| *o).unwrap_or(body.len());
190 for (_, value) in lids
191 .iter()
192 .filter(|(off, _)| *off > *url_off && *off < next_url_off)
193 {
194 out.push(LidCorrelation {
195 url: url.clone(),
196 value: value.clone(),
197 url_offset: *url_off,
198 });
199 }
200 }
201 out
202}
203
204#[derive(Debug, Clone, PartialEq, Eq)]
207pub struct CbIdCorrelation {
208 pub name: String,
210 pub value: String,
212 pub key: String,
214}
215
216pub fn extract_cb_id_values(body: &str) -> Vec<CbIdCorrelation> {
218 cb_id_include_re()
219 .captures_iter(body)
220 .filter_map(|cap| {
221 let name = cap.get(1)?.as_str().to_string();
222 let value = cap.get(2).or(cap.get(3)).map(|m| m.as_str().to_string())?;
223 let key = slug_for_cb_id(&name);
224 Some(CbIdCorrelation { name, value, key })
225 })
226 .collect()
227}
228
229pub fn slug_for_cb_id(name: &str) -> String {
234 let base = slug_core(name);
235 if base.is_empty() {
236 "cb".to_string()
237 } else if base.starts_with(|c: char| c.is_ascii_digit()) {
238 format!("cb_{base}")
239 } else {
240 base
241 }
242}
243
244pub fn slug_for_lid(source: &str) -> String {
249 let base = slug_core(source);
250 if base.is_empty() {
251 "link".to_string()
252 } else if base.starts_with(|c: char| c.is_ascii_digit()) {
253 format!("link_{base}")
254 } else {
255 base
256 }
257}
258
259fn slug_core(s: &str) -> String {
260 let mut out = String::with_capacity(s.len());
261 let mut last_underscore = false;
262 for ch in s.chars() {
263 let mapped = if ch.is_ascii_alphanumeric() {
264 ch.to_ascii_lowercase()
265 } else {
266 '_'
267 };
268 if mapped == '_' {
269 if last_underscore {
270 continue;
271 }
272 last_underscore = true;
273 } else {
274 last_underscore = false;
275 }
276 out.push(mapped);
277 }
278 let trimmed = out.trim_matches('_');
279 trimmed.to_string()
280}
281
282#[cfg(test)]
283mod tests {
284 use super::*;
285
286 #[test]
287 fn normalize_strips_query_and_fragment() {
288 assert_eq!(
289 normalize_url("https://example.com/x?utm=1"),
290 "https://example.com/x"
291 );
292 assert_eq!(
293 normalize_url("https://example.com/x#frag"),
294 "https://example.com/x"
295 );
296 assert_eq!(
297 normalize_url("https://example.com/x"),
298 "https://example.com/x"
299 );
300 }
301
302 #[test]
303 fn html_lid_pairs_each_anchor_with_following_value() {
304 let body = r#"<p>
305<a href="https://example.com/a">{{ x | lid: 'lidvalueaa1' }}A</a>
306<a href="https://example.com/b">{{ x | lid: 'lidvaluebb2' }}B</a>
307</p>"#;
308 let pairs = extract_html_lid_values(body);
309 assert_eq!(pairs.len(), 2);
310 assert_eq!(pairs[0].url, "https://example.com/a");
311 assert_eq!(pairs[0].value, "lidvalueaa1");
312 assert_eq!(pairs[1].url, "https://example.com/b");
313 assert_eq!(pairs[1].value, "lidvaluebb2");
314 }
315
316 #[test]
317 fn html_lid_unpaired_anchor_is_skipped() {
318 let body = r#"<a href="https://example.com/a">no lid here</a>
319<a href="https://example.com/b">{{ x | lid: 'lidvaluebb2' }}B</a>"#;
320 let pairs = extract_html_lid_values(body);
321 assert_eq!(pairs.len(), 1);
322 assert_eq!(pairs[0].url, "https://example.com/b");
323 }
324
325 #[test]
326 fn html_lid_handles_both_quote_styles_and_query_string() {
327 let body = r#"<a href='https://example.com/x?utm=foo'>{{ x | lid: "lidvaluexyz1" }}X</a>"#;
328 let pairs = extract_html_lid_values(body);
329 assert_eq!(pairs.len(), 1);
330 assert_eq!(pairs[0].url, "https://example.com/x");
331 assert_eq!(pairs[0].value, "lidvaluexyz1");
332 }
333
334 #[test]
335 fn plaintext_lid_trims_trailing_punctuation() {
336 let body = "Visit (https://example.com/cta) | lid: 'lidplain01a' for the deal.";
340 let pairs = extract_plaintext_lid_values(body);
341 assert_eq!(pairs.len(), 1);
342 assert_eq!(pairs[0].url, "https://example.com/cta");
343 assert_eq!(pairs[0].value, "lidplain01a");
344 }
345
346 #[test]
347 fn plaintext_lid_trims_sentence_period() {
348 let body = "See https://example.com/end. | lid: 'lidplain02b'";
349 let pairs = extract_plaintext_lid_values(body);
350 assert_eq!(pairs.len(), 1);
351 assert_eq!(pairs[0].url, "https://example.com/end");
352 }
353
354 #[test]
355 fn cb_id_extracts_name_and_value() {
356 let body = "before {{content_blocks.${promo_banner} | id: 'cb42'}} after";
360 let pairs = extract_cb_id_values(body);
361 assert_eq!(pairs.len(), 1);
362 assert_eq!(pairs[0].name, "promo_banner");
363 assert_eq!(pairs[0].value, "cb42");
364 assert_eq!(pairs[0].key, "promo_banner");
365 }
366
367 #[test]
368 fn cb_id_handles_multiple_includes() {
369 let body = "{{content_blocks.${alpha} | id: 'cb1'}} {{content_blocks.${beta} | id: 'cb2'}}";
370 let pairs = extract_cb_id_values(body);
371 assert_eq!(pairs.len(), 2);
372 assert_eq!(pairs[0].name, "alpha");
373 assert_eq!(pairs[0].value, "cb1");
374 assert_eq!(pairs[0].key, "alpha");
375 assert_eq!(pairs[1].name, "beta");
376 assert_eq!(pairs[1].value, "cb2");
377 }
378
379 #[test]
380 fn cb_id_slug_uses_cb_prefix_for_empty_or_digit_start() {
381 assert_eq!(slug_for_cb_id("2024_summer"), "cb_2024_summer");
382 assert_eq!(slug_for_cb_id(""), "cb");
383 assert_eq!(slug_for_cb_id("My Promo Banner"), "my_promo_banner");
384 assert_eq!(slug_for_cb_id("cb_promo_image"), "cb_promo_image");
385 }
386
387 #[test]
388 fn lid_slug_uses_link_prefix_for_empty_or_digit_start() {
389 assert_eq!(slug_for_lid("/spring-sale"), "spring_sale");
390 assert_eq!(slug_for_lid("/"), "link");
391 assert_eq!(slug_for_lid("123"), "link_123");
392 assert_eq!(slug_for_lid("プロモ"), "link");
394 }
395
396 #[test]
397 fn slug_collapses_multiple_separators() {
398 assert_eq!(slug_for_lid("foo//bar--baz"), "foo_bar_baz");
399 assert_eq!(slug_for_lid("--leading"), "leading");
400 }
401}