braze_sync/values/
correlation.rs1use regex_lite::Regex;
22use std::sync::OnceLock;
23
24pub fn normalize_url(url: &str) -> String {
31 let stop = url.find(['?', '#']).unwrap_or(url.len());
32 url[..stop].to_string()
33}
34
35fn href_re() -> &'static Regex {
36 static RE: OnceLock<Regex> = OnceLock::new();
37 RE.get_or_init(|| {
38 Regex::new(r#"(?i)<a\b[^>]*?\bhref\s*=\s*(?:"([^"]*)"|'([^']*)')"#)
42 .expect("href regex is valid")
43 })
44}
45
46fn lid_value_re() -> &'static Regex {
47 static RE: OnceLock<Regex> = OnceLock::new();
48 RE.get_or_init(|| {
49 Regex::new(r#"\|\s*lid:\s*(?:"([a-z0-9]{8,})"|'([a-z0-9]{8,})')"#)
54 .expect("lid value regex is valid")
55 })
56}
57
58fn plaintext_url_re() -> &'static Regex {
59 static RE: OnceLock<Regex> = OnceLock::new();
60 RE.get_or_init(|| {
61 Regex::new(r#"https?://[^\s<>"']+"#).expect("plaintext URL regex is valid")
66 })
67}
68
69fn cb_id_include_re() -> &'static Regex {
70 static RE: OnceLock<Regex> = OnceLock::new();
71 RE.get_or_init(|| {
72 Regex::new(
78 r#"\{\{\s*content_blocks\.\$\{\s*([^\s}|]+)\s*\}\s*\|\s*id:\s*(?:"(cb[0-9]+)"|'(cb[0-9]+)')\s*\}\}"#,
79 )
80 .expect("cb_id include regex is valid")
81 })
82}
83
84fn trim_trailing_punctuation(url: &str, preceded_by: Option<char>) -> &str {
93 let pair_closer = match preceded_by {
94 Some('(') => Some(')'),
95 Some('[') => Some(']'),
96 Some('<') => Some('>'),
97 _ => None,
98 };
99 let mut end = url.len();
100 while end > 0 {
101 let c = url[..end].chars().last().unwrap();
102 let drop_general = matches!(c, '.' | ',' | ';' | ':' | '!' | '?' | '>');
103 let drop_pair = Some(c) == pair_closer;
104 if drop_general || drop_pair {
105 end -= c.len_utf8();
106 } else {
107 break;
108 }
109 }
110 &url[..end]
111}
112
113#[derive(Debug, Clone, PartialEq, Eq)]
117pub struct LidCorrelation {
118 pub url: String,
120 pub value: String,
122 pub url_offset: usize,
125}
126
127pub fn extract_html_lid_values(body: &str) -> Vec<LidCorrelation> {
131 pair_urls_with_lids(href_iter(body), body)
132}
133
134pub fn extract_plaintext_lid_values(body: &str) -> Vec<LidCorrelation> {
137 pair_urls_with_lids(plaintext_url_iter(body), body)
138}
139
140fn href_iter(body: &str) -> Vec<(usize, String)> {
141 href_re()
142 .captures_iter(body)
143 .filter_map(|cap| {
144 let whole = cap.get(0)?;
145 let url = cap
146 .get(1)
147 .or(cap.get(2))
148 .map(|m| m.as_str())
149 .unwrap_or_default();
150 Some((whole.start(), normalize_url(url)))
151 })
152 .collect()
153}
154
155fn plaintext_url_iter(body: &str) -> Vec<(usize, String)> {
156 plaintext_url_re()
157 .find_iter(body)
158 .map(|m| {
159 let raw = m.as_str();
160 let preceded_by = if m.start() > 0 {
161 body[..m.start()].chars().last()
162 } else {
163 None
164 };
165 let trimmed = trim_trailing_punctuation(raw, preceded_by);
166 (m.start(), normalize_url(trimmed))
167 })
168 .collect()
169}
170
171fn pair_urls_with_lids(urls: Vec<(usize, String)>, body: &str) -> Vec<LidCorrelation> {
172 let lids: Vec<(usize, String)> = lid_value_re()
173 .captures_iter(body)
174 .filter_map(|cap| {
175 let whole = cap.get(0)?;
176 let value = cap.get(1).or(cap.get(2)).map(|m| m.as_str().to_string())?;
177 Some((whole.start(), value))
178 })
179 .collect();
180
181 let mut out = Vec::new();
182 for (i, (url_off, url)) in urls.iter().enumerate() {
183 let next_url_off = urls.get(i + 1).map(|(o, _)| *o).unwrap_or(body.len());
184 if let Some((_, value)) = lids
185 .iter()
186 .find(|(off, _)| *off > *url_off && *off < next_url_off)
187 {
188 out.push(LidCorrelation {
189 url: url.clone(),
190 value: value.clone(),
191 url_offset: *url_off,
192 });
193 }
194 }
195 out
196}
197
198#[derive(Debug, Clone, PartialEq, Eq)]
201pub struct CbIdCorrelation {
202 pub name: String,
204 pub value: String,
206 pub key: String,
208}
209
210pub fn extract_cb_id_values(body: &str) -> Vec<CbIdCorrelation> {
212 cb_id_include_re()
213 .captures_iter(body)
214 .filter_map(|cap| {
215 let name = cap.get(1)?.as_str().to_string();
216 let value = cap.get(2).or(cap.get(3)).map(|m| m.as_str().to_string())?;
217 let key = slug_for_cb_id(&name);
218 Some(CbIdCorrelation { name, value, key })
219 })
220 .collect()
221}
222
223pub fn slug_for_cb_id(name: &str) -> String {
232 let base = slug_core(name);
233 if base.is_empty() {
234 "cb".to_string()
235 } else if base.starts_with(|c: char| c.is_ascii_digit()) {
236 format!("cb_{base}")
237 } else {
238 base
239 }
240}
241
242pub fn slug_for_lid(source: &str) -> String {
247 let base = slug_core(source);
248 if base.is_empty() {
249 "link".to_string()
250 } else if base.starts_with(|c: char| c.is_ascii_digit()) {
251 format!("link_{base}")
252 } else {
253 base
254 }
255}
256
257fn slug_core(s: &str) -> String {
258 let mut out = String::with_capacity(s.len());
259 let mut last_underscore = false;
260 for ch in s.chars() {
261 let mapped = if ch.is_ascii_alphanumeric() {
262 ch.to_ascii_lowercase()
263 } else {
264 '_'
265 };
266 if mapped == '_' {
267 if last_underscore {
268 continue;
269 }
270 last_underscore = true;
271 } else {
272 last_underscore = false;
273 }
274 out.push(mapped);
275 }
276 let trimmed = out.trim_matches('_');
277 trimmed.to_string()
278}
279
280#[cfg(test)]
281mod tests {
282 use super::*;
283
284 #[test]
285 fn normalize_strips_query_and_fragment() {
286 assert_eq!(
287 normalize_url("https://example.com/x?utm=1"),
288 "https://example.com/x"
289 );
290 assert_eq!(
291 normalize_url("https://example.com/x#frag"),
292 "https://example.com/x"
293 );
294 assert_eq!(
295 normalize_url("https://example.com/x"),
296 "https://example.com/x"
297 );
298 }
299
300 #[test]
301 fn html_lid_pairs_each_anchor_with_following_value() {
302 let body = r#"<p>
303<a href="https://example.com/a">{{ x | lid: 'lidvalueaa1' }}A</a>
304<a href="https://example.com/b">{{ x | lid: 'lidvaluebb2' }}B</a>
305</p>"#;
306 let pairs = extract_html_lid_values(body);
307 assert_eq!(pairs.len(), 2);
308 assert_eq!(pairs[0].url, "https://example.com/a");
309 assert_eq!(pairs[0].value, "lidvalueaa1");
310 assert_eq!(pairs[1].url, "https://example.com/b");
311 assert_eq!(pairs[1].value, "lidvaluebb2");
312 }
313
314 #[test]
315 fn html_lid_unpaired_anchor_is_skipped() {
316 let body = r#"<a href="https://example.com/a">no lid here</a>
317<a href="https://example.com/b">{{ x | lid: 'lidvaluebb2' }}B</a>"#;
318 let pairs = extract_html_lid_values(body);
319 assert_eq!(pairs.len(), 1);
320 assert_eq!(pairs[0].url, "https://example.com/b");
321 }
322
323 #[test]
324 fn html_lid_handles_both_quote_styles_and_query_string() {
325 let body = r#"<a href='https://example.com/x?utm=foo'>{{ x | lid: "lidvaluexyz1" }}X</a>"#;
326 let pairs = extract_html_lid_values(body);
327 assert_eq!(pairs.len(), 1);
328 assert_eq!(pairs[0].url, "https://example.com/x");
329 assert_eq!(pairs[0].value, "lidvaluexyz1");
330 }
331
332 #[test]
333 fn plaintext_lid_trims_trailing_punctuation() {
334 let body = "Visit (https://example.com/cta) | lid: 'lidplain01a' for the deal.";
338 let pairs = extract_plaintext_lid_values(body);
339 assert_eq!(pairs.len(), 1);
340 assert_eq!(pairs[0].url, "https://example.com/cta");
341 assert_eq!(pairs[0].value, "lidplain01a");
342 }
343
344 #[test]
345 fn plaintext_lid_trims_sentence_period() {
346 let body = "See https://example.com/end. | lid: 'lidplain02b'";
347 let pairs = extract_plaintext_lid_values(body);
348 assert_eq!(pairs.len(), 1);
349 assert_eq!(pairs[0].url, "https://example.com/end");
350 }
351
352 #[test]
353 fn cb_id_extracts_name_and_value() {
354 let body = "before {{content_blocks.${promo_banner} | id: 'cb42'}} after";
358 let pairs = extract_cb_id_values(body);
359 assert_eq!(pairs.len(), 1);
360 assert_eq!(pairs[0].name, "promo_banner");
361 assert_eq!(pairs[0].value, "cb42");
362 assert_eq!(pairs[0].key, "promo_banner");
363 }
364
365 #[test]
366 fn cb_id_handles_multiple_includes() {
367 let body = "{{content_blocks.${alpha} | id: 'cb1'}} {{content_blocks.${beta} | id: 'cb2'}}";
368 let pairs = extract_cb_id_values(body);
369 assert_eq!(pairs.len(), 2);
370 assert_eq!(pairs[0].name, "alpha");
371 assert_eq!(pairs[0].value, "cb1");
372 assert_eq!(pairs[0].key, "alpha");
373 assert_eq!(pairs[1].name, "beta");
374 assert_eq!(pairs[1].value, "cb2");
375 }
376
377 #[test]
378 fn cb_id_slug_uses_cb_prefix_for_empty_or_digit_start() {
379 assert_eq!(slug_for_cb_id("2024_summer"), "cb_2024_summer");
380 assert_eq!(slug_for_cb_id(""), "cb");
381 assert_eq!(slug_for_cb_id("My Promo Banner"), "my_promo_banner");
382 assert_eq!(slug_for_cb_id("cb_promo_image"), "cb_promo_image");
383 }
384
385 #[test]
386 fn lid_slug_uses_link_prefix_for_empty_or_digit_start() {
387 assert_eq!(slug_for_lid("/spring-sale"), "spring_sale");
388 assert_eq!(slug_for_lid("/"), "link");
389 assert_eq!(slug_for_lid("123"), "link_123");
390 assert_eq!(slug_for_lid("プロモ"), "link");
392 }
393
394 #[test]
395 fn slug_collapses_multiple_separators() {
396 assert_eq!(slug_for_lid("foo//bar--baz"), "foo_bar_baz");
397 assert_eq!(slug_for_lid("--leading"), "leading");
398 }
399}