braze_sync/values/
correlation.rs1use regex_lite::Regex;
22use std::sync::OnceLock;
23
24pub fn normalize_url(url: &str) -> String {
31 let stop = url.find(['?', '#']).unwrap_or(url.len());
32 url[..stop].to_string()
33}
34
35fn href_re() -> &'static Regex {
36 static RE: OnceLock<Regex> = OnceLock::new();
37 RE.get_or_init(|| {
38 Regex::new(r#"(?i)<a\b[^>]*?\bhref\s*=\s*(?:"([^"]*)"|'([^']*)')"#)
42 .expect("href regex is valid")
43 })
44}
45
46fn lid_value_re() -> &'static Regex {
47 static RE: OnceLock<Regex> = OnceLock::new();
48 RE.get_or_init(|| {
49 Regex::new(r#"\|\s*lid:\s*(?:"([a-z0-9]{8,})"|'([a-z0-9]{8,})')"#)
54 .expect("lid value regex is valid")
55 })
56}
57
58fn plaintext_url_re() -> &'static Regex {
59 static RE: OnceLock<Regex> = OnceLock::new();
60 RE.get_or_init(|| {
61 Regex::new(r#"https?://[^\s<>"']+"#).expect("plaintext URL regex is valid")
66 })
67}
68
69fn cb_id_include_re() -> &'static Regex {
70 static RE: OnceLock<Regex> = OnceLock::new();
71 RE.get_or_init(|| {
72 Regex::new(
78 r#"\{\{\s*content_blocks\.\$\{\s*([^\s}|]+)\s*\}\s*\|\s*id:\s*(?:"(cb[0-9]+)"|'(cb[0-9]+)')\s*\}\}"#,
79 )
80 .expect("cb_id include regex is valid")
81 })
82}
83
84fn trim_trailing_punctuation(url: &str, preceded_by: Option<char>) -> &str {
93 let pair_closer = match preceded_by {
94 Some('(') => Some(')'),
95 Some('[') => Some(']'),
96 Some('<') => Some('>'),
97 _ => None,
98 };
99 let mut end = url.len();
100 while end > 0 {
101 let c = url[..end].chars().last().unwrap();
102 let drop_general = matches!(c, '.' | ',' | ';' | ':' | '!' | '?' | '>');
103 let drop_pair = Some(c) == pair_closer;
104 if drop_general || drop_pair {
105 end -= c.len_utf8();
106 } else {
107 break;
108 }
109 }
110 &url[..end]
111}
112
113#[derive(Debug, Clone, PartialEq, Eq)]
117pub struct LidCorrelation {
118 pub url: String,
120 pub value: String,
122 pub url_offset: usize,
125}
126
127pub fn extract_html_lid_values(body: &str) -> Vec<LidCorrelation> {
131 pair_urls_with_lids(href_iter(body), body)
132}
133
134pub fn extract_plaintext_lid_values(body: &str) -> Vec<LidCorrelation> {
137 pair_urls_with_lids(plaintext_url_iter(body), body)
138}
139
140fn href_iter(body: &str) -> Vec<(usize, String)> {
141 href_re()
142 .captures_iter(body)
143 .filter_map(|cap| {
144 let whole = cap.get(0)?;
145 let url = cap
146 .get(1)
147 .or(cap.get(2))
148 .map(|m| m.as_str())
149 .unwrap_or_default();
150 Some((whole.start(), normalize_url(url)))
151 })
152 .collect()
153}
154
155fn plaintext_url_iter(body: &str) -> Vec<(usize, String)> {
156 plaintext_url_re()
157 .find_iter(body)
158 .map(|m| {
159 let raw = m.as_str();
160 let preceded_by = if m.start() > 0 {
161 body[..m.start()].chars().last()
162 } else {
163 None
164 };
165 let trimmed = trim_trailing_punctuation(raw, preceded_by);
166 (m.start(), normalize_url(trimmed))
167 })
168 .collect()
169}
170
171fn pair_urls_with_lids(urls: Vec<(usize, String)>, body: &str) -> Vec<LidCorrelation> {
172 let lids: Vec<(usize, String)> = lid_value_re()
173 .captures_iter(body)
174 .filter_map(|cap| {
175 let whole = cap.get(0)?;
176 let value = cap.get(1).or(cap.get(2)).map(|m| m.as_str().to_string())?;
177 Some((whole.start(), value))
178 })
179 .collect();
180
181 let mut out = Vec::new();
182 for (i, (url_off, url)) in urls.iter().enumerate() {
183 let next_url_off = urls.get(i + 1).map(|(o, _)| *o).unwrap_or(body.len());
184 if let Some((_, value)) = lids
185 .iter()
186 .find(|(off, _)| *off > *url_off && *off < next_url_off)
187 {
188 out.push(LidCorrelation {
189 url: url.clone(),
190 value: value.clone(),
191 url_offset: *url_off,
192 });
193 }
194 }
195 out
196}
197
198#[derive(Debug, Clone, PartialEq, Eq)]
201pub struct CbIdCorrelation {
202 pub name: String,
204 pub value: String,
206 pub key: String,
208}
209
210pub fn extract_cb_id_values(body: &str) -> Vec<CbIdCorrelation> {
212 cb_id_include_re()
213 .captures_iter(body)
214 .filter_map(|cap| {
215 let name = cap.get(1)?.as_str().to_string();
216 let value = cap.get(2).or(cap.get(3)).map(|m| m.as_str().to_string())?;
217 let key = slug_for_cb_id(&name);
218 Some(CbIdCorrelation { name, value, key })
219 })
220 .collect()
221}
222
223pub fn slug_for_cb_id(name: &str) -> String {
225 let base = slug_core(name);
226 if base.is_empty() || base.starts_with(|c: char| c.is_ascii_digit()) {
227 format!("cb_{base}")
228 } else {
229 base
230 }
231}
232
233pub fn slug_for_lid(source: &str) -> String {
237 let base = slug_core(source);
238 if base.is_empty() || base.starts_with(|c: char| c.is_ascii_digit()) {
239 format!("link_{base}")
240 } else {
241 base
242 }
243}
244
245fn slug_core(s: &str) -> String {
246 let mut out = String::with_capacity(s.len());
247 let mut last_underscore = false;
248 for ch in s.chars() {
249 let mapped = if ch.is_ascii_alphanumeric() {
250 ch.to_ascii_lowercase()
251 } else {
252 '_'
253 };
254 if mapped == '_' {
255 if last_underscore {
256 continue;
257 }
258 last_underscore = true;
259 } else {
260 last_underscore = false;
261 }
262 out.push(mapped);
263 }
264 let trimmed = out.trim_matches('_');
265 trimmed.to_string()
266}
267
268#[cfg(test)]
269mod tests {
270 use super::*;
271
272 #[test]
273 fn normalize_strips_query_and_fragment() {
274 assert_eq!(
275 normalize_url("https://example.com/x?utm=1"),
276 "https://example.com/x"
277 );
278 assert_eq!(
279 normalize_url("https://example.com/x#frag"),
280 "https://example.com/x"
281 );
282 assert_eq!(
283 normalize_url("https://example.com/x"),
284 "https://example.com/x"
285 );
286 }
287
288 #[test]
289 fn html_lid_pairs_each_anchor_with_following_value() {
290 let body = r#"<p>
291<a href="https://example.com/a">{{ x | lid: 'lidvalueaa1' }}A</a>
292<a href="https://example.com/b">{{ x | lid: 'lidvaluebb2' }}B</a>
293</p>"#;
294 let pairs = extract_html_lid_values(body);
295 assert_eq!(pairs.len(), 2);
296 assert_eq!(pairs[0].url, "https://example.com/a");
297 assert_eq!(pairs[0].value, "lidvalueaa1");
298 assert_eq!(pairs[1].url, "https://example.com/b");
299 assert_eq!(pairs[1].value, "lidvaluebb2");
300 }
301
302 #[test]
303 fn html_lid_unpaired_anchor_is_skipped() {
304 let body = r#"<a href="https://example.com/a">no lid here</a>
305<a href="https://example.com/b">{{ x | lid: 'lidvaluebb2' }}B</a>"#;
306 let pairs = extract_html_lid_values(body);
307 assert_eq!(pairs.len(), 1);
308 assert_eq!(pairs[0].url, "https://example.com/b");
309 }
310
311 #[test]
312 fn html_lid_handles_both_quote_styles_and_query_string() {
313 let body = r#"<a href='https://example.com/x?utm=foo'>{{ x | lid: "lidvaluexyz1" }}X</a>"#;
314 let pairs = extract_html_lid_values(body);
315 assert_eq!(pairs.len(), 1);
316 assert_eq!(pairs[0].url, "https://example.com/x");
317 assert_eq!(pairs[0].value, "lidvaluexyz1");
318 }
319
320 #[test]
321 fn plaintext_lid_trims_trailing_punctuation() {
322 let body = "Visit (https://example.com/cta) | lid: 'lidplain01a' for the deal.";
326 let pairs = extract_plaintext_lid_values(body);
327 assert_eq!(pairs.len(), 1);
328 assert_eq!(pairs[0].url, "https://example.com/cta");
329 assert_eq!(pairs[0].value, "lidplain01a");
330 }
331
332 #[test]
333 fn plaintext_lid_trims_sentence_period() {
334 let body = "See https://example.com/end. | lid: 'lidplain02b'";
335 let pairs = extract_plaintext_lid_values(body);
336 assert_eq!(pairs.len(), 1);
337 assert_eq!(pairs[0].url, "https://example.com/end");
338 }
339
340 #[test]
341 fn cb_id_extracts_name_and_value() {
342 let body = "before {{content_blocks.${promo_banner} | id: 'cb42'}} after";
346 let pairs = extract_cb_id_values(body);
347 assert_eq!(pairs.len(), 1);
348 assert_eq!(pairs[0].name, "promo_banner");
349 assert_eq!(pairs[0].value, "cb42");
350 assert_eq!(pairs[0].key, "promo_banner");
351 }
352
353 #[test]
354 fn cb_id_handles_multiple_includes() {
355 let body = "{{content_blocks.${alpha} | id: 'cb1'}} {{content_blocks.${beta} | id: 'cb2'}}";
356 let pairs = extract_cb_id_values(body);
357 assert_eq!(pairs.len(), 2);
358 assert_eq!(pairs[0].name, "alpha");
359 assert_eq!(pairs[0].value, "cb1");
360 assert_eq!(pairs[0].key, "alpha");
361 assert_eq!(pairs[1].name, "beta");
362 assert_eq!(pairs[1].value, "cb2");
363 }
364
365 #[test]
366 fn cb_id_slug_uses_cb_prefix_for_empty_or_digit_start() {
367 assert_eq!(slug_for_cb_id("2024_summer"), "cb_2024_summer");
368 assert_eq!(slug_for_cb_id(""), "cb_");
369 assert_eq!(slug_for_cb_id("My Promo Banner"), "my_promo_banner");
370 assert_eq!(slug_for_cb_id("cb_promo_image"), "cb_promo_image");
371 }
372
373 #[test]
374 fn lid_slug_uses_link_prefix_for_empty_or_digit_start() {
375 assert_eq!(slug_for_lid("/spring-sale"), "spring_sale");
376 assert_eq!(slug_for_lid("/"), "link_");
377 assert_eq!(slug_for_lid("123"), "link_123");
378 assert_eq!(slug_for_lid("プロモ"), "link_");
380 }
381
382 #[test]
383 fn slug_collapses_multiple_separators() {
384 assert_eq!(slug_for_lid("foo//bar--baz"), "foo_bar_baz");
385 assert_eq!(slug_for_lid("--leading"), "leading");
386 }
387}