use regex_lite::Regex;
use std::sync::OnceLock;
pub fn normalize_url(url: &str) -> String {
let stop = url.find(['?', '#']).unwrap_or(url.len());
url[..stop].to_string()
}
fn href_re() -> &'static Regex {
static RE: OnceLock<Regex> = OnceLock::new();
RE.get_or_init(|| {
Regex::new(r#"(?i)<a\b[^>]*?\bhref\s*=\s*(?:"([^"]*)"|'([^']*)')"#)
.expect("href regex is valid")
})
}
fn lid_value_re() -> &'static Regex {
static RE: OnceLock<Regex> = OnceLock::new();
RE.get_or_init(|| {
Regex::new(r#"\|\s*lid:\s*(?:"([a-z0-9]{8,})"|'([a-z0-9]{8,})')"#)
.expect("lid value regex is valid")
})
}
fn plaintext_url_re() -> &'static Regex {
static RE: OnceLock<Regex> = OnceLock::new();
RE.get_or_init(|| {
Regex::new(r#"https?://[^\s<>"']+"#).expect("plaintext URL regex is valid")
})
}
fn cb_id_include_re() -> &'static Regex {
static RE: OnceLock<Regex> = OnceLock::new();
RE.get_or_init(|| {
Regex::new(
r#"\{\{\s*content_blocks\.\$\{\s*([^\s}|]+)\s*\}\s*\|\s*id:\s*(?:"(cb[0-9]+)"|'(cb[0-9]+)')\s*\}\}"#,
)
.expect("cb_id include regex is valid")
})
}
fn trim_trailing_punctuation(url: &str, preceded_by: Option<char>) -> &str {
let pair_closer = match preceded_by {
Some('(') => Some(')'),
Some('[') => Some(']'),
Some('<') => Some('>'),
_ => None,
};
let mut end = url.len();
while end > 0 {
let c = url[..end].chars().last().unwrap();
let drop_general = matches!(c, '.' | ',' | ';' | ':' | '!' | '?' | '>');
let drop_pair = Some(c) == pair_closer;
if drop_general || drop_pair {
end -= c.len_utf8();
} else {
break;
}
}
&url[..end]
}
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct LidCorrelation {
pub url: String,
pub value: String,
pub url_offset: usize,
}
pub fn extract_html_lid_values(body: &str) -> Vec<LidCorrelation> {
pair_urls_with_lids(href_iter(body), body)
}
pub fn extract_plaintext_lid_values(body: &str) -> Vec<LidCorrelation> {
pair_urls_with_lids(plaintext_url_iter(body), body)
}
fn href_iter(body: &str) -> Vec<(usize, String)> {
href_re()
.captures_iter(body)
.filter_map(|cap| {
let whole = cap.get(0)?;
let url = cap
.get(1)
.or(cap.get(2))
.map(|m| m.as_str())
.unwrap_or_default();
Some((whole.start(), normalize_url(url)))
})
.collect()
}
fn plaintext_url_iter(body: &str) -> Vec<(usize, String)> {
plaintext_url_re()
.find_iter(body)
.map(|m| {
let raw = m.as_str();
let preceded_by = if m.start() > 0 {
body[..m.start()].chars().last()
} else {
None
};
let trimmed = trim_trailing_punctuation(raw, preceded_by);
(m.start(), normalize_url(trimmed))
})
.collect()
}
fn pair_urls_with_lids(urls: Vec<(usize, String)>, body: &str) -> Vec<LidCorrelation> {
let lids: Vec<(usize, String)> = lid_value_re()
.captures_iter(body)
.filter_map(|cap| {
let whole = cap.get(0)?;
let value = cap.get(1).or(cap.get(2)).map(|m| m.as_str().to_string())?;
Some((whole.start(), value))
})
.collect();
let mut out = Vec::new();
for (i, (url_off, url)) in urls.iter().enumerate() {
let next_url_off = urls.get(i + 1).map(|(o, _)| *o).unwrap_or(body.len());
if let Some((_, value)) = lids
.iter()
.find(|(off, _)| *off > *url_off && *off < next_url_off)
{
out.push(LidCorrelation {
url: url.clone(),
value: value.clone(),
url_offset: *url_off,
});
}
}
out
}
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct CbIdCorrelation {
pub name: String,
pub value: String,
pub key: String,
}
pub fn extract_cb_id_values(body: &str) -> Vec<CbIdCorrelation> {
cb_id_include_re()
.captures_iter(body)
.filter_map(|cap| {
let name = cap.get(1)?.as_str().to_string();
let value = cap.get(2).or(cap.get(3)).map(|m| m.as_str().to_string())?;
let key = slug_for_cb_id(&name);
Some(CbIdCorrelation { name, value, key })
})
.collect()
}
pub fn slug_for_cb_id(name: &str) -> String {
let base = slug_core(name);
if base.is_empty() || base.starts_with(|c: char| c.is_ascii_digit()) {
format!("cb_{base}")
} else {
base
}
}
pub fn slug_for_lid(source: &str) -> String {
let base = slug_core(source);
if base.is_empty() || base.starts_with(|c: char| c.is_ascii_digit()) {
format!("link_{base}")
} else {
base
}
}
fn slug_core(s: &str) -> String {
let mut out = String::with_capacity(s.len());
let mut last_underscore = false;
for ch in s.chars() {
let mapped = if ch.is_ascii_alphanumeric() {
ch.to_ascii_lowercase()
} else {
'_'
};
if mapped == '_' {
if last_underscore {
continue;
}
last_underscore = true;
} else {
last_underscore = false;
}
out.push(mapped);
}
let trimmed = out.trim_matches('_');
trimmed.to_string()
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn normalize_strips_query_and_fragment() {
assert_eq!(
normalize_url("https://example.com/x?utm=1"),
"https://example.com/x"
);
assert_eq!(
normalize_url("https://example.com/x#frag"),
"https://example.com/x"
);
assert_eq!(
normalize_url("https://example.com/x"),
"https://example.com/x"
);
}
#[test]
fn html_lid_pairs_each_anchor_with_following_value() {
let body = r#"<p>
<a href="https://example.com/a">{{ x | lid: 'lidvalueaa1' }}A</a>
<a href="https://example.com/b">{{ x | lid: 'lidvaluebb2' }}B</a>
</p>"#;
let pairs = extract_html_lid_values(body);
assert_eq!(pairs.len(), 2);
assert_eq!(pairs[0].url, "https://example.com/a");
assert_eq!(pairs[0].value, "lidvalueaa1");
assert_eq!(pairs[1].url, "https://example.com/b");
assert_eq!(pairs[1].value, "lidvaluebb2");
}
#[test]
fn html_lid_unpaired_anchor_is_skipped() {
let body = r#"<a href="https://example.com/a">no lid here</a>
<a href="https://example.com/b">{{ x | lid: 'lidvaluebb2' }}B</a>"#;
let pairs = extract_html_lid_values(body);
assert_eq!(pairs.len(), 1);
assert_eq!(pairs[0].url, "https://example.com/b");
}
#[test]
fn html_lid_handles_both_quote_styles_and_query_string() {
let body = r#"<a href='https://example.com/x?utm=foo'>{{ x | lid: "lidvaluexyz1" }}X</a>"#;
let pairs = extract_html_lid_values(body);
assert_eq!(pairs.len(), 1);
assert_eq!(pairs[0].url, "https://example.com/x");
assert_eq!(pairs[0].value, "lidvaluexyz1");
}
#[test]
fn plaintext_lid_trims_trailing_punctuation() {
let body = "Visit (https://example.com/cta) | lid: 'lidplain01a' for the deal.";
let pairs = extract_plaintext_lid_values(body);
assert_eq!(pairs.len(), 1);
assert_eq!(pairs[0].url, "https://example.com/cta");
assert_eq!(pairs[0].value, "lidplain01a");
}
#[test]
fn plaintext_lid_trims_sentence_period() {
let body = "See https://example.com/end. | lid: 'lidplain02b'";
let pairs = extract_plaintext_lid_values(body);
assert_eq!(pairs.len(), 1);
assert_eq!(pairs[0].url, "https://example.com/end");
}
#[test]
fn cb_id_extracts_name_and_value() {
let body = "before {{content_blocks.${promo_banner} | id: 'cb42'}} after";
let pairs = extract_cb_id_values(body);
assert_eq!(pairs.len(), 1);
assert_eq!(pairs[0].name, "promo_banner");
assert_eq!(pairs[0].value, "cb42");
assert_eq!(pairs[0].key, "promo_banner");
}
#[test]
fn cb_id_handles_multiple_includes() {
let body = "{{content_blocks.${alpha} | id: 'cb1'}} {{content_blocks.${beta} | id: 'cb2'}}";
let pairs = extract_cb_id_values(body);
assert_eq!(pairs.len(), 2);
assert_eq!(pairs[0].name, "alpha");
assert_eq!(pairs[0].value, "cb1");
assert_eq!(pairs[0].key, "alpha");
assert_eq!(pairs[1].name, "beta");
assert_eq!(pairs[1].value, "cb2");
}
#[test]
fn cb_id_slug_uses_cb_prefix_for_empty_or_digit_start() {
assert_eq!(slug_for_cb_id("2024_summer"), "cb_2024_summer");
assert_eq!(slug_for_cb_id(""), "cb_");
assert_eq!(slug_for_cb_id("My Promo Banner"), "my_promo_banner");
assert_eq!(slug_for_cb_id("cb_promo_image"), "cb_promo_image");
}
#[test]
fn lid_slug_uses_link_prefix_for_empty_or_digit_start() {
assert_eq!(slug_for_lid("/spring-sale"), "spring_sale");
assert_eq!(slug_for_lid("/"), "link_");
assert_eq!(slug_for_lid("123"), "link_123");
assert_eq!(slug_for_lid("プロモ"), "link_");
}
#[test]
fn slug_collapses_multiple_separators() {
assert_eq!(slug_for_lid("foo//bar--baz"), "foo_bar_baz");
assert_eq!(slug_for_lid("--leading"), "leading");
}
}