use crate::config;
#[derive(Debug, Default)]
pub struct Document {
pub data_collection_events: String,
pub sdk_full_tag: String,
pub sdk_src: String,
pub inlined_sdk: String,
pub data_layer: String,
pub title: String,
pub canonical: String,
pub keywords: String,
}
#[derive(Debug, Default)]
struct DocumentBuilder {
sdk_full_tag: Option<String>,
sdk_src: Option<String>,
inlined_sdk: Option<String>,
data_layer: Option<String>,
title: Option<String>,
canonical: Option<String>,
keywords: Option<String>,
}
impl DocumentBuilder {
fn is_complete(&self) -> bool {
matches!(
*self,
DocumentBuilder {
sdk_full_tag: Some(_),
sdk_src: Some(_),
inlined_sdk: Some(_),
data_layer: Some(_),
title: Some(_),
canonical: Some(_),
keywords: Some(_),
}
)
}
fn build(self) -> Document {
Document {
sdk_full_tag: self.sdk_full_tag.unwrap_or_default(),
sdk_src: self.sdk_src.unwrap_or_default(),
inlined_sdk: self.inlined_sdk.unwrap_or_default(),
data_layer: self.data_layer.unwrap_or_default(),
title: self.title.unwrap_or_default(),
canonical: self.canonical.unwrap_or_default(),
keywords: self.keywords.unwrap_or_default(),
..Default::default()
}
}
}
macro_rules! set_document_field {
($builder:expr, $field:ident, $value:expr) => {
set_document_field!($builder, $field, ?Some($value));
};
($builder:expr, $field:ident, ?$value:expr) => {
$builder.$field = $value;
if $builder.is_complete() {
return $builder.build();
}
};
}
pub fn parse_html(html: &str, host: &str) -> Document {
static RECORDED_TAGS: &[&str] = &["script", "title", "meta", "link"];
let mut builder = DocumentBuilder::default();
if !html.contains("__EDGEE_DATA_LAYER__") {
builder.data_layer = Some(String::new());
}
let mut temp = String::new();
let mut recording = false;
let mut chars = html.chars().peekable();
while let Some(c) = chars.next() {
match c {
'<' if chars.peek() == Some(&'!') => {
chars.next(); if chars.peek() == Some(&'-') {
chars.next(); if chars.peek() == Some(&'-') {
chars.next();
while let Some(&next_c) = chars.peek() {
chars.next(); temp.push(next_c);
if next_c == '>' && temp.ends_with("-->") {
break;
}
}
temp.clear(); }
}
}
'<' => {
let next_chars: String = chars.clone().take(6).collect();
if !recording {
for tag in RECORDED_TAGS.iter() {
if next_chars.starts_with(tag) {
recording = true;
temp.clear();
break;
}
}
}
if next_chars.starts_with("/head") {
builder.title.get_or_insert_with(String::new);
builder.canonical.get_or_insert_with(String::new);
builder.keywords.get_or_insert_with(String::new);
if builder.is_complete() {
return builder.build();
}
}
temp.push(c);
}
'>' if recording => {
temp.push(c);
if temp.contains("__EDGEE_SDK__") {
if temp.ends_with("/>") {
set_document_field!(builder, sdk_full_tag, temp.clone());
} else {
while let Some(&next_c) = chars.peek() {
chars.next(); temp.push(next_c);
if next_c == '>' {
if temp.ends_with("script>") {
break;
}
}
}
set_document_field!(builder, sdk_src, ?extract_src_value(&temp));
let inline = !temp.contains(r#"data-inline="false""#);
if let (true, Some(sdk_url)) = (inline, &builder.sdk_src) {
if let Ok(inlined_sdk) = edgee_sdk::get_sdk(
sdk_url,
host,
config::get().compute.autocapture.clone(),
config::get().compute.cookie_name.clone().as_str(),
) {
set_document_field!(builder, inlined_sdk, inlined_sdk);
}
}
set_document_field!(builder, sdk_full_tag, temp.clone());
}
} else if temp.contains("__EDGEE_DATA_LAYER__") {
temp.clear();
while let Some(&next_c) = chars.peek() {
chars.next(); temp.push(next_c);
if next_c == '>' {
if temp.ends_with("script>") {
break;
}
}
}
temp = temp.replace("</script>", "");
set_document_field!(builder, data_layer, temp.clone());
} else if temp == "<title>" {
while let Some(&next_c) = chars.peek() {
chars.next(); temp.push(next_c);
if next_c == '>' {
if temp.ends_with("title>") {
break;
}
}
}
let mut title_tag = temp.clone();
title_tag = title_tag.replace("</title>", "");
title_tag = title_tag.replace("<title>", "");
set_document_field!(builder, title, title_tag);
} else if temp.contains(r#"rel="canonical""#) {
set_document_field!(builder, canonical, ?extract_href_value(&temp));
} else if temp.contains(r#"name="keywords""#) {
set_document_field!(builder, keywords, ?extract_content_value(&temp));
}
recording = false;
temp.clear();
}
_ if recording => {
temp.push(c);
}
_ => {}
}
}
builder.build()
}
fn extract_href_value(tag: &str) -> Option<String> {
let start = tag.find(r#"href=""#)?;
let rest_of_tag = &tag[start + 6..];
let end_quote = rest_of_tag.find('"')?;
Some(rest_of_tag[..end_quote].to_string())
}
fn extract_src_value(tag: &str) -> Option<String> {
let start = tag.find(r#"src=""#)?;
let rest_of_tag = &tag[start + 5..];
let end_quote = rest_of_tag.find('"')?;
Some(rest_of_tag[..end_quote].to_string())
}
fn extract_content_value(tag: &str) -> Option<String> {
let start = tag.find(r#"content=""#)?;
let rest_of_tag = &tag[start + 9..];
let end_quote = rest_of_tag.find('"')?;
Some(rest_of_tag[..end_quote].to_string())
}
#[cfg(test)]
mod tests {
use super::*;
use pretty_assertions::assert_eq;
fn sample_html_full_minimal() -> String {
String::from(
"<html>
<head>
<title>ABC > DEF</title>
<!-- LEGACY STUFF HERE -->
<link rel=\"canonical\" href=\"https://test.com/test\"/>
<meta name=\"keywords\" content=\"k1, k2, k3\"/>
<script type=\"json\" id=\"__EDGEE_DATA_LAYER__\">{
\"data_collection\": {
\"events\": [
{
\"type\": \"track\",
\"data\": {\"name\": \"Event > name\"}
}
]
}
}</script>
<script type=\"javascript\" id=\"__EDGEE_SDK__\" src=\"/_edgee/sdk.js\"/>
</head>
<body></body>
</html>",
)
}
fn sample_html_full_sdk_in_body() -> String {
String::from(
"<html>
<head>
<title>ABC</title>
<!-- LEGACY STUFF HERE -->
<link rel=\"canonical\" href=\"https://test.com/test\"/>
<meta name=\"keywords\" content=\"k1, k2, k3\"/>
</head>
<body>
<script type=\"javascript\" id=\"__EDGEE_SDK__\" src=\"/_edgee/sdk.js\"></script>
</body>
</html>",
)
}
fn sample_html_full_sdk_inline_false() -> String {
String::from("<html>
<head>
<title>ABC</title>
<!-- LEGACY STUFF HERE -->
<link rel=\"canonical\" href=\"https://test.com/test\"/>
<meta name=\"keywords\" content=\"k1, k2, k3\"/>
<script type=\"json\" id=\"__EDGEE_DATA_LAYER__\">{}</script>
<script data-inline=\"false\" data-random=\"ok>ko\" type=\"javascript\" id=\"__EDGEE_SDK__\" src=\"/_edgee/sdk.js\"></script>
</head>
<body></body>
</html>")
}
fn sample_html_without_data_layer() -> String {
String::from(
"<html>
<head>
<title>ABC</title>
<link rel=\"canonical\" href=\"https://test.com/test\"/>
<meta name=\"keywords\" content=\"k1, k2, k3\"/>
<script type=\"javascript\" id=\"__EDGEE_SDK__\" src=\"/_edgee/sdk.js\"></script>
</head>
<body>
</body>
</html>",
)
}
fn sample_html_without_sdk() -> String {
String::from(
"<html>
<head>
<title>ABC</title>
<link rel=\"canonical\" href=\"https://test.com/test\"/>
<meta name=\"keywords\" content=\"k1, k2, k3\"/>
</head>
<body>
</body>
</html>",
)
}
#[test]
fn parse_html_creates_a_document() {
let document = parse_html(&sample_html_full_minimal(), "test.com");
assert_eq!(document.title, "ABC > DEF");
assert_eq!(document.canonical, "https://test.com/test");
assert_eq!(document.keywords, "k1, k2, k3");
}
#[test]
fn parse_html_without_data_layer() {
crate::config::init_test_config();
let document = parse_html(&sample_html_without_data_layer(), "test.com");
assert_eq!(document.title, "ABC");
}
#[test]
fn parse_html_without_sdk() {
let document = parse_html(&sample_html_without_sdk(), "test.com");
assert_eq!(document.title, "ABC");
}
#[test]
fn parse_html_with_sdk_in_body() {
crate::config::init_test_config();
let document = parse_html(&sample_html_full_sdk_in_body(), "test.com");
assert_eq!(document.title, "ABC");
}
#[test]
fn parse_html_with_sdk_inline_false() {
let document = parse_html(&sample_html_full_sdk_inline_false(), "test.com");
assert_eq!(document.title, "ABC");
}
#[test]
fn parse_html_doesnt_break_if_broken_html() {
let html = "<!- LEGACY STUFF HERE ->"; let document = parse_html(html, "test.com");
assert_eq!(document.title, "");
}
#[test]
fn parse_html_doesnt_break_if_broken_html2() {
let html = "<! LEGACY STUFF HERE >"; let document = parse_html(html, "test.com");
assert_eq!(document.title, "");
}
#[test]
fn parse_html_doesnt_break_if_invalid_sdk_version() {
crate::config::init_test_config();
let html = "<script type=\"javascript\" id=\"__EDGEE_SDK__\" src=\"/_edgee/edgee.v99.js.js\"></script>"; let document = parse_html(html, "test.com");
assert_eq!(document.title, "");
}
}