mod spa;
pub use spa::helpers::{find_content_by_key, find_longest_string};
pub use spa::inline::{
extract_balanced_json, extract_inline_script_json, unwrap_api_response_bodies,
};
pub use spa::jsonld::extract_jsonld_content;
pub use spa::nextjs::{
discover_nextjs_content_chunks, extract_jsx_text_content, extract_nextjs_content,
is_nextjs_metadata_only, resolve_content_chunk_urls, resolve_content_chunk_urls_for_slug,
};
pub fn extract_spa_data(html: &str) -> Option<String> {
let document = scraper::Html::parse_document(html);
if let Some(content) = spa::nextjs::try_extract_script_json(&document, "script#__NEXT_DATA__") {
return Some(content);
}
for selector in &["script#__NUXT_DATA__", "script#__nuxt-data"] {
if let Some(content) = spa::nextjs::try_extract_script_json(&document, selector) {
return Some(content);
}
}
if let Some(content) = spa::sveltekit::extract_sveltekit_data(&document) {
return Some(content);
}
if let Some(content) = spa::gatsby::extract_gatsby_data(&document, html) {
return Some(content);
}
if let Some(content) = spa::angular::extract_angular_universal_state(&document) {
return Some(content);
}
if let Some(content) = spa::jsonld::extract_jsonld_content(&document) {
return Some(content);
}
if let Some(content) = spa::inline::extract_inline_script_json(html) {
return Some(content);
}
if let Some(content) = spa::inline::extract_hidden_code_json(&document) {
return Some(content);
}
None
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn extract_spa_data_detects_sveltekit() {
let article = "SvelteKit integration test: this content must be long enough to pass the \
minimum two hundred character threshold applied by extract_spa_data so that \
the SvelteKit extractor is exercised end-to-end via the main entry point.";
let payload = serde_json::json!({"status": 200, "body": {"content": article}});
let html = format!(
r#"<html><body>
<script type="application/json" data-sveltekit-fetched data-url="/api/post">
{payload}
</script>
</body></html>"#,
payload = serde_json::to_string(&payload).unwrap()
);
let result = extract_spa_data(&html);
assert!(result.is_some());
assert!(result.unwrap().contains("SvelteKit integration test"));
}
#[test]
fn extract_spa_data_detects_gatsby() {
let article = "Gatsby end-to-end integration test: this content is deliberately over two \
hundred characters so that the Gatsby extractor path inside extract_spa_data \
is exercised and we confirm the framework is wired into the main try-chain.";
let payload = serde_json::json!({"result": {"data": {"body": article}}});
let html = format!(
r#"<html><body>
<script type="application/json" data-gatsby-ssr>
{payload}
</script>
</body></html>"#,
payload = serde_json::to_string(&payload).unwrap()
);
let result = extract_spa_data(&html);
assert!(result.is_some());
assert!(
result
.unwrap()
.contains("Gatsby end-to-end integration test")
);
}
#[test]
fn extract_spa_data_detects_angular_universal() {
let article = "Angular Universal end-to-end integration test: this content body is \
intentionally long enough to exceed the two hundred character minimum so \
that the Angular Universal extractor path within extract_spa_data is \
exercised and we confirm it is wired into the main try-chain correctly.";
let state = serde_json::json!({
"cache.key": {"status": 200, "body": {"content": article}}
});
let html = format!(
r#"<html><body>
<script id="serverApp-state" type="application/json">
{state}
</script>
</body></html>"#,
state = serde_json::to_string(&state).unwrap()
);
let result = extract_spa_data(&html);
assert!(result.is_some());
assert!(result.unwrap().contains("Angular Universal end-to-end"));
}
}