use std::fmt::Write as _;
use anyhow::{Result, bail};
use scraper::{Html, Selector};
use super::helpers::{build_full_name, decode_html_entities, kind_label, strip_html_comment};
use super::types::VoyagerProfileResponse;
use super::url::LinkedInUrlKind;
use crate::site::{SiteContent, SiteMetadata};
pub(super) async fn fetch_authenticated(
url: &str,
cookies: &str,
kind: LinkedInUrlKind,
) -> Result<SiteContent> {
fetch_authenticated_html(url, cookies, kind).await
}
async fn fetch_authenticated_html(
url: &str,
cookies: &str,
kind: LinkedInUrlKind,
) -> Result<SiteContent> {
use crate::impersonate_client;
let response = impersonate_client::fetch_impersonated(url, Some(cookies), None).await?;
let status = response.status.as_u16();
if status == 999 {
bail!(
"LinkedIn returned HTTP 999 (bot detection).\n\
Your session cookies may have expired. Try:\n\
1. Log into LinkedIn in your browser\n\
2. Retry: nab fetch {url} --cookies brave"
);
}
if (300..400).contains(&status)
|| response.body.contains("login") && response.body.contains("session_redirect")
{
bail!(
"LinkedIn redirected to login. Cookies missing or expired.\n\
Use: nab fetch {url} --cookies brave"
);
}
if !response.status.is_success() {
bail!("LinkedIn returned HTTP {status} for {url}");
}
parse_linkedin_html(&response.body, url, kind)
}
pub(super) fn parse_linkedin_html(
html: &str,
url: &str,
kind: LinkedInUrlKind,
) -> Result<SiteContent> {
let document = Html::parse_document(html);
if let Some(content) = extract_code_json(&document, url, kind) {
return Ok(content);
}
if let Some(content) = extract_json_ld(&document, url, kind) {
return Ok(content);
}
extract_from_selectors(&document, url, kind)
}
fn extract_code_json(document: &Html, url: &str, kind: LinkedInUrlKind) -> Option<SiteContent> {
let selector = Selector::parse("code").ok()?;
let mut profile: Option<VoyagerProfileResponse> = None;
let mut posts: Vec<String> = Vec::new();
for element in document.select(&selector) {
let raw = element.inner_html();
let json_str = strip_html_comment(raw.trim());
if json_str.is_empty() {
continue;
}
let Ok(value) = serde_json::from_str::<serde_json::Value>(json_str) else {
continue;
};
scan_json_value(&value, &mut profile, &mut posts);
if let Some(obj) = value.as_object()
&& let (Some(status), Some(body_str)) = (
obj.get("status").and_then(serde_json::Value::as_u64),
obj.get("body").and_then(|v| v.as_str()),
)
&& status == 200
&& !body_str.is_empty()
&& let Ok(body_json) = serde_json::from_str::<serde_json::Value>(body_str)
{
scan_json_value(&body_json, &mut profile, &mut posts);
}
}
build_code_json_content(url, kind, profile.as_ref(), &posts)
}
fn scan_json_value(
value: &serde_json::Value,
profile: &mut Option<VoyagerProfileResponse>,
posts: &mut Vec<String>,
) {
match value {
serde_json::Value::Object(map) => {
if looks_like_profile(map) {
let p = extract_profile_manual(map);
let new_field_count = count_profile_fields(&p);
let old_field_count = profile.as_ref().map_or(0, count_profile_fields);
if new_field_count > old_field_count {
*profile = Some(p);
}
}
if let Some(text) = extract_post_text(map)
&& !posts.contains(&text)
{
posts.push(text);
}
for v in map.values() {
scan_json_value(v, profile, posts);
}
}
serde_json::Value::Array(arr) => {
for v in arr {
scan_json_value(v, profile, posts);
}
}
_ => {}
}
}
fn count_profile_fields(p: &VoyagerProfileResponse) -> usize {
[
&p.first_name,
&p.last_name,
&p.headline,
&p.summary,
&p.location_name,
&p.industry_name,
]
.iter()
.filter(|f| f.is_some())
.count()
}
fn extract_profile_manual(
map: &serde_json::Map<String, serde_json::Value>,
) -> VoyagerProfileResponse {
fn get_str(
map: &serde_json::Map<String, serde_json::Value>,
key: &str,
multi_key: &str,
) -> Option<String> {
if let Some(v) = map.get(key).and_then(|v| v.as_str())
&& !v.is_empty()
{
return Some(decode_html_entities(v));
}
if let Some(obj) = map.get(multi_key).and_then(|v| v.as_object()) {
for v in obj.values() {
if let Some(s) = v.as_str()
&& !s.is_empty()
{
return Some(decode_html_entities(s));
}
}
}
None
}
VoyagerProfileResponse {
first_name: get_str(map, "firstName", "multiLocaleFirstName"),
last_name: get_str(map, "lastName", "multiLocaleLastName"),
headline: get_str(map, "headline", "multiLocaleHeadline"),
summary: get_str(map, "summary", "multiLocaleSummary"),
location_name: map
.get("geoLocationName")
.and_then(|v| v.as_str())
.map(decode_html_entities)
.or_else(|| {
map.get("locationName")
.and_then(|v| v.as_str())
.map(decode_html_entities)
}),
industry_name: map
.get("industryName")
.and_then(|v| v.as_str())
.map(decode_html_entities),
}
}
pub(super) fn looks_like_profile(map: &serde_json::Map<String, serde_json::Value>) -> bool {
let profile_keys = ["firstName", "lastName", "headline", "summary"];
profile_keys
.iter()
.filter(|k| map.contains_key(**k))
.count()
>= 2
}
pub(super) fn extract_post_text(
map: &serde_json::Map<String, serde_json::Value>,
) -> Option<String> {
if let Some(commentary) = map.get("commentary").and_then(|c| c.as_object()) {
if let Some(text) = commentary
.get("text")
.and_then(|t| t.as_object())
.and_then(|t| t.get("text"))
.and_then(|t| t.as_str())
{
let trimmed = text.trim();
if !trimmed.is_empty() {
return Some(trimmed.to_string());
}
}
if let Some(text) = commentary.get("text").and_then(|t| t.as_str()) {
let trimmed = text.trim();
if !trimmed.is_empty() {
return Some(trimmed.to_string());
}
}
}
if let Some(text) = map.get("commentary").and_then(|c| c.as_str()) {
let trimmed = text.trim();
if !trimmed.is_empty() {
return Some(trimmed.to_string());
}
}
None
}
fn build_code_json_content(
url: &str,
kind: LinkedInUrlKind,
profile: Option<&VoyagerProfileResponse>,
posts: &[String],
) -> Option<SiteContent> {
let mut md = String::new();
let (author, title) = if let Some(p) = profile {
let name = build_full_name(p.first_name.as_deref(), p.last_name.as_deref());
if let Some(ref n) = name {
let _ = writeln!(md, "## {n}\n");
}
if let Some(ref h) = p.headline {
let _ = writeln!(md, "{h}\n");
}
if let Some(ref loc) = p.location_name {
let _ = writeln!(md, "Location: {loc}");
}
if let Some(ref ind) = p.industry_name {
let _ = writeln!(md, "Industry: {ind}\n");
} else if name.is_some() {
md.push('\n');
}
if let Some(ref summary) = p.summary {
let trimmed = summary.trim();
if !trimmed.is_empty() {
let _ = writeln!(md, "### About\n\n{trimmed}\n");
}
}
(name.clone(), name)
} else {
(None, None)
};
if !posts.is_empty() {
if profile.is_some() {
let _ = writeln!(md, "### Recent Activity\n");
}
for post in posts.iter().take(10) {
let _ = writeln!(md, "---\n\n{post}\n");
}
}
if md.trim().is_empty() {
return None;
}
let _ = writeln!(md, "[View on LinkedIn]({url})");
Some(SiteContent {
markdown: md,
metadata: SiteMetadata {
author,
title,
published: None,
platform: format!("LinkedIn ({})", kind_label(kind)),
canonical_url: url.to_string(),
media_urls: vec![],
engagement: None,
},
})
}
fn extract_json_ld(document: &Html, url: &str, kind: LinkedInUrlKind) -> Option<SiteContent> {
let selector = Selector::parse(r#"script[type="application/ld+json"]"#).ok()?;
for element in document.select(&selector) {
let json_text = element.text().collect::<String>();
if let Ok(ld) = serde_json::from_str::<serde_json::Value>(&json_text) {
let name = ld
.get("name")
.or_else(|| ld.get("headline"))
.and_then(|v| v.as_str())
.map(String::from);
let description = ld
.get("description")
.or_else(|| ld.get("articleBody"))
.and_then(|v| v.as_str())
.map(String::from);
let author = ld.get("author").and_then(|a| {
a.get("name")
.and_then(|n| n.as_str())
.map(String::from)
.or_else(|| a.as_str().map(String::from))
});
let image = ld.get("image").and_then(|i| {
i.as_str()
.map(String::from)
.or_else(|| i.get("url").and_then(|u| u.as_str()).map(String::from))
});
if name.is_some() || description.is_some() {
let mut md = String::new();
if let Some(ref n) = name {
let _ = writeln!(md, "## {n}\n");
}
if let Some(ref a) = author {
let _ = writeln!(md, "by {a}\n");
}
if let Some(ref d) = description {
let _ = writeln!(md, "{d}\n");
}
let _ = writeln!(md, "[View on LinkedIn]({url})");
let metadata = SiteMetadata {
author,
title: name,
published: ld
.get("datePublished")
.and_then(|v| v.as_str())
.map(String::from),
platform: format!("LinkedIn ({})", kind_label(kind)),
canonical_url: url.to_string(),
media_urls: image.into_iter().collect(),
engagement: None,
};
return Some(SiteContent {
markdown: md,
metadata,
});
}
}
}
None
}
#[allow(clippy::too_many_lines)]
fn extract_from_selectors(
document: &Html,
url: &str,
kind: LinkedInUrlKind,
) -> Result<SiteContent> {
let mut md = String::new();
let mut title = None;
let mut author = None;
if let Ok(sel) = Selector::parse("h1")
&& let Some(el) = document.select(&sel).next()
{
let text = el.text().collect::<String>().trim().to_string();
if !text.is_empty() {
title = Some(text.clone());
let _ = writeln!(md, "## {text}\n");
}
}
for selector_str in &[
".text-body-medium", ".top-card-layout__headline", ".break-words", ] {
if let Ok(sel) = Selector::parse(selector_str)
&& let Some(el) = document.select(&sel).next()
{
let text = el.text().collect::<String>().trim().to_string();
if !text.is_empty() && Some(&text) != title.as_ref() {
let _ = writeln!(md, "{text}\n");
break;
}
}
}
for selector_str in &[
"#about ~ .display-flex .pv-shared-text-with-see-more span[aria-hidden=true]",
".pv-about__summary-text",
"section.summary .description",
] {
if let Ok(sel) = Selector::parse(selector_str)
&& let Some(el) = document.select(&sel).next()
{
let text = el.text().collect::<String>().trim().to_string();
if !text.is_empty() {
let _ = writeln!(md, "### About\n\n{text}\n");
break;
}
}
}
if let Ok(sel) = Selector::parse("#experience ~ .pvs-list__outer-container li") {
let items: Vec<_> = document.select(&sel).take(5).collect();
if !items.is_empty() {
let _ = writeln!(md, "### Experience\n");
for item in items {
let text = item.text().collect::<String>();
let clean: String = text.split_whitespace().collect::<Vec<_>>().join(" ");
if !clean.is_empty() {
let _ = writeln!(md, "- {clean}");
}
}
md.push('\n');
}
}
for selector_str in &[
".feed-shared-update-v2__description",
".feed-shared-text",
".update-components-text",
] {
if let Ok(sel) = Selector::parse(selector_str) {
for el in document.select(&sel).take(10) {
let text = el.text().collect::<String>().trim().to_string();
if !text.is_empty() {
let _ = writeln!(md, "---\n\n{text}\n");
}
}
}
}
if author.is_none()
&& let Ok(sel) = Selector::parse(r#"meta[name="author"]"#)
&& let Some(el) = document.select(&sel).next()
{
author = el.attr("content").map(String::from);
}
if title.is_none()
&& let Ok(sel) = Selector::parse("title")
&& let Some(el) = document.select(&sel).next()
{
let text = el.text().collect::<String>().trim().to_string();
title = Some(
text.strip_suffix(" | LinkedIn")
.unwrap_or(&text)
.to_string(),
);
}
if md.trim().is_empty() {
if let Ok(sel) = Selector::parse(r#"meta[property="og:description"]"#)
&& let Some(el) = document.select(&sel).next()
&& let Some(desc) = el.attr("content")
{
let _ = writeln!(md, "{desc}\n");
}
}
if md.trim().is_empty() {
bail!("Could not extract meaningful content from LinkedIn page: {url}");
}
let _ = writeln!(md, "[View on LinkedIn]({url})");
let metadata = SiteMetadata {
author,
title,
published: None,
platform: format!("LinkedIn ({})", kind_label(kind)),
canonical_url: url.to_string(),
media_urls: extract_og_image(document),
engagement: None,
};
Ok(SiteContent {
markdown: md,
metadata,
})
}
fn extract_og_image(document: &Html) -> Vec<String> {
Selector::parse(r#"meta[property="og:image"]"#)
.ok()
.and_then(|sel| document.select(&sel).next())
.and_then(|el| el.attr("content"))
.map(|url| vec![url.to_string()])
.unwrap_or_default()
}