use std::fmt::Write;
use scraper::Html;
use crate::dom;
use super::ExtractorResult;
use super::comments::{CommentData, build_comment_tree, build_content_html};
#[must_use]
pub fn is_lobsters(url: Option<&str>) -> bool {
url.is_some_and(|u| u.contains("lobste.rs/s/"))
}
#[must_use]
pub fn extract_lobsters(
html: &Html,
url: Option<&str>,
include_replies: bool,
) -> Option<ExtractorResult> {
if !is_lobsters(url) {
return None;
}
let result = extract_from_html(html, include_replies);
match result {
Some(ref r) if dom::count_words_html(&r.content) >= 10 => result,
_ => try_api_fetch(url, include_replies).or(result),
}
}
fn extract_from_html(html: &Html, include_replies: bool) -> Option<ExtractorResult> {
let title = extract_title(html);
let story_body = extract_story_body(html);
let story_link = extract_story_link(html);
if story_body.is_empty() && story_link.is_empty() {
return None;
}
let mut post_html = String::new();
if !story_link.is_empty() {
let escaped = dom::html_attr_escape(&story_link);
let _ = write!(
post_html,
"<p><a href=\"{escaped}\" target=\"_blank\">\
{escaped}</a></p>"
);
}
if !story_body.is_empty() {
let _ = write!(post_html, "<div class=\"post-text\">{story_body}</div>");
}
let comments_html = if include_replies {
extract_comments(html)
} else {
String::new()
};
let content = build_content_html("lobsters", &post_html, &comments_html);
Some(ExtractorResult {
content,
title: if title.is_empty() { None } else { Some(title) },
author: None,
site: Some("Lobsters".to_string()),
published: None,
image: None,
description: None,
})
}
fn extract_title(html: &Html) -> String {
let ids = dom::select_ids(html, ".u-repost-of");
ids.first()
.map(|&id| dom::text_content(html, id).trim().to_string())
.unwrap_or_default()
}
fn extract_story_body(html: &Html) -> String {
let ids = dom::select_ids(html, ".story_text");
ids.first()
.map(|&id| dom::inner_html(html, id).trim().to_string())
.unwrap_or_default()
}
fn extract_story_link(html: &Html) -> String {
let ids = dom::select_ids(html, ".u-repost-of");
ids.first()
.and_then(|&id| dom::get_attr(html, id, "href"))
.unwrap_or_default()
}
fn extract_comments(html: &Html) -> String {
let comment_ids = dom::select_ids(html, ".comment_text");
if comment_ids.is_empty() {
return String::new();
}
let mut comments = Vec::new();
for &cid in &comment_ids {
let body = dom::inner_html(html, cid).trim().to_string();
if body.is_empty() {
continue;
}
let container = find_comment_container(html, cid);
let (author, depth, score) = container.map_or_else(
|| (String::new(), 0, String::new()),
|c| extract_comment_meta(html, c),
);
comments.push(CommentData {
author,
date: String::new(),
content: body,
depth,
score: if score.is_empty() { None } else { Some(score) },
url: None,
});
}
if comments.is_empty() {
return String::new();
}
build_comment_tree(&comments)
}
fn find_comment_container(html: &Html, node_id: ego_tree::NodeId) -> Option<ego_tree::NodeId> {
let mut current = Some(node_id);
while let Some(nid) = current {
if dom::has_class(html, nid, "comment") {
return Some(nid);
}
current = dom::parent_element(html, nid);
}
None
}
fn extract_comment_meta(html: &Html, container: ego_tree::NodeId) -> (String, usize, String) {
let author_ids = dom::select_within(html, container, ".comment_author a");
let author = author_ids
.first()
.map(|&id| dom::text_content(html, id).trim().to_string())
.unwrap_or_default();
let depth = extract_comment_depth(html, container);
let score_ids = dom::select_within(html, container, ".comment_score");
let score = score_ids
.first()
.map(|&id| dom::text_content(html, id).trim().to_string())
.unwrap_or_default();
(author, depth, score)
}
fn extract_comment_depth(html: &Html, comment_id: ego_tree::NodeId) -> usize {
let mut current = Some(comment_id);
while let Some(nid) = current {
if dom::has_class(html, nid, "comments_subtree") {
if let Some(style) = dom::get_attr(html, nid, "style") {
let ml_value = style
.split(';')
.find_map(|decl| {
let decl = decl.trim();
let (prop, val) = decl.split_once(':')?;
if prop.trim() == "margin-left" {
Some(val.trim().to_string())
} else {
None
}
})
.and_then(|v| {
v.strip_suffix("px")
.and_then(|n| n.trim().parse::<f64>().ok())
});
if let Some(ml) = ml_value {
#[allow(clippy::cast_possible_truncation, clippy::cast_sign_loss)]
return ml.max(0.0).round() as usize / 18;
}
}
return 0;
}
current = dom::parent_element(html, nid);
}
0
}
fn parse_short_id(url: &str) -> Option<&str> {
let after_s = url.split("lobste.rs/s/").nth(1)?;
let id = after_s.split('/').next()?;
if id.is_empty() {
return None;
}
Some(id)
}
fn api_url(url: &str) -> Option<String> {
let id = parse_short_id(url)?;
Some(format!("https://lobste.rs/s/{id}.json"))
}
fn try_api_fetch(url: Option<&str>, include_replies: bool) -> Option<ExtractorResult> {
let u = url?;
let api = api_url(u)?;
let json = fetch_lobsters_json(&api)?;
build_from_api(&json, include_replies)
}
fn build_from_api(json: &serde_json::Value, include_replies: bool) -> Option<ExtractorResult> {
let title = lobsters_json_str(json, "title");
let description = json
.get("description")
.and_then(serde_json::Value::as_str)
.unwrap_or("");
let story_url = lobsters_json_str(json, "url");
let submitter = json
.get("submitter_user")
.and_then(|u| u.get("username"))
.and_then(serde_json::Value::as_str)
.unwrap_or("")
.to_string();
let mut post_html = String::new();
if !story_url.is_empty() {
let escaped = dom::html_attr_escape(&story_url);
let _ = write!(
post_html,
"<p><a href=\"{escaped}\" target=\"_blank\">\
{escaped}</a></p>"
);
}
if !description.trim().is_empty() {
let _ = write!(post_html, "<div class=\"post-text\">{description}</div>");
}
if post_html.is_empty() {
return None;
}
let comments_html = if include_replies {
build_api_comments(json)
} else {
String::new()
};
let content = build_content_html("lobsters", &post_html, &comments_html);
Some(ExtractorResult {
content,
title: if title.is_empty() { None } else { Some(title) },
author: if submitter.is_empty() {
None
} else {
Some(submitter)
},
site: Some("Lobsters".to_string()),
published: json
.get("created_at")
.and_then(serde_json::Value::as_str)
.and_then(|d| d.split('T').next())
.map(String::from),
image: None,
description: None,
})
}
fn build_api_comments(json: &serde_json::Value) -> String {
let Some(comments_arr) = json.get("comments").and_then(serde_json::Value::as_array) else {
return String::new();
};
let mut comments = Vec::new();
for c in comments_arr {
let body = c
.get("comment")
.and_then(serde_json::Value::as_str)
.unwrap_or("");
if body.trim().is_empty() {
continue;
}
let author = c
.get("commenting_user")
.and_then(|u| u.get("username"))
.and_then(serde_json::Value::as_str)
.unwrap_or("")
.to_string();
let depth = c
.get("depth")
.and_then(serde_json::Value::as_u64)
.and_then(|d| usize::try_from(d).ok())
.unwrap_or(0);
let score = c
.get("score")
.and_then(serde_json::Value::as_i64)
.unwrap_or(0);
comments.push(CommentData {
author,
date: String::new(),
content: body.to_string(),
depth,
score: Some(format!("{score} points")),
url: None,
});
}
if comments.is_empty() {
return String::new();
}
build_comment_tree(&comments)
}
fn fetch_lobsters_json(url: &str) -> Option<serde_json::Value> {
let body = crate::http::get(url)?;
serde_json::from_str(&body).ok()
}
fn lobsters_json_str(json: &serde_json::Value, key: &str) -> String {
json.get(key)
.and_then(serde_json::Value::as_str)
.unwrap_or("")
.to_string()
}
#[cfg(test)]
#[expect(clippy::unwrap_used)]
mod tests {
use super::*;
#[test]
fn detect_lobsters_url() {
assert!(is_lobsters(Some("https://lobste.rs/s/abc123/some_story")));
assert!(is_lobsters(Some("https://lobste.rs/s/xyz")));
}
#[test]
fn reject_non_lobsters_url() {
assert!(!is_lobsters(Some("https://example.com")));
assert!(!is_lobsters(Some("https://lobste.rs/")));
assert!(!is_lobsters(None));
}
#[test]
fn parse_short_id_valid() {
assert_eq!(
parse_short_id("https://lobste.rs/s/abc123/some_title"),
Some("abc123")
);
assert_eq!(parse_short_id("https://lobste.rs/s/xyz"), Some("xyz"));
}
#[test]
fn parse_short_id_invalid() {
assert!(parse_short_id("https://example.com").is_none());
}
#[test]
fn api_url_construction() {
assert_eq!(
api_url("https://lobste.rs/s/abc123/story"),
Some("https://lobste.rs/s/abc123.json".to_string())
);
}
#[test]
fn extract_from_html_basic() {
let html_str = r#"
<html>
<body>
<a class="u-repost-of" href="https://example.com/article">
A Cool Story
</a>
<div class="comment">
<span class="comment_author">
<a href="/u/alice">alice</a>
</span>
<div class="comment_text">
<p>This is a great article.</p>
</div>
</div>
</body>
</html>
"#;
let html = Html::parse_document(html_str);
let result = extract_from_html(&html, true).unwrap();
assert_eq!(result.title.as_deref(), Some("A Cool Story"));
assert!(result.content.contains("example.com/article"));
assert!(result.content.contains("great article"));
}
#[test]
fn build_from_api_basic() {
let json = serde_json::json!({
"title": "Test Story",
"description": "<p>Story description</p>",
"url": "https://example.com/test",
"score": 42,
"submitter_user": {"username": "alice"},
"tags": ["rust"],
"created_at": "2025-01-15T12:00:00Z",
"comments": [
{
"comment": "<p>Nice!</p>",
"commenting_user": {"username": "bob"},
"depth": 0,
"score": 5,
"created_at": "2025-01-15T13:00:00Z"
},
{
"comment": "<p>Agreed</p>",
"commenting_user": {"username": "carol"},
"depth": 1,
"score": 2,
"created_at": "2025-01-15T14:00:00Z"
}
]
});
let result = build_from_api(&json, true).unwrap();
assert_eq!(result.title.as_deref(), Some("Test Story"));
assert_eq!(result.author.as_deref(), Some("alice"));
assert_eq!(result.published.as_deref(), Some("2025-01-15"));
assert!(result.content.contains("example.com/test"));
assert!(result.content.contains("Nice!"));
assert!(result.content.contains("bob"));
assert!(result.content.contains("Agreed"));
}
#[test]
fn api_fetch_live() {
let url = "https://lobste.rs/s/abc123";
let result = try_api_fetch(Some(url), false);
drop(result);
}
}