use crate::aur::validation::validate_package_name;
use crate::cache::cache_key_comments;
use crate::client::{
ArchClient, extract_retry_after, is_archlinux_url, rate_limit_archlinux,
reset_archlinux_backoff, retry_with_policy,
};
use crate::error::{ArchToolkitError, Result};
use crate::types::AurComment;
use chrono::{DateTime, Local, NaiveDate, NaiveDateTime};
use reqwest::Client as ReqwestClient;
use reqwest::header::{ACCEPT, ACCEPT_LANGUAGE, HeaderMap, HeaderValue};
use scraper::{ElementRef, Html, Selector};
use tracing::debug;
struct CommentExtractionContext<'a> {
document: &'a Html,
date_selector: &'a Selector,
pkgname: &'a str,
html_text: &'a str,
has_pinned_section: bool,
latest_comments_pos: Option<usize>,
}
pub async fn comments(client: &ArchClient, pkgname: &str) -> Result<Vec<AurComment>> {
let validation_config = client.validation_config();
validate_package_name(pkgname, Some(validation_config))?;
if let Some(cache_config) = client.cache_config()
&& cache_config.enable_comments
&& let Some(cache) = client.cache()
{
let cache_key = cache_key_comments(pkgname);
if let Some(cached) = cache.get::<Vec<AurComment>>(&cache_key) {
debug!(pkgname = %pkgname, "cache hit for comments");
return Ok(cached);
}
}
let url = format!("https://aur.archlinux.org/packages/{pkgname}");
debug!(pkgname = %pkgname, url = %url, "fetching AUR comments");
let _permit = if is_archlinux_url(&url) {
rate_limit_archlinux().await
} else {
return Err(ArchToolkitError::InvalidInput(format!(
"Unexpected URL domain: {url}"
)));
};
let retry_policy = client.retry_policy();
let http_client = client.http_client();
let html_text = if retry_policy.enabled && retry_policy.retry_comments {
retry_with_policy(retry_policy, "comments", pkgname, || async {
perform_comments_request(http_client, &url, pkgname).await
})
.await?
} else {
perform_comments_request(http_client, &url, pkgname).await?
};
let result = parse_comments_html(&html_text, pkgname)?;
if let Some(cache_config) = client.cache_config()
&& cache_config.enable_comments
&& let Some(cache) = client.cache()
{
let cache_key = cache_key_comments(pkgname);
let _ = cache.set(&cache_key, &result, cache_config.comments_ttl);
}
Ok(result)
}
async fn perform_comments_request(
client: &ReqwestClient,
url: &str,
pkgname: &str,
) -> Result<String> {
let mut headers = HeaderMap::new();
headers.insert(
ACCEPT,
HeaderValue::from_static("text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8"),
);
headers.insert(ACCEPT_LANGUAGE, HeaderValue::from_static("en-US,en;q=0.5"));
let response = match client.get(url).headers(headers).send().await {
Ok(resp) => {
reset_archlinux_backoff();
resp
}
Err(e) => {
debug!(error = %e, pkgname = %pkgname, "AUR comments request failed");
return Err(ArchToolkitError::comments_failed(pkgname, e));
}
};
let _retry_after = extract_retry_after(&response);
let response = match response.error_for_status() {
Ok(resp) => resp,
Err(e) => {
debug!(error = %e, pkgname = %pkgname, "AUR comments returned non-success status");
return Err(ArchToolkitError::comments_failed(pkgname, e));
}
};
let html_text = match response.text().await {
Ok(text) => text,
Err(e) => {
debug!(error = %e, pkgname = %pkgname, "failed to read AUR comments response");
return Err(ArchToolkitError::comments_failed(pkgname, e));
}
};
Ok(html_text)
}
fn parse_comments_html(html_text: &str, pkgname: &str) -> Result<Vec<AurComment>> {
let document = Html::parse_document(html_text);
let comment_header_selector = Selector::parse("h4.comment-header").map_err(|e| {
ArchToolkitError::Parse(format!("Failed to parse comment header selector: {e}"))
})?;
let date_selector = Selector::parse("a.date")
.map_err(|e| ArchToolkitError::Parse(format!("Failed to parse date selector: {e}")))?;
let heading_selector = Selector::parse("h3, h2, h4")
.map_err(|e| ArchToolkitError::Parse(format!("Failed to parse heading selector: {e}")))?;
let has_pinned_section = document.select(&heading_selector).any(|h| {
let text: String = h.text().collect();
text.contains("Pinned Comments")
});
let html_text_lower = html_text.to_lowercase();
let latest_comments_pos = html_text_lower.find("latest comments");
let all_headers: Vec<_> = document.select(&comment_header_selector).collect();
let mut seen_comment_ids = std::collections::HashSet::new();
let mut comments = Vec::new();
for (index, header) in all_headers.iter().enumerate() {
let comment_id = header.value().attr("id");
if let Some(id) = comment_id
&& !seen_comment_ids.insert(id)
{
continue; }
let context = CommentExtractionContext {
document: &document,
date_selector: &date_selector,
pkgname,
html_text,
has_pinned_section,
latest_comments_pos,
};
if let Some(comment) = extract_comment_from_header(header, comment_id, index, &context) {
comments.push(comment);
}
}
Ok(separate_and_sort_comments(comments))
}
fn extract_comment_from_header(
header: &ElementRef,
comment_id: Option<&str>,
index: usize,
context: &CommentExtractionContext,
) -> Option<AurComment> {
let header_text = header.text().collect::<String>();
let author = header_text.find(" commented on ").map_or_else(
|| {
header_text
.split_whitespace()
.next()
.unwrap_or("Unknown")
.to_string()
},
|pos| header_text[..pos].trim().to_string(),
);
let base_url = format!("https://aur.archlinux.org/packages/{}", context.pkgname);
let (date_text, date_url) = header.select(context.date_selector).next().map_or_else(
|| (String::new(), None),
|e| {
let text = e.text().collect::<String>().trim().to_string();
let url = e.value().attr("href").map(|href| {
if href.starts_with("http://") || href.starts_with("https://") {
href.to_string()
} else if href.starts_with('#') {
format!("{base_url}{href}")
} else {
format!("https://aur.archlinux.org{href}")
}
});
(text, url)
},
);
let comment_content = comment_id
.and_then(|id| id.strip_prefix("comment-"))
.and_then(|comment_id_str| {
Selector::parse(&format!("div#comment-{comment_id_str}-content")).ok()
})
.and_then(|content_id_selector| context.document.select(&content_id_selector).next())
.map_or_else(String::new, |div| {
html_to_formatted_text(div)
});
if comment_content.is_empty() && author == "Unknown" {
return None;
}
let date_timestamp = parse_date_to_timestamp(&date_text);
if date_timestamp.is_none() && !date_text.is_empty() {
debug!(
pkgname = %context.pkgname,
author = %author,
date_text = %date_text,
"Failed to parse comment date to timestamp"
);
}
let local_date = convert_utc_to_local_date(&date_text);
let is_pinned = determine_pinned_status(comment_id, index, context);
let stable_id = comment_id.map(str::to_string).or_else(|| date_url.clone());
Some(AurComment {
id: stable_id,
author,
date: local_date,
date_timestamp,
date_url,
content: comment_content,
pinned: is_pinned,
})
}
fn determine_pinned_status(
comment_id: Option<&str>,
index: usize,
context: &CommentExtractionContext,
) -> bool {
if !context.has_pinned_section {
return false;
}
let Some(latest_pos) = context.latest_comments_pos else {
return false;
};
comment_id.map_or(index < 10, |id| {
context
.html_text
.find(id)
.map_or(index < 10, |comment_pos| comment_pos < latest_pos)
})
}
fn separate_and_sort_comments(comments: Vec<AurComment>) -> Vec<AurComment> {
let mut pinned_comments: Vec<AurComment> =
comments.iter().filter(|c| c.pinned).cloned().collect();
let mut regular_comments: Vec<AurComment> =
comments.into_iter().filter(|c| !c.pinned).collect();
sort_comments_by_date(&mut pinned_comments);
sort_comments_by_date(&mut regular_comments);
pinned_comments.extend(regular_comments);
pinned_comments
}
fn sort_comments_by_date(comments: &mut [AurComment]) {
comments.sort_by(|a, b| {
match (a.date_timestamp, b.date_timestamp) {
(Some(ts_a), Some(ts_b)) => ts_b.cmp(&ts_a), (Some(_), None) => std::cmp::Ordering::Less,
(None, Some(_)) => std::cmp::Ordering::Greater,
(None, None) => b.date.cmp(&a.date), }
});
}
fn convert_utc_to_local_date(utc_date_str: &str) -> String {
let utc_date_str = utc_date_str.trim();
if let Some(tz_start) = utc_date_str.rfind('(') {
let date_time_part = utc_date_str[..tz_start].trim();
if let Ok(naive_dt) = NaiveDateTime::parse_from_str(date_time_part, "%Y-%m-%d %H:%M") {
let utc_dt = naive_dt.and_utc();
let local_dt = utc_dt.with_timezone(&Local);
let formatted = local_dt.format("%Y-%m-%d %H:%M");
let tz_abbr = get_timezone_abbreviation(&local_dt);
return format!("{formatted} ({tz_abbr})");
}
}
utc_date_str.to_string()
}
fn get_timezone_abbreviation(local_dt: &DateTime<Local>) -> String {
let tz_from_format = local_dt.format("%Z").to_string();
if !tz_from_format.is_empty()
&& tz_from_format.len() >= 3
&& tz_from_format.len() <= 6
&& tz_from_format.chars().all(char::is_alphabetic)
&& !tz_from_format.starts_with("UTC")
{
return tz_from_format;
}
if let Ok(tz_env) = std::env::var("TZ") {
if let Some(tz_name) = tz_env.rsplit('/').next() {
if tz_name.len() >= 3
&& tz_name.len() <= 6
&& tz_name.chars().all(|c| c.is_uppercase() || c == '-')
{
let abbr = tz_name.split('-').next().unwrap_or(tz_name);
if abbr.len() >= 3 && abbr.chars().all(char::is_alphabetic) {
return abbr.to_string();
}
}
}
}
let offset_secs = local_dt.offset().local_minus_utc();
let hours = offset_secs / 3600;
let minutes = (offset_secs.abs() % 3600) / 60;
if offset_secs == 0 {
"UTC".to_string()
} else if minutes == 0 {
format!("UTC{hours:+}")
} else {
format!("UTC{hours:+}:{minutes:02}")
}
}
fn parse_date_to_timestamp(date_str: &str) -> Option<i64> {
let date_str = date_str.trim();
if date_str.is_empty() {
return None;
}
if let Some(tz_start) = date_str.rfind('(') {
let date_time_part = date_str[..tz_start].trim();
if let Ok(dt) = NaiveDateTime::parse_from_str(date_time_part, "%Y-%m-%d %H:%M") {
return Some(dt.and_utc().timestamp());
}
if let Ok(dt) = NaiveDateTime::parse_from_str(date_time_part, "%Y-%m-%d %H:%M:%S") {
return Some(dt.and_utc().timestamp());
}
}
if let Ok(dt) = NaiveDateTime::parse_from_str(date_str, "%Y-%m-%d %H:%M:%S") {
return Some(dt.and_utc().timestamp());
}
if let Ok(dt) = NaiveDateTime::parse_from_str(date_str, "%Y-%m-%dT%H:%M:%S") {
return Some(dt.and_utc().timestamp());
}
if let Ok(dt) = DateTime::parse_from_str(date_str, "%Y-%m-%dT%H:%M:%S%z") {
return Some(dt.timestamp());
}
if let Ok(d) = NaiveDate::parse_from_str(date_str, "%Y-%m-%d")
&& let Some(dt) = d.and_hms_opt(0, 0, 0)
{
return Some(dt.and_utc().timestamp());
}
if let Ok(ts) = date_str.parse::<i64>() {
if ts > 946_684_800 && ts < 4_102_444_800 {
return Some(ts);
}
}
None
}
fn html_to_formatted_text(element: ElementRef) -> String {
let mut result = String::new();
let p_selector = Selector::parse("p").ok();
if let Some(ref p_sel) = p_selector {
let paragraphs: Vec<_> = element.select(p_sel).collect();
if !paragraphs.is_empty() {
for (i, p) in paragraphs.iter().enumerate() {
if i > 0 {
result.push_str("\n\n");
}
result.push_str(&format_text_node(p));
}
return result;
}
}
format_text_node(&element)
}
fn format_text_node(element: &ElementRef) -> String {
let mut result = element.html();
let pre_selector = Selector::parse("pre").ok();
if let Some(ref pre_sel) = pre_selector {
for pre in element.select(pre_sel) {
let text = pre.text().collect::<String>();
let pre_html = pre.html();
let replacement = format!("```\n{}\n```", text.trim());
result = result.replace(&pre_html, &replacement);
}
}
let a_selector = Selector::parse("a").ok();
if let Some(ref a_sel) = a_selector {
for link in element.select(a_sel) {
let text = link.text().collect::<String>().trim().to_string();
if let Some(href) = link.value().attr("href") {
let link_html = link.html();
let replacement = format!("[{text}]({href})");
result = result.replace(&link_html, &replacement);
}
}
}
let strong_selector = Selector::parse("strong, b").ok();
if let Some(ref strong_sel) = strong_selector {
for bold in element.select(strong_sel) {
let text = bold.text().collect::<String>().trim().to_string();
if !text.is_empty() {
let bold_html = bold.html();
let replacement = format!("**{text}**");
result = result.replace(&bold_html, &replacement);
}
}
}
let em_selector = Selector::parse("em, i").ok();
if let Some(ref em_sel) = em_selector {
for italic in element.select(em_sel) {
let text = italic.text().collect::<String>().trim().to_string();
if !text.is_empty() {
let italic_html = italic.html();
let replacement = format!("*{text}*");
result = result.replace(&italic_html, &replacement);
}
}
}
let code_selector = Selector::parse("code").ok();
if let Some(ref code_sel) = code_selector {
for code in element.select(code_sel) {
let text = code.text().collect::<String>().trim().to_string();
if !text.is_empty() {
let code_html = code.html();
let replacement = format!("`{text}`");
result = result.replace(&code_html, &replacement);
}
}
}
let temp_doc = Html::parse_fragment(&result);
let text_result = temp_doc.root_element().text().collect::<String>();
text_result
.replace("<br>", "\n")
.replace("<br/>", "\n")
.replace("<br />", "\n")
}
#[cfg(test)]
mod tests {
use crate::error::ArchToolkitError;
#[test]
fn test_comments_error_includes_package_context() {
let package = "yay";
#[allow(clippy::unwrap_used)]
let cert_result = reqwest::Certificate::from_pem(b"invalid cert");
let mock_error = match cert_result {
Ok(cert) => reqwest::Client::builder()
.add_root_certificate(cert)
.build()
.expect_err("Should fail to build client with invalid cert"),
Err(e) => e,
};
let error = ArchToolkitError::comments_failed(package, mock_error);
let error_msg = format!("{error}");
assert!(
error_msg.contains(package),
"Error message should include package name: {error_msg}"
);
assert!(
error_msg.contains("AUR comments fetch failed"),
"Error message should indicate comments operation: {error_msg}"
);
}
}