use anyhow::Result;
use std::path::{Path, PathBuf};
pub(super) fn extract_title(html: &str) -> String {
if let Some(start) = html.find("<title>") {
let after = &html[start + 7..];
if let Some(end) = after.find("</title>") {
let title = strip_tags(&after[..end]);
let trimmed = title.trim();
if !trimmed.is_empty() {
return trimmed.to_string();
}
}
}
String::new()
}
pub(super) fn extract_description(html: &str, max_len: usize) -> String {
let content = extract_main_content(html);
let clean = strip_inline_tags(&content, &["script", "style"]);
let text = strip_tags(&clean);
let trimmed = text.trim();
truncate_at_word_boundary(trimmed, max_len)
}
fn extract_main_content(html: &str) -> String {
if let Some(inner) = extract_tag_inner(html, "main") {
return inner;
}
let body =
extract_tag_inner(html, "body").unwrap_or_else(|| html.to_string());
strip_inline_tags(&body, &["script", "style", "nav", "header", "footer"])
}
fn extract_tag_inner(html: &str, tag_name: &str) -> Option<String> {
let open = format!("<{tag_name}");
let close = format!("</{tag_name}>");
let start = html.find(&open)?;
let after = &html[start..];
let gt = after.find('>')?;
let inner = &after[gt + 1..];
if let Some(end) = inner.find(&close) {
Some(inner[..end].to_string())
} else {
Some(inner.to_string())
}
}
fn strip_inline_tags(html: &str, tags: &[&str]) -> String {
let mut clean = html.to_string();
for tag in tags {
let open = format!("<{tag}");
let close = format!("</{tag}>");
while let Some(start) = clean.find(&open) {
if let Some(end) = clean[start..].find(&close) {
clean.replace_range(start..start + end + close.len(), " ");
} else {
break;
}
}
}
clean
}
fn truncate_at_word_boundary(text: &str, max_len: usize) -> String {
if text.len() <= max_len {
return text.to_string();
}
let mut end = max_len;
while end > 0 && !text.is_char_boundary(end) {
end -= 1;
}
let truncated = &text[..end];
if let Some(last_space) = truncated.rfind(' ') {
truncated[..last_space].to_string()
} else {
truncated.to_string()
}
}
pub(super) fn strip_tags(html: &str) -> String {
let mut result = String::with_capacity(html.len());
let mut in_tag = false;
for ch in html.chars() {
match ch {
'<' => in_tag = true,
'>' => {
in_tag = false;
result.push(' ');
}
_ if !in_tag => result.push(ch),
_ => {}
}
}
let mut collapsed = String::with_capacity(result.len());
let mut prev_space = false;
for ch in result.chars() {
if ch.is_whitespace() {
if !prev_space {
collapsed.push(' ');
prev_space = true;
}
} else {
collapsed.push(ch);
prev_space = false;
}
}
collapsed.trim().to_string()
}
pub(super) fn collect_html_files(dir: &Path) -> Result<Vec<PathBuf>> {
crate::walk::walk_files(dir, "html")
}
pub(super) fn escape_attr(s: &str) -> String {
s.replace('&', "&")
.replace('"', """)
.replace('<', "<")
.replace('>', ">")
}
pub(super) fn has_meta_tag(html: &str, attr: &str) -> bool {
html.contains(&format!("<meta property=\"{attr}\""))
|| html.contains(&format!("<meta property='{attr}'"))
|| html.contains(&format!("<meta name=\"{attr}\""))
|| html.contains(&format!("<meta name='{attr}'"))
}
pub(super) fn extract_canonical(html: &str) -> String {
if let Some(pos) = html.find("rel=\"canonical\"") {
let region_start = pos.saturating_sub(200);
let region = &html[region_start..html.len().min(pos + 200)];
if let Some(href_start) = region.find("href=\"") {
let after = ®ion[href_start + 6..];
if let Some(end) = after.find('"') {
return after[..end].to_string();
}
}
}
String::new()
}
pub(super) fn extract_existing_meta(html: &str, attr: &str) -> String {
for prefix in &[
format!("<meta name=\"{attr}\" content=\""),
format!("<meta property=\"{attr}\" content=\""),
format!("<meta name='{attr}' content='"),
format!("<meta property='{attr}' content='"),
] {
if let Some(pos) = html.find(prefix.as_str()) {
let after = &html[pos + prefix.len()..];
let delim = if prefix.ends_with('\'') { '\'' } else { '"' };
if let Some(end) = after.find(delim) {
let value = after[..end].trim();
if !value.is_empty() {
return value.to_string();
}
}
}
}
String::new()
}
pub(super) fn extract_html_lang(html: &str) -> String {
if let Some(start) = html.find("<html") {
let tag_end = html[start..].find('>').unwrap_or(200);
let tag = &html[start..start + tag_end];
if let Some(lang_pos) = tag.find("lang=\"") {
let after = &tag[lang_pos + 6..];
if let Some(end) = after.find('"') {
return after[..end].to_string();
}
}
if let Some(lang_pos) = tag.find("lang='") {
let after = &tag[lang_pos + 6..];
if let Some(end) = after.find('\'') {
return after[..end].to_string();
}
}
}
String::new()
}
pub(super) fn extract_first_content_image(html: &str) -> String {
let search_region = if let Some(start) = html.find("<main") {
&html[start..]
} else if let Some(start) = html.find("<article") {
&html[start..]
} else {
return String::new();
};
if let Some(img_pos) = search_region.find("<img") {
let after_img = &search_region[img_pos..];
let tag_end = after_img.find('>').unwrap_or(500).min(500);
let img_tag = &after_img[..tag_end];
if let Some(src_pos) = img_tag.find("src=\"") {
let after_src = &img_tag[src_pos + 5..];
if let Some(end) = after_src.find('"') {
return after_src[..end].to_string();
}
}
}
String::new()
}
pub(super) fn extract_meta_author(html: &str) -> String {
let from_meta = extract_existing_meta(html, "author");
if !from_meta.is_empty() {
return from_meta;
}
for pattern in &["class=\"author\">", "class='author'>", "rel=\"author\">"]
{
if let Some(pos) = html.find(pattern) {
let after = &html[pos + pattern.len()..];
if let Some(end) = after.find('<') {
let name = after[..end].trim();
let name = name.strip_prefix("by ").unwrap_or(name).trim();
if !name.is_empty() {
return name.to_string();
}
}
}
}
String::new()
}
pub(super) fn extract_date_from_html(
html: &str,
field: &str,
) -> Option<String> {
let pattern = format!("\"{field}\":\"");
if let Some(pos) = html.find(&pattern) {
let after = &html[pos + pattern.len()..];
if let Some(end) = after.find('"') {
let date = &after[..end];
if !date.is_empty() {
return Some(date.to_string());
}
}
}
None
}
pub(super) fn extract_meta_date(html: &str) -> Option<String> {
let meta = extract_existing_meta(html, "article:published_time");
if !meta.is_empty() {
return Some(meta);
}
if let Some(pos) = html.find("datetime=\"") {
let after = &html[pos + 10..];
if let Some(end) = after.find('"') {
let date = &after[..end];
if !date.is_empty() {
return Some(date.to_string());
}
}
}
None
}
pub(super) fn collect_html_files_recursive(dir: &Path) -> Result<Vec<PathBuf>> {
crate::walk::walk_files(dir, "html")
}