use anyhow::Result;
use std::collections::HashMap;
use std::fs;
use std::path::Path;
pub(super) fn normalise_url(url: &str) -> String {
if let Some(scheme_end) = url.find("://") {
let (scheme, rest) = url.split_at(scheme_end + 3);
let cleaned: String = rest
.chars()
.fold((String::new(), false), |(mut acc, prev_slash), ch| {
if ch == '/' && prev_slash {
(acc, true)
} else {
acc.push(ch);
(acc, ch == '/')
}
})
.0;
format!("{scheme}{cleaned}")
} else {
url.to_string()
}
}
pub(super) fn xml_escape(s: &str) -> String {
s.replace('&', "&")
.replace('<', "<")
.replace('>', ">")
.replace('"', """)
.replace('\'', "'")
}
pub(super) fn truncate_at_word(s: &str, max_len: usize) -> String {
if s.len() <= max_len {
return s.to_string();
}
let mut end = max_len;
while end > 0 && !s.is_char_boundary(end) {
end -= 1;
}
let truncated = &s[..end];
match truncated.rfind(' ') {
Some(pos) => format!("{}...", &s[..pos]),
None => format!("{truncated}..."),
}
}
pub(super) fn read_meta_sidecars(
site_dir: &Path,
) -> Result<Vec<(String, HashMap<String, String>)>> {
let mut entries = Vec::new();
let mut stack = vec![site_dir.to_path_buf()];
while let Some(current) = stack.pop() {
if !current.is_dir() {
continue;
}
for entry in fs::read_dir(¤t)? {
let entry = entry?;
let path = entry.path();
if path.is_dir() {
stack.push(path);
} else if path
.file_name()
.is_some_and(|n| n.to_string_lossy().ends_with(".meta.json"))
{
if let Ok(content) = fs::read_to_string(&path) {
if let Ok(meta) = serde_json::from_str::<
HashMap<String, String>,
>(&content)
{
let rel = path
.parent()
.and_then(|p| p.strip_prefix(site_dir).ok())
.map(|p| p.to_string_lossy().into_owned())
.unwrap_or_default();
entries.push((rel, meta));
}
}
}
}
}
Ok(entries)
}
pub(super) struct Rfc2822Date {
pub year: u32,
pub month: u32,
pub day: u32,
pub hour: u32,
pub min: u32,
pub sec: u32,
pub tz: String,
}
impl Rfc2822Date {
pub(super) fn to_iso_date(&self) -> String {
format!("{:04}-{:02}-{:02}", self.year, self.month, self.day)
}
pub(super) fn to_rfc3339(&self) -> String {
let tz = if self.tz == "+0000" || self.tz == "GMT" || self.tz == "UTC" {
"+00:00".to_string()
} else if self.tz.len() == 5 {
format!("{}:{}", &self.tz[..3], &self.tz[3..])
} else {
self.tz.clone()
};
format!(
"{:04}-{:02}-{:02}T{:02}:{:02}:{:02}{}",
self.year, self.month, self.day, self.hour, self.min, self.sec, tz
)
}
}
pub(super) fn parse_rfc2822_lenient(rfc: &str) -> Option<Rfc2822Date> {
let rest = if let Some(pos) = rfc.find(", ") {
rfc[pos + 2..].trim()
} else {
rfc.trim()
};
let parts: Vec<&str> = rest.splitn(5, ' ').collect();
if parts.len() < 4 {
return None;
}
let day: u32 = parts[0].parse().ok()?;
let month = match parts[1] {
"Jan" => 1,
"Feb" => 2,
"Mar" => 3,
"Apr" => 4,
"May" => 5,
"Jun" => 6,
"Jul" => 7,
"Aug" => 8,
"Sep" => 9,
"Oct" => 10,
"Nov" => 11,
"Dec" => 12,
_ => return None,
};
let year: u32 = parts[2].parse().ok()?;
let time_parts: Vec<&str> = parts[3].split(':').collect();
if time_parts.len() != 3 {
return None;
}
let hour: u32 = time_parts[0].parse().ok()?;
let min: u32 = time_parts[1].parse().ok()?;
let sec: u32 = time_parts[2].parse().ok()?;
let tz = parts.get(4).unwrap_or(&"+0000");
Some(Rfc2822Date {
year,
month,
day,
hour,
min,
sec,
tz: tz.to_string(),
})
}
pub(super) fn rfc2822_to_iso_date(rfc: &str) -> Option<String> {
parse_rfc2822_lenient(rfc).map(|dt| dt.to_iso_date())
}
pub(super) fn rfc2822_to_iso8601(rfc: &str) -> String {
parse_rfc2822_lenient(rfc)
.map_or_else(|| rfc.to_string(), |dt| dt.to_rfc3339())
}
pub(super) fn extract_xml_value(xml: &str, tag: &str) -> Option<String> {
let open = format!("<{tag}>");
let close = format!("</{tag}>");
if let Some(start) = xml.find(&open) {
let after = &xml[start + open.len()..];
if let Some(end) = after.find(&close) {
let value = after[..end].trim().to_string();
if !value.is_empty() {
return Some(value);
}
}
}
None
}
pub(super) fn normalise_url_in_xml_line(line: &str) -> String {
let mut result = line.to_string();
let patterns = ["https://", "http://"];
for pat in &patterns {
while let Some(start) = result.find(pat) {
let after_scheme = start + pat.len();
let end = result[after_scheme..]
.find(|c: char| {
c == '<' || c == '"' || c == '\'' || c.is_whitespace()
})
.map_or(result.len(), |i| i + after_scheme);
let url = &result[start..end];
let fixed = normalise_url(url);
if fixed == url {
break;
}
result = format!("{}{}{}", &result[..start], fixed, &result[end..]);
}
}
result
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_normalise_url_double_slash() {
assert_eq!(
normalise_url("https://example.com//index.html"),
"https://example.com/index.html"
);
}
#[test]
fn test_normalise_url_preserves_scheme() {
assert_eq!(
normalise_url("https://example.com/path/to/file"),
"https://example.com/path/to/file"
);
}
#[test]
fn test_normalise_url_multiple_slashes() {
assert_eq!(
normalise_url("https://example.com///a//b///c"),
"https://example.com/a/b/c"
);
}
#[test]
fn test_normalise_url_no_scheme() {
assert_eq!(normalise_url("example.com//path"), "example.com//path");
}
#[test]
fn test_normalise_url_trailing_slash() {
assert_eq!(
normalise_url("https://example.com/"),
"https://example.com/"
);
}
#[test]
fn test_normalise_url_http_scheme() {
assert_eq!(
normalise_url("http://example.com//a//b"),
"http://example.com/a/b"
);
}
#[test]
fn test_xml_escape_ampersand() {
assert_eq!(xml_escape("a&b"), "a&b");
}
#[test]
fn test_xml_escape_lt() {
assert_eq!(xml_escape("a<b"), "a<b");
}
#[test]
fn test_xml_escape_gt() {
assert_eq!(xml_escape("a>b"), "a>b");
}
#[test]
fn test_xml_escape_quot() {
assert_eq!(xml_escape("a\"b"), "a"b");
}
#[test]
fn test_xml_escape_apos() {
assert_eq!(xml_escape("a'b"), "a'b");
}
#[test]
fn test_xml_escape_all_combined() {
assert_eq!(
xml_escape("<tag attr=\"a&b\" val='c'>"),
"<tag attr="a&b" val='c'>"
);
}
#[test]
fn test_truncate_at_word_short() {
assert_eq!(truncate_at_word("hello", 10), "hello");
}
#[test]
fn test_truncate_at_word_long() {
let result = truncate_at_word("hello world foo bar", 12);
assert_eq!(result, "hello world...");
}
#[test]
fn test_truncate_at_word_no_spaces() {
let result = truncate_at_word("abcdefghij", 5);
assert_eq!(result, "abcde...");
}
#[test]
fn test_truncate_at_word_exact_length() {
let result = truncate_at_word("hello", 5);
assert_eq!(result, "hello");
}
#[test]
fn test_rfc2822_to_iso_date() {
let result = rfc2822_to_iso_date("Thu, 11 Apr 2026 06:06:06 +0000");
assert_eq!(result, Some("2026-04-11".to_string()));
}
#[test]
fn test_rfc2822_to_iso8601() {
let result = rfc2822_to_iso8601("Thu, 11 Apr 2026 06:06:06 +0000");
assert!(result.starts_with("2026-04-11"));
assert!(result.contains('T'));
}
#[test]
fn test_rfc2822_to_iso8601_passthrough() {
let input = "2026-04-11";
assert_eq!(rfc2822_to_iso8601(input), input);
}
#[test]
fn test_rfc2822_date_to_iso_date() {
let dt = Rfc2822Date {
year: 2026,
month: 4,
day: 11,
hour: 6,
min: 6,
sec: 6,
tz: "+0000".to_string(),
};
assert_eq!(dt.to_iso_date(), "2026-04-11");
}
#[test]
fn test_rfc2822_date_to_rfc3339_utc() {
let dt = Rfc2822Date {
year: 2026,
month: 4,
day: 11,
hour: 6,
min: 6,
sec: 6,
tz: "+0000".to_string(),
};
assert_eq!(dt.to_rfc3339(), "2026-04-11T06:06:06+00:00");
}
#[test]
fn test_rfc2822_date_to_rfc3339_gmt() {
let dt = Rfc2822Date {
year: 2025,
month: 1,
day: 15,
hour: 12,
min: 0,
sec: 0,
tz: "GMT".to_string(),
};
assert_eq!(dt.to_rfc3339(), "2025-01-15T12:00:00+00:00");
}
#[test]
fn test_rfc2822_date_to_rfc3339_utc_tz() {
let dt = Rfc2822Date {
year: 2025,
month: 6,
day: 1,
hour: 0,
min: 0,
sec: 0,
tz: "UTC".to_string(),
};
assert_eq!(dt.to_rfc3339(), "2025-06-01T00:00:00+00:00");
}
#[test]
fn test_rfc2822_date_to_rfc3339_positive_offset() {
let dt = Rfc2822Date {
year: 2026,
month: 12,
day: 25,
hour: 18,
min: 30,
sec: 45,
tz: "+0530".to_string(),
};
assert_eq!(dt.to_rfc3339(), "2026-12-25T18:30:45+05:30");
}
#[test]
fn test_rfc2822_date_to_rfc3339_negative_offset() {
let dt = Rfc2822Date {
year: 2026,
month: 7,
day: 4,
hour: 9,
min: 15,
sec: 0,
tz: "-0700".to_string(),
};
assert_eq!(dt.to_rfc3339(), "2026-07-04T09:15:00-07:00");
}
#[test]
fn test_rfc2822_date_to_rfc3339_unknown_tz() {
let dt = Rfc2822Date {
year: 2026,
month: 1,
day: 1,
hour: 0,
min: 0,
sec: 0,
tz: "EST".to_string(),
};
assert_eq!(dt.to_rfc3339(), "2026-01-01T00:00:00EST");
}
#[test]
fn test_parse_rfc2822_lenient_no_weekday() {
let dt = parse_rfc2822_lenient("11 Apr 2026 06:06:06 +0000");
assert!(dt.is_some());
let dt = dt.unwrap();
assert_eq!(dt.day, 11);
assert_eq!(dt.month, 4);
assert_eq!(dt.year, 2026);
}
#[test]
fn test_parse_rfc2822_lenient_invalid() {
assert!(parse_rfc2822_lenient("not a date").is_none());
}
#[test]
fn test_parse_rfc2822_lenient_too_few_parts() {
assert!(parse_rfc2822_lenient("11 Apr").is_none());
}
#[test]
fn test_parse_rfc2822_lenient_bad_month() {
assert!(parse_rfc2822_lenient("11 Xxx 2026 06:06:06 +0000").is_none());
}
#[test]
fn test_parse_rfc2822_lenient_bad_time() {
assert!(parse_rfc2822_lenient("11 Apr 2026 06:06 +0000").is_none());
}
#[test]
fn test_parse_rfc2822_lenient_no_tz_defaults() {
let dt = parse_rfc2822_lenient("11 Apr 2026 06:06:06");
assert!(dt.is_some());
let dt = dt.unwrap();
assert_eq!(dt.tz, "+0000");
}
#[test]
fn test_extract_xml_value() {
let xml = "<channel><title>Hello</title><link>https://example.com</link></channel>";
assert_eq!(extract_xml_value(xml, "title"), Some("Hello".to_string()));
assert_eq!(
extract_xml_value(xml, "link"),
Some("https://example.com".to_string())
);
assert_eq!(extract_xml_value(xml, "missing"), None);
}
#[test]
fn test_extract_xml_value_empty_value() {
let xml = "<title></title>";
assert_eq!(extract_xml_value(xml, "title"), None);
}
#[test]
fn test_extract_xml_value_whitespace_only() {
let xml = "<title> </title>";
assert_eq!(extract_xml_value(xml, "title"), None);
}
#[test]
fn test_normalise_url_in_xml_line() {
let line = " <loc>https://example.com//page//index.html</loc>";
let result = normalise_url_in_xml_line(line);
assert_eq!(result, " <loc>https://example.com/page/index.html</loc>");
}
#[test]
fn test_normalise_url_in_xml_line_no_url() {
let line = " <lastmod>2025-09-01</lastmod>";
let result = normalise_url_in_xml_line(line);
assert_eq!(result, line, "Non-URL lines should be unchanged");
}
}