use papaya::HashMap as ConcurrentHashMap;
use serde::{Deserialize, Serialize};
use std::sync::atomic::{AtomicUsize, Ordering};
use std::time::Instant;
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
pub struct OutboundLink {
pub to: String,
pub text: String,
#[serde(skip_serializing_if = "Option::is_none")]
pub anchor: Option<String>,
pub internal: bool,
}
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
pub struct InboundLink {
pub from: String,
pub text: String,
#[serde(skip_serializing_if = "Option::is_none")]
pub anchor: Option<String>,
}
#[derive(Debug, Clone, Serialize, Deserialize, Default)]
pub struct PageLinks {
pub inbound: Vec<InboundLink>,
pub outbound: Vec<OutboundLink>,
}
pub fn split_url_anchor(url: &str) -> (String, Option<String>) {
if let Some(hash_pos) = url.find('#') {
let path = url[..hash_pos].to_string();
let anchor = Some(url[hash_pos..].to_string());
(path, anchor)
} else {
(url.to_string(), None)
}
}
pub fn is_internal_link(url: &str) -> bool {
if url.starts_with("http://")
|| url.starts_with("https://")
|| url.starts_with("//")
|| url.starts_with("mailto:")
|| url.starts_with("tel:")
|| url.starts_with("javascript:")
|| url.starts_with("data:")
{
return false;
}
true
}
pub fn normalize_url_path(url: &str) -> String {
let url = url.split('?').next().unwrap_or(url);
let (path, _anchor) = split_url_anchor(url);
let path = path.trim_end_matches('/');
if path.is_empty() {
return "/".to_string();
}
let last_component = path.rsplit('/').next().unwrap_or(path);
if last_component.contains('.') && !last_component.starts_with('.') {
path.to_string()
} else {
format!("{}/", path)
}
}
pub fn resolve_relative_url(base_url: &str, relative_url: &str) -> String {
if relative_url.starts_with('/') {
let trimmed = relative_url.trim_end_matches('/');
return if trimmed.is_empty() {
"/".to_string()
} else {
format!("{}/", trimmed)
};
}
if relative_url.starts_with('#') {
return relative_url.to_string();
}
let base_segments: Vec<&str> = base_url
.trim_matches('/')
.split('/')
.filter(|s| !s.is_empty())
.collect();
let mut segments: Vec<&str> = if !base_segments.is_empty() {
base_segments[..base_segments.len() - 1].to_vec()
} else {
vec![]
};
for part in relative_url.split('/') {
match part {
"" | "." => {} ".." => {
segments.pop(); }
segment => {
segments.push(segment); }
}
}
if segments.is_empty() {
"/".to_string()
} else {
format!("/{}/", segments.join("/"))
}
}
pub fn resolve_outbound_links(base_url: &str, links: Vec<OutboundLink>) -> Vec<OutboundLink> {
links
.into_iter()
.map(|mut link| {
if link.internal && !link.to.starts_with('/') && !link.to.starts_with('#') {
link.to = resolve_relative_url(base_url, &link.to);
}
link
})
.collect()
}
#[derive(Clone)]
struct LinkCacheEntry {
links: Vec<OutboundLink>,
inserted_at: Instant,
size_bytes: usize,
}
pub struct LinkCache {
cache: ConcurrentHashMap<String, LinkCacheEntry>,
current_size: AtomicUsize,
max_size: usize,
}
impl LinkCache {
pub fn new(max_size_bytes: usize) -> Self {
Self {
cache: ConcurrentHashMap::new(),
current_size: AtomicUsize::new(0),
max_size: max_size_bytes,
}
}
pub fn get(&self, url_path: &str) -> Option<Vec<OutboundLink>> {
if self.max_size == 0 {
return None;
}
let guard = self.cache.pin();
guard.get(url_path).map(|entry| {
tracing::debug!("link cache hit: {}", url_path);
entry.links.clone()
})
}
pub fn insert(&self, url_path: String, links: Vec<OutboundLink>) {
if self.max_size == 0 {
return;
}
let size_bytes = url_path.len()
+ links
.iter()
.map(|l| {
l.to.len() + l.text.len() + l.anchor.as_ref().map(|a| a.len()).unwrap_or(0) + 32
})
.sum::<usize>()
+ std::mem::size_of::<LinkCacheEntry>();
let entry = LinkCacheEntry {
links,
inserted_at: Instant::now(),
size_bytes,
};
self.cache.pin().insert(url_path.clone(), entry);
let new_size = self.current_size.fetch_add(size_bytes, Ordering::Relaxed) + size_bytes;
tracing::debug!("link cached: {} ({} bytes)", url_path, size_bytes);
if new_size > self.max_size {
self.evict_oldest(new_size - self.max_size);
}
}
fn evict_oldest(&self, target_bytes: usize) {
let guard = self.cache.pin();
let mut entries: Vec<(String, Instant, usize)> = guard
.iter()
.map(|(k, v)| (k.clone(), v.inserted_at, v.size_bytes))
.collect();
entries.sort_by_key(|(_, inserted_at, _)| *inserted_at);
let mut freed = 0usize;
let mut evict_count = 0usize;
for (url, _, size) in entries {
if freed >= target_bytes {
break;
}
if guard.remove(&url).is_some() {
freed += size;
evict_count += 1;
self.current_size.fetch_sub(size, Ordering::Relaxed);
}
}
if evict_count > 0 {
tracing::debug!(
"link cache evicted {} entries ({} bytes freed)",
evict_count,
freed
);
}
}
#[cfg(test)]
pub fn current_size(&self) -> usize {
self.current_size.load(Ordering::Relaxed)
}
#[cfg(test)]
pub fn len(&self) -> usize {
self.cache.pin().len()
}
#[cfg(test)]
pub fn is_empty(&self) -> bool {
self.cache.pin().is_empty()
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_split_url_anchor_with_anchor() {
let (path, anchor) = split_url_anchor("/docs/guide/#section");
assert_eq!(path, "/docs/guide/");
assert_eq!(anchor, Some("#section".to_string()));
}
#[test]
fn test_split_url_anchor_without_anchor() {
let (path, anchor) = split_url_anchor("/docs/guide/");
assert_eq!(path, "/docs/guide/");
assert_eq!(anchor, None);
}
#[test]
fn test_split_url_anchor_only_anchor() {
let (path, anchor) = split_url_anchor("#section");
assert_eq!(path, "");
assert_eq!(anchor, Some("#section".to_string()));
}
#[test]
fn test_is_internal_link_external_https() {
assert!(!is_internal_link("https://example.com"));
assert!(!is_internal_link("http://example.com"));
}
#[test]
fn test_is_internal_link_external_protocols() {
assert!(!is_internal_link("mailto:test@example.com"));
assert!(!is_internal_link("tel:+1234567890"));
assert!(!is_internal_link("javascript:void(0)"));
assert!(!is_internal_link("data:text/html,<h1>Hi</h1>"));
assert!(!is_internal_link("//cdn.example.com/script.js"));
}
#[test]
fn test_is_internal_link_internal() {
assert!(is_internal_link("/docs/guide/"));
assert!(is_internal_link("../other-page/"));
assert!(is_internal_link("./sibling/"));
assert!(is_internal_link("relative-path/"));
assert!(is_internal_link("#anchor"));
}
#[test]
fn test_normalize_url_path_trailing_slash() {
assert_eq!(normalize_url_path("/docs/guide"), "/docs/guide/");
assert_eq!(normalize_url_path("/docs/guide/"), "/docs/guide/");
}
#[test]
fn test_normalize_url_path_file() {
assert_eq!(normalize_url_path("/images/photo.jpg"), "/images/photo.jpg");
assert_eq!(normalize_url_path("/docs/file.pdf"), "/docs/file.pdf");
}
#[test]
fn test_normalize_url_path_with_query() {
assert_eq!(normalize_url_path("/docs/guide/?foo=bar"), "/docs/guide/");
}
#[test]
fn test_normalize_url_path_root() {
assert_eq!(normalize_url_path("/"), "/");
assert_eq!(normalize_url_path(""), "/");
}
#[test]
fn test_link_cache_insert_and_get() {
let cache = LinkCache::new(1024 * 1024);
let links = vec![OutboundLink {
to: "/other/".to_string(),
text: "Other Page".to_string(),
anchor: None,
internal: true,
}];
cache.insert("/docs/".to_string(), links.clone());
let retrieved = cache.get("/docs/");
assert!(retrieved.is_some());
assert_eq!(retrieved.unwrap(), links);
}
#[test]
fn test_link_cache_miss() {
let cache = LinkCache::new(1024 * 1024);
assert!(cache.get("/nonexistent/").is_none());
}
#[test]
fn test_link_cache_disabled() {
let cache = LinkCache::new(0);
let links = vec![OutboundLink {
to: "/other/".to_string(),
text: "Other".to_string(),
anchor: None,
internal: true,
}];
cache.insert("/docs/".to_string(), links);
assert!(cache.get("/docs/").is_none());
}
#[test]
fn test_outbound_link_serialize() {
let link = OutboundLink {
to: "/docs/guide/".to_string(),
text: "Guide".to_string(),
anchor: Some("#intro".to_string()),
internal: true,
};
let json = serde_json::to_string(&link).unwrap();
assert!(json.contains("\"to\":\"/docs/guide/\""));
assert!(json.contains("\"text\":\"Guide\""));
assert!(json.contains("\"anchor\":\"#intro\""));
assert!(json.contains("\"internal\":true"));
}
#[test]
fn test_outbound_link_serialize_no_anchor() {
let link = OutboundLink {
to: "/docs/".to_string(),
text: "Docs".to_string(),
anchor: None,
internal: true,
};
let json = serde_json::to_string(&link).unwrap();
assert!(!json.contains("anchor"));
}
#[test]
fn test_page_links_default() {
let links = PageLinks::default();
assert!(links.inbound.is_empty());
assert!(links.outbound.is_empty());
}
}