use std::collections::HashSet;
use regex::Regex;
use crate::extract::ExtractedDoc;
use crate::store::DocStore;
#[derive(Debug)]
struct DocLink {
url: String,
link_type: String,
target_doc_id: Option<String>,
}
pub fn extract_and_link_doc(store: &DocStore, doc: &ExtractedDoc, all_doc_ids: &HashSet<String>) {
let links = extract_links(&doc.text, &doc.file);
if links.is_empty() {
return;
}
let _ = store.delete_links_from(&doc.file);
for link in &links {
if let Some(ref target) = link.target_doc_id {
if all_doc_ids.contains(target) && target != &doc.file {
let _ = store.create_link(&doc.file, target, &link.url, &link.link_type);
}
}
}
}
fn extract_links(text: &str, doc_file: &str) -> Vec<DocLink> {
let mut links = Vec::new();
let md_link_re = Regex::new(r"\[([^\]]*)\]\(([^)]+)\)").unwrap();
for cap in md_link_re.captures_iter(text) {
let url = cap[2].trim();
if url.starts_with('#') {
continue; }
let classified = classify_doc_link(url, doc_file);
links.push(classified);
}
let html_link_re = Regex::new(r#"<a\s[^>]*href=["']([^"']+)["']"#).unwrap();
for cap in html_link_re.captures_iter(text) {
let url = cap[1].trim();
if url.starts_with('#') {
continue;
}
let classified = classify_doc_link(url, doc_file);
links.push(classified);
}
links
}
fn classify_doc_link(url: &str, doc_file: &str) -> DocLink {
if url.contains("/wiki/") || url.contains("confluence") || url.contains("atlassian") {
let page_id = extract_confluence_page_id(url);
return DocLink {
url: url.to_string(),
link_type: "confluence".to_string(),
target_doc_id: page_id,
};
}
if url.contains("/browse/") || url.contains("jira") {
return DocLink {
url: url.to_string(),
link_type: "jira".to_string(),
target_doc_id: None,
};
}
if url.starts_with("http://") || url.starts_with("https://") || url.starts_with("//") {
return DocLink {
url: url.to_string(),
link_type: "external".to_string(),
target_doc_id: None,
};
}
let target = resolve_relative_path(url, doc_file);
DocLink {
url: url.to_string(),
link_type: "local".to_string(),
target_doc_id: Some(target),
}
}
fn resolve_relative_path(url: &str, doc_file: &str) -> String {
let path = url.split('#').next().unwrap_or(url);
let path = path.split('?').next().unwrap_or(path);
if path.is_empty() {
return doc_file.to_string();
}
let dir = if let Some(idx) = doc_file.rfind('/') {
&doc_file[..idx]
} else {
""
};
let full = if path.starts_with('/') || dir.is_empty() {
path.to_string()
} else {
format!("{}/{}", dir, path)
};
let mut parts: Vec<&str> = Vec::new();
for component in full.split('/') {
match component {
"" | "." => {}
".." => {
parts.pop();
}
other => parts.push(other),
}
}
parts.join("/")
}
fn extract_confluence_page_id(url: &str) -> Option<String> {
if url.contains("/pages/") {
let parts: Vec<&str> = url.split('/').collect();
if let Some(idx) = parts.iter().position(|&p| p == "pages") {
if let Some(id) = parts.get(idx + 1) {
if id.chars().all(|c| c.is_ascii_digit()) {
return None;
}
}
}
}
if url.contains("pageId=") {
if let Some(id) = url.split("pageId=").nth(1) {
let id = id.split('&').next().unwrap_or(id);
if id.chars().all(|c| c.is_ascii_digit()) {
return None;
}
}
}
None
}