1use std::collections::HashSet;
2
3use regex::Regex;
4
5use crate::extract::ExtractedDoc;
6use crate::store::DocStore;
7
8#[derive(Debug)]
9struct DocLink {
10 url: String,
11 link_type: String,
12 target_doc_id: Option<String>,
13}
14
15pub fn extract_and_link_doc(store: &DocStore, doc: &ExtractedDoc, all_doc_ids: &HashSet<String>) {
16 let links = extract_links(&doc.text, &doc.file);
17 if links.is_empty() {
18 return;
19 }
20
21 let _ = store.delete_links_from(&doc.file);
22
23 for link in &links {
24 if let Some(ref target) = link.target_doc_id {
25 if all_doc_ids.contains(target) && target != &doc.file {
26 let _ = store.create_link(&doc.file, target, &link.url, &link.link_type);
27 }
28 }
29 }
30}
31
32fn extract_links(text: &str, doc_file: &str) -> Vec<DocLink> {
33 let mut links = Vec::new();
34
35 let md_link_re = Regex::new(r"\[([^\]]*)\]\(([^)]+)\)").unwrap();
37 for cap in md_link_re.captures_iter(text) {
38 let url = cap[2].trim();
39 if url.starts_with('#') {
40 continue; }
42 let classified = classify_doc_link(url, doc_file);
43 links.push(classified);
44 }
45
46 let html_link_re = Regex::new(r#"<a\s[^>]*href=["']([^"']+)["']"#).unwrap();
48 for cap in html_link_re.captures_iter(text) {
49 let url = cap[1].trim();
50 if url.starts_with('#') {
51 continue;
52 }
53 let classified = classify_doc_link(url, doc_file);
54 links.push(classified);
55 }
56
57 links
58}
59
60fn classify_doc_link(url: &str, doc_file: &str) -> DocLink {
61 if url.contains("/wiki/") || url.contains("confluence") || url.contains("atlassian") {
63 let page_id = extract_confluence_page_id(url);
64 return DocLink {
65 url: url.to_string(),
66 link_type: "confluence".to_string(),
67 target_doc_id: page_id,
68 };
69 }
70
71 if url.contains("/browse/") || url.contains("jira") {
73 return DocLink {
74 url: url.to_string(),
75 link_type: "jira".to_string(),
76 target_doc_id: None,
77 };
78 }
79
80 if url.starts_with("http://") || url.starts_with("https://") || url.starts_with("//") {
82 return DocLink {
83 url: url.to_string(),
84 link_type: "external".to_string(),
85 target_doc_id: None,
86 };
87 }
88
89 let target = resolve_relative_path(url, doc_file);
91 DocLink {
92 url: url.to_string(),
93 link_type: "local".to_string(),
94 target_doc_id: Some(target),
95 }
96}
97
98fn resolve_relative_path(url: &str, doc_file: &str) -> String {
99 let path = url.split('#').next().unwrap_or(url);
101 let path = path.split('?').next().unwrap_or(path);
103
104 if path.is_empty() {
105 return doc_file.to_string();
106 }
107
108 let dir = if let Some(idx) = doc_file.rfind('/') {
110 &doc_file[..idx]
111 } else {
112 ""
113 };
114
115 let full = if path.starts_with('/') || dir.is_empty() {
117 path.to_string()
118 } else {
119 format!("{}/{}", dir, path)
120 };
121
122 let mut parts: Vec<&str> = Vec::new();
124 for component in full.split('/') {
125 match component {
126 "" | "." => {}
127 ".." => {
128 parts.pop();
129 }
130 other => parts.push(other),
131 }
132 }
133 parts.join("/")
134}
135
136fn extract_confluence_page_id(url: &str) -> Option<String> {
137 if url.contains("/pages/") {
139 let parts: Vec<&str> = url.split('/').collect();
140 if let Some(idx) = parts.iter().position(|&p| p == "pages") {
141 if let Some(id) = parts.get(idx + 1) {
142 if id.chars().all(|c| c.is_ascii_digit()) {
143 return None;
146 }
147 }
148 }
149 }
150 if url.contains("pageId=") {
151 if let Some(id) = url.split("pageId=").nth(1) {
152 let id = id.split('&').next().unwrap_or(id);
153 if id.chars().all(|c| c.is_ascii_digit()) {
154 return None;
155 }
156 }
157 }
158 None
159}