Skip to main content

infigraph_docs/
links.rs

1use std::collections::HashSet;
2
3use regex::Regex;
4
5use crate::extract::ExtractedDoc;
6use crate::store::DocStore;
7
8#[derive(Debug)]
9struct DocLink {
10    url: String,
11    link_type: String,
12    target_doc_id: Option<String>,
13}
14
15pub fn extract_and_link_doc(store: &DocStore, doc: &ExtractedDoc, all_doc_ids: &HashSet<String>) {
16    let links = extract_links(&doc.text, &doc.file);
17    if links.is_empty() {
18        return;
19    }
20
21    let _ = store.delete_links_from(&doc.file);
22
23    for link in &links {
24        if let Some(ref target) = link.target_doc_id {
25            if all_doc_ids.contains(target) && target != &doc.file {
26                let _ = store.create_link(&doc.file, target, &link.url, &link.link_type);
27            }
28        }
29    }
30}
31
32fn extract_links(text: &str, doc_file: &str) -> Vec<DocLink> {
33    let mut links = Vec::new();
34
35    // Markdown links: [text](url)
36    let md_link_re = Regex::new(r"\[([^\]]*)\]\(([^)]+)\)").unwrap();
37    for cap in md_link_re.captures_iter(text) {
38        let url = cap[2].trim();
39        if url.starts_with('#') {
40            continue; // anchor-only
41        }
42        let classified = classify_doc_link(url, doc_file);
43        links.push(classified);
44    }
45
46    // HTML links: <a href="...">
47    let html_link_re = Regex::new(r#"<a\s[^>]*href=["']([^"']+)["']"#).unwrap();
48    for cap in html_link_re.captures_iter(text) {
49        let url = cap[1].trim();
50        if url.starts_with('#') {
51            continue;
52        }
53        let classified = classify_doc_link(url, doc_file);
54        links.push(classified);
55    }
56
57    links
58}
59
60fn classify_doc_link(url: &str, doc_file: &str) -> DocLink {
61    // Confluence page links
62    if url.contains("/wiki/") || url.contains("confluence") || url.contains("atlassian") {
63        let page_id = extract_confluence_page_id(url);
64        return DocLink {
65            url: url.to_string(),
66            link_type: "confluence".to_string(),
67            target_doc_id: page_id,
68        };
69    }
70
71    // JIRA links
72    if url.contains("/browse/") || url.contains("jira") {
73        return DocLink {
74            url: url.to_string(),
75            link_type: "jira".to_string(),
76            target_doc_id: None,
77        };
78    }
79
80    // External URLs
81    if url.starts_with("http://") || url.starts_with("https://") || url.starts_with("//") {
82        return DocLink {
83            url: url.to_string(),
84            link_type: "external".to_string(),
85            target_doc_id: None,
86        };
87    }
88
89    // Relative path — resolve against current doc's directory
90    let target = resolve_relative_path(url, doc_file);
91    DocLink {
92        url: url.to_string(),
93        link_type: "local".to_string(),
94        target_doc_id: Some(target),
95    }
96}
97
98fn resolve_relative_path(url: &str, doc_file: &str) -> String {
99    // Strip fragment
100    let path = url.split('#').next().unwrap_or(url);
101    // Strip query string
102    let path = path.split('?').next().unwrap_or(path);
103
104    if path.is_empty() {
105        return doc_file.to_string();
106    }
107
108    // Get directory of current doc
109    let dir = if let Some(idx) = doc_file.rfind('/') {
110        &doc_file[..idx]
111    } else {
112        ""
113    };
114
115    // Resolve relative path components
116    let full = if path.starts_with('/') || dir.is_empty() {
117        path.to_string()
118    } else {
119        format!("{}/{}", dir, path)
120    };
121
122    // Normalize: collapse .. and .
123    let mut parts: Vec<&str> = Vec::new();
124    for component in full.split('/') {
125        match component {
126            "" | "." => {}
127            ".." => {
128                parts.pop();
129            }
130            other => parts.push(other),
131        }
132    }
133    parts.join("/")
134}
135
136fn extract_confluence_page_id(url: &str) -> Option<String> {
137    // /wiki/spaces/SPACE/pages/PAGEID/...
138    if url.contains("/pages/") {
139        let parts: Vec<&str> = url.split('/').collect();
140        if let Some(idx) = parts.iter().position(|&p| p == "pages") {
141            if let Some(id) = parts.get(idx + 1) {
142                if id.chars().all(|c| c.is_ascii_digit()) {
143                    // Can't resolve to doc_id without knowing space — return None
144                    // Confluence docs use confluence://SPACE/ID format
145                    return None;
146                }
147            }
148        }
149    }
150    if url.contains("pageId=") {
151        if let Some(id) = url.split("pageId=").nth(1) {
152            let id = id.split('&').next().unwrap_or(id);
153            if id.chars().all(|c| c.is_ascii_digit()) {
154                return None;
155            }
156        }
157    }
158    None
159}