use pulldown_cmark::{Event, Options, Parser, Tag};
use regex::Regex;
use crate::config::LinkPattern;
use crate::model::{Confidence, RawEdge};
pub fn extract_links(body: &str, custom_patterns: &[LinkPattern]) -> Vec<RawEdge> {
let mut edges = Vec::new();
let compiled_patterns = compile_patterns(custom_patterns);
let mut in_code_block = false;
let mut code_block_lines: Vec<(usize, usize)> = Vec::new();
let mut current_line = 0;
for (i, line) in body.lines().enumerate() {
let trimmed = line.trim();
if trimmed.starts_with("```") || trimmed.starts_with("~~~") {
if in_code_block {
code_block_lines.push((current_line, i));
in_code_block = false;
} else {
current_line = i;
in_code_block = true;
}
}
}
let opts = Options::empty();
let parser = Parser::new_ext(body, opts);
for (event, range) in parser.into_offset_iter() {
let line_num = body[..range.start].matches('\n').count() + 1;
if let Event::Start(Tag::Link { dest_url, .. }) = &event
&& let Some(raw_edge) = process_link_target(dest_url, line_num)
{
edges.push(raw_edge);
}
}
for (i, line) in body.lines().enumerate() {
let in_code = code_block_lines
.iter()
.any(|&(start, end)| i >= start && i <= end);
if in_code {
continue;
}
for (regex, relation) in &compiled_patterns {
if let Some(caps) = regex.captures(line)
&& let Some(m) = caps.get(1)
{
edges.push(RawEdge {
target_path: m.as_str().trim().to_string(),
relation: relation.clone(),
confidence: Confidence::Extracted,
location: format!("L{}", i + 1),
});
}
}
}
edges
}
fn process_link_target(dest: &str, line_num: usize) -> Option<RawEdge> {
let dest = dest.trim();
if dest.starts_with("http://")
|| dest.starts_with("https://")
|| dest.starts_with("mailto:")
|| dest.starts_with('#')
|| dest.is_empty()
{
return None;
}
let path = dest.split('#').next().unwrap_or(dest);
if !path.ends_with(".md") {
return None;
}
let normalized = path.strip_prefix("./").unwrap_or(path);
Some(RawEdge {
target_path: normalized.to_string(),
relation: "references".to_string(),
confidence: Confidence::Extracted,
location: format!("L{line_num}"),
})
}
fn compile_patterns(patterns: &[LinkPattern]) -> Vec<(Regex, String)> {
patterns
.iter()
.filter_map(|p| match Regex::new(&p.pattern) {
Ok(r) => Some((r, p.relation.clone())),
Err(e) => {
eprintln!("warning: invalid link pattern {:?}: {e}", p.pattern);
None
}
})
.collect()
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn extract_markdown_links() {
let body = "See [ADR 1](docs/decisions/0001-auth.md) for details.\n\
Also [external](https://example.com).";
let edges = extract_links(body, &[]);
assert_eq!(edges.len(), 1);
assert_eq!(edges[0].target_path, "docs/decisions/0001-auth.md");
assert_eq!(edges[0].relation, "references");
}
#[test]
fn skip_links_in_code_blocks() {
let body = "```\n[not a link](fake.md)\n```\n\n[real](real.md)";
let edges = extract_links(body, &[]);
assert_eq!(edges.len(), 1);
assert_eq!(edges[0].target_path, "real.md");
}
#[test]
fn strip_anchor_fragment() {
let body = "[link](docs/guide.md#section-3)";
let edges = extract_links(body, &[]);
assert_eq!(edges.len(), 1);
assert_eq!(edges[0].target_path, "docs/guide.md");
}
#[test]
fn custom_import_pattern() {
let patterns = vec![LinkPattern {
pattern: r"^@import\s+(.+?)\s*$".to_string(),
relation: "imports".to_string(),
}];
let body = "@import scripts/docs/parse.py\n\nSome text.";
let edges = extract_links(body, &patterns);
assert_eq!(edges.len(), 1);
assert_eq!(edges[0].target_path, "scripts/docs/parse.py");
assert_eq!(edges[0].relation, "imports");
}
#[test]
fn custom_pattern_skipped_in_code_block() {
let patterns = vec![LinkPattern {
pattern: r"^@import\s+(.+?)\s*$".to_string(),
relation: "imports".to_string(),
}];
let body = "```\n@import not/real.py\n```\n\n@import real/file.py";
let edges = extract_links(body, &patterns);
assert_eq!(edges.len(), 1);
assert_eq!(edges[0].target_path, "real/file.py");
}
#[test]
fn normalize_leading_dot_slash() {
let body = "[link](./relative/path.md)";
let edges = extract_links(body, &[]);
assert_eq!(edges.len(), 1);
assert_eq!(edges[0].target_path, "relative/path.md");
}
#[test]
fn skip_non_markdown() {
let body = "[img](picture.png)\n[doc](file.md)";
let edges = extract_links(body, &[]);
assert_eq!(edges.len(), 1);
assert_eq!(edges[0].target_path, "file.md");
}
}