Skip to main content

eure_mark/
check.rs

1//! Reference checking for eumd documents
2
3use std::collections::HashSet;
4
5use eure_document::document::{EureDocument, NodeId};
6use eure_document::map::Map;
7use regex::Regex;
8use std::sync::LazyLock;
9
10use crate::document::{EumdDocument, Section};
11use crate::error::{ReferenceError, ReferenceType};
12use crate::reference::extract_references;
13
14/// Result of reference checking
15#[derive(Debug, Default)]
16pub struct CheckResult {
17    /// List of reference errors
18    pub errors: Vec<ReferenceError>,
19}
20
21impl CheckResult {
22    /// Returns true if there are no errors
23    pub fn is_ok(&self) -> bool {
24        self.errors.is_empty()
25    }
26}
27
28/// Regex to extract BibTeX entry keys
29static BIBTEX_ENTRY_PATTERN: LazyLock<Regex> =
30    LazyLock::new(|| Regex::new(r"@\w+\{([^,\s]+)").expect("invalid bibtex entry regex"));
31
32/// Extract citation keys from BibTeX content
33fn extract_bibtex_keys(bibtex: &str) -> HashSet<String> {
34    BIBTEX_ENTRY_PATTERN
35        .captures_iter(bibtex)
36        .map(|cap| cap[1].to_string())
37        .collect()
38}
39
40/// Collect all section keys recursively
41fn collect_section_keys(sections: &Map<String, Section>, keys: &mut HashSet<String>) {
42    for (key, section) in sections.iter() {
43        keys.insert(key.clone());
44        collect_section_keys(&section.sections, keys);
45    }
46}
47
48/// Check all references in a document (basic version without spans)
49pub fn check_references(doc: &EumdDocument) -> CheckResult {
50    let mut result = CheckResult::default();
51
52    // Collect available keys
53    let cite_keys: HashSet<String> = doc
54        .cites
55        .as_ref()
56        .map(|c| extract_bibtex_keys(c))
57        .unwrap_or_default();
58
59    let footnote_keys: HashSet<String> = doc.footnotes.iter().map(|(k, _)| k.clone()).collect();
60
61    let mut section_keys = HashSet::new();
62    collect_section_keys(&doc.sections, &mut section_keys);
63
64    // Check references in all markdown content
65    check_content_simple(
66        doc.description.as_deref(),
67        "in description",
68        &cite_keys,
69        &footnote_keys,
70        &section_keys,
71        &mut result,
72    );
73
74    check_content_simple(
75        doc.intro.as_deref(),
76        "in intro",
77        &cite_keys,
78        &footnote_keys,
79        &section_keys,
80        &mut result,
81    );
82
83    // Check sections recursively
84    check_sections_simple(
85        &doc.sections,
86        "",
87        &cite_keys,
88        &footnote_keys,
89        &section_keys,
90        &mut result,
91    );
92
93    // Check footnote content
94    for (key, footnote) in doc.footnotes.iter() {
95        check_content_simple(
96            Some(&footnote.content),
97            &format!("in footnote '{key}'"),
98            &cite_keys,
99            &footnote_keys,
100            &section_keys,
101            &mut result,
102        );
103    }
104
105    result
106}
107
108fn check_content_simple(
109    content: Option<&str>,
110    location: &str,
111    cite_keys: &HashSet<String>,
112    footnote_keys: &HashSet<String>,
113    section_keys: &HashSet<String>,
114    result: &mut CheckResult,
115) {
116    let Some(content) = content else { return };
117
118    for reference in extract_references(content) {
119        let is_valid = match reference.ref_type {
120            ReferenceType::Cite => cite_keys.contains(&reference.key),
121            ReferenceType::Footnote => footnote_keys.contains(&reference.key),
122            ReferenceType::Section => section_keys.contains(&reference.key),
123        };
124
125        if !is_valid {
126            result.errors.push(ReferenceError::new(
127                reference.ref_type,
128                reference.key,
129                location.to_string(),
130            ));
131        }
132    }
133}
134
135fn check_sections_simple(
136    sections: &Map<String, Section>,
137    path: &str,
138    cite_keys: &HashSet<String>,
139    footnote_keys: &HashSet<String>,
140    section_keys: &HashSet<String>,
141    result: &mut CheckResult,
142) {
143    for (key, section) in sections.iter() {
144        let current_path = if path.is_empty() {
145            key.clone()
146        } else {
147            format!("{path}.{key}")
148        };
149
150        // Check header if present
151        check_content_simple(
152            section.header.as_deref(),
153            &format!("in section '{current_path}' header"),
154            cite_keys,
155            footnote_keys,
156            section_keys,
157            result,
158        );
159
160        // Check body
161        check_content_simple(
162            section.body.as_deref(),
163            &format!("in section '{current_path}'"),
164            cite_keys,
165            footnote_keys,
166            section_keys,
167            result,
168        );
169
170        // Recurse into nested sections
171        check_sections_simple(
172            &section.sections,
173            &current_path,
174            cite_keys,
175            footnote_keys,
176            section_keys,
177            result,
178        );
179    }
180}
181
182// ============================================================================
183// Advanced checking with span information
184// ============================================================================
185
186/// Context for checking with span information
187struct CheckContext<'a> {
188    raw_doc: &'a EureDocument,
189    cite_keys: HashSet<String>,
190    footnote_keys: HashSet<String>,
191    section_keys: HashSet<String>,
192    result: CheckResult,
193}
194
195impl<'a> CheckContext<'a> {
196    fn new(eumd_doc: &EumdDocument, raw_doc: &'a EureDocument) -> Self {
197        let cite_keys: HashSet<String> = eumd_doc
198            .cites
199            .as_ref()
200            .map(|c| extract_bibtex_keys(c))
201            .unwrap_or_default();
202
203        let footnote_keys: HashSet<String> =
204            eumd_doc.footnotes.iter().map(|(k, _)| k.clone()).collect();
205
206        let mut section_keys = HashSet::new();
207        collect_section_keys(&eumd_doc.sections, &mut section_keys);
208
209        CheckContext {
210            raw_doc,
211            cite_keys,
212            footnote_keys,
213            section_keys,
214            result: CheckResult::default(),
215        }
216    }
217
218    fn check_content(&mut self, content: &str, location: &str, node_id: NodeId) {
219        // Get the actual text content offset within the code block
220        let content_offset = get_code_block_content_offset(self.raw_doc, node_id);
221
222        for reference in extract_references(content) {
223            let is_valid = match reference.ref_type {
224                ReferenceType::Cite => self.cite_keys.contains(&reference.key),
225                ReferenceType::Footnote => self.footnote_keys.contains(&reference.key),
226                ReferenceType::Section => self.section_keys.contains(&reference.key),
227            };
228
229            if !is_valid {
230                self.result.errors.push(ReferenceError::with_span(
231                    reference.ref_type,
232                    reference.key,
233                    location.to_string(),
234                    node_id,
235                    content_offset + reference.offset,
236                    reference.len,
237                ));
238            }
239        }
240    }
241
242    fn check_sections(
243        &mut self,
244        sections: &Map<String, Section>,
245        path: &str,
246        sections_node_id: NodeId,
247    ) {
248        let sections_node = self.raw_doc.node(sections_node_id);
249        let Some(sections_map) = sections_node.as_map() else {
250            return;
251        };
252
253        for (key, section) in sections.iter() {
254            let current_path = if path.is_empty() {
255                key.clone()
256            } else {
257                format!("{path}.{key}")
258            };
259
260            let Some(section_node_id) = sections_map.get_node_id(&key.clone().into()) else {
261                continue;
262            };
263
264            let section_node = self.raw_doc.node(section_node_id);
265            let Some(section_map) = section_node.as_map() else {
266                continue;
267            };
268
269            // Check header if present
270            if let Some(ref header) = section.header
271                && let Some(header_node_id) = section_map.get_node_id(&"header".into())
272            {
273                self.check_content(
274                    header,
275                    &format!("in section '{current_path}' header"),
276                    header_node_id,
277                );
278            }
279
280            // Check body
281            if let Some(ref body) = section.body
282                && let Some(body_node_id) = section_map.get_node_id(&"body".into())
283            {
284                self.check_content(body, &format!("in section '{current_path}'"), body_node_id);
285            }
286
287            // Recurse into nested sections
288            if let Some(nested_sections_id) = section_map.get_node_id(&"sections".into()) {
289                self.check_sections(&section.sections, &current_path, nested_sections_id);
290            }
291        }
292    }
293}
294
295/// Get the byte offset of the code block content start within the node
296fn get_code_block_content_offset(_raw_doc: &EureDocument, _node_id: NodeId) -> u32 {
297    // For code blocks, we need to account for the opening ``` and language tag
298    // However, since we're using the node's span which points to the content,
299    // we can return 0 here. The actual offset calculation happens in report.rs
300    // when we compute the final span using OriginMap.
301    0
302}
303
304/// Check references with span information for better error reporting
305pub fn check_references_with_spans(eumd_doc: &EumdDocument, raw_doc: &EureDocument) -> CheckResult {
306    let mut ctx = CheckContext::new(eumd_doc, raw_doc);
307
308    let root_id = raw_doc.get_root_id();
309    let root = raw_doc.node(root_id);
310
311    let Some(map) = root.as_map() else {
312        return ctx.result;
313    };
314
315    // Check description
316    if let Some(ref content) = eumd_doc.description
317        && let Some(node_id) = map.get_node_id(&"description".into())
318    {
319        ctx.check_content(content, "in description", node_id);
320    }
321
322    // Check intro
323    if let Some(ref content) = eumd_doc.intro
324        && let Some(node_id) = map.get_node_id(&"intro".into())
325    {
326        ctx.check_content(content, "in intro", node_id);
327    }
328
329    // Check sections recursively
330    if let Some(sections_node_id) = map.get_node_id(&"sections".into()) {
331        ctx.check_sections(&eumd_doc.sections, "", sections_node_id);
332    }
333
334    // Check footnotes
335    if let Some(footnotes_node_id) = map.get_node_id(&"footnotes".into()) {
336        let footnotes_node = raw_doc.node(footnotes_node_id);
337        if let Some(footnotes_map) = footnotes_node.as_map() {
338            for (key, footnote) in eumd_doc.footnotes.iter() {
339                if let Some(footnote_node_id) = footnotes_map.get_node_id(&key.clone().into())
340                    && let Some(content_node_id) = raw_doc
341                        .node(footnote_node_id)
342                        .as_map()
343                        .and_then(|m| m.get_node_id(&"content".into()))
344                {
345                    ctx.check_content(
346                        &footnote.content,
347                        &format!("in footnote '{key}'"),
348                        content_node_id,
349                    );
350                }
351            }
352        }
353    }
354
355    ctx.result
356}
357
358#[cfg(test)]
359mod tests {
360    use super::*;
361
362    #[test]
363    fn test_extract_bibtex_keys() {
364        let bibtex = r#"
365@article{knuth1984,
366  author = "Donald Knuth",
367  title = "Literate Programming"
368}
369
370@book{lamport1994,
371  author = "Leslie Lamport"
372}
373"#;
374        let keys = extract_bibtex_keys(bibtex);
375        assert!(keys.contains("knuth1984"));
376        assert!(keys.contains("lamport1994"));
377        assert_eq!(keys.len(), 2);
378    }
379}