Skip to main content

sem_core/parser/plugins/
latex.rs

1use regex::Regex;
2use std::collections::HashMap;
3
4use crate::model::entity::{build_entity_id, build_entity_id_disambiguated, SemanticEntity};
5use crate::parser::plugin::SemanticParserPlugin;
6use crate::utils::hash::content_hash;
7
8pub struct LatexParserPlugin;
9
10const SIGNIFICANT_ENVIRONMENTS: &[&str] = &[
11    "theorem",
12    "lemma",
13    "corollary",
14    "proposition",
15    "definition",
16    "proof",
17    "example",
18    "remark",
19    "figure",
20    "table",
21    "listing",
22    "algorithm",
23    "abstract",
24    "appendix",
25];
26
27/// Map LaTeX sectioning commands to hierarchy levels.
28fn section_level(cmd: &str) -> Option<usize> {
29    match cmd {
30        "part" => Some(0),
31        "chapter" => Some(1),
32        "section" => Some(2),
33        "subsection" => Some(3),
34        "subsubsection" => Some(4),
35        "paragraph" => Some(5),
36        _ => None,
37    }
38}
39
40/// Extract content inside balanced braces starting at byte position `pos` (the `{`).
41/// Uses char iteration to handle UTF-8 correctly.
42fn extract_braced(s: &str, pos: usize) -> Option<String> {
43    let substr = &s[pos..];
44    let mut chars = substr.chars();
45
46    if chars.next() != Some('{') {
47        return None;
48    }
49
50    let mut depth = 1i32;
51    let mut result = String::new();
52
53    for ch in chars {
54        match ch {
55            '{' => {
56                depth += 1;
57                result.push(ch);
58            }
59            '}' => {
60                depth -= 1;
61                if depth == 0 {
62                    return Some(result);
63                }
64                result.push(ch);
65            }
66            _ => result.push(ch),
67        }
68    }
69    None
70}
71
72/// A line is a comment if the first non-whitespace character is `%`.
73fn is_comment_line(line: &str) -> bool {
74    line.trim_start().starts_with('%')
75}
76
77impl SemanticParserPlugin for LatexParserPlugin {
78    fn id(&self) -> &str {
79        "latex"
80    }
81
82    fn extensions(&self) -> &[&str] {
83        &[".tex", ".latex", ".cls", ".sty"]
84    }
85
86    fn extract_entities(&self, content: &str, file_path: &str) -> Vec<SemanticEntity> {
87        let mut entities = Vec::new();
88        let lines: Vec<&str> = content.lines().collect();
89        if lines.is_empty() {
90            return entities;
91        }
92
93        let section_re = Regex::new(
94            r"\\(part|chapter|section|subsection|subsubsection|paragraph)\*?\{",
95        )
96        .unwrap();
97        let begin_env_re = Regex::new(r"\\begin\{(\w+)\}").unwrap();
98        let end_env_re = Regex::new(r"\\end\{(\w+)\}").unwrap();
99        let label_re = Regex::new(r"\\label\{([^}]+)\}").unwrap();
100        let cmd_def_re =
101            Regex::new(r"\\(newcommand|renewcommand|DeclareMathOperator)\*?\{?\\(\w+)").unwrap();
102
103        // --- Locate \begin{document} and \end{document} ---
104        let mut doc_start: Option<usize> = None;
105        let mut doc_end: Option<usize> = None;
106        for (i, &line) in lines.iter().enumerate() {
107            if !is_comment_line(line) {
108                if doc_start.is_none() && line.contains(r"\begin{document}") {
109                    doc_start = Some(i);
110                } else if doc_start.is_some()
111                    && doc_end.is_none()
112                    && line.contains(r"\end{document}")
113                {
114                    doc_end = Some(i);
115                }
116            }
117        }
118
119        let body_start = doc_start.map_or(0, |s| s + 1);
120        let body_end = doc_end.unwrap_or(lines.len());
121
122        // --- Preamble ---
123        // For files with \begin{document}: everything before it is preamble.
124        // For .sty/.cls without \begin{document}: entire file is preamble.
125        let preamble_range: Option<(usize, usize)> = if let Some(ds) = doc_start {
126            if ds > 0 {
127                Some((0, ds))
128            } else {
129                None
130            }
131        } else if file_path.ends_with(".sty") || file_path.ends_with(".cls") {
132            Some((0, lines.len()))
133        } else {
134            None
135        };
136
137        if let Some((p_start, p_end)) = preamble_range {
138            let preamble_content = lines[p_start..p_end].join("\n").trim().to_string();
139            if !preamble_content.is_empty() {
140                let pid = build_entity_id(file_path, "preamble", "(preamble)", None);
141                entities.push(SemanticEntity {
142                    id: pid.clone(),
143                    file_path: file_path.to_string(),
144                    entity_type: "preamble".to_string(),
145                    name: "(preamble)".to_string(),
146                    parent_id: None,
147                    content_hash: content_hash(&preamble_content),
148                    structural_hash: None,
149                    content: preamble_content,
150                    start_line: p_start + 1,
151                    end_line: p_end,
152                    metadata: None,
153                });
154
155                // Extract command definitions from preamble
156                let preamble_lines = &lines[p_start..p_end];
157                let mut i = 0;
158                while i < preamble_lines.len() {
159                    let line = preamble_lines[i];
160                    if !is_comment_line(line) {
161                        if let Some(caps) = cmd_def_re.captures(line) {
162                            let cmd_name = format!("\\{}", &caps[2]);
163                            let def_start = i;
164                            let mut def_end = i;
165
166                            // Count braces to find multi-line definitions
167                            let mut depth: i32 = 0;
168                            for j in i..preamble_lines.len() {
169                                for ch in preamble_lines[j].chars() {
170                                    if ch == '{' {
171                                        depth += 1;
172                                    } else if ch == '}' {
173                                        depth -= 1;
174                                    }
175                                }
176                                def_end = j;
177                                if depth <= 0 {
178                                    break;
179                                }
180                            }
181
182                            let def_content = preamble_lines[def_start..=def_end]
183                                .join("\n")
184                                .trim()
185                                .to_string();
186                            let cmd_id = build_entity_id(
187                                file_path,
188                                "command_definition",
189                                &cmd_name,
190                                Some(&pid),
191                            );
192
193                            entities.push(SemanticEntity {
194                                id: cmd_id,
195                                file_path: file_path.to_string(),
196                                entity_type: "command_definition".to_string(),
197                                name: cmd_name,
198                                parent_id: Some(pid.clone()),
199                                content_hash: content_hash(&def_content),
200                                structural_hash: None,
201                                content: def_content,
202                                start_line: p_start + def_start + 1,
203                                end_line: p_start + def_end + 1,
204                                metadata: None,
205                            });
206
207                            i = def_end + 1;
208                            continue;
209                        }
210                    }
211                    i += 1;
212                }
213
214            }
215        }
216
217        // If entire file was treated as preamble (.sty/.cls), skip body parsing
218        if preamble_range.map_or(false, |(_, end)| end == lines.len()) {
219            return entities;
220        }
221
222        // --- Body: Pass 1 – Parse sections (like markdown headings) ---
223        struct Section {
224            level: usize,
225            name: String,
226            start_line: usize, // 1-based
227            lines: Vec<String>,
228            base_id: String,
229            parent_index: Option<usize>,
230        }
231
232        let mut sections: Vec<Section> = Vec::new();
233        let mut current_section: Option<usize> = None;
234        let mut section_stack: Vec<(usize, usize)> = Vec::new(); // (level, section index)
235
236        for i in body_start..body_end {
237            let line = lines[i];
238            let line_num = i + 1; // 1-based
239
240            if is_comment_line(line) {
241                if let Some(idx) = current_section {
242                    sections[idx].lines.push(line.to_string());
243                }
244                continue;
245            }
246
247            if let Some(m) = section_re.find(line) {
248                if let Some(caps) = section_re.captures(line) {
249                    let cmd = &caps[1];
250                    if let Some(level) = section_level(cmd) {
251                        // Use brace-counting to extract title (handles nested braces)
252                        let brace_pos = m.end() - 1; // byte offset of the `{`
253                        let name = extract_braced(line, brace_pos)
254                            .unwrap_or_else(|| cmd.to_string());
255
256                        // Pop stack until we find an ancestor with a strictly lower level
257                        while section_stack.last().map_or(false, |(l, _)| *l >= level) {
258                            section_stack.pop();
259                        }
260                        let parent_index = section_stack.last().map(|(_, idx)| *idx);
261
262                        sections.push(Section {
263                            level,
264                            name: name.clone(),
265                            start_line: line_num,
266                            lines: vec![line.to_string()],
267                            base_id: build_entity_id(file_path, "section", &name, None),
268                            parent_index,
269                        });
270                        let section_index = sections.len() - 1;
271                        current_section = Some(section_index);
272                        section_stack.push((level, section_index));
273                        continue;
274                    }
275                }
276            }
277
278            // Regular line: append to current section
279            if let Some(idx) = current_section {
280                sections[idx].lines.push(line.to_string());
281            }
282        }
283
284        // Disambiguate section IDs
285        let mut id_counts: HashMap<&str, usize> = HashMap::new();
286        for section in &sections {
287            *id_counts.entry(section.base_id.as_str()).or_default() += 1;
288        }
289
290        let section_ids: Vec<String> = sections
291            .iter()
292            .map(|section| {
293                if id_counts[section.base_id.as_str()] > 1 {
294                    build_entity_id_disambiguated(
295                        file_path,
296                        "section",
297                        &section.name,
298                        None,
299                        section.start_line,
300                    )
301                } else {
302                    section.base_id.clone()
303                }
304            })
305            .collect();
306
307        // Store section ranges for environment parent lookup
308        let section_ranges: Vec<(usize, usize, usize)> = sections
309            .iter()
310            .map(|s| (s.start_line, s.start_line + s.lines.len() - 1, s.level))
311            .collect();
312
313        for (index, section) in sections.iter().enumerate() {
314            let section_content = section.lines.join("\n").trim().to_string();
315            if section_content.is_empty() {
316                continue;
317            }
318
319            entities.push(SemanticEntity {
320                id: section_ids[index].clone(),
321                file_path: file_path.to_string(),
322                entity_type: "section".to_string(),
323                name: section.name.clone(),
324                parent_id: section
325                    .parent_index
326                    .map(|pi| section_ids[pi].clone()),
327                content_hash: content_hash(&section_content),
328                structural_hash: None,
329                content: section_content,
330                start_line: section.start_line,
331                end_line: section.start_line + section.lines.len() - 1,
332                metadata: None,
333            });
334        }
335
336        // --- Body: Pass 2 – Parse significant environments ---
337        struct EnvInfo {
338            env_type: String,
339            name: String,
340            start_line: usize, // 1-based
341            end_line: usize,   // 1-based
342            content: String,
343            base_id: String,
344        }
345
346        let mut env_entities: Vec<EnvInfo> = Vec::new();
347        // Stack: (env_type, start_line_1based, accumulated_lines)
348        let mut env_stack: Vec<(String, usize, Vec<String>)> = Vec::new();
349
350        for i in body_start..body_end {
351            let line = lines[i];
352            let line_num = i + 1;
353
354            if is_comment_line(line) {
355                if let Some((_, _, ref mut env_lines)) = env_stack.last_mut() {
356                    env_lines.push(line.to_string());
357                }
358                continue;
359            }
360
361            // Check for \begin{env}
362            if let Some(caps) = begin_env_re.captures(line) {
363                let env_name = caps[1].to_string();
364                if env_name != "document"
365                    && SIGNIFICANT_ENVIRONMENTS.contains(&env_name.as_str())
366                {
367                    env_stack.push((env_name, line_num, vec![line.to_string()]));
368                    continue;
369                }
370            }
371
372            // Check for \end{env}
373            if let Some(caps) = end_env_re.captures(line) {
374                let env_name = caps[1].to_string();
375                if let Some(pos) =
376                    env_stack.iter().rposition(|(name, _, _)| *name == env_name)
377                {
378                    let (env_type, start_line, mut env_lines) = env_stack.remove(pos);
379                    env_lines.push(line.to_string());
380
381                    // Try to find a \label inside the environment
382                    let label = env_lines
383                        .iter()
384                        .find_map(|l| label_re.captures(l).map(|c| c[1].to_string()));
385
386                    let name = label.unwrap_or_else(|| env_type.clone());
387                    let env_content = env_lines.join("\n").trim().to_string();
388
389                    env_entities.push(EnvInfo {
390                        env_type,
391                        name: name.clone(),
392                        start_line,
393                        end_line: line_num,
394                        content: env_content,
395                        base_id: build_entity_id(file_path, "environment", &name, None),
396                    });
397                    continue;
398                }
399            }
400
401            // Accumulate lines inside open environments
402            if let Some((_, _, ref mut env_lines)) = env_stack.last_mut() {
403                env_lines.push(line.to_string());
404            }
405        }
406
407        // Disambiguate environment IDs
408        let mut env_id_counts: HashMap<&str, usize> = HashMap::new();
409        for env in &env_entities {
410            *env_id_counts.entry(env.base_id.as_str()).or_default() += 1;
411        }
412
413        let env_ids: Vec<String> = env_entities
414            .iter()
415            .map(|env| {
416                if env_id_counts[env.base_id.as_str()] > 1 {
417                    build_entity_id_disambiguated(
418                        file_path,
419                        "environment",
420                        &env.name,
421                        None,
422                        env.start_line,
423                    )
424                } else {
425                    env.base_id.clone()
426                }
427            })
428            .collect();
429
430        for (index, env) in env_entities.iter().enumerate() {
431            // Find the deepest (highest-level number) section containing this environment
432            let parent_id = find_parent_section_id(
433                env.start_line,
434                &section_ranges,
435                &section_ids,
436            );
437
438            let mut metadata = HashMap::new();
439            metadata.insert("environment_type".to_string(), env.env_type.clone());
440
441            entities.push(SemanticEntity {
442                id: env_ids[index].clone(),
443                file_path: file_path.to_string(),
444                entity_type: "environment".to_string(),
445                name: env.name.clone(),
446                parent_id,
447                content_hash: content_hash(&env.content),
448                structural_hash: None,
449                content: env.content.clone(),
450                start_line: env.start_line,
451                end_line: env.end_line,
452                metadata: Some(metadata),
453            });
454        }
455
456        entities
457    }
458}
459
460/// Find the ID of the deepest section that contains the given line.
461fn find_parent_section_id(
462    line: usize,
463    section_ranges: &[(usize, usize, usize)], // (start, end, level)
464    section_ids: &[String],
465) -> Option<String> {
466    let mut best: Option<(usize, usize)> = None; // (index, level)
467    for (i, &(start, end, level)) in section_ranges.iter().enumerate() {
468        if start <= line && line <= end {
469            if best.map_or(true, |(_, best_level)| level > best_level) {
470                best = Some((i, level));
471            }
472        }
473    }
474    best.map(|(idx, _)| section_ids[idx].clone())
475}
476
477#[cfg(test)]
478mod tests {
479    use super::*;
480
481    fn extract(content: &str) -> Vec<SemanticEntity> {
482        let plugin = LatexParserPlugin;
483        plugin.extract_entities(content, "paper.tex")
484    }
485
486    #[test]
487    fn basic_section_hierarchy() {
488        let content = r"\begin{document}
489\section{Introduction}
490Some intro text.
491\subsection{Background}
492Background material.
493\section{Methods}
494Method details.
495\end{document}
496";
497        let entities = extract(content);
498
499        let sections: Vec<&SemanticEntity> = entities
500            .iter()
501            .filter(|e| e.entity_type == "section")
502            .collect();
503
504        assert_eq!(sections.len(), 3);
505        assert_eq!(sections[0].name, "Introduction");
506        assert_eq!(sections[0].parent_id, None);
507        assert_eq!(sections[1].name, "Background");
508        assert_eq!(
509            sections[1].parent_id.as_deref(),
510            Some("paper.tex::section::Introduction")
511        );
512        assert_eq!(sections[2].name, "Methods");
513        assert_eq!(sections[2].parent_id, None);
514    }
515
516    #[test]
517    fn preamble_with_command_definitions() {
518        let content = r"\documentclass{article}
519\usepackage{amsmath}
520\newcommand{\R}{\mathbb{R}}
521\renewcommand{\vec}[1]{\mathbf{#1}}
522\begin{document}
523\section{Body}
524Text.
525\end{document}
526";
527        let entities = extract(content);
528
529        let preamble: Vec<&SemanticEntity> = entities
530            .iter()
531            .filter(|e| e.entity_type == "preamble")
532            .collect();
533        assert_eq!(preamble.len(), 1);
534        assert!(preamble[0].content.contains(r"\documentclass{article}"));
535        assert!(preamble[0].content.contains(r"\usepackage{amsmath}"));
536
537        let cmds: Vec<&SemanticEntity> = entities
538            .iter()
539            .filter(|e| e.entity_type == "command_definition")
540            .collect();
541        assert_eq!(cmds.len(), 2);
542        assert_eq!(cmds[0].name, r"\R");
543        assert_eq!(cmds[1].name, r"\vec");
544        assert_eq!(
545            cmds[0].parent_id.as_deref(),
546            Some("paper.tex::preamble::(preamble)")
547        );
548    }
549
550    #[test]
551    fn environment_with_label() {
552        let content = r"\begin{document}
553\section{Results}
554\begin{theorem}\label{thm:main}
555Every even number greater than 2 is the sum of two primes.
556\end{theorem}
557\end{document}
558";
559        let entities = extract(content);
560
561        let envs: Vec<&SemanticEntity> = entities
562            .iter()
563            .filter(|e| e.entity_type == "environment")
564            .collect();
565        assert_eq!(envs.len(), 1);
566        assert_eq!(envs[0].name, "thm:main");
567        assert_eq!(
568            envs[0].metadata.as_ref().unwrap().get("environment_type"),
569            Some(&"theorem".to_string())
570        );
571        assert_eq!(
572            envs[0].parent_id.as_deref(),
573            Some("paper.tex::section::Results")
574        );
575    }
576
577    #[test]
578    fn environment_without_label() {
579        let content = r"\begin{document}
580\section{Proofs}
581\begin{proof}
582Trivial.
583\end{proof}
584\end{document}
585";
586        let entities = extract(content);
587
588        let envs: Vec<&SemanticEntity> = entities
589            .iter()
590            .filter(|e| e.entity_type == "environment")
591            .collect();
592        assert_eq!(envs.len(), 1);
593        // Without a label, the name is the environment type
594        assert_eq!(envs[0].name, "proof");
595        assert_eq!(envs[0].id, "paper.tex::environment::proof");
596    }
597
598    #[test]
599    fn starred_sections() {
600        let content = r"\begin{document}
601\section*{Acknowledgments}
602Thanks to everyone.
603\end{document}
604";
605        let entities = extract(content);
606
607        let sections: Vec<&SemanticEntity> = entities
608            .iter()
609            .filter(|e| e.entity_type == "section")
610            .collect();
611        assert_eq!(sections.len(), 1);
612        assert_eq!(sections[0].name, "Acknowledgments");
613    }
614
615    #[test]
616    fn nested_braces_in_title() {
617        let content = r"\begin{document}
618\section{The $O(n^{2})$ Algorithm}
619Details here.
620\end{document}
621";
622        let entities = extract(content);
623
624        let sections: Vec<&SemanticEntity> = entities
625            .iter()
626            .filter(|e| e.entity_type == "section")
627            .collect();
628        assert_eq!(sections.len(), 1);
629        assert_eq!(sections[0].name, "The $O(n^{2})$ Algorithm");
630    }
631
632    #[test]
633    fn comments_skipped_for_sections() {
634        let content = r"\begin{document}
635% \section{Commented Out}
636\section{Real Section}
637Content here.
638\end{document}
639";
640        let entities = extract(content);
641
642        let sections: Vec<&SemanticEntity> = entities
643            .iter()
644            .filter(|e| e.entity_type == "section")
645            .collect();
646        assert_eq!(sections.len(), 1);
647        assert_eq!(sections[0].name, "Real Section");
648    }
649
650    #[test]
651    fn empty_document_only_preamble() {
652        let content = r"\documentclass{article}
653\usepackage{amsmath}
654\newcommand{\N}{\mathbb{N}}
655\begin{document}
656\end{document}
657";
658        let entities = extract(content);
659
660        let preamble: Vec<&SemanticEntity> = entities
661            .iter()
662            .filter(|e| e.entity_type == "preamble")
663            .collect();
664        assert_eq!(preamble.len(), 1);
665
666        let cmds: Vec<&SemanticEntity> = entities
667            .iter()
668            .filter(|e| e.entity_type == "command_definition")
669            .collect();
670        assert_eq!(cmds.len(), 1);
671        assert_eq!(cmds[0].name, r"\N");
672
673        let sections: Vec<&SemanticEntity> = entities
674            .iter()
675            .filter(|e| e.entity_type == "section")
676            .collect();
677        assert_eq!(sections.len(), 0);
678    }
679
680    #[test]
681    fn duplicate_section_names_disambiguated() {
682        let content = r"\begin{document}
683\section{Results}
684First results.
685\section{Results}
686Second results.
687\end{document}
688";
689        let entities = extract(content);
690
691        let sections: Vec<&SemanticEntity> = entities
692            .iter()
693            .filter(|e| e.entity_type == "section")
694            .collect();
695        assert_eq!(sections.len(), 2);
696        assert_eq!(sections[0].id, "paper.tex::section::Results@L2");
697        assert_eq!(sections[1].id, "paper.tex::section::Results@L4");
698    }
699
700    #[test]
701    fn figure_environment() {
702        let content = r"\begin{document}
703\section{Experiments}
704\begin{figure}
705\centering
706\includegraphics{plot.png}
707\caption{Results}
708\label{fig:results}
709\end{figure}
710\end{document}
711";
712        let entities = extract(content);
713
714        let envs: Vec<&SemanticEntity> = entities
715            .iter()
716            .filter(|e| e.entity_type == "environment")
717            .collect();
718        assert_eq!(envs.len(), 1);
719        assert_eq!(envs[0].name, "fig:results");
720        assert_eq!(
721            envs[0].metadata.as_ref().unwrap().get("environment_type"),
722            Some(&"figure".to_string())
723        );
724    }
725
726    #[test]
727    fn nonsignificant_environments_not_extracted() {
728        let content = r"\begin{document}
729\section{List}
730\begin{itemize}
731\item One
732\item Two
733\end{itemize}
734\end{document}
735";
736        let entities = extract(content);
737
738        let envs: Vec<&SemanticEntity> = entities
739            .iter()
740            .filter(|e| e.entity_type == "environment")
741            .collect();
742        assert_eq!(envs.len(), 0);
743    }
744
745    #[test]
746    fn sty_file_treated_as_preamble() {
747        let content = r"\NeedsTeXFormat{LaTeX2e}
748\ProvidesPackage{mypackage}
749\newcommand{\foo}{bar}
750";
751        let plugin = LatexParserPlugin;
752        let entities = plugin.extract_entities(content, "mypackage.sty");
753
754        let preamble: Vec<&SemanticEntity> = entities
755            .iter()
756            .filter(|e| e.entity_type == "preamble")
757            .collect();
758        assert_eq!(preamble.len(), 1);
759
760        let cmds: Vec<&SemanticEntity> = entities
761            .iter()
762            .filter(|e| e.entity_type == "command_definition")
763            .collect();
764        assert_eq!(cmds.len(), 1);
765        assert_eq!(cmds[0].name, r"\foo");
766    }
767
768    #[test]
769    fn multiline_command_definition() {
770        let content = r"\newcommand{\mybox}[1]{%
771  \fbox{%
772    \parbox{0.9\textwidth}{#1}%
773  }%
774}
775\begin{document}
776\section{Body}
777Text.
778\end{document}
779";
780        let entities = extract(content);
781
782        let cmds: Vec<&SemanticEntity> = entities
783            .iter()
784            .filter(|e| e.entity_type == "command_definition")
785            .collect();
786        assert_eq!(cmds.len(), 1);
787        assert_eq!(cmds[0].name, r"\mybox");
788        assert!(cmds[0].content.contains(r"\parbox"));
789        assert_eq!(cmds[0].start_line, 1);
790        assert_eq!(cmds[0].end_line, 5);
791    }
792
793    #[test]
794    fn multiple_environments_disambiguated() {
795        let content = r"\begin{document}
796\section{Theorems}
797\begin{theorem}
798First theorem.
799\end{theorem}
800\begin{theorem}
801Second theorem.
802\end{theorem}
803\end{document}
804";
805        let entities = extract(content);
806
807        let envs: Vec<&SemanticEntity> = entities
808            .iter()
809            .filter(|e| e.entity_type == "environment")
810            .collect();
811        assert_eq!(envs.len(), 2);
812        // Both have name "theorem" (no labels), so they get disambiguated
813        assert_eq!(envs[0].id, "paper.tex::environment::theorem@L3");
814        assert_eq!(envs[1].id, "paper.tex::environment::theorem@L6");
815    }
816
817    #[test]
818    fn abstract_before_sections() {
819        let content = r"\begin{document}
820\begin{abstract}
821This paper presents results.
822\end{abstract}
823\section{Introduction}
824Intro text.
825\end{document}
826";
827        let entities = extract(content);
828
829        let envs: Vec<&SemanticEntity> = entities
830            .iter()
831            .filter(|e| e.entity_type == "environment")
832            .collect();
833        assert_eq!(envs.len(), 1);
834        assert_eq!(envs[0].name, "abstract");
835        // abstract is before any section, so no parent
836        assert_eq!(envs[0].parent_id, None);
837    }
838}