1use regex::Regex;
2use std::collections::HashMap;
3
4use crate::model::entity::{build_entity_id, build_entity_id_disambiguated, SemanticEntity};
5use crate::parser::plugin::SemanticParserPlugin;
6use crate::utils::hash::content_hash;
7
8pub struct LatexParserPlugin;
9
10const SIGNIFICANT_ENVIRONMENTS: &[&str] = &[
11 "theorem",
12 "lemma",
13 "corollary",
14 "proposition",
15 "definition",
16 "proof",
17 "example",
18 "remark",
19 "figure",
20 "table",
21 "listing",
22 "algorithm",
23 "abstract",
24 "appendix",
25];
26
27fn section_level(cmd: &str) -> Option<usize> {
29 match cmd {
30 "part" => Some(0),
31 "chapter" => Some(1),
32 "section" => Some(2),
33 "subsection" => Some(3),
34 "subsubsection" => Some(4),
35 "paragraph" => Some(5),
36 _ => None,
37 }
38}
39
40fn extract_braced(s: &str, pos: usize) -> Option<String> {
43 let substr = &s[pos..];
44 let mut chars = substr.chars();
45
46 if chars.next() != Some('{') {
47 return None;
48 }
49
50 let mut depth = 1i32;
51 let mut result = String::new();
52
53 for ch in chars {
54 match ch {
55 '{' => {
56 depth += 1;
57 result.push(ch);
58 }
59 '}' => {
60 depth -= 1;
61 if depth == 0 {
62 return Some(result);
63 }
64 result.push(ch);
65 }
66 _ => result.push(ch),
67 }
68 }
69 None
70}
71
72fn is_comment_line(line: &str) -> bool {
74 line.trim_start().starts_with('%')
75}
76
77impl SemanticParserPlugin for LatexParserPlugin {
78 fn id(&self) -> &str {
79 "latex"
80 }
81
82 fn extensions(&self) -> &[&str] {
83 &[".tex", ".latex", ".cls", ".sty"]
84 }
85
86 fn extract_entities(&self, content: &str, file_path: &str) -> Vec<SemanticEntity> {
87 let mut entities = Vec::new();
88 let lines: Vec<&str> = content.lines().collect();
89 if lines.is_empty() {
90 return entities;
91 }
92
93 let section_re = Regex::new(
94 r"\\(part|chapter|section|subsection|subsubsection|paragraph)\*?\{",
95 )
96 .unwrap();
97 let begin_env_re = Regex::new(r"\\begin\{(\w+)\}").unwrap();
98 let end_env_re = Regex::new(r"\\end\{(\w+)\}").unwrap();
99 let label_re = Regex::new(r"\\label\{([^}]+)\}").unwrap();
100 let cmd_def_re =
101 Regex::new(r"\\(newcommand|renewcommand|DeclareMathOperator)\*?\{?\\(\w+)").unwrap();
102
103 let mut doc_start: Option<usize> = None;
105 let mut doc_end: Option<usize> = None;
106 for (i, &line) in lines.iter().enumerate() {
107 if !is_comment_line(line) {
108 if doc_start.is_none() && line.contains(r"\begin{document}") {
109 doc_start = Some(i);
110 } else if doc_start.is_some()
111 && doc_end.is_none()
112 && line.contains(r"\end{document}")
113 {
114 doc_end = Some(i);
115 }
116 }
117 }
118
119 let body_start = doc_start.map_or(0, |s| s + 1);
120 let body_end = doc_end.unwrap_or(lines.len());
121
122 let preamble_range: Option<(usize, usize)> = if let Some(ds) = doc_start {
126 if ds > 0 {
127 Some((0, ds))
128 } else {
129 None
130 }
131 } else if file_path.ends_with(".sty") || file_path.ends_with(".cls") {
132 Some((0, lines.len()))
133 } else {
134 None
135 };
136
137 if let Some((p_start, p_end)) = preamble_range {
138 let preamble_content = lines[p_start..p_end].join("\n").trim().to_string();
139 if !preamble_content.is_empty() {
140 let pid = build_entity_id(file_path, "preamble", "(preamble)", None);
141 entities.push(SemanticEntity {
142 id: pid.clone(),
143 file_path: file_path.to_string(),
144 entity_type: "preamble".to_string(),
145 name: "(preamble)".to_string(),
146 parent_id: None,
147 content_hash: content_hash(&preamble_content),
148 structural_hash: None,
149 content: preamble_content,
150 start_line: p_start + 1,
151 end_line: p_end,
152 metadata: None,
153 });
154
155 let preamble_lines = &lines[p_start..p_end];
157 let mut i = 0;
158 while i < preamble_lines.len() {
159 let line = preamble_lines[i];
160 if !is_comment_line(line) {
161 if let Some(caps) = cmd_def_re.captures(line) {
162 let cmd_name = format!("\\{}", &caps[2]);
163 let def_start = i;
164 let mut def_end = i;
165
166 let mut depth: i32 = 0;
168 for j in i..preamble_lines.len() {
169 for ch in preamble_lines[j].chars() {
170 if ch == '{' {
171 depth += 1;
172 } else if ch == '}' {
173 depth -= 1;
174 }
175 }
176 def_end = j;
177 if depth <= 0 {
178 break;
179 }
180 }
181
182 let def_content = preamble_lines[def_start..=def_end]
183 .join("\n")
184 .trim()
185 .to_string();
186 let cmd_id = build_entity_id(
187 file_path,
188 "command_definition",
189 &cmd_name,
190 Some(&pid),
191 );
192
193 entities.push(SemanticEntity {
194 id: cmd_id,
195 file_path: file_path.to_string(),
196 entity_type: "command_definition".to_string(),
197 name: cmd_name,
198 parent_id: Some(pid.clone()),
199 content_hash: content_hash(&def_content),
200 structural_hash: None,
201 content: def_content,
202 start_line: p_start + def_start + 1,
203 end_line: p_start + def_end + 1,
204 metadata: None,
205 });
206
207 i = def_end + 1;
208 continue;
209 }
210 }
211 i += 1;
212 }
213
214 }
215 }
216
217 if preamble_range.map_or(false, |(_, end)| end == lines.len()) {
219 return entities;
220 }
221
222 struct Section {
224 level: usize,
225 name: String,
226 start_line: usize, lines: Vec<String>,
228 base_id: String,
229 parent_index: Option<usize>,
230 }
231
232 let mut sections: Vec<Section> = Vec::new();
233 let mut current_section: Option<usize> = None;
234 let mut section_stack: Vec<(usize, usize)> = Vec::new(); for i in body_start..body_end {
237 let line = lines[i];
238 let line_num = i + 1; if is_comment_line(line) {
241 if let Some(idx) = current_section {
242 sections[idx].lines.push(line.to_string());
243 }
244 continue;
245 }
246
247 if let Some(m) = section_re.find(line) {
248 if let Some(caps) = section_re.captures(line) {
249 let cmd = &caps[1];
250 if let Some(level) = section_level(cmd) {
251 let brace_pos = m.end() - 1; let name = extract_braced(line, brace_pos)
254 .unwrap_or_else(|| cmd.to_string());
255
256 while section_stack.last().map_or(false, |(l, _)| *l >= level) {
258 section_stack.pop();
259 }
260 let parent_index = section_stack.last().map(|(_, idx)| *idx);
261
262 sections.push(Section {
263 level,
264 name: name.clone(),
265 start_line: line_num,
266 lines: vec![line.to_string()],
267 base_id: build_entity_id(file_path, "section", &name, None),
268 parent_index,
269 });
270 let section_index = sections.len() - 1;
271 current_section = Some(section_index);
272 section_stack.push((level, section_index));
273 continue;
274 }
275 }
276 }
277
278 if let Some(idx) = current_section {
280 sections[idx].lines.push(line.to_string());
281 }
282 }
283
284 let mut id_counts: HashMap<&str, usize> = HashMap::new();
286 for section in §ions {
287 *id_counts.entry(section.base_id.as_str()).or_default() += 1;
288 }
289
290 let section_ids: Vec<String> = sections
291 .iter()
292 .map(|section| {
293 if id_counts[section.base_id.as_str()] > 1 {
294 build_entity_id_disambiguated(
295 file_path,
296 "section",
297 §ion.name,
298 None,
299 section.start_line,
300 )
301 } else {
302 section.base_id.clone()
303 }
304 })
305 .collect();
306
307 let section_ranges: Vec<(usize, usize, usize)> = sections
309 .iter()
310 .map(|s| (s.start_line, s.start_line + s.lines.len() - 1, s.level))
311 .collect();
312
313 for (index, section) in sections.iter().enumerate() {
314 let section_content = section.lines.join("\n").trim().to_string();
315 if section_content.is_empty() {
316 continue;
317 }
318
319 entities.push(SemanticEntity {
320 id: section_ids[index].clone(),
321 file_path: file_path.to_string(),
322 entity_type: "section".to_string(),
323 name: section.name.clone(),
324 parent_id: section
325 .parent_index
326 .map(|pi| section_ids[pi].clone()),
327 content_hash: content_hash(§ion_content),
328 structural_hash: None,
329 content: section_content,
330 start_line: section.start_line,
331 end_line: section.start_line + section.lines.len() - 1,
332 metadata: None,
333 });
334 }
335
336 struct EnvInfo {
338 env_type: String,
339 name: String,
340 start_line: usize, end_line: usize, content: String,
343 base_id: String,
344 }
345
346 let mut env_entities: Vec<EnvInfo> = Vec::new();
347 let mut env_stack: Vec<(String, usize, Vec<String>)> = Vec::new();
349
350 for i in body_start..body_end {
351 let line = lines[i];
352 let line_num = i + 1;
353
354 if is_comment_line(line) {
355 if let Some((_, _, ref mut env_lines)) = env_stack.last_mut() {
356 env_lines.push(line.to_string());
357 }
358 continue;
359 }
360
361 if let Some(caps) = begin_env_re.captures(line) {
363 let env_name = caps[1].to_string();
364 if env_name != "document"
365 && SIGNIFICANT_ENVIRONMENTS.contains(&env_name.as_str())
366 {
367 env_stack.push((env_name, line_num, vec![line.to_string()]));
368 continue;
369 }
370 }
371
372 if let Some(caps) = end_env_re.captures(line) {
374 let env_name = caps[1].to_string();
375 if let Some(pos) =
376 env_stack.iter().rposition(|(name, _, _)| *name == env_name)
377 {
378 let (env_type, start_line, mut env_lines) = env_stack.remove(pos);
379 env_lines.push(line.to_string());
380
381 let label = env_lines
383 .iter()
384 .find_map(|l| label_re.captures(l).map(|c| c[1].to_string()));
385
386 let name = label.unwrap_or_else(|| env_type.clone());
387 let env_content = env_lines.join("\n").trim().to_string();
388
389 env_entities.push(EnvInfo {
390 env_type,
391 name: name.clone(),
392 start_line,
393 end_line: line_num,
394 content: env_content,
395 base_id: build_entity_id(file_path, "environment", &name, None),
396 });
397 continue;
398 }
399 }
400
401 if let Some((_, _, ref mut env_lines)) = env_stack.last_mut() {
403 env_lines.push(line.to_string());
404 }
405 }
406
407 let mut env_id_counts: HashMap<&str, usize> = HashMap::new();
409 for env in &env_entities {
410 *env_id_counts.entry(env.base_id.as_str()).or_default() += 1;
411 }
412
413 let env_ids: Vec<String> = env_entities
414 .iter()
415 .map(|env| {
416 if env_id_counts[env.base_id.as_str()] > 1 {
417 build_entity_id_disambiguated(
418 file_path,
419 "environment",
420 &env.name,
421 None,
422 env.start_line,
423 )
424 } else {
425 env.base_id.clone()
426 }
427 })
428 .collect();
429
430 for (index, env) in env_entities.iter().enumerate() {
431 let parent_id = find_parent_section_id(
433 env.start_line,
434 §ion_ranges,
435 §ion_ids,
436 );
437
438 let mut metadata = HashMap::new();
439 metadata.insert("environment_type".to_string(), env.env_type.clone());
440
441 entities.push(SemanticEntity {
442 id: env_ids[index].clone(),
443 file_path: file_path.to_string(),
444 entity_type: "environment".to_string(),
445 name: env.name.clone(),
446 parent_id,
447 content_hash: content_hash(&env.content),
448 structural_hash: None,
449 content: env.content.clone(),
450 start_line: env.start_line,
451 end_line: env.end_line,
452 metadata: Some(metadata),
453 });
454 }
455
456 entities
457 }
458}
459
460fn find_parent_section_id(
462 line: usize,
463 section_ranges: &[(usize, usize, usize)], section_ids: &[String],
465) -> Option<String> {
466 let mut best: Option<(usize, usize)> = None; for (i, &(start, end, level)) in section_ranges.iter().enumerate() {
468 if start <= line && line <= end {
469 if best.map_or(true, |(_, best_level)| level > best_level) {
470 best = Some((i, level));
471 }
472 }
473 }
474 best.map(|(idx, _)| section_ids[idx].clone())
475}
476
477#[cfg(test)]
478mod tests {
479 use super::*;
480
481 fn extract(content: &str) -> Vec<SemanticEntity> {
482 let plugin = LatexParserPlugin;
483 plugin.extract_entities(content, "paper.tex")
484 }
485
486 #[test]
487 fn basic_section_hierarchy() {
488 let content = r"\begin{document}
489\section{Introduction}
490Some intro text.
491\subsection{Background}
492Background material.
493\section{Methods}
494Method details.
495\end{document}
496";
497 let entities = extract(content);
498
499 let sections: Vec<&SemanticEntity> = entities
500 .iter()
501 .filter(|e| e.entity_type == "section")
502 .collect();
503
504 assert_eq!(sections.len(), 3);
505 assert_eq!(sections[0].name, "Introduction");
506 assert_eq!(sections[0].parent_id, None);
507 assert_eq!(sections[1].name, "Background");
508 assert_eq!(
509 sections[1].parent_id.as_deref(),
510 Some("paper.tex::section::Introduction")
511 );
512 assert_eq!(sections[2].name, "Methods");
513 assert_eq!(sections[2].parent_id, None);
514 }
515
516 #[test]
517 fn preamble_with_command_definitions() {
518 let content = r"\documentclass{article}
519\usepackage{amsmath}
520\newcommand{\R}{\mathbb{R}}
521\renewcommand{\vec}[1]{\mathbf{#1}}
522\begin{document}
523\section{Body}
524Text.
525\end{document}
526";
527 let entities = extract(content);
528
529 let preamble: Vec<&SemanticEntity> = entities
530 .iter()
531 .filter(|e| e.entity_type == "preamble")
532 .collect();
533 assert_eq!(preamble.len(), 1);
534 assert!(preamble[0].content.contains(r"\documentclass{article}"));
535 assert!(preamble[0].content.contains(r"\usepackage{amsmath}"));
536
537 let cmds: Vec<&SemanticEntity> = entities
538 .iter()
539 .filter(|e| e.entity_type == "command_definition")
540 .collect();
541 assert_eq!(cmds.len(), 2);
542 assert_eq!(cmds[0].name, r"\R");
543 assert_eq!(cmds[1].name, r"\vec");
544 assert_eq!(
545 cmds[0].parent_id.as_deref(),
546 Some("paper.tex::preamble::(preamble)")
547 );
548 }
549
550 #[test]
551 fn environment_with_label() {
552 let content = r"\begin{document}
553\section{Results}
554\begin{theorem}\label{thm:main}
555Every even number greater than 2 is the sum of two primes.
556\end{theorem}
557\end{document}
558";
559 let entities = extract(content);
560
561 let envs: Vec<&SemanticEntity> = entities
562 .iter()
563 .filter(|e| e.entity_type == "environment")
564 .collect();
565 assert_eq!(envs.len(), 1);
566 assert_eq!(envs[0].name, "thm:main");
567 assert_eq!(
568 envs[0].metadata.as_ref().unwrap().get("environment_type"),
569 Some(&"theorem".to_string())
570 );
571 assert_eq!(
572 envs[0].parent_id.as_deref(),
573 Some("paper.tex::section::Results")
574 );
575 }
576
577 #[test]
578 fn environment_without_label() {
579 let content = r"\begin{document}
580\section{Proofs}
581\begin{proof}
582Trivial.
583\end{proof}
584\end{document}
585";
586 let entities = extract(content);
587
588 let envs: Vec<&SemanticEntity> = entities
589 .iter()
590 .filter(|e| e.entity_type == "environment")
591 .collect();
592 assert_eq!(envs.len(), 1);
593 assert_eq!(envs[0].name, "proof");
595 assert_eq!(envs[0].id, "paper.tex::environment::proof");
596 }
597
598 #[test]
599 fn starred_sections() {
600 let content = r"\begin{document}
601\section*{Acknowledgments}
602Thanks to everyone.
603\end{document}
604";
605 let entities = extract(content);
606
607 let sections: Vec<&SemanticEntity> = entities
608 .iter()
609 .filter(|e| e.entity_type == "section")
610 .collect();
611 assert_eq!(sections.len(), 1);
612 assert_eq!(sections[0].name, "Acknowledgments");
613 }
614
615 #[test]
616 fn nested_braces_in_title() {
617 let content = r"\begin{document}
618\section{The $O(n^{2})$ Algorithm}
619Details here.
620\end{document}
621";
622 let entities = extract(content);
623
624 let sections: Vec<&SemanticEntity> = entities
625 .iter()
626 .filter(|e| e.entity_type == "section")
627 .collect();
628 assert_eq!(sections.len(), 1);
629 assert_eq!(sections[0].name, "The $O(n^{2})$ Algorithm");
630 }
631
632 #[test]
633 fn comments_skipped_for_sections() {
634 let content = r"\begin{document}
635% \section{Commented Out}
636\section{Real Section}
637Content here.
638\end{document}
639";
640 let entities = extract(content);
641
642 let sections: Vec<&SemanticEntity> = entities
643 .iter()
644 .filter(|e| e.entity_type == "section")
645 .collect();
646 assert_eq!(sections.len(), 1);
647 assert_eq!(sections[0].name, "Real Section");
648 }
649
650 #[test]
651 fn empty_document_only_preamble() {
652 let content = r"\documentclass{article}
653\usepackage{amsmath}
654\newcommand{\N}{\mathbb{N}}
655\begin{document}
656\end{document}
657";
658 let entities = extract(content);
659
660 let preamble: Vec<&SemanticEntity> = entities
661 .iter()
662 .filter(|e| e.entity_type == "preamble")
663 .collect();
664 assert_eq!(preamble.len(), 1);
665
666 let cmds: Vec<&SemanticEntity> = entities
667 .iter()
668 .filter(|e| e.entity_type == "command_definition")
669 .collect();
670 assert_eq!(cmds.len(), 1);
671 assert_eq!(cmds[0].name, r"\N");
672
673 let sections: Vec<&SemanticEntity> = entities
674 .iter()
675 .filter(|e| e.entity_type == "section")
676 .collect();
677 assert_eq!(sections.len(), 0);
678 }
679
680 #[test]
681 fn duplicate_section_names_disambiguated() {
682 let content = r"\begin{document}
683\section{Results}
684First results.
685\section{Results}
686Second results.
687\end{document}
688";
689 let entities = extract(content);
690
691 let sections: Vec<&SemanticEntity> = entities
692 .iter()
693 .filter(|e| e.entity_type == "section")
694 .collect();
695 assert_eq!(sections.len(), 2);
696 assert_eq!(sections[0].id, "paper.tex::section::Results@L2");
697 assert_eq!(sections[1].id, "paper.tex::section::Results@L4");
698 }
699
700 #[test]
701 fn figure_environment() {
702 let content = r"\begin{document}
703\section{Experiments}
704\begin{figure}
705\centering
706\includegraphics{plot.png}
707\caption{Results}
708\label{fig:results}
709\end{figure}
710\end{document}
711";
712 let entities = extract(content);
713
714 let envs: Vec<&SemanticEntity> = entities
715 .iter()
716 .filter(|e| e.entity_type == "environment")
717 .collect();
718 assert_eq!(envs.len(), 1);
719 assert_eq!(envs[0].name, "fig:results");
720 assert_eq!(
721 envs[0].metadata.as_ref().unwrap().get("environment_type"),
722 Some(&"figure".to_string())
723 );
724 }
725
726 #[test]
727 fn nonsignificant_environments_not_extracted() {
728 let content = r"\begin{document}
729\section{List}
730\begin{itemize}
731\item One
732\item Two
733\end{itemize}
734\end{document}
735";
736 let entities = extract(content);
737
738 let envs: Vec<&SemanticEntity> = entities
739 .iter()
740 .filter(|e| e.entity_type == "environment")
741 .collect();
742 assert_eq!(envs.len(), 0);
743 }
744
745 #[test]
746 fn sty_file_treated_as_preamble() {
747 let content = r"\NeedsTeXFormat{LaTeX2e}
748\ProvidesPackage{mypackage}
749\newcommand{\foo}{bar}
750";
751 let plugin = LatexParserPlugin;
752 let entities = plugin.extract_entities(content, "mypackage.sty");
753
754 let preamble: Vec<&SemanticEntity> = entities
755 .iter()
756 .filter(|e| e.entity_type == "preamble")
757 .collect();
758 assert_eq!(preamble.len(), 1);
759
760 let cmds: Vec<&SemanticEntity> = entities
761 .iter()
762 .filter(|e| e.entity_type == "command_definition")
763 .collect();
764 assert_eq!(cmds.len(), 1);
765 assert_eq!(cmds[0].name, r"\foo");
766 }
767
768 #[test]
769 fn multiline_command_definition() {
770 let content = r"\newcommand{\mybox}[1]{%
771 \fbox{%
772 \parbox{0.9\textwidth}{#1}%
773 }%
774}
775\begin{document}
776\section{Body}
777Text.
778\end{document}
779";
780 let entities = extract(content);
781
782 let cmds: Vec<&SemanticEntity> = entities
783 .iter()
784 .filter(|e| e.entity_type == "command_definition")
785 .collect();
786 assert_eq!(cmds.len(), 1);
787 assert_eq!(cmds[0].name, r"\mybox");
788 assert!(cmds[0].content.contains(r"\parbox"));
789 assert_eq!(cmds[0].start_line, 1);
790 assert_eq!(cmds[0].end_line, 5);
791 }
792
793 #[test]
794 fn multiple_environments_disambiguated() {
795 let content = r"\begin{document}
796\section{Theorems}
797\begin{theorem}
798First theorem.
799\end{theorem}
800\begin{theorem}
801Second theorem.
802\end{theorem}
803\end{document}
804";
805 let entities = extract(content);
806
807 let envs: Vec<&SemanticEntity> = entities
808 .iter()
809 .filter(|e| e.entity_type == "environment")
810 .collect();
811 assert_eq!(envs.len(), 2);
812 assert_eq!(envs[0].id, "paper.tex::environment::theorem@L3");
814 assert_eq!(envs[1].id, "paper.tex::environment::theorem@L6");
815 }
816
817 #[test]
818 fn abstract_before_sections() {
819 let content = r"\begin{document}
820\begin{abstract}
821This paper presents results.
822\end{abstract}
823\section{Introduction}
824Intro text.
825\end{document}
826";
827 let entities = extract(content);
828
829 let envs: Vec<&SemanticEntity> = entities
830 .iter()
831 .filter(|e| e.entity_type == "environment")
832 .collect();
833 assert_eq!(envs.len(), 1);
834 assert_eq!(envs[0].name, "abstract");
835 assert_eq!(envs[0].parent_id, None);
837 }
838}