1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
//! The losslessness invariant: `reconstruct(text) == text`, byte-for-byte.
//! This is badness's foundational parser test (Tenet 4 / Core decision in
//! `AGENTS.md`).
use std::fs;
use std::path::Path;
use badness::parser::{LatexFlavor, LexConfig, parse_with_flavor, reconstruct};
fn assert_lossless(text: &str) {
assert_eq!(reconstruct(text), text);
}
/// Reconstruct under the docstrip (`.dtx`) lexer config. Losslessness must hold
/// in this mode exactly as in the plain one: the two-layer parse only *re-parents*
/// tokens (margins become trivia, `macrocode` bodies become code), never drops a
/// byte.
fn reconstruct_dtx(text: &str) -> String {
let config = LexConfig {
flavor: LatexFlavor::Document,
dtx: true,
};
parse_with_flavor(text, config).syntax().to_string()
}
fn assert_lossless_dtx(text: &str) {
assert_eq!(reconstruct_dtx(text), text);
}
#[test]
fn roundtrip_units() {
let cases = [
"",
"hello world",
r"\section{Introduction}",
r"$x^2 + y_i = \frac{1}{2}$",
"a % comment\nb",
r"\begin{itemize}\item one\end{itemize}",
"line1\n\nline2\r\nline3\r",
"unicode: café — naïve ∑∫ 𝕏",
r"\\ \{ \} \% \, \;",
"trailing backslash \\",
"[opt] {req} & # ~ ^_",
"no final newline",
// Argument-taking verbatim environments: the args precede the raw body, and
// the body holds characters the generic lexer would otherwise (mis)read.
"\\begin{lstlisting}[language=C]\nint a[3] = {1}; % literal\n\\end{lstlisting}",
"\\begin{minted}[frame=single]{python}\nprint(\"$x$\")\n\\end{minted}",
// A user-defined verbatim environment (catcode-othering begin-code) routes its
// body to the opaque branch via the two-pass parse; it must still round-trip.
"\\newenvironment{shellenv}{\\@makeother\\$}{}\n\\begin{shellenv}\na_$b$ % literal\n\\end{shellenv}\n",
// Leading comment-bind: comments attached *into* a command/environment
// must still reconstruct byte-for-byte (the bind only re-parents tokens).
"% a doc comment\n\\section{Intro}\n",
"% caption note\n\\begin{figure}\nbody\n\\end{figure}\n",
"%a\n\n%b\n\\foo",
// expl3 syntax mode: `_`/`:` become letters between the toggles, so names
// lex as single control words. Losslessness holds regardless of token kind.
r"\ExplSyntaxOn\seq_new:N \g_@@_x_tl\ExplSyntaxOff\seq_new:N",
// A `.ins` docstrip driver: plain `Document`-config code (no docstrip mode),
// so a `%<…>`-looking line and a commented-out `\generate` are ordinary
// comments and must reconstruct byte-for-byte.
"\\input docstrip.tex\n\\keepsilent\n%<*nonsense>\n\\generate{\\file{foo.sty}{\\from{foo.dtx}{package}}}\n% \\generate{\\file{x}{\\from{y}{z}}}\n\\endbatchfile\n",
];
for case in cases {
assert_lossless(case);
}
}
#[test]
fn roundtrip_dtx_units() {
// Realistic `.dtx` surface shapes: a meta-comment header, a guarded driver
// block, documentation prose behind `%` margins, and a `macrocode` block whose
// code lines carry no margin. Losslessness must hold under the docstrip config
// through every milestone.
let cases = [
"% \\iffalse meta-comment\n%<*driver>\n\\documentclass{ltxdoc}\n\\begin{document}\n\\DocInput{foo.dtx}\n\\end{document}\n%</driver>\n% \\fi\n",
"% \\section{Introduction}\n% Some prose about \\foo.\n% \\begin{macrocode}\n\\def\\foo{\\bar@baz}\n% \\end{macrocode}\n",
// A doc line whose content itself ends in a real trailing comment.
"% prose with a real trailing comment % todo\n% \\DescribeMacro{\\foo}\n",
// A margin-only blank line between two doc paragraphs.
"% first paragraph\n%\n% second paragraph\n",
// CRLF line endings throughout.
"% doc line\r\n% \\begin{macrocode}\r\n\\foo\r\n% \\end{macrocode}\r\n",
// An unterminated macrocode block must still reconstruct.
"% \\begin{macrocode}\n\\foo\n\\bar\n",
// Inline docstrip guard prefixing a code line.
"%<*pkg>\n\\RequirePackage{xcolor}\n%</pkg>\n",
// A guard block with CRLF line endings (the `>` terminates before `\r`).
"%<*driver>\r\n\\documentclass{ltxdoc}\r\n%</driver>\r\n",
// A guard with a boolean tag expression.
"%<*package|driver>\n\\foo\n%</package|driver>\n",
// A `macrocode` body with nested groups (the formatter indents these from a
// column-0 base; losslessness must hold regardless).
"% \\begin{macrocode}\n\\def\\foo{%\n\\begingroup\n\\bar\n\\endgroup\n}\n% \\end{macrocode}\n",
// A documentation-layer environment whose frames sit on margin lines.
"% \\begin{itemize}\n% \\item first\n% \\item second\n% \\end{itemize}\n",
];
for case in cases {
assert_lossless_dtx(case);
// The same bytes must also round-trip under the plain config: dtx-ness only
// changes structure, never which bytes are kept.
assert_lossless(case);
}
}
#[test]
fn roundtrip_dtx_corpus() {
// Optional: any `.dtx` files dropped into the corpus (e.g. from CTAN) must
// round-trip under the docstrip config. Absence is not a failure — unlike the
// `.tex` corpus, this set may be empty until sources are vendored.
let dir = Path::new(env!("CARGO_MANIFEST_DIR")).join("tests/corpus");
for entry in fs::read_dir(&dir).expect("read corpus dir") {
let path = entry.expect("dir entry").path();
if path.extension().and_then(|e| e.to_str()) == Some("dtx") {
let text = fs::read_to_string(&path).expect("read corpus file");
assert_eq!(
reconstruct_dtx(&text),
text,
"dtx losslessness failed for {path:?}"
);
}
}
}
#[test]
fn roundtrip_corpus() {
let dir = Path::new(env!("CARGO_MANIFEST_DIR")).join("tests/corpus");
let mut count = 0;
for entry in fs::read_dir(&dir).expect("read corpus dir") {
let path = entry.expect("dir entry").path();
if path.extension().and_then(|e| e.to_str()) == Some("tex") {
let text = fs::read_to_string(&path).expect("read corpus file");
assert_eq!(reconstruct(&text), text, "losslessness failed for {path:?}");
count += 1;
}
}
assert!(count > 0, "no .tex corpus files found in {dir:?}");
}