1use std::{fmt::Display, sync::OnceLock};
18
19use regex::bytes::{CaptureMatches, Match, Regex, RegexSet};
20
21use crate::{statement::unescape, Span};
22
23#[derive(Debug, Clone, Copy, PartialEq, Eq)]
26pub enum CommentItem {
27 Text(Span),
32 LineBreak(usize),
36 StartMathMode(usize),
40 EndMathMode(usize),
44 MathToken(Span),
48 Label(usize, Span),
52 Url(usize, Span),
56 StartHtml(usize),
60 EndHtml(usize),
63 StartSubscript(usize),
65 EndSubscript(usize),
67 StartItalic(usize),
69 EndItalic(usize),
71 BibTag(Span),
73}
74
75#[inline]
78#[must_use]
79pub const fn is_text_escape(html_mode: bool, c: u8) -> bool {
80 matches!(c, b'`' | b'[' | b'~') || !html_mode && c == b'_'
81}
82
83#[inline]
87#[must_use]
88pub const fn is_label_escape(c: u8) -> bool {
89 matches!(c, b'`' | b'[' | b'~')
90}
91
92#[inline]
95#[must_use]
96pub const fn is_math_escape(c: u8) -> bool {
97 c == b'`'
98}
99
100impl CommentItem {
101 pub fn unescape_text(html_mode: bool, buf: &[u8], out: &mut Vec<u8>) {
104 unescape(buf, out, |c| is_text_escape(html_mode, c))
105 }
106
107 pub fn unescape_label(buf: &[u8], out: &mut Vec<u8>) {
110 unescape(buf, out, is_label_escape)
111 }
112
113 pub fn unescape_math(buf: &[u8], out: &mut Vec<u8>) {
116 unescape(buf, out, is_math_escape)
117 }
118
119 const fn token(math: bool, span: Span) -> Self {
120 if math {
121 Self::MathToken(span)
122 } else {
123 Self::Text(span)
124 }
125 }
126}
127
128#[derive(Debug, Clone)]
130pub struct CommentParser<'a> {
131 buf: &'a [u8],
132 pos: usize,
133 math_mode: bool,
134 html_mode: bool,
135 item: Option<CommentItem>,
136 end_italic: usize,
137 end_subscript: usize,
138}
139
140const CLOSING_PUNCTUATION: &[u8] = b".,;)?!:]'\"_-";
141
142impl<'a> CommentParser<'a> {
143 #[must_use]
147 pub fn new(buf: &'a [u8], span: Span) -> Self {
148 Self {
149 buf: &buf[..span.end as usize],
150 pos: span.start as _,
151 math_mode: false,
152 html_mode: false,
153 item: None,
154 end_italic: usize::MAX,
155 end_subscript: 0,
156 }
157 }
158
159 pub fn unescape_text(&self, span: Span, out: &mut Vec<u8>) {
162 CommentItem::unescape_text(self.html_mode, span.as_ref(self.buf), out)
163 }
164
165 pub fn unescape_label(&self, span: Span, out: &mut Vec<u8>) {
168 CommentItem::unescape_label(span.as_ref(self.buf), out)
169 }
170
171 pub fn unescape_math(&self, span: Span, out: &mut Vec<u8>) {
174 CommentItem::unescape_math(span.as_ref(self.buf), out)
175 }
176
177 fn parse_bib(&self) -> Option<Span> {
178 let start = self.pos + 1;
179 let mut end = start;
180 while let Some(&c) = self.buf.get(end) {
181 match c {
182 b']' => return Some(Span::new(start, end)),
183 b'`' if self.buf.get(end + 1) == Some(&c) => end += 2,
184 b'`' => break,
185 _ if c.is_ascii_whitespace() => break,
186 _ => end += 1,
187 }
188 }
189 None
190 }
191
192 fn parse_subscript(&self) -> Option<usize> {
193 let c = self.buf.get(self.pos + 1)?;
194 if c.is_ascii_whitespace() || CLOSING_PUNCTUATION.contains(c) {
195 return None;
196 }
197 let start = self.pos + 1;
198 let mut end = start;
199 while let Some(c) = self.buf.get(end) {
200 if c.is_ascii_whitespace() || CLOSING_PUNCTUATION.contains(c) {
201 break;
202 }
203 end += 1;
204 }
205 Some(end)
206 }
207
208 fn parse_italic(&self) -> Option<usize> {
209 if !self.buf.get(self.pos + 1)?.is_ascii_alphanumeric() {
210 return None;
211 }
212 let mut end = self.pos + 2;
213 loop {
214 if *self.buf.get(end)? == b'_' {
215 if self.buf.get(end + 1) == Some(&b'_') {
216 end += 2
217 } else {
218 break;
219 }
220 } else {
221 end += 1
222 }
223 }
224 if !self.buf[end - 1].is_ascii_alphanumeric()
225 || matches!(self.buf.get(end + 1), Some(c) if c.is_ascii_alphanumeric())
226 {
227 return None;
228 }
229 Some(end)
230 }
231
232 fn parse_underscore(&mut self) -> Option<(usize, CommentItem)> {
233 const OPENING_PUNCTUATION: &[u8] = b"(['\"";
234 let start = self.pos;
235 let is_italic = self.pos == self.end_subscript
236 || (self.pos.checked_sub(1).and_then(|pos| self.buf.get(pos))).map_or(true, |c| {
237 c.is_ascii_whitespace() || OPENING_PUNCTUATION.contains(c)
238 });
239 let item = if is_italic {
240 let it_end = self.parse_italic()?;
241 self.pos += 1;
242 self.end_italic = it_end;
243 CommentItem::StartItalic(start)
244 } else {
245 let sub_end = self.parse_subscript()?;
246 self.pos += 1;
247 self.end_subscript = sub_end;
248 CommentItem::StartSubscript(start)
249 };
250 Some((start, item))
251 }
252
253 fn parse_html(&mut self) -> Option<(usize, CommentItem)> {
254 if self.html_mode {
255 if self.buf[self.pos..].starts_with(b"</HTML>") {
256 self.html_mode = false;
257 let start = self.pos;
258 self.pos += 7;
259 Some((start, CommentItem::EndHtml(start)))
260 } else {
261 None
262 }
263 } else if self.buf[self.pos..].starts_with(b"<HTML>") {
264 self.html_mode = true;
265 let start = self.pos;
266 self.pos += 6;
267 Some((start, CommentItem::StartHtml(start)))
268 } else {
269 None
270 }
271 }
272
273 fn parse_math_delim(&mut self, at: usize) -> CommentItem {
274 if self.math_mode {
275 self.math_mode = false;
276 CommentItem::EndMathMode(at)
277 } else {
278 self.math_mode = true;
279 CommentItem::StartMathMode(at)
280 }
281 }
282
283 fn skip_whitespace(&mut self) {
284 while matches!(self.buf.get(self.pos), Some(c) if c.is_ascii_whitespace()) {
285 self.pos += 1;
286 }
287 }
288
289 fn parse_newline(&self, last_nl: Option<usize>) -> Option<()> {
290 self.buf[last_nl? + 1..self.pos]
291 .iter()
292 .all(u8::is_ascii_whitespace)
293 .then_some(())
294 }
295
296 fn parse_label(&mut self) -> CommentItem {
297 let tilde = self.pos;
298 self.pos += 1;
299 while matches!(self.buf.get(self.pos), Some(c) if c.is_ascii_whitespace()) {
300 self.pos += 1;
301 }
302 let label_start = self.pos;
303 while let Some(&c) = self.buf.get(self.pos) {
304 match c {
305 b'[' | b'`' if self.buf.get(self.pos + 1) == Some(&c) => self.pos += 2,
306 b' ' | b'\r' | b'\n' | b'`' => break,
307 b'[' if self.parse_bib().is_some() => break,
308 b'<' if self.buf[self.pos..].starts_with(b"<HTML>")
309 || self.buf[self.pos..].starts_with(b"</HTML>") =>
310 {
311 break
312 }
313 _ => self.pos += 1,
314 }
315 }
316 let label = &self.buf[label_start..self.pos];
317 if label.starts_with(b"http://")
318 || label.starts_with(b"https://")
319 || label.starts_with(b"mm")
320 {
321 CommentItem::Url(tilde, Span::new(label_start, self.pos))
322 } else {
323 CommentItem::Label(tilde, Span::new(label_start, self.pos))
324 }
325 }
326
327 fn trim_space_before_open(&self, lo: usize, pos: usize) -> bool {
328 lo + 2 <= pos
329 && self.buf[pos - 1] == b' '
330 && (b"([".contains(&self.buf[pos - 2])
331 || self.buf[pos - 2] == b'\"'
332 && pos
333 .checked_sub(3)
334 .and_then(|i| self.buf.get(i))
335 .map_or(true, u8::is_ascii_whitespace))
336 }
337
338 fn trim_space_after_close(&self) -> bool {
339 self.buf.get(self.pos) == Some(&b' ')
340 && self
341 .buf
342 .get(self.pos + 1)
343 .map_or(false, |c| CLOSING_PUNCTUATION.contains(c))
344 }
345}
346
347impl<'a> Iterator for CommentParser<'a> {
348 type Item = CommentItem;
349
350 #[allow(clippy::cognitive_complexity)]
351 fn next(&mut self) -> Option<Self::Item> {
352 if let Some(item) = self.item.take() {
353 return Some(item);
354 }
355 let math_token = self.math_mode;
356 if math_token {
357 self.skip_whitespace()
358 }
359 let start = self.pos;
360 let mut end = self.buf.len();
361 let mut last_nl = None;
362 while let Some(&c) = self.buf.get(self.pos) {
363 if c == b'`' {
364 if self.buf.get(self.pos + 1) == Some(&b'`') {
365 self.pos += 2;
366 } else {
367 end = self.pos;
368 self.pos += 1;
369 self.item = Some(self.parse_math_delim(end));
370 if self.math_mode && self.trim_space_before_open(start, end) {
371 end -= 1;
373 } else if !self.math_mode && self.trim_space_after_close() {
374 self.pos += 1;
376 }
377 break;
378 }
379 } else if math_token {
380 if matches!(self.buf.get(self.pos), Some(c) if !c.is_ascii_whitespace()) {
381 self.pos += 1;
382 } else if self.pos > start {
383 return Some(CommentItem::MathToken(Span::new(start, self.pos)));
384 } else {
385 return None;
386 }
387 } else if c == b'<' {
388 if let Some((start, item)) = self.parse_html() {
389 end = start;
390 self.item = Some(item);
391 break;
392 }
393 self.pos += 1;
394 } else if c == b'~' {
395 if self.buf.get(self.pos + 1) == Some(&b'~') {
396 self.pos += 2;
397 } else {
398 end = self.pos;
399 if self.trim_space_before_open(start, end) {
400 end -= 1;
402 }
403 self.item = Some(self.parse_label());
404 if self.trim_space_after_close() {
405 self.pos += 1;
407 }
408 break;
409 }
410 } else if c == b'[' {
411 if self.buf.get(self.pos + 1) == Some(&b'[') {
412 self.pos += 2;
413 } else if let Some(span) = self.parse_bib() {
414 end = self.pos;
415 self.pos = span.end as usize + 1;
416 self.item = Some(CommentItem::BibTag(span));
417 break;
418 } else {
419 self.pos += 1;
420 }
421 } else if self.html_mode {
422 self.pos += 1;
423 } else if c == b'\n' {
424 if self.parse_newline(last_nl.replace(self.pos)).is_some() {
425 end = self.pos;
426 self.item = Some(CommentItem::LineBreak(end));
427 break;
428 }
429 self.pos += 1;
430 } else if c == b'_' {
431 if self.buf.get(self.pos + 1) == Some(&b'_') {
432 self.pos += 2;
433 } else if self.end_italic == self.pos {
434 end = self.pos;
435 self.pos += 1;
436 self.item = Some(CommentItem::EndItalic(end));
437 break;
438 } else if let Some((pos, item)) = self.parse_underscore() {
439 end = pos;
440 self.item = Some(item);
441 break;
442 } else {
443 self.pos += 1;
444 }
445 } else {
446 self.pos += 1;
447 }
448 if self.pos == self.end_subscript {
449 end = self.pos;
450 self.item = Some(CommentItem::EndSubscript(end));
451 break;
452 }
453 }
454 if end > start {
455 return Some(CommentItem::token(math_token, Span::new(start, end)));
456 }
457 self.item.take()
458 }
459}
460
461#[derive(Default, Debug, Clone, Copy, PartialEq, Eq)]
463pub struct Discouragements {
464 pub modification_discouraged: bool,
466 pub usage_discouraged: bool,
468}
469
470impl Discouragements {
471 #[must_use]
474 pub fn new(buf: &[u8]) -> Self {
475 static MODIFICATION: OnceLock<RegexSet> = OnceLock::new();
476 let modification = MODIFICATION.get_or_init(|| {
477 RegexSet::new([
478 r"\(Proof[ \r\n]+modification[ \r\n]+is[ \r\n]+discouraged\.\)",
479 r"\(New[ \r\n]+usage[ \r\n]+is[ \r\n]+discouraged\.\)",
480 ])
481 .unwrap()
482 });
483 let m = modification.matches(buf);
484 Self {
485 modification_discouraged: m.matched(0),
486 usage_discouraged: m.matched(1),
487 }
488 }
489}
490
491#[derive(Debug, Clone, Copy, PartialEq, Eq)]
493pub enum Parenthetical {
494 ContributedBy {
496 author: Span,
498 date: Span,
501 },
502 RevisedBy {
504 author: Span,
506 date: Span,
509 },
510 ProofShortenedBy {
512 author: Span,
514 date: Span,
517 },
518 ProofModificationDiscouraged,
520 NewUsageDiscouraged,
522}
523
524#[derive(Debug)]
526pub struct ParentheticalIter<'a> {
527 matches: CaptureMatches<'static, 'a>,
528 off: u32,
529}
530
531impl<'a> ParentheticalIter<'a> {
532 #[must_use]
534 pub fn new(buf: &'a [u8], span: Span) -> Self {
535 static PARENTHETICALS: OnceLock<Regex> = OnceLock::new();
536 let parentheticals = PARENTHETICALS.get_or_init(|| {
537 Regex::new(concat!(
538 r"\((Contributed|Revised|Proof[ \r\n]+shortened)",
539 r"[ \r\n]+by[ \r\n]+([^,)]+),[ \r\n]+([0-9]{1,2}-[A-Z][a-z]{2}-[0-9]{4})\.\)|",
540 r"\((Proof[ \r\n]+modification|New[ \r\n]+usage)[ \r\n]+is[ \r\n]+discouraged\.\)",
541 ))
542 .unwrap()
543 });
544 Self {
545 matches: parentheticals.captures_iter(span.as_ref(buf)),
546 off: span.start,
547 }
548 }
549
550 fn to_span(&self, m: Match<'_>) -> Span {
551 Span {
552 start: self.off + m.start() as u32,
553 end: self.off + m.end() as u32,
554 }
555 }
556}
557
558impl<'a> Iterator for ParentheticalIter<'a> {
559 type Item = (Span, Parenthetical);
560
561 fn next(&mut self) -> Option<Self::Item> {
562 let groups = self.matches.next()?;
563 let all = groups.get(0).unwrap();
564 let item = if let Some(m) = groups.get(1) {
565 let author = self.to_span(groups.get(2).unwrap());
566 let date = self.to_span(groups.get(3).unwrap());
567 match m.as_bytes()[0] {
568 b'C' => Parenthetical::ContributedBy { author, date },
569 b'R' => Parenthetical::RevisedBy { author, date },
570 b'P' => Parenthetical::ProofShortenedBy { author, date },
571 _ => unreachable!(),
572 }
573 } else {
574 match all.as_bytes()[1] {
575 b'P' => Parenthetical::ProofModificationDiscouraged,
576 b'N' => Parenthetical::NewUsageDiscouraged,
577 _ => unreachable!(),
578 }
579 };
580 Some((self.to_span(all), item))
581 }
582}
583
584#[derive(Debug, Copy, Clone, PartialOrd, Ord, PartialEq, Eq)]
588pub struct Date {
589 pub year: u16,
591 pub month: u8,
593 pub day: u8,
595}
596
597impl Display for Date {
598 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
599 const MONTHS: [&str; 12] = [
600 "Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul", "Aug", "Sep", "Oct", "Nov", "Dec",
601 ];
602 write!(
603 f,
604 "{day}-{month}-{year:04}",
605 day = self.day,
606 month = MONTHS[(self.month - 1) as usize],
607 year = self.year
608 )
609 }
610}
611
612impl TryFrom<&[u8]> for Date {
613 type Error = ();
614
615 fn try_from(value: &[u8]) -> Result<Self, Self::Error> {
616 let (day, month, year) = match value.len() {
617 10 => (&value[..1], &value[2..5], &value[6..]),
618 11 => (&value[..2], &value[3..6], &value[7..]),
619 _ => return Err(()),
620 };
621 Ok(Date {
622 year: std::str::from_utf8(year)
623 .map_err(|_| ())?
624 .parse()
625 .map_err(|_| ())?,
626 month: match month {
627 b"Jan" => 1,
628 b"Feb" => 2,
629 b"Mar" => 3,
630 b"Apr" => 4,
631 b"May" => 5,
632 b"Jun" => 6,
633 b"Jul" => 7,
634 b"Aug" => 8,
635 b"Sep" => 9,
636 b"Oct" => 10,
637 b"Nov" => 11,
638 b"Dec" => 12,
639 _ => return Err(()),
640 },
641 day: std::str::from_utf8(day)
642 .map_err(|_| ())?
643 .parse()
644 .map_err(|_| ())?,
645 })
646 }
647}