1use std::collections::HashSet;
2use std::io::{self, BufRead, Write};
3
4#[derive(Clone, Debug, PartialEq)]
6pub enum OutputFormat {
7 Roff,
9 Tex,
11 Plain,
13}
14
15#[derive(Clone, Debug)]
17pub struct PtxConfig {
18 pub width: usize,
19 pub ignore_case: bool,
20 pub auto_reference: bool,
21 pub traditional: bool,
22 pub format: OutputFormat,
23 pub ignore_words: HashSet<String>,
24 pub only_words: Option<HashSet<String>>,
25 pub references: bool,
26 pub gap_size: usize,
27 pub right_reference: bool,
28 pub sentence_regexp: Option<String>,
29 pub word_regexp: Option<String>,
30 pub flag_truncation: Option<String>,
31 pub macro_name: Option<String>,
32}
33
34impl Default for PtxConfig {
35 fn default() -> Self {
36 Self {
37 width: 72,
38 ignore_case: false,
39 auto_reference: false,
40 traditional: false,
41 format: OutputFormat::Plain,
42 ignore_words: HashSet::new(),
43 only_words: None,
44 references: false,
45 gap_size: 3,
46 right_reference: false,
47 sentence_regexp: None,
48 word_regexp: None,
49 flag_truncation: None,
50 macro_name: None,
51 }
52 }
53}
54
55struct NormalizedSets {
57 ignore_lower: HashSet<String>,
58 only_lower: Option<HashSet<String>>,
59}
60
61impl NormalizedSets {
62 fn new(config: &PtxConfig) -> Self {
63 if config.ignore_case {
64 let ignore_lower = config
65 .ignore_words
66 .iter()
67 .map(|w| w.to_lowercase())
68 .collect();
69 let only_lower = config
70 .only_words
71 .as_ref()
72 .map(|s| s.iter().map(|w| w.to_lowercase()).collect());
73 Self {
74 ignore_lower,
75 only_lower,
76 }
77 } else {
78 Self {
80 ignore_lower: HashSet::new(),
81 only_lower: None,
82 }
83 }
84 }
85}
86
87struct KwicEntry {
89 line_idx: u32,
90 word_start: u32,
91 word_len: u16,
92}
93
94struct LayoutFields<'a> {
96 tail: &'a str,
97 before: &'a str,
98 keyafter: &'a str,
99 keyword: &'a str,
100 after: &'a str,
101 head: &'a str,
102 tail_truncated: bool,
103 before_truncated: bool,
104 keyafter_truncated: bool,
105 head_truncated: bool,
106}
107
108const SPACES: &[u8; 256] = b" ";
110
111#[inline]
113fn write_spaces<W: Write>(out: &mut W, n: usize) -> io::Result<()> {
114 let mut remaining = n;
115 while remaining > 0 {
116 let chunk = remaining.min(SPACES.len());
117 out.write_all(&SPACES[..chunk])?;
118 remaining -= chunk;
119 }
120 Ok(())
121}
122
123fn extract_words(line: &str) -> Vec<(usize, &str)> {
127 let mut words = Vec::new();
128 let bytes = line.as_bytes();
129 let len = bytes.len();
130 let mut i = 0;
131
132 while i < len {
133 if bytes[i].is_ascii_alphabetic() {
134 let start = i;
135 i += 1;
136 while i < len && bytes[i].is_ascii_alphanumeric() {
137 i += 1;
138 }
139 words.push((start, &line[start..i]));
140 } else {
141 i += 1;
142 }
143 }
144
145 words
146}
147
148#[inline]
150fn should_index(word: &str, config: &PtxConfig, norm: &NormalizedSets) -> bool {
151 if config.ignore_case {
152 if let Some(ref only) = norm.only_lower {
154 let lower = word.to_ascii_lowercase();
156 return only.contains(lower.as_str());
157 }
158 let lower = word.to_ascii_lowercase();
159 !norm.ignore_lower.contains(lower.as_str())
160 } else {
161 if let Some(ref only) = config.only_words {
162 return only.contains(word);
163 }
164 !config.ignore_words.contains(word)
165 }
166}
167
168fn generate_entries(
170 lines: &[(String, String)],
171 config: &PtxConfig,
172 norm: &NormalizedSets,
173) -> (Vec<KwicEntry>, usize) {
174 let mut entries = Vec::new();
175 let mut max_word_length: usize = 0;
176
177 for (line_idx, (_reference, line)) in lines.iter().enumerate() {
178 let words = extract_words(line);
179
180 for &(word_start, word) in &words {
181 let wlen = word.len();
182 if wlen > max_word_length {
183 max_word_length = wlen;
184 }
185
186 if !should_index(word, config, norm) {
187 continue;
188 }
189
190 debug_assert!(
191 wlen <= u16::MAX as usize,
192 "word length {} exceeds u16::MAX",
193 wlen
194 );
195 entries.push(KwicEntry {
196 line_idx: line_idx as u32,
197 word_start: word_start as u32,
198 word_len: wlen as u16,
199 });
200 }
201 }
202
203 if config.ignore_case {
207 entries.sort_by(|a, b| {
208 let a_line = &lines[a.line_idx as usize].1;
209 let b_line = &lines[b.line_idx as usize].1;
210 let a_kw = &a_line[a.word_start as usize..a.word_start as usize + a.word_len as usize];
211 let b_kw = &b_line[b.word_start as usize..b.word_start as usize + b.word_len as usize];
212 a_kw.bytes()
213 .map(|c| c.to_ascii_lowercase())
214 .cmp(b_kw.bytes().map(|c| c.to_ascii_lowercase()))
215 .then_with(|| {
216 lines[a.line_idx as usize]
217 .0
218 .cmp(&lines[b.line_idx as usize].0)
219 })
220 });
221 } else {
222 entries.sort_by(|a, b| {
223 let a_line = &lines[a.line_idx as usize].1;
224 let b_line = &lines[b.line_idx as usize].1;
225 let a_kw = &a_line[a.word_start as usize..a.word_start as usize + a.word_len as usize];
226 let b_kw = &b_line[b.word_start as usize..b.word_start as usize + b.word_len as usize];
227 a_kw.cmp(b_kw).then_with(|| {
228 lines[a.line_idx as usize]
229 .0
230 .cmp(&lines[b.line_idx as usize].0)
231 })
232 });
233 }
234
235 (entries, max_word_length)
236}
237
238#[inline]
240fn skip_something(s: &str, pos: usize) -> usize {
241 if pos >= s.len() {
242 return pos;
243 }
244 let bytes = s.as_bytes();
245 if bytes[pos].is_ascii_alphabetic() {
246 let mut p = pos + 1;
247 while p < s.len() && bytes[p].is_ascii_alphanumeric() {
248 p += 1;
249 }
250 p
251 } else {
252 pos + 1
253 }
254}
255
256#[inline]
258fn skip_white(s: &str, pos: usize) -> usize {
259 let bytes = s.as_bytes();
260 let mut p = pos;
261 while p < s.len() && bytes[p].is_ascii_whitespace() {
262 p += 1;
263 }
264 p
265}
266
267#[inline]
269fn skip_white_backwards(s: &str, pos: usize, start: usize) -> usize {
270 let bytes = s.as_bytes();
271 let mut p = pos;
272 while p > start && bytes[p - 1].is_ascii_whitespace() {
273 p -= 1;
274 }
275 p
276}
277
278fn compute_layout<'a>(
280 sentence: &'a str,
281 word_start: usize,
282 keyword_len: usize,
283 ref_str: &str,
284 config: &PtxConfig,
285 max_word_length: usize,
286 ref_max_width: usize,
287) -> LayoutFields<'a> {
288 let total_width = config.width;
289 let gap = config.gap_size;
290 let trunc_len = 1; let ref_width = if ref_str.is_empty() || config.right_reference {
293 0
294 } else {
295 ref_max_width + gap
296 };
297
298 let line_width = if total_width > ref_width {
299 total_width - ref_width
300 } else {
301 total_width
302 };
303
304 let half_line_width = line_width / 2;
305
306 let before_max_width = if half_line_width > gap + 2 * trunc_len {
307 half_line_width - gap - 2 * trunc_len
308 } else {
309 0
310 };
311 let keyafter_max_width = if half_line_width > 2 * trunc_len {
312 half_line_width - 2 * trunc_len
313 } else {
314 0
315 };
316
317 let line_len = sentence.len();
318
319 let keyafter_start = word_start;
321 let mut keyafter_end = word_start + keyword_len;
322 {
323 let mut cursor = keyafter_end;
324 while cursor < line_len && cursor <= keyafter_start + keyafter_max_width {
325 keyafter_end = cursor;
326 cursor = skip_something(sentence, cursor);
327 }
328 if cursor <= keyafter_start + keyafter_max_width {
329 keyafter_end = cursor;
330 }
331 }
332 let mut keyafter_truncation = keyafter_end < line_len;
333 keyafter_end = skip_white_backwards(sentence, keyafter_end, keyafter_start);
334
335 let left_context_start: usize = 0;
337 let left_field_start = if word_start > half_line_width + max_word_length {
338 let lfs = word_start - (half_line_width + max_word_length);
339 skip_something(sentence, lfs)
340 } else {
341 left_context_start
342 };
343
344 let mut before_start: usize = left_field_start;
346 let mut before_end = keyafter_start;
347 before_end = skip_white_backwards(sentence, before_end, before_start);
348
349 while before_start + before_max_width < before_end {
350 before_start = skip_something(sentence, before_start);
351 }
352
353 let mut before_truncation = {
354 let cursor = skip_white_backwards(sentence, before_start, 0);
355 cursor > left_context_start
356 };
357
358 before_start = skip_white(sentence, before_start);
359 let before_len = if before_end > before_start {
360 before_end - before_start
361 } else {
362 0
363 };
364
365 let tail_max_width_raw: isize = before_max_width as isize - before_len as isize - gap as isize;
367 let mut tail_start: usize = 0;
368 let mut tail_end: usize = 0;
369 let mut tail_truncation = false;
370 let mut has_tail = false;
371
372 if tail_max_width_raw > 0 {
373 let tail_max_width = tail_max_width_raw as usize;
374 tail_start = skip_white(sentence, keyafter_end);
375 tail_end = tail_start;
376 let mut cursor = tail_end;
377 while cursor < line_len && cursor < tail_start + tail_max_width {
378 tail_end = cursor;
379 cursor = skip_something(sentence, cursor);
380 }
381 if cursor < tail_start + tail_max_width {
382 tail_end = cursor;
383 }
384
385 if tail_end > tail_start {
386 has_tail = true;
387 keyafter_truncation = false;
388 tail_truncation = tail_end < line_len;
389 } else {
390 tail_truncation = false;
391 }
392
393 tail_end = skip_white_backwards(sentence, tail_end, tail_start);
394 }
395
396 let keyafter_len = if keyafter_end > keyafter_start {
398 keyafter_end - keyafter_start
399 } else {
400 0
401 };
402 let head_max_width_raw: isize =
403 keyafter_max_width as isize - keyafter_len as isize - gap as isize;
404 let mut head_start: usize = 0;
405 let mut head_end: usize = 0;
406 let mut head_truncation = false;
407 let mut has_head = false;
408
409 if head_max_width_raw > 0 {
410 let head_max_width = head_max_width_raw as usize;
411 head_end = skip_white_backwards(sentence, before_start, 0);
412
413 head_start = left_field_start;
414 while head_start + head_max_width < head_end {
415 head_start = skip_something(sentence, head_start);
416 }
417
418 if head_end > head_start {
419 has_head = true;
420 before_truncation = false;
421 head_truncation = {
422 let cursor = skip_white_backwards(sentence, head_start, 0);
423 cursor > left_context_start
424 };
425 } else {
426 head_truncation = false;
427 }
428
429 if head_end > head_start {
430 head_start = skip_white(sentence, head_start);
431 }
432 }
433
434 let before_text = if before_len > 0 {
436 &sentence[before_start..before_end]
437 } else {
438 ""
439 };
440 let keyafter_text = if keyafter_end > keyafter_start {
441 &sentence[keyafter_start..keyafter_end]
442 } else {
443 ""
444 };
445 let tail_text = if has_tail && tail_end > tail_start {
446 &sentence[tail_start..tail_end]
447 } else {
448 ""
449 };
450 let head_text = if has_head && head_end > head_start {
451 &sentence[head_start..head_end]
452 } else {
453 ""
454 };
455
456 let keyword_text = &sentence[word_start..word_start + keyword_len];
457 let after_start = word_start + keyword_len;
458 let after_text = if keyafter_end > after_start {
459 &sentence[after_start..keyafter_end]
460 } else {
461 ""
462 };
463
464 LayoutFields {
465 tail: tail_text,
466 before: before_text,
467 keyafter: keyafter_text,
468 keyword: keyword_text,
469 after: after_text,
470 head: head_text,
471 tail_truncated: tail_truncation,
472 before_truncated: before_truncation,
473 keyafter_truncated: keyafter_truncation,
474 head_truncated: head_truncation,
475 }
476}
477
478fn write_plain<W: Write>(
480 out: &mut W,
481 ref_str: &str,
482 config: &PtxConfig,
483 layout: &LayoutFields<'_>,
484 ref_max_width: usize,
485) -> io::Result<()> {
486 let total_width = config.width;
487 let gap = config.gap_size;
488 let trunc_str = config.flag_truncation.as_deref().unwrap_or("/");
489 let trunc_len = trunc_str.len();
490
491 let ref_width = if ref_str.is_empty() || config.right_reference {
492 0
493 } else {
494 ref_max_width + gap
495 };
496
497 let line_width = if total_width > ref_width {
498 total_width - ref_width
499 } else {
500 total_width
501 };
502
503 let half_line_width = line_width / 2;
504
505 let before_trunc_len = if layout.before_truncated {
506 trunc_len
507 } else {
508 0
509 };
510 let keyafter_trunc_len = if layout.keyafter_truncated {
511 trunc_len
512 } else {
513 0
514 };
515 let tail_trunc_len = if layout.tail_truncated { trunc_len } else { 0 };
516 let head_trunc_len = if layout.head_truncated { trunc_len } else { 0 };
517
518 if !config.right_reference {
520 if !ref_str.is_empty() && config.auto_reference {
521 out.write_all(ref_str.as_bytes())?;
522 out.write_all(b":")?;
523 let ref_total = ref_str.len() + 1;
524 let ref_pad_total = ref_max_width + gap;
525 write_spaces(out, ref_pad_total.saturating_sub(ref_total))?;
526 } else if !ref_str.is_empty() {
527 out.write_all(ref_str.as_bytes())?;
528 let ref_pad_total = ref_max_width + gap;
529 write_spaces(out, ref_pad_total.saturating_sub(ref_str.len()))?;
530 } else {
531 write_spaces(out, gap)?;
532 }
533 }
534
535 if !layout.tail.is_empty() {
537 out.write_all(layout.tail.as_bytes())?;
538 if layout.tail_truncated {
539 out.write_all(trunc_str.as_bytes())?;
540 }
541 let tail_used = layout.tail.len() + tail_trunc_len;
542 let before_used = layout.before.len() + before_trunc_len;
543 let padding = half_line_width
544 .saturating_sub(gap)
545 .saturating_sub(tail_used)
546 .saturating_sub(before_used);
547 write_spaces(out, padding)?;
548 } else {
549 let before_used = layout.before.len() + before_trunc_len;
550 let padding = half_line_width
551 .saturating_sub(gap)
552 .saturating_sub(before_used);
553 write_spaces(out, padding)?;
554 }
555
556 if layout.before_truncated {
557 out.write_all(trunc_str.as_bytes())?;
558 }
559 out.write_all(layout.before.as_bytes())?;
560
561 write_spaces(out, gap)?;
563
564 out.write_all(layout.keyafter.as_bytes())?;
566 if layout.keyafter_truncated {
567 out.write_all(trunc_str.as_bytes())?;
568 }
569
570 if !layout.head.is_empty() {
571 let keyafter_used = layout.keyafter.len() + keyafter_trunc_len;
572 let head_used = layout.head.len() + head_trunc_len;
573 let padding = half_line_width
574 .saturating_sub(keyafter_used)
575 .saturating_sub(head_used);
576 write_spaces(out, padding)?;
577 if layout.head_truncated {
578 out.write_all(trunc_str.as_bytes())?;
579 }
580 out.write_all(layout.head.as_bytes())?;
581 } else if !ref_str.is_empty() && config.right_reference {
582 let keyafter_used = layout.keyafter.len() + keyafter_trunc_len;
583 let padding = half_line_width.saturating_sub(keyafter_used);
584 write_spaces(out, padding)?;
585 }
586
587 if !ref_str.is_empty() && config.right_reference {
589 write_spaces(out, gap)?;
590 out.write_all(ref_str.as_bytes())?;
591 }
592
593 out.write_all(b"\n")
594}
595
596fn escape_roff(s: &str) -> String {
598 s.replace('\\', "\\\\").replace('"', "\\\"")
599}
600
601fn write_roff<W: Write>(
603 out: &mut W,
604 ref_str: &str,
605 config: &PtxConfig,
606 layout: &LayoutFields<'_>,
607 escaped_trunc: &str,
608) -> io::Result<()> {
609 let macro_name = config.macro_name.as_deref().unwrap_or("xx");
610
611 out.write_all(b".")?;
612 out.write_all(macro_name.as_bytes())?;
613
614 out.write_all(b" \"")?;
616 out.write_all(escape_roff(layout.tail).as_bytes())?;
617 if layout.tail_truncated {
618 out.write_all(escaped_trunc.as_bytes())?;
619 }
620
621 out.write_all(b"\" \"")?;
623 if layout.before_truncated {
624 out.write_all(escaped_trunc.as_bytes())?;
625 }
626 out.write_all(escape_roff(layout.before).as_bytes())?;
627
628 out.write_all(b"\" \"")?;
630 out.write_all(escape_roff(layout.keyafter).as_bytes())?;
631 if layout.keyafter_truncated {
632 out.write_all(escaped_trunc.as_bytes())?;
633 }
634
635 out.write_all(b"\" \"")?;
637 if layout.head_truncated {
638 out.write_all(escaped_trunc.as_bytes())?;
639 }
640 out.write_all(escape_roff(layout.head).as_bytes())?;
641 out.write_all(b"\"")?;
642
643 if !ref_str.is_empty() {
645 out.write_all(b" \"")?;
646 out.write_all(escape_roff(ref_str).as_bytes())?;
647 out.write_all(b"\"")?;
648 }
649
650 out.write_all(b"\n")
651}
652
653fn escape_tex(s: &str) -> String {
655 let mut result = String::with_capacity(s.len());
656 for ch in s.chars() {
657 match ch {
658 '\\' => result.push_str("\\backslash "),
659 '{' => result.push_str("\\{"),
660 '}' => result.push_str("\\}"),
661 '$' => result.push_str("\\$"),
662 '&' => result.push_str("\\&"),
663 '#' => result.push_str("\\#"),
664 '_' => result.push_str("\\_"),
665 '^' => result.push_str("\\^{}"),
666 '~' => result.push_str("\\~{}"),
667 '%' => result.push_str("\\%"),
668 _ => result.push(ch),
669 }
670 }
671 result
672}
673
674fn write_tex<W: Write>(
676 out: &mut W,
677 ref_str: &str,
678 config: &PtxConfig,
679 layout: &LayoutFields<'_>,
680) -> io::Result<()> {
681 let macro_name = config.macro_name.as_deref().unwrap_or("xx");
682
683 out.write_all(b"\\")?;
684 out.write_all(macro_name.as_bytes())?;
685 out.write_all(b" {")?;
686 out.write_all(escape_tex(layout.tail).as_bytes())?;
687 out.write_all(b"}{")?;
688 out.write_all(escape_tex(layout.before).as_bytes())?;
689 out.write_all(b"}{")?;
690 out.write_all(escape_tex(layout.keyword).as_bytes())?;
691 out.write_all(b"}{")?;
692 out.write_all(escape_tex(layout.after).as_bytes())?;
693 out.write_all(b"}{")?;
694 out.write_all(escape_tex(layout.head).as_bytes())?;
695 out.write_all(b"}")?;
696
697 if !ref_str.is_empty() {
698 out.write_all(b"{")?;
699 out.write_all(escape_tex(ref_str).as_bytes())?;
700 out.write_all(b"}")?;
701 }
702
703 out.write_all(b"\n")
704}
705
706fn process_lines_into_contexts(
708 content: &str,
709 filename: Option<&str>,
710 config: &PtxConfig,
711 lines_out: &mut Vec<(String, String)>,
712 global_line_num: &mut usize,
713) {
714 let mut current_text = String::new();
715 let mut context_ref = String::new();
716 let mut first_line_of_context = true;
717
718 for line in content.lines() {
719 *global_line_num += 1;
720
721 let reference = if config.auto_reference {
722 match filename {
723 Some(name) => format!("{}:{}", name, global_line_num),
724 None => format!("{}", global_line_num),
725 }
726 } else {
727 String::new()
728 };
729
730 if first_line_of_context {
731 context_ref = reference;
732 first_line_of_context = false;
733 }
734
735 if !current_text.is_empty() {
736 current_text.push(' ');
737 }
738 current_text.push_str(line);
739
740 let trimmed = line.trim_end();
741 let ends_with_terminator =
742 trimmed.ends_with('.') || trimmed.ends_with('?') || trimmed.ends_with('!');
743
744 if ends_with_terminator || line.is_empty() {
745 if !current_text.trim().is_empty() {
746 lines_out.push((context_ref.clone(), current_text.clone()));
747 }
748 current_text.clear();
749 first_line_of_context = true;
750 }
751 }
752
753 if !current_text.trim().is_empty() {
754 lines_out.push((context_ref.clone(), current_text.clone()));
755 }
756}
757
758fn format_and_write<W: Write>(
759 lines: &[(String, String)],
760 output: &mut W,
761 config: &PtxConfig,
762) -> io::Result<()> {
763 let norm = NormalizedSets::new(config);
764 let (entries, max_word_length) = generate_entries(lines, config, &norm);
765
766 let ref_max_width = if config.auto_reference || config.references {
768 entries
769 .iter()
770 .map(|e| lines[e.line_idx as usize].0.len())
771 .max()
772 .unwrap_or(0)
773 } else {
774 0
775 };
776
777 let escaped_trunc = if config.format == OutputFormat::Roff {
779 escape_roff(config.flag_truncation.as_deref().unwrap_or("/"))
780 } else {
781 String::new()
782 };
783
784 for entry in &entries {
785 let line_data = &lines[entry.line_idx as usize];
786 let ref_str = if config.auto_reference || config.references {
787 &line_data.0
788 } else {
789 ""
790 };
791 let sentence = &line_data.1;
792 let word_start = entry.word_start as usize;
793 let keyword_len = entry.word_len as usize;
794
795 let layout = compute_layout(
796 sentence,
797 word_start,
798 keyword_len,
799 ref_str,
800 config,
801 max_word_length,
802 ref_max_width,
803 );
804
805 match config.format {
806 OutputFormat::Plain => write_plain(output, ref_str, config, &layout, ref_max_width)?,
807 OutputFormat::Roff => {
808 write_roff(output, ref_str, config, &layout, &escaped_trunc)?;
809 }
810 OutputFormat::Tex => write_tex(output, ref_str, config, &layout)?,
811 }
812 }
813
814 Ok(())
815}
816
817pub fn generate_ptx<R: BufRead, W: Write>(
819 mut input: R,
820 output: &mut W,
821 config: &PtxConfig,
822) -> io::Result<()> {
823 let mut content = String::new();
824 input.read_to_string(&mut content)?;
825
826 let mut lines: Vec<(String, String)> = Vec::new();
827 let mut global_line_num = 0usize;
828 process_lines_into_contexts(&content, None, config, &mut lines, &mut global_line_num);
829
830 format_and_write(&lines, output, config)
831}
832
833pub fn generate_ptx_multi<W: Write>(
835 file_contents: &[(Option<String>, String)],
836 output: &mut W,
837 config: &PtxConfig,
838) -> io::Result<()> {
839 let mut lines: Vec<(String, String)> = Vec::new();
840 let mut global_line_num = 0usize;
841
842 for (filename, content) in file_contents {
843 process_lines_into_contexts(
844 content,
845 filename.as_deref(),
846 config,
847 &mut lines,
848 &mut global_line_num,
849 );
850 }
851
852 format_and_write(&lines, output, config)
853}
854
855pub fn read_word_file(path: &str) -> io::Result<HashSet<String>> {
857 let content = std::fs::read_to_string(path)?;
858 Ok(content
859 .lines()
860 .map(|l| l.trim().to_string())
861 .filter(|l| !l.is_empty())
862 .collect())
863}