1use std::collections::HashSet;
2use std::io::{self, BufRead, Write};
3
4#[derive(Clone, Debug, PartialEq)]
6pub enum OutputFormat {
7 Roff,
9 Tex,
11 Plain,
13}
14
15#[derive(Clone, Debug)]
17pub struct PtxConfig {
18 pub width: usize,
19 pub ignore_case: bool,
20 pub auto_reference: bool,
21 pub traditional: bool,
22 pub format: OutputFormat,
23 pub ignore_words: HashSet<String>,
24 pub only_words: Option<HashSet<String>>,
25 pub references: bool,
26 pub gap_size: usize,
27 pub right_reference: bool,
28 pub sentence_regexp: Option<String>,
29 pub word_regexp: Option<String>,
30}
31
32impl Default for PtxConfig {
33 fn default() -> Self {
34 Self {
35 width: 72,
36 ignore_case: false,
37 auto_reference: false,
38 traditional: false,
39 format: OutputFormat::Plain,
40 ignore_words: HashSet::new(),
41 only_words: None,
42 references: false,
43 gap_size: 3,
44 right_reference: false,
45 sentence_regexp: None,
46 word_regexp: None,
47 }
48 }
49}
50
51#[derive(Clone, Debug)]
53struct KwicEntry {
54 reference: String,
56 full_line: String,
58 word_start: usize,
60 keyword: String,
62 left_context: String,
64 right_context: String,
66 sort_key: String,
68}
69
70fn extract_words(line: &str) -> Vec<(usize, &str)> {
72 let mut words = Vec::new();
73 let mut start = None;
74
75 for (i, ch) in line.char_indices() {
76 if ch.is_alphanumeric() || ch == '_' {
77 if start.is_none() {
78 start = Some(i);
79 }
80 } else if let Some(s) = start {
81 words.push((s, &line[s..i]));
82 start = None;
83 }
84 }
85
86 if let Some(s) = start {
87 words.push((s, &line[s..]));
88 }
89
90 words
91}
92
93fn should_index(word: &str, config: &PtxConfig) -> bool {
95 let check_word = if config.ignore_case {
96 word.to_lowercase()
97 } else {
98 word.to_string()
99 };
100
101 if let Some(ref only) = config.only_words {
103 if config.ignore_case {
104 return only.iter().any(|w| w.to_lowercase() == check_word);
105 }
106 return only.contains(&check_word);
107 }
108
109 if config.ignore_case {
111 !config
112 .ignore_words
113 .iter()
114 .any(|w| w.to_lowercase() == check_word)
115 } else {
116 !config.ignore_words.contains(&check_word)
117 }
118}
119
120fn generate_entries(lines: &[(String, String)], config: &PtxConfig) -> Vec<KwicEntry> {
122 let mut entries = Vec::new();
123
124 for (reference, line) in lines {
125 let words = extract_words(line);
126
127 for &(word_start, word) in &words {
128 if !should_index(word, config) {
129 continue;
130 }
131
132 let word_end = word_start + word.len();
133
134 let left = line[..word_start].trim_end();
136
137 let right = line[word_end..].trim_start();
139
140 let sort_key = if config.ignore_case {
141 word.to_lowercase()
142 } else {
143 word.to_string()
144 };
145
146 entries.push(KwicEntry {
147 reference: reference.clone(),
148 full_line: line.clone(),
149 word_start,
150 keyword: word.to_string(),
151 left_context: left.to_string(),
152 right_context: right.to_string(),
153 sort_key,
154 });
155 }
156 }
157
158 entries.sort_by(|a, b| {
160 a.sort_key
161 .cmp(&b.sort_key)
162 .then_with(|| a.reference.cmp(&b.reference))
163 });
164
165 entries
166}
167
168fn skip_something(s: &str, pos: usize) -> usize {
171 if pos >= s.len() {
172 return pos;
173 }
174 let bytes = s.as_bytes();
175 if bytes[pos].is_ascii_alphanumeric() || bytes[pos] == b'_' {
176 let mut p = pos;
178 while p < s.len() && (bytes[p].is_ascii_alphanumeric() || bytes[p] == b'_') {
179 p += 1;
180 }
181 p
182 } else {
183 pos + 1
185 }
186}
187
188fn skip_white(s: &str, pos: usize) -> usize {
190 let bytes = s.as_bytes();
191 let mut p = pos;
192 while p < s.len() && bytes[p].is_ascii_whitespace() {
193 p += 1;
194 }
195 p
196}
197
198fn skip_white_backwards(s: &str, pos: usize, start: usize) -> usize {
200 let bytes = s.as_bytes();
201 let mut p = pos;
202 while p > start && bytes[p - 1].is_ascii_whitespace() {
203 p -= 1;
204 }
205 p
206}
207
208fn format_plain(
224 entry: &KwicEntry,
225 config: &PtxConfig,
226 max_word_length: usize,
227 ref_max_width: usize,
228) -> String {
229 let ref_str = if config.auto_reference || config.references {
230 &entry.reference
231 } else {
232 ""
233 };
234
235 let total_width = config.width;
236 let gap = config.gap_size;
237 let trunc_str = "/";
238 let trunc_len = trunc_str.len(); let ref_width = if ref_str.is_empty() || config.right_reference {
243 0
244 } else {
245 ref_max_width + gap
246 };
247
248 let line_width = if total_width > ref_width {
249 total_width - ref_width
250 } else {
251 total_width
252 };
253
254 let half_line_width = line_width / 2;
255
256 let before_max_width = if half_line_width > gap + 2 * trunc_len {
259 half_line_width - gap - 2 * trunc_len
260 } else {
261 0
262 };
263 let keyafter_max_width = if half_line_width > 2 * trunc_len {
264 half_line_width - 2 * trunc_len
265 } else {
266 0
267 };
268
269 let sentence = &entry.full_line;
270 let word_start = entry.word_start;
271 let line_len = sentence.len();
272
273 let keyafter_start = word_start;
276 let mut keyafter_end = word_start + entry.keyword.len();
277 {
278 let mut cursor = keyafter_end;
279 while cursor < line_len && cursor <= keyafter_start + keyafter_max_width {
280 keyafter_end = cursor;
281 cursor = skip_something(sentence, cursor);
282 }
283 if cursor <= keyafter_start + keyafter_max_width {
284 keyafter_end = cursor;
285 }
286 }
287 let mut keyafter_truncation = keyafter_end < line_len;
288 keyafter_end = skip_white_backwards(sentence, keyafter_end, keyafter_start);
290
291 let left_context_start: usize = 0; let left_field_start = if word_start > half_line_width + max_word_length {
297 let mut lfs = word_start - (half_line_width + max_word_length);
298 lfs = skip_something(sentence, lfs);
299 lfs
300 } else {
301 left_context_start
302 };
303
304 let mut before_start: usize = left_field_start;
308 let mut before_end = keyafter_start;
309 before_end = skip_white_backwards(sentence, before_end, before_start);
311
312 while before_start + before_max_width < before_end {
314 before_start = skip_something(sentence, before_start);
315 }
316
317 let mut before_truncation = {
319 let cursor = skip_white_backwards(sentence, before_start, 0);
320 cursor > left_context_start
321 };
322
323 before_start = skip_white(sentence, before_start);
325 let before_len = if before_end > before_start {
326 before_end - before_start
327 } else {
328 0
329 };
330
331 let tail_max_width_raw: isize = before_max_width as isize - before_len as isize - gap as isize;
335 let mut tail_start: usize = 0;
336 let mut tail_end: usize = 0;
337 let mut tail_truncation = false;
338 let mut has_tail = false;
339
340 if tail_max_width_raw > 0 {
341 let tail_max_width = tail_max_width_raw as usize;
342 tail_start = skip_white(sentence, keyafter_end);
343 tail_end = tail_start;
344 let mut cursor = tail_end;
345 while cursor < line_len && cursor < tail_start + tail_max_width {
346 tail_end = cursor;
347 cursor = skip_something(sentence, cursor);
348 }
349 if cursor < tail_start + tail_max_width {
350 tail_end = cursor;
351 }
352
353 if tail_end > tail_start {
354 has_tail = true;
355 keyafter_truncation = false; tail_truncation = tail_end < line_len;
357 } else {
358 tail_truncation = false;
359 }
360
361 tail_end = skip_white_backwards(sentence, tail_end, tail_start);
363 }
364
365 let keyafter_len = if keyafter_end > keyafter_start {
369 keyafter_end - keyafter_start
370 } else {
371 0
372 };
373 let head_max_width_raw: isize =
374 keyafter_max_width as isize - keyafter_len as isize - gap as isize;
375 let mut head_start: usize = 0;
376 let mut head_end: usize = 0;
377 let mut head_truncation = false;
378 let mut has_head = false;
379
380 if head_max_width_raw > 0 {
381 let head_max_width = head_max_width_raw as usize;
382 head_end = skip_white_backwards(sentence, before_start, 0);
386
387 head_start = left_field_start;
388 while head_start + head_max_width < head_end {
389 head_start = skip_something(sentence, head_start);
390 }
391
392 if head_end > head_start {
393 has_head = true;
394 before_truncation = false; head_truncation = {
396 let cursor = skip_white_backwards(sentence, head_start, 0);
398 cursor > left_context_start
399 };
400 } else {
401 head_truncation = false;
402 }
403
404 if head_end > head_start {
406 head_start = skip_white(sentence, head_start);
407 }
408 }
409
410 let before_text = if before_len > 0 {
413 &sentence[before_start..before_end]
414 } else {
415 ""
416 };
417 let keyafter_text = if keyafter_end > keyafter_start {
418 &sentence[keyafter_start..keyafter_end]
419 } else {
420 ""
421 };
422 let tail_text = if has_tail && tail_end > tail_start {
423 &sentence[tail_start..tail_end]
424 } else {
425 ""
426 };
427 let head_text = if has_head && head_end > head_start {
428 &sentence[head_start..head_end]
429 } else {
430 ""
431 };
432
433 let before_trunc_len = if before_truncation { trunc_len } else { 0 };
434 let keyafter_trunc_len = if keyafter_truncation { trunc_len } else { 0 };
435 let tail_trunc_len = if tail_truncation { trunc_len } else { 0 };
436 let head_trunc_len = if head_truncation { trunc_len } else { 0 };
437
438 let mut result = String::with_capacity(total_width + 10);
439
440 if !config.right_reference {
444 if !ref_str.is_empty() && config.auto_reference {
445 result.push_str(ref_str);
447 result.push(':');
448 let ref_total = ref_str.len() + 1; let ref_pad_total = ref_max_width + gap; let padding = ref_pad_total.saturating_sub(ref_total);
451 for _ in 0..padding {
452 result.push(' ');
453 }
454 } else if !ref_str.is_empty() {
455 result.push_str(ref_str);
457 let ref_pad_total = ref_max_width + gap;
458 let padding = ref_pad_total.saturating_sub(ref_str.len());
459 for _ in 0..padding {
460 result.push(' ');
461 }
462 } else {
463 for _ in 0..gap {
466 result.push(' ');
467 }
468 }
469 }
470
471 if !tail_text.is_empty() {
473 result.push_str(tail_text);
475 if tail_truncation {
476 result.push_str(trunc_str);
477 }
478 let tail_used = tail_text.len() + tail_trunc_len;
480 let before_used = before_text.len() + before_trunc_len;
481 let padding = half_line_width
482 .saturating_sub(gap)
483 .saturating_sub(tail_used)
484 .saturating_sub(before_used);
485 for _ in 0..padding {
486 result.push(' ');
487 }
488 } else {
489 let before_used = before_text.len() + before_trunc_len;
491 let padding = half_line_width
492 .saturating_sub(gap)
493 .saturating_sub(before_used);
494 for _ in 0..padding {
495 result.push(' ');
496 }
497 }
498
499 if before_truncation {
501 result.push_str(trunc_str);
502 }
503 result.push_str(before_text);
504
505 for _ in 0..gap {
507 result.push(' ');
508 }
509
510 result.push_str(keyafter_text);
512 if keyafter_truncation {
513 result.push_str(trunc_str);
514 }
515
516 if has_head && !head_text.is_empty() {
517 let keyafter_used = keyafter_text.len() + keyafter_trunc_len;
519 let head_used = head_text.len() + head_trunc_len;
520 let padding = half_line_width
521 .saturating_sub(keyafter_used)
522 .saturating_sub(head_used);
523 for _ in 0..padding {
524 result.push(' ');
525 }
526 if head_truncation {
527 result.push_str(trunc_str);
528 }
529 result.push_str(head_text);
530 } else if !ref_str.is_empty() && config.right_reference {
531 let keyafter_used = keyafter_text.len() + keyafter_trunc_len;
533 let padding = half_line_width.saturating_sub(keyafter_used);
534 for _ in 0..padding {
535 result.push(' ');
536 }
537 }
538
539 if !ref_str.is_empty() && config.right_reference {
541 for _ in 0..gap {
542 result.push(' ');
543 }
544 result.push_str(ref_str);
545 }
546
547 result
548}
549
550fn format_roff(entry: &KwicEntry, config: &PtxConfig) -> String {
552 let ref_str = if config.auto_reference || config.references {
553 &entry.reference
554 } else {
555 ""
556 };
557
558 let left = entry
560 .left_context
561 .replace('\\', "\\\\")
562 .replace('"', "\\\"");
563 let keyword = entry.keyword.replace('\\', "\\\\").replace('"', "\\\"");
564 let right = entry
565 .right_context
566 .replace('\\', "\\\\")
567 .replace('"', "\\\"");
568 let reference = ref_str.replace('\\', "\\\\").replace('"', "\\\"");
569
570 format!(
571 ".xx \"{}\" \"{}\" \"{}\" \"{}\"",
572 left, keyword, right, reference
573 )
574}
575
576fn format_tex(entry: &KwicEntry, config: &PtxConfig) -> String {
578 let ref_str = if config.auto_reference || config.references {
579 &entry.reference
580 } else {
581 ""
582 };
583
584 fn escape_tex(s: &str) -> String {
586 let mut result = String::with_capacity(s.len());
587 for ch in s.chars() {
588 match ch {
589 '\\' => result.push_str("\\backslash "),
590 '{' => result.push_str("\\{"),
591 '}' => result.push_str("\\}"),
592 '$' => result.push_str("\\$"),
593 '&' => result.push_str("\\&"),
594 '#' => result.push_str("\\#"),
595 '_' => result.push_str("\\_"),
596 '^' => result.push_str("\\^{}"),
597 '~' => result.push_str("\\~{}"),
598 '%' => result.push_str("\\%"),
599 _ => result.push(ch),
600 }
601 }
602 result
603 }
604
605 format!(
606 "\\xx {{{}}}{{{}}}{{{}}}{{{}}}",
607 escape_tex(&entry.left_context),
608 escape_tex(&entry.keyword),
609 escape_tex(&entry.right_context),
610 escape_tex(ref_str),
611 )
612}
613
614pub fn generate_ptx<R: BufRead, W: Write>(
619 input: R,
620 output: &mut W,
621 config: &PtxConfig,
622) -> io::Result<()> {
623 let mut lines: Vec<(String, String)> = Vec::new();
627 let mut line_num = 0usize;
628 let mut current_text = String::new();
629 let mut context_ref = String::new();
630 let mut first_line_of_context = true;
631
632 for line_result in input.lines() {
633 let line = line_result?;
634 line_num += 1;
635
636 let reference = if config.auto_reference {
637 format!("{}", line_num)
638 } else {
639 String::new()
640 };
641
642 if first_line_of_context {
643 context_ref = reference;
644 first_line_of_context = false;
645 }
646
647 if !current_text.is_empty() {
648 current_text.push(' ');
649 }
650 current_text.push_str(&line);
651
652 let trimmed = line.trim_end();
654 let ends_with_terminator =
655 trimmed.ends_with('.') || trimmed.ends_with('?') || trimmed.ends_with('!');
656
657 if ends_with_terminator || line.is_empty() {
658 if !current_text.trim().is_empty() {
659 lines.push((context_ref.clone(), current_text.clone()));
660 }
661 current_text.clear();
662 first_line_of_context = true;
663 }
664 }
665
666 if !current_text.trim().is_empty() {
668 lines.push((context_ref.clone(), current_text.clone()));
669 }
670
671 let entries = generate_entries(&lines, config);
673
674 let max_word_length = lines
676 .iter()
677 .flat_map(|(_, line)| extract_words(line))
678 .map(|(_, word)| word.len())
679 .max()
680 .unwrap_or(0);
681
682 let ref_max_width = if config.auto_reference {
684 entries.iter().map(|e| e.reference.len()).max().unwrap_or(0) + 1 } else {
688 entries.iter().map(|e| e.reference.len()).max().unwrap_or(0)
689 };
690
691 for entry in &entries {
693 let formatted = match config.format {
694 OutputFormat::Plain => format_plain(entry, config, max_word_length, ref_max_width),
695 OutputFormat::Roff => format_roff(entry, config),
696 OutputFormat::Tex => format_tex(entry, config),
697 };
698 writeln!(output, "{}", formatted)?;
699 }
700
701 Ok(())
702}
703
704pub fn read_word_file(path: &str) -> io::Result<HashSet<String>> {
706 let content = std::fs::read_to_string(path)?;
707 Ok(content
708 .lines()
709 .map(|l| l.trim().to_string())
710 .filter(|l| !l.is_empty())
711 .collect())
712}