Skip to main content

coreutils_rs/ptx/
core.rs

1use std::collections::HashSet;
2use std::io::{self, BufRead, Write};
3
4/// Output format for ptx.
5#[derive(Clone, Debug, PartialEq)]
6pub enum OutputFormat {
7    /// Default GNU ptx output format (roff-like).
8    Roff,
9    /// TeX output format.
10    Tex,
11    /// Dumb terminal / plain text format.
12    Plain,
13}
14
15/// Configuration for the ptx command.
16#[derive(Clone, Debug)]
17pub struct PtxConfig {
18    pub width: usize,
19    pub ignore_case: bool,
20    pub auto_reference: bool,
21    pub traditional: bool,
22    pub format: OutputFormat,
23    pub ignore_words: HashSet<String>,
24    pub only_words: Option<HashSet<String>>,
25    pub references: bool,
26    pub gap_size: usize,
27    pub right_reference: bool,
28    pub sentence_regexp: Option<String>,
29    pub word_regexp: Option<String>,
30    pub flag_truncation: Option<String>,
31    pub macro_name: Option<String>,
32}
33
34impl Default for PtxConfig {
35    fn default() -> Self {
36        Self {
37            width: 72,
38            ignore_case: false,
39            auto_reference: false,
40            traditional: false,
41            format: OutputFormat::Plain,
42            ignore_words: HashSet::new(),
43            only_words: None,
44            references: false,
45            gap_size: 3,
46            right_reference: false,
47            sentence_regexp: None,
48            word_regexp: None,
49            flag_truncation: None,
50            macro_name: None,
51        }
52    }
53}
54
55/// A single KWIC (Key Word In Context) entry.
56#[derive(Clone, Debug)]
57struct KwicEntry {
58    /// Reference (filename:line or line number).
59    reference: String,
60    /// The full input line.
61    full_line: String,
62    /// Byte offset of the keyword within the full line.
63    word_start: usize,
64    /// The keyword itself.
65    keyword: String,
66    /// Sort key (lowercase keyword for case-insensitive sorting).
67    sort_key: String,
68}
69
70/// Computed layout fields for a KWIC entry.
71///
72/// These correspond to the four display regions in GNU ptx output:
73///   Left half:  [tail] ... [before]
74///   Gap
75///   Right half: [keyafter] ... [head]
76///
77/// For roff:  .xx "tail" "before" "keyafter" "head" ["reference"]
78/// For TeX:   \xx {tail}{before}{keyword}{after}{head} [{reference}]
79struct LayoutFields {
80    tail: String,
81    before: String,
82    keyafter: String,
83    keyword: String,
84    after: String,
85    head: String,
86    tail_truncated: bool,
87    before_truncated: bool,
88    keyafter_truncated: bool,
89    head_truncated: bool,
90}
91
92/// Extract words from a line of text.
93///
94/// GNU ptx's default word regex is effectively `[a-zA-Z][a-zA-Z0-9]*`:
95/// a word must start with a letter and may continue with letters or digits.
96/// Underscores and other non-alphanumeric characters are word separators.
97/// Pure-digit tokens are not considered words.
98fn extract_words(line: &str) -> Vec<(usize, &str)> {
99    let mut words = Vec::new();
100    let bytes = line.as_bytes();
101    let len = bytes.len();
102    let mut i = 0;
103
104    while i < len {
105        // A word must start with an ASCII letter
106        if bytes[i].is_ascii_alphabetic() {
107            let start = i;
108            i += 1;
109            // Continue with letters or digits
110            while i < len && bytes[i].is_ascii_alphanumeric() {
111                i += 1;
112            }
113            words.push((start, &line[start..i]));
114        } else {
115            i += 1;
116        }
117    }
118
119    words
120}
121
122/// Check if a word should be indexed.
123fn should_index(word: &str, config: &PtxConfig) -> bool {
124    let check_word = if config.ignore_case {
125        word.to_lowercase()
126    } else {
127        word.to_string()
128    };
129
130    // If only_words is set, the word must be in that set
131    if let Some(ref only) = config.only_words {
132        if config.ignore_case {
133            return only.iter().any(|w| w.to_lowercase() == check_word);
134        }
135        return only.contains(&check_word);
136    }
137
138    // Otherwise, word must not be in ignore list
139    if config.ignore_case {
140        !config
141            .ignore_words
142            .iter()
143            .any(|w| w.to_lowercase() == check_word)
144    } else {
145        !config.ignore_words.contains(&check_word)
146    }
147}
148
149/// Generate KWIC entries from input lines.
150fn generate_entries(lines: &[(String, String)], config: &PtxConfig) -> Vec<KwicEntry> {
151    let mut entries = Vec::new();
152
153    for (reference, line) in lines {
154        let words = extract_words(line);
155
156        for &(word_start, word) in &words {
157            if !should_index(word, config) {
158                continue;
159            }
160
161            let sort_key = if config.ignore_case {
162                word.to_lowercase()
163            } else {
164                word.to_string()
165            };
166
167            entries.push(KwicEntry {
168                reference: reference.clone(),
169                full_line: line.clone(),
170                word_start,
171                keyword: word.to_string(),
172                sort_key,
173            });
174        }
175    }
176
177    // Sort by keyword (case-insensitive if requested), then by reference
178    entries.sort_by(|a, b| {
179        a.sort_key
180            .cmp(&b.sort_key)
181            .then_with(|| a.reference.cmp(&b.reference))
182    });
183
184    entries
185}
186
187/// Advance past one "word" (consecutive word chars) or one non-word char.
188/// Returns the new position after skipping.
189///
190/// A "word" here matches the default GNU ptx word definition: starts with
191/// a letter, continues with letters or digits.
192fn skip_something(s: &str, pos: usize) -> usize {
193    if pos >= s.len() {
194        return pos;
195    }
196    let bytes = s.as_bytes();
197    if bytes[pos].is_ascii_alphabetic() {
198        // Skip a word: letter followed by alphanumeric chars
199        let mut p = pos + 1;
200        while p < s.len() && bytes[p].is_ascii_alphanumeric() {
201            p += 1;
202        }
203        p
204    } else {
205        // Skip one non-word character (digit, underscore, punctuation, etc.)
206        pos + 1
207    }
208}
209
210/// Skip whitespace forward from position.
211fn skip_white(s: &str, pos: usize) -> usize {
212    let bytes = s.as_bytes();
213    let mut p = pos;
214    while p < s.len() && bytes[p].is_ascii_whitespace() {
215        p += 1;
216    }
217    p
218}
219
220/// Skip whitespace backward from position (exclusive end).
221fn skip_white_backwards(s: &str, pos: usize, start: usize) -> usize {
222    let bytes = s.as_bytes();
223    let mut p = pos;
224    while p > start && bytes[p - 1].is_ascii_whitespace() {
225        p -= 1;
226    }
227    p
228}
229
230/// Compute the layout fields for a KWIC entry using the GNU ptx algorithm.
231///
232/// This computes the four display regions (tail, before, keyafter, head)
233/// that are used by all three output formats (plain, roff, TeX).
234fn compute_layout(
235    entry: &KwicEntry,
236    config: &PtxConfig,
237    max_word_length: usize,
238    ref_max_width: usize,
239) -> LayoutFields {
240    let ref_str = if config.auto_reference || config.references {
241        &entry.reference
242    } else {
243        ""
244    };
245
246    let total_width = config.width;
247    let gap = config.gap_size;
248    let trunc_len = 1; // "/" is 1 char
249
250    // Calculate available line width (subtract reference if on the left)
251    let ref_width = if ref_str.is_empty() || config.right_reference {
252        0
253    } else {
254        ref_max_width + gap
255    };
256
257    let line_width = if total_width > ref_width {
258        total_width - ref_width
259    } else {
260        total_width
261    };
262
263    let half_line_width = line_width / 2;
264
265    // GNU ptx: before_max_width = half_line_width - gap_size - 2 * trunc_len
266    // keyafter_max_width = half_line_width - 2 * trunc_len
267    let before_max_width = if half_line_width > gap + 2 * trunc_len {
268        half_line_width - gap - 2 * trunc_len
269    } else {
270        0
271    };
272    let keyafter_max_width = if half_line_width > 2 * trunc_len {
273        half_line_width - 2 * trunc_len
274    } else {
275        0
276    };
277
278    let sentence = &entry.full_line;
279    let word_start = entry.word_start;
280    let keyword_len = entry.keyword.len();
281    let line_len = sentence.len();
282
283    // ========== Step 1: Compute keyafter ==========
284    let keyafter_start = word_start;
285    let mut keyafter_end = word_start + keyword_len;
286    {
287        let mut cursor = keyafter_end;
288        while cursor < line_len && cursor <= keyafter_start + keyafter_max_width {
289            keyafter_end = cursor;
290            cursor = skip_something(sentence, cursor);
291        }
292        if cursor <= keyafter_start + keyafter_max_width {
293            keyafter_end = cursor;
294        }
295    }
296    let mut keyafter_truncation = keyafter_end < line_len;
297    keyafter_end = skip_white_backwards(sentence, keyafter_end, keyafter_start);
298
299    // ========== Compute left_field_start ==========
300    let left_context_start: usize = 0;
301    let left_field_start = if word_start > half_line_width + max_word_length {
302        let mut lfs = word_start - (half_line_width + max_word_length);
303        lfs = skip_something(sentence, lfs);
304        lfs
305    } else {
306        left_context_start
307    };
308
309    // ========== Step 2: Compute before ==========
310    let mut before_start: usize = left_field_start;
311    let mut before_end = keyafter_start;
312    before_end = skip_white_backwards(sentence, before_end, before_start);
313
314    while before_start + before_max_width < before_end {
315        before_start = skip_something(sentence, before_start);
316    }
317
318    let mut before_truncation = {
319        let cursor = skip_white_backwards(sentence, before_start, 0);
320        cursor > left_context_start
321    };
322
323    before_start = skip_white(sentence, before_start);
324    let before_len = if before_end > before_start {
325        before_end - before_start
326    } else {
327        0
328    };
329
330    // ========== Step 3: Compute tail ==========
331    let tail_max_width_raw: isize = before_max_width as isize - before_len as isize - gap as isize;
332    let mut tail_start: usize = 0;
333    let mut tail_end: usize = 0;
334    let mut tail_truncation = false;
335    let mut has_tail = false;
336
337    if tail_max_width_raw > 0 {
338        let tail_max_width = tail_max_width_raw as usize;
339        tail_start = skip_white(sentence, keyafter_end);
340        tail_end = tail_start;
341        let mut cursor = tail_end;
342        while cursor < line_len && cursor < tail_start + tail_max_width {
343            tail_end = cursor;
344            cursor = skip_something(sentence, cursor);
345        }
346        if cursor < tail_start + tail_max_width {
347            tail_end = cursor;
348        }
349
350        if tail_end > tail_start {
351            has_tail = true;
352            keyafter_truncation = false;
353            tail_truncation = tail_end < line_len;
354        } else {
355            tail_truncation = false;
356        }
357
358        tail_end = skip_white_backwards(sentence, tail_end, tail_start);
359    }
360
361    // ========== Step 4: Compute head ==========
362    let keyafter_len = if keyafter_end > keyafter_start {
363        keyafter_end - keyafter_start
364    } else {
365        0
366    };
367    let head_max_width_raw: isize =
368        keyafter_max_width as isize - keyafter_len as isize - gap as isize;
369    let mut head_start: usize = 0;
370    let mut head_end: usize = 0;
371    let mut head_truncation = false;
372    let mut has_head = false;
373
374    if head_max_width_raw > 0 {
375        let head_max_width = head_max_width_raw as usize;
376        head_end = skip_white_backwards(sentence, before_start, 0);
377
378        head_start = left_field_start;
379        while head_start + head_max_width < head_end {
380            head_start = skip_something(sentence, head_start);
381        }
382
383        if head_end > head_start {
384            has_head = true;
385            before_truncation = false;
386            head_truncation = {
387                let cursor = skip_white_backwards(sentence, head_start, 0);
388                cursor > left_context_start
389            };
390        } else {
391            head_truncation = false;
392        }
393
394        if head_end > head_start {
395            head_start = skip_white(sentence, head_start);
396        }
397    }
398
399    // ========== Extract text fields ==========
400    let before_text = if before_len > 0 {
401        &sentence[before_start..before_end]
402    } else {
403        ""
404    };
405    let keyafter_text = if keyafter_end > keyafter_start {
406        &sentence[keyafter_start..keyafter_end]
407    } else {
408        ""
409    };
410    let tail_text = if has_tail && tail_end > tail_start {
411        &sentence[tail_start..tail_end]
412    } else {
413        ""
414    };
415    let head_text = if has_head && head_end > head_start {
416        &sentence[head_start..head_end]
417    } else {
418        ""
419    };
420
421    // Extract keyword and after separately (for TeX format)
422    let keyword_text = &entry.keyword;
423    let after_start = keyafter_start + keyword_len;
424    let after_text = if keyafter_end > after_start {
425        &sentence[after_start..keyafter_end]
426    } else {
427        ""
428    };
429
430    LayoutFields {
431        tail: tail_text.to_string(),
432        before: before_text.to_string(),
433        keyafter: keyafter_text.to_string(),
434        keyword: keyword_text.to_string(),
435        after: after_text.to_string(),
436        head: head_text.to_string(),
437        tail_truncated: tail_truncation,
438        before_truncated: before_truncation,
439        keyafter_truncated: keyafter_truncation,
440        head_truncated: head_truncation,
441    }
442}
443
444/// Format a KWIC entry for plain text output.
445fn format_plain(
446    entry: &KwicEntry,
447    config: &PtxConfig,
448    layout: &LayoutFields,
449    ref_max_width: usize,
450) -> String {
451    let ref_str = if config.auto_reference || config.references {
452        &entry.reference
453    } else {
454        ""
455    };
456
457    let total_width = config.width;
458    let gap = config.gap_size;
459    let trunc_str = config.flag_truncation.as_deref().unwrap_or("/");
460    let trunc_len = trunc_str.len();
461
462    let ref_width = if ref_str.is_empty() || config.right_reference {
463        0
464    } else {
465        ref_max_width + gap
466    };
467
468    let line_width = if total_width > ref_width {
469        total_width - ref_width
470    } else {
471        total_width
472    };
473
474    let half_line_width = line_width / 2;
475
476    let before_trunc_len = if layout.before_truncated {
477        trunc_len
478    } else {
479        0
480    };
481    let keyafter_trunc_len = if layout.keyafter_truncated {
482        trunc_len
483    } else {
484        0
485    };
486    let tail_trunc_len = if layout.tail_truncated { trunc_len } else { 0 };
487    let head_trunc_len = if layout.head_truncated { trunc_len } else { 0 };
488
489    let mut result = String::with_capacity(total_width + 10);
490
491    // Reference prefix (if not right_reference)
492    if !config.right_reference {
493        if !ref_str.is_empty() && config.auto_reference {
494            result.push_str(ref_str);
495            result.push(':');
496            let ref_total = ref_str.len() + 1;
497            let ref_pad_total = ref_max_width + gap;
498            let padding = ref_pad_total.saturating_sub(ref_total);
499            for _ in 0..padding {
500                result.push(' ');
501            }
502        } else if !ref_str.is_empty() {
503            result.push_str(ref_str);
504            let ref_pad_total = ref_max_width + gap;
505            let padding = ref_pad_total.saturating_sub(ref_str.len());
506            for _ in 0..padding {
507                result.push(' ');
508            }
509        } else {
510            for _ in 0..gap {
511                result.push(' ');
512            }
513        }
514    }
515
516    // Left half: [tail][tail_trunc] ... padding ... [before_trunc][before]
517    if !layout.tail.is_empty() {
518        result.push_str(&layout.tail);
519        if layout.tail_truncated {
520            result.push_str(trunc_str);
521        }
522        let tail_used = layout.tail.len() + tail_trunc_len;
523        let before_used = layout.before.len() + before_trunc_len;
524        let padding = half_line_width
525            .saturating_sub(gap)
526            .saturating_sub(tail_used)
527            .saturating_sub(before_used);
528        for _ in 0..padding {
529            result.push(' ');
530        }
531    } else {
532        let before_used = layout.before.len() + before_trunc_len;
533        let padding = half_line_width
534            .saturating_sub(gap)
535            .saturating_sub(before_used);
536        for _ in 0..padding {
537            result.push(' ');
538        }
539    }
540
541    if layout.before_truncated {
542        result.push_str(trunc_str);
543    }
544    result.push_str(&layout.before);
545
546    // Gap
547    for _ in 0..gap {
548        result.push(' ');
549    }
550
551    // Right half: [keyafter][keyafter_trunc] ... padding ... [head_trunc][head]
552    result.push_str(&layout.keyafter);
553    if layout.keyafter_truncated {
554        result.push_str(trunc_str);
555    }
556
557    if !layout.head.is_empty() {
558        let keyafter_used = layout.keyafter.len() + keyafter_trunc_len;
559        let head_used = layout.head.len() + head_trunc_len;
560        let padding = half_line_width
561            .saturating_sub(keyafter_used)
562            .saturating_sub(head_used);
563        for _ in 0..padding {
564            result.push(' ');
565        }
566        if layout.head_truncated {
567            result.push_str(trunc_str);
568        }
569        result.push_str(&layout.head);
570    } else if !ref_str.is_empty() && config.right_reference {
571        let keyafter_used = layout.keyafter.len() + keyafter_trunc_len;
572        let padding = half_line_width.saturating_sub(keyafter_used);
573        for _ in 0..padding {
574            result.push(' ');
575        }
576    }
577
578    // Reference on the right (if right_reference)
579    if !ref_str.is_empty() && config.right_reference {
580        for _ in 0..gap {
581            result.push(' ');
582        }
583        result.push_str(ref_str);
584    }
585
586    result
587}
588
589/// Escape a string for roff output (backslashes and quotes).
590fn escape_roff(s: &str) -> String {
591    s.replace('\\', "\\\\").replace('"', "\\\"")
592}
593
594/// Format a KWIC entry for roff output.
595///
596/// GNU ptx roff format: .xx "tail" "before" "keyafter" "head" ["reference"]
597/// Truncation flags are embedded in the field text.
598fn format_roff(entry: &KwicEntry, config: &PtxConfig, layout: &LayoutFields) -> String {
599    let ref_str = if config.auto_reference || config.references {
600        &entry.reference
601    } else {
602        ""
603    };
604
605    let trunc_flag = config.flag_truncation.as_deref().unwrap_or("/");
606
607    let macro_name = config.macro_name.as_deref().unwrap_or("xx");
608
609    // Build fields with truncation flags embedded
610    let tail = if layout.tail_truncated {
611        format!("{}{}", layout.tail, trunc_flag)
612    } else {
613        layout.tail.clone()
614    };
615
616    let before = if layout.before_truncated {
617        format!("{}{}", trunc_flag, layout.before)
618    } else {
619        layout.before.clone()
620    };
621
622    let keyafter = if layout.keyafter_truncated {
623        format!("{}{}", layout.keyafter, trunc_flag)
624    } else {
625        layout.keyafter.clone()
626    };
627
628    let head = if layout.head_truncated {
629        format!("{}{}", trunc_flag, layout.head)
630    } else {
631        layout.head.clone()
632    };
633
634    if ref_str.is_empty() {
635        format!(
636            ".{} \"{}\" \"{}\" \"{}\" \"{}\"",
637            macro_name,
638            escape_roff(&tail),
639            escape_roff(&before),
640            escape_roff(&keyafter),
641            escape_roff(&head),
642        )
643    } else {
644        format!(
645            ".{} \"{}\" \"{}\" \"{}\" \"{}\" \"{}\"",
646            macro_name,
647            escape_roff(&tail),
648            escape_roff(&before),
649            escape_roff(&keyafter),
650            escape_roff(&head),
651            escape_roff(ref_str),
652        )
653    }
654}
655
656/// Escape a string for TeX output.
657fn escape_tex(s: &str) -> String {
658    let mut result = String::with_capacity(s.len());
659    for ch in s.chars() {
660        match ch {
661            '\\' => result.push_str("\\backslash "),
662            '{' => result.push_str("\\{"),
663            '}' => result.push_str("\\}"),
664            '$' => result.push_str("\\$"),
665            '&' => result.push_str("\\&"),
666            '#' => result.push_str("\\#"),
667            '_' => result.push_str("\\_"),
668            '^' => result.push_str("\\^{}"),
669            '~' => result.push_str("\\~{}"),
670            '%' => result.push_str("\\%"),
671            _ => result.push(ch),
672        }
673    }
674    result
675}
676
677/// Format a KWIC entry for TeX output.
678///
679/// GNU ptx TeX format: \xx {tail}{before}{keyword}{after}{head} [{reference}]
680/// No truncation flags are used in TeX output.
681fn format_tex(entry: &KwicEntry, config: &PtxConfig, layout: &LayoutFields) -> String {
682    let ref_str = if config.auto_reference || config.references {
683        &entry.reference
684    } else {
685        ""
686    };
687
688    let macro_name = config.macro_name.as_deref().unwrap_or("xx");
689
690    if ref_str.is_empty() {
691        format!(
692            "\\{} {{{}}}{{{}}}{{{}}}{{{}}}{{{}}}",
693            macro_name,
694            escape_tex(&layout.tail),
695            escape_tex(&layout.before),
696            escape_tex(&layout.keyword),
697            escape_tex(&layout.after),
698            escape_tex(&layout.head),
699        )
700    } else {
701        format!(
702            "\\{} {{{}}}{{{}}}{{{}}}{{{}}}{{{}}}{{{}}}",
703            macro_name,
704            escape_tex(&layout.tail),
705            escape_tex(&layout.before),
706            escape_tex(&layout.keyword),
707            escape_tex(&layout.after),
708            escape_tex(&layout.head),
709            escape_tex(ref_str),
710        )
711    }
712}
713
714/// Process lines from a single source, grouping them into sentence contexts.
715///
716/// GNU ptx joins consecutive lines within a single file into one context
717/// unless a line ends with a sentence terminator (`.`, `?`, `!`).
718/// File boundaries always break sentences.
719fn process_lines_into_contexts(
720    content: &str,
721    filename: Option<&str>,
722    config: &PtxConfig,
723    lines_out: &mut Vec<(String, String)>,
724    global_line_num: &mut usize,
725) {
726    let mut current_text = String::new();
727    let mut context_ref = String::new();
728    let mut first_line_of_context = true;
729
730    for line in content.lines() {
731        *global_line_num += 1;
732
733        let reference = if config.auto_reference {
734            match filename {
735                Some(name) => format!("{}:{}", name, global_line_num),
736                None => format!("{}", global_line_num),
737            }
738        } else {
739            String::new()
740        };
741
742        if first_line_of_context {
743            context_ref = reference;
744            first_line_of_context = false;
745        }
746
747        if !current_text.is_empty() {
748            current_text.push(' ');
749        }
750        current_text.push_str(line);
751
752        // Check if line ends with a sentence terminator
753        let trimmed = line.trim_end();
754        let ends_with_terminator =
755            trimmed.ends_with('.') || trimmed.ends_with('?') || trimmed.ends_with('!');
756
757        if ends_with_terminator || line.is_empty() {
758            if !current_text.trim().is_empty() {
759                lines_out.push((context_ref.clone(), current_text.clone()));
760            }
761            current_text.clear();
762            first_line_of_context = true;
763        }
764    }
765
766    // Don't forget any remaining context (lines without terminators at end of file)
767    if !current_text.trim().is_empty() {
768        lines_out.push((context_ref.clone(), current_text.clone()));
769    }
770}
771
772fn format_and_write<W: Write>(
773    lines: &[(String, String)],
774    output: &mut W,
775    config: &PtxConfig,
776) -> io::Result<()> {
777    // Generate KWIC entries
778    let entries = generate_entries(lines, config);
779
780    // Compute maximum word length across all input (needed for left_field_start)
781    let max_word_length = lines
782        .iter()
783        .flat_map(|(_, line)| extract_words(line))
784        .map(|(_, word)| word.len())
785        .max()
786        .unwrap_or(0);
787
788    // Compute maximum reference width (for consistent left-alignment)
789    // Note: do NOT add +1 for auto_reference here; the ":" is handled
790    // in the display formatting (ref_total = ref_str.len() + 1).
791    let ref_max_width = entries.iter().map(|e| e.reference.len()).max().unwrap_or(0);
792
793    // Format and output
794    for entry in &entries {
795        let layout = compute_layout(entry, config, max_word_length, ref_max_width);
796        let formatted = match config.format {
797            OutputFormat::Plain => format_plain(entry, config, &layout, ref_max_width),
798            OutputFormat::Roff => format_roff(entry, config, &layout),
799            OutputFormat::Tex => format_tex(entry, config, &layout),
800        };
801        writeln!(output, "{}", formatted)?;
802    }
803
804    Ok(())
805}
806
807/// Generate a permuted index from input.
808///
809/// Reads lines from `input`, generates KWIC entries for each indexable word,
810/// sorts them, and writes the formatted output to `output`.
811pub fn generate_ptx<R: BufRead, W: Write>(
812    input: R,
813    output: &mut W,
814    config: &PtxConfig,
815) -> io::Result<()> {
816    let mut content = String::new();
817    for line_result in input.lines() {
818        let line = line_result?;
819        content.push_str(&line);
820        content.push('\n');
821    }
822
823    let mut lines: Vec<(String, String)> = Vec::new();
824    let mut global_line_num = 0usize;
825    process_lines_into_contexts(&content, None, config, &mut lines, &mut global_line_num);
826
827    format_and_write(&lines, output, config)
828}
829
830/// Generate a permuted index from multiple named file contents.
831///
832/// Each file's lines are processed independently for sentence grouping
833/// (file boundaries always break sentences), matching GNU ptx behavior.
834/// When auto_reference is enabled, references include the filename.
835pub fn generate_ptx_multi<W: Write>(
836    file_contents: &[(Option<String>, String)],
837    output: &mut W,
838    config: &PtxConfig,
839) -> io::Result<()> {
840    let mut lines: Vec<(String, String)> = Vec::new();
841    let mut global_line_num = 0usize;
842
843    for (filename, content) in file_contents {
844        process_lines_into_contexts(
845            content,
846            filename.as_deref(),
847            config,
848            &mut lines,
849            &mut global_line_num,
850        );
851    }
852
853    format_and_write(&lines, output, config)
854}
855
856/// Read a word list file (one word per line) into a HashSet.
857pub fn read_word_file(path: &str) -> io::Result<HashSet<String>> {
858    let content = std::fs::read_to_string(path)?;
859    Ok(content
860        .lines()
861        .map(|l| l.trim().to_string())
862        .filter(|l| !l.is_empty())
863        .collect())
864}