Skip to main content

coreutils_rs/ptx/
core.rs

1use std::collections::HashSet;
2use std::io::{self, BufRead, Write};
3
4/// Output format for ptx.
5#[derive(Clone, Debug, PartialEq)]
6pub enum OutputFormat {
7    /// Default GNU ptx output format (roff-like).
8    Roff,
9    /// TeX output format.
10    Tex,
11    /// Dumb terminal / plain text format.
12    Plain,
13}
14
15/// Configuration for the ptx command.
16#[derive(Clone, Debug)]
17pub struct PtxConfig {
18    pub width: usize,
19    pub ignore_case: bool,
20    pub auto_reference: bool,
21    pub traditional: bool,
22    pub format: OutputFormat,
23    pub ignore_words: HashSet<String>,
24    pub only_words: Option<HashSet<String>>,
25    pub references: bool,
26    pub gap_size: usize,
27    pub right_reference: bool,
28    pub sentence_regexp: Option<String>,
29    pub word_regexp: Option<String>,
30}
31
32impl Default for PtxConfig {
33    fn default() -> Self {
34        Self {
35            width: 72,
36            ignore_case: false,
37            auto_reference: false,
38            traditional: false,
39            format: OutputFormat::Plain,
40            ignore_words: HashSet::new(),
41            only_words: None,
42            references: false,
43            gap_size: 3,
44            right_reference: false,
45            sentence_regexp: None,
46            word_regexp: None,
47        }
48    }
49}
50
51/// A single KWIC (Key Word In Context) entry.
52#[derive(Clone, Debug)]
53struct KwicEntry {
54    /// Reference (filename:line or line number).
55    reference: String,
56    /// Text before the keyword (left context).
57    left_context: String,
58    /// The keyword itself.
59    keyword: String,
60    /// Text after the keyword (right context).
61    right_context: String,
62    /// Sort key (lowercase keyword for case-insensitive sorting).
63    sort_key: String,
64}
65
66/// Extract words from a line of text.
67fn extract_words(line: &str) -> Vec<(usize, &str)> {
68    let mut words = Vec::new();
69    let mut start = None;
70
71    for (i, ch) in line.char_indices() {
72        if ch.is_alphanumeric() || ch == '_' {
73            if start.is_none() {
74                start = Some(i);
75            }
76        } else if let Some(s) = start {
77            words.push((s, &line[s..i]));
78            start = None;
79        }
80    }
81
82    if let Some(s) = start {
83        words.push((s, &line[s..]));
84    }
85
86    words
87}
88
89/// Check if a word should be indexed.
90fn should_index(word: &str, config: &PtxConfig) -> bool {
91    let check_word = if config.ignore_case {
92        word.to_lowercase()
93    } else {
94        word.to_string()
95    };
96
97    // If only_words is set, the word must be in that set
98    if let Some(ref only) = config.only_words {
99        if config.ignore_case {
100            return only.iter().any(|w| w.to_lowercase() == check_word);
101        }
102        return only.contains(&check_word);
103    }
104
105    // Otherwise, word must not be in ignore list
106    if config.ignore_case {
107        !config
108            .ignore_words
109            .iter()
110            .any(|w| w.to_lowercase() == check_word)
111    } else {
112        !config.ignore_words.contains(&check_word)
113    }
114}
115
116/// Generate KWIC entries from input lines.
117fn generate_entries(lines: &[(String, String)], config: &PtxConfig) -> Vec<KwicEntry> {
118    let mut entries = Vec::new();
119
120    for (reference, line) in lines {
121        let words = extract_words(line);
122
123        for &(word_start, word) in &words {
124            if !should_index(word, config) {
125                continue;
126            }
127
128            let word_end = word_start + word.len();
129
130            // Left context: text before the keyword
131            let left = line[..word_start].trim_end();
132
133            // Right context: text after the keyword
134            let right = line[word_end..].trim_start();
135
136            let sort_key = if config.ignore_case {
137                word.to_lowercase()
138            } else {
139                word.to_string()
140            };
141
142            entries.push(KwicEntry {
143                reference: reference.clone(),
144                left_context: left.to_string(),
145                keyword: word.to_string(),
146                right_context: right.to_string(),
147                sort_key,
148            });
149        }
150    }
151
152    // Sort by keyword (case-insensitive if requested), then by reference
153    entries.sort_by(|a, b| {
154        a.sort_key
155            .cmp(&b.sort_key)
156            .then_with(|| a.reference.cmp(&b.reference))
157    });
158
159    entries
160}
161
162/// Truncate a string from the left to fit within max_len characters.
163fn truncate_left(s: &str, max_len: usize) -> &str {
164    if s.len() <= max_len {
165        return s;
166    }
167    let skip = s.len() - max_len;
168    // Find a valid char boundary
169    let mut idx = skip;
170    while idx < s.len() && !s.is_char_boundary(idx) {
171        idx += 1;
172    }
173    &s[idx..]
174}
175
176/// Truncate a string from the right to fit within max_len characters.
177fn truncate_right(s: &str, max_len: usize) -> &str {
178    if s.len() <= max_len {
179        return s;
180    }
181    let mut idx = max_len;
182    while idx > 0 && !s.is_char_boundary(idx) {
183        idx -= 1;
184    }
185    &s[..idx]
186}
187
188/// Format a KWIC entry for plain text output.
189fn format_plain(entry: &KwicEntry, config: &PtxConfig) -> String {
190    let ref_str = if config.auto_reference || config.references {
191        &entry.reference
192    } else {
193        ""
194    };
195
196    let total_width = config.width;
197    let gap = config.gap_size;
198
199    // Calculate available space
200    let ref_width = if ref_str.is_empty() {
201        0
202    } else {
203        ref_str.len() + gap
204    };
205
206    let available = if total_width > ref_width {
207        total_width - ref_width
208    } else {
209        total_width
210    };
211
212    // Split available space: left context, keyword+right context
213    // Allocate roughly half for left, half for keyword+right
214    let right_half = available / 2;
215    let left_half = available - right_half;
216
217    // Left context (truncated from the left to fit)
218    let left = truncate_left(
219        &entry.left_context,
220        if left_half > gap { left_half - gap } else { 0 },
221    );
222
223    // Right side: keyword + right context
224    let right_text = if entry.right_context.is_empty() {
225        entry.keyword.clone()
226    } else {
227        format!("{} {}", entry.keyword, entry.right_context)
228    };
229    let right = truncate_right(&right_text, right_half);
230
231    if ref_str.is_empty() {
232        format!(
233            "{:>left_w$}{}{}",
234            left,
235            " ".repeat(gap),
236            right,
237            left_w = left_half - gap
238        )
239    } else if config.right_reference {
240        format!(
241            "{:>left_w$}{}{}{}{}",
242            left,
243            " ".repeat(gap),
244            right,
245            " ".repeat(gap),
246            ref_str,
247            left_w = left_half - gap,
248        )
249    } else {
250        format!(
251            "{}{}{:>left_w$}{}{}",
252            ref_str,
253            " ".repeat(gap),
254            left,
255            " ".repeat(gap),
256            right,
257            left_w = left_half - gap,
258        )
259    }
260}
261
262/// Format a KWIC entry for roff output.
263fn format_roff(entry: &KwicEntry, config: &PtxConfig) -> String {
264    let ref_str = if config.auto_reference || config.references {
265        &entry.reference
266    } else {
267        ""
268    };
269
270    // Escape backslashes and quotes for roff
271    let left = entry
272        .left_context
273        .replace('\\', "\\\\")
274        .replace('"', "\\\"");
275    let keyword = entry.keyword.replace('\\', "\\\\").replace('"', "\\\"");
276    let right = entry
277        .right_context
278        .replace('\\', "\\\\")
279        .replace('"', "\\\"");
280    let reference = ref_str.replace('\\', "\\\\").replace('"', "\\\"");
281
282    format!(
283        ".xx \"{}\" \"{}\" \"{}\" \"{}\"",
284        left, keyword, right, reference
285    )
286}
287
288/// Format a KWIC entry for TeX output.
289fn format_tex(entry: &KwicEntry, config: &PtxConfig) -> String {
290    let ref_str = if config.auto_reference || config.references {
291        &entry.reference
292    } else {
293        ""
294    };
295
296    // Escape TeX special characters
297    fn escape_tex(s: &str) -> String {
298        let mut result = String::with_capacity(s.len());
299        for ch in s.chars() {
300            match ch {
301                '\\' => result.push_str("\\backslash "),
302                '{' => result.push_str("\\{"),
303                '}' => result.push_str("\\}"),
304                '$' => result.push_str("\\$"),
305                '&' => result.push_str("\\&"),
306                '#' => result.push_str("\\#"),
307                '_' => result.push_str("\\_"),
308                '^' => result.push_str("\\^{}"),
309                '~' => result.push_str("\\~{}"),
310                '%' => result.push_str("\\%"),
311                _ => result.push(ch),
312            }
313        }
314        result
315    }
316
317    format!(
318        "\\xx {{{}}}{{{}}}{{{}}}{{{}}}",
319        escape_tex(&entry.left_context),
320        escape_tex(&entry.keyword),
321        escape_tex(&entry.right_context),
322        escape_tex(ref_str),
323    )
324}
325
326/// Generate a permuted index from input.
327///
328/// Reads lines from `input`, generates KWIC entries for each indexable word,
329/// sorts them, and writes the formatted output to `output`.
330pub fn generate_ptx<R: BufRead, W: Write>(
331    input: R,
332    output: &mut W,
333    config: &PtxConfig,
334) -> io::Result<()> {
335    // Read all lines with references
336    let mut lines: Vec<(String, String)> = Vec::new();
337    let mut line_num = 0usize;
338
339    for line_result in input.lines() {
340        let line = line_result?;
341        line_num += 1;
342
343        let reference = if config.auto_reference {
344            format!("{}", line_num)
345        } else {
346            String::new()
347        };
348
349        lines.push((reference, line));
350    }
351
352    // Generate KWIC entries
353    let entries = generate_entries(&lines, config);
354
355    // Format and output
356    for entry in &entries {
357        let formatted = match config.format {
358            OutputFormat::Plain => format_plain(entry, config),
359            OutputFormat::Roff => format_roff(entry, config),
360            OutputFormat::Tex => format_tex(entry, config),
361        };
362        writeln!(output, "{}", formatted)?;
363    }
364
365    Ok(())
366}
367
368/// Read a word list file (one word per line) into a HashSet.
369pub fn read_word_file(path: &str) -> io::Result<HashSet<String>> {
370    let content = std::fs::read_to_string(path)?;
371    Ok(content
372        .lines()
373        .map(|l| l.trim().to_string())
374        .filter(|l| !l.is_empty())
375        .collect())
376}