Skip to main content

hr_shape/
lib.rs

1//! Rust implementation of hb-shape.
2//! <https://github.com/harfbuzz/harfbuzz/blob/main/util/hb-shape.cc>
3
4use std::ffi::OsString;
5use std::io::{self, Write};
6use std::path::PathBuf;
7
8use clap::Parser;
9use harfrust::{
10    BufferClusterLevel, BufferFlags, Direction, Feature, FontRef, Language, SerializeFlags,
11    ShaperData, ShaperInstance, UnicodeBuffer, Variation,
12};
13
14#[derive(Clone, Parser)]
15#[command(name = "hr-shape", version, about = "Shape text using HarfRust")]
16pub struct Args {
17    /// Font file path
18    #[arg(value_name = "FONT-FILE")]
19    font_file_pos: Option<PathBuf>,
20
21    /// Text to shape
22    #[arg(value_name = "TEXT")]
23    text_pos: Option<String>,
24
25    /// Set font file-name
26    #[arg(long)]
27    font_file: Option<PathBuf>,
28
29    /// Set face index
30    #[arg(short = 'y', long, default_value_t = 0)]
31    face_index: u32,
32
33    /// Set font point-size
34    #[arg(long)]
35    font_ptem: Option<f32>,
36
37    /// Comma-separated list of font variations
38    #[arg(long, value_delimiter = ',')]
39    variations: Vec<Variation>,
40
41    /// Set named-instance index
42    #[arg(long)]
43    named_instance: Option<usize>,
44
45    /// Set input text
46    #[arg(long)]
47    text: Option<String>,
48
49    /// Set input text file-name ("-" for stdin)
50    #[arg(long)]
51    text_file: Option<PathBuf>,
52
53    /// Set input Unicode codepoints (e.g. 'U+0056,U+0057')
54    #[arg(short = 'u', long, value_parser = parse_unicodes)]
55    unicodes: Option<String>,
56
57    /// Set text context before each line
58    #[arg(long)]
59    text_before: Option<String>,
60
61    /// Set text context after each line
62    #[arg(long)]
63    text_after: Option<String>,
64
65    /// Set Unicode codepoints context before each line
66    #[arg(long, value_parser = parse_unicodes)]
67    unicodes_before: Option<String>,
68
69    /// Set Unicode codepoints context after each line
70    #[arg(long, value_parser = parse_unicodes)]
71    unicodes_after: Option<String>,
72
73    /// Treat text as single paragraph
74    #[arg(long)]
75    single_par: bool,
76
77    /// Set text direction (ltr/rtl/ttb/btt)
78    #[arg(long)]
79    direction: Option<Direction>,
80
81    /// Set text language [default: $LANG]
82    #[arg(long)]
83    language: Option<Language>,
84
85    /// Set text script as ISO-15924 tag
86    #[arg(long)]
87    script: Option<harfrust::Script>,
88
89    /// Comma-separated list of font features
90    #[arg(long, value_delimiter = ',')]
91    features: Vec<Feature>,
92
93    /// Use UTF-8 byte indices, not char indices
94    #[arg(long)]
95    utf8_clusters: bool,
96
97    /// Cluster merging level (0-3)
98    #[arg(long, value_parser = parse_cluster, default_value = "0")]
99    cluster_level: BufferClusterLevel,
100
101    /// Treat text as beginning of paragraph
102    #[arg(long)]
103    bot: bool,
104
105    /// Treat text as end of paragraph
106    #[arg(long)]
107    eot: bool,
108
109    /// Preserve Default-Ignorable characters
110    #[arg(long)]
111    preserve_default_ignorables: bool,
112
113    /// Remove Default-Ignorable characters
114    #[arg(long)]
115    remove_default_ignorables: bool,
116
117    /// Glyph value to replace not-found variation-selector characters with
118    #[arg(long)]
119    not_found_variation_selector_glyph: Option<u32>,
120
121    /// Produce unsafe-to-concat glyph flag
122    #[arg(long)]
123    unsafe_to_concat: bool,
124
125    /// Produce safe-to-insert-tatweel glyph flag
126    #[arg(long)]
127    safe_to_insert_tatweel: bool,
128
129    /// Perform sanity checks on shaping results
130    #[arg(long)]
131    verify: bool,
132
133    /// Prefix each line of output with its input text
134    #[arg(long)]
135    show_text: bool,
136
137    /// Prefix each line of output with its input codepoints
138    #[arg(long)]
139    show_unicode: bool,
140
141    /// Prefix each line of output with its line number
142    #[arg(long)]
143    show_line_num: bool,
144
145    /// Prefix each line of output with text, unicode, and line number
146    #[arg(long)]
147    verbose: bool,
148
149    /// Shorthand for --verbose --ned (matching hb-shape behavior)
150    #[arg(short = 'v', hide = true)]
151    short_v: bool,
152
153    /// Output glyph indices instead of names
154    #[arg(long)]
155    no_glyph_names: bool,
156
157    /// Do not output glyph positions
158    #[arg(long)]
159    no_positions: bool,
160
161    /// Do not output glyph advances
162    #[arg(long)]
163    no_advances: bool,
164
165    /// Do not output cluster indices
166    #[arg(long)]
167    no_clusters: bool,
168
169    /// Output glyph extents
170    #[arg(long)]
171    show_extents: bool,
172
173    /// Output glyph flags
174    #[arg(long)]
175    show_flags: bool,
176
177    /// No Extra Data; Do not output clusters or advances
178    #[arg(long)]
179    ned: bool,
180
181    /// Set output file-name [default: stdout]
182    #[arg(short = 'o', long)]
183    output_file: Option<PathBuf>,
184
185    /// Run shaper N times
186    #[arg(short = 'n', long, default_value_t = 1)]
187    num_iterations: u32,
188
189    /// Ignored; accepted for hb-shape compatibility
190    #[arg(long, hide = true)]
191    font_funcs: Option<String>,
192}
193
194/// Parses command-line arguments and runs the `hr-shape` command.
195///
196/// # Errors
197///
198/// Returns an error string if argument parsing, shaping, or output writing fails.
199pub fn try_main() -> Result<(), String> {
200    let args = Args::parse();
201    run_and_write(args)
202}
203
204/// Runs `hr-shape` from a parsed argument struct and writes output to the configured destination.
205///
206/// # Errors
207///
208/// Returns an error string if shaping or output writing fails.
209pub fn run_and_write(args: Args) -> Result<(), String> {
210    let output_file = args.output_file.clone();
211    let output = render(args)?;
212    write_output(&output, output_file.as_ref())?;
213    Ok(())
214}
215
216/// Parses `hr-shape` arguments from an iterator and returns the rendered output.
217///
218/// If `-o/--output-file` is present, this also writes the rendered output to that file.
219///
220/// # Errors
221///
222/// Returns an error string if argument parsing, shaping, or requested file output fails.
223pub fn run_from_args<I, T>(args: I) -> Result<String, String>
224where
225    I: IntoIterator<Item = T>,
226    T: Into<OsString> + Clone,
227{
228    let args = Args::try_parse_from(args).map_err(|e| e.to_string())?;
229    let output = render(args.clone())?;
230    if let Some(path) = args.output_file.as_ref() {
231        write_output(&output, Some(path))?;
232    }
233    Ok(output)
234}
235
236/// Shapes a single input string using the same option parsing path as the CLI.
237///
238/// The input text is passed as Unicode codepoints so tests can include NUL bytes and other
239/// special characters without shell escaping concerns.
240///
241/// # Errors
242///
243/// Returns an error string if option parsing or shaping fails.
244pub fn shape(font_path: &str, text: &str, options: &str) -> Result<String, String> {
245    let unicodes: Vec<String> = text
246        .chars()
247        .map(|c| format!("U+{:04X}", c as u32))
248        .collect();
249    let mut args = vec![
250        "hr-shape".to_string(),
251        "--font-file".to_string(),
252        font_path.to_string(),
253        "-u".to_string(),
254        unicodes.join(","),
255        "--single-par".to_string(),
256    ];
257    args.extend(
258        options
259            .split(' ')
260            .filter(|s| !s.is_empty())
261            .map(ToOwned::to_owned),
262    );
263    run_from_args(args)
264}
265
266/// Renders `hr-shape` output for a parsed argument struct without writing to stdout.
267///
268/// # Errors
269///
270/// Returns an error string if font loading, input loading, or shaping fails.
271pub fn render(mut args: Args) -> Result<String, String> {
272    normalize_args(&mut args);
273
274    let mut font_set_as_free_arg = false;
275    let font_path = if let Some(ref path) = args.font_file {
276        path.clone()
277    } else if let Some(ref path) = args.font_file_pos {
278        font_set_as_free_arg = true;
279        path.clone()
280    } else {
281        return Err("Error: font is not set.".to_string());
282    };
283
284    if !font_path.exists() {
285        return Err(format!("Error: '{}' does not exist.", font_path.display()));
286    }
287
288    let font_data = std::fs::read(&font_path)
289        .map_err(|e| format!("Error: cannot read '{}': {e}", font_path.display()))?;
290    let font = FontRef::from_index(&font_data, args.face_index)
291        .map_err(|_| format!("Error: face index {} not found.", args.face_index))?;
292
293    let data = ShaperData::new(&font);
294    let variations = &args.variations;
295    let instance = match args.named_instance {
296        Some(idx) => {
297            let mut inst = ShaperInstance::from_named_instance(&font, idx);
298            if !variations.is_empty() {
299                inst.set_variations(&font, variations);
300            }
301            inst
302        }
303        None => ShaperInstance::from_variations(&font, variations),
304    };
305    let shaper = data
306        .shaper(&font)
307        .instance(Some(&instance))
308        .point_size(args.font_ptem)
309        .build();
310
311    let pre_context = args
312        .unicodes_before
313        .as_deref()
314        .or(args.text_before.as_deref());
315    let post_context = args
316        .unicodes_after
317        .as_deref()
318        .or(args.text_after.as_deref());
319
320    let mut buf_flags = BufferFlags::default();
321    if args.bot {
322        buf_flags |= BufferFlags::BEGINNING_OF_TEXT;
323    }
324    if args.eot {
325        buf_flags |= BufferFlags::END_OF_TEXT;
326    }
327    if args.preserve_default_ignorables {
328        buf_flags |= BufferFlags::PRESERVE_DEFAULT_IGNORABLES;
329    }
330    if args.remove_default_ignorables {
331        buf_flags |= BufferFlags::REMOVE_DEFAULT_IGNORABLES;
332    }
333    if args.unsafe_to_concat {
334        buf_flags |= BufferFlags::PRODUCE_UNSAFE_TO_CONCAT;
335    }
336    if args.safe_to_insert_tatweel {
337        buf_flags |= BufferFlags::PRODUCE_SAFE_TO_INSERT_TATWEEL;
338    }
339    if args.verify {
340        buf_flags |= BufferFlags::VERIFY;
341    }
342
343    let no_clusters = args.no_clusters || args.ned;
344    let format_flags = {
345        let mut f = SerializeFlags::default();
346        if args.no_glyph_names {
347            f |= SerializeFlags::NO_GLYPH_NAMES;
348        }
349        if no_clusters {
350            f |= SerializeFlags::NO_CLUSTERS;
351        }
352        if args.no_positions {
353            f |= SerializeFlags::NO_POSITIONS;
354        }
355        if args.no_advances || args.ned {
356            f |= SerializeFlags::NO_ADVANCES;
357        }
358        if args.show_extents {
359            f |= SerializeFlags::GLYPH_EXTENTS;
360        }
361        if args.show_flags {
362            f |= SerializeFlags::GLYPH_FLAGS;
363        }
364        f.bits()
365    };
366
367    let language = args.language;
368    let features = &args.features;
369
370    let text = if let Some(ref path) = args.text_file {
371        if path == &PathBuf::from("-") {
372            read_stdin()?
373        } else {
374            std::fs::read_to_string(path)
375                .map_err(|e| format!("Error: cannot read '{}': {e}", path.display()))?
376        }
377    } else if font_set_as_free_arg {
378        if let Some(ref text) = args.text_pos {
379            text.clone()
380        } else if let Some(ref text) = args.unicodes {
381            text.clone()
382        } else if let Some(ref text) = args.text {
383            text.clone()
384        } else {
385            read_stdin()?
386        }
387    } else if let Some(ref text) = args.font_file_pos {
388        text.to_string_lossy().to_string()
389    } else if let Some(ref text) = args.unicodes {
390        text.clone()
391    } else if let Some(ref text) = args.text {
392        text.clone()
393    } else {
394        read_stdin()?
395    };
396
397    let lines: Vec<&str> = if args.single_par {
398        vec![&text]
399    } else {
400        text.split('\n').filter(|s| !s.is_empty()).collect()
401    };
402
403    let mut output = Vec::new();
404    for (line_idx, text) in lines.iter().enumerate() {
405        let line_no = line_idx + 1;
406
407        if args.show_text {
408            if args.show_line_num {
409                write!(output, "{line_no}: ").unwrap();
410            }
411            writeln!(output, "({text})").unwrap();
412        }
413
414        if args.show_unicode {
415            if args.show_line_num {
416                write!(output, "{line_no}: ").unwrap();
417            }
418            writeln!(output, "{}", serialize_unicode(text, args.utf8_clusters)).unwrap();
419        }
420
421        let glyph_buffer = {
422            let mut result = None;
423            for _ in 0..args.num_iterations {
424                let mut buffer = UnicodeBuffer::new();
425                buffer.push_str(text);
426
427                if let Some(d) = args.direction {
428                    buffer.set_direction(d);
429                }
430                if let Some(ref lang) = language {
431                    buffer.set_language(lang.clone());
432                }
433                if let Some(script) = args.script {
434                    buffer.set_script(script);
435                }
436
437                buffer.set_cluster_level(args.cluster_level);
438                if !args.utf8_clusters {
439                    buffer.reset_clusters();
440                }
441                if let Some(g) = args.not_found_variation_selector_glyph {
442                    buffer.set_not_found_variation_selector_glyph(g);
443                }
444
445                buffer.set_flags(buf_flags);
446
447                if let Some(ctx) = pre_context {
448                    buffer.set_pre_context(ctx);
449                }
450                if let Some(ctx) = post_context {
451                    buffer.set_post_context(ctx);
452                }
453
454                buffer.guess_segment_properties();
455
456                result = Some(shaper.shape(buffer, features));
457            }
458            result.unwrap()
459        };
460
461        if args.show_line_num {
462            write!(output, "{line_no}: ").unwrap();
463        }
464        writeln!(
465            output,
466            "{}",
467            glyph_buffer.serialize(&shaper, SerializeFlags::from_bits_truncate(format_flags))
468        )
469        .unwrap();
470    }
471
472    String::from_utf8(output).map_err(|e| format!("Error: invalid UTF-8 output: {e}"))
473}
474
475fn normalize_args(args: &mut Args) {
476    if args.short_v {
477        args.verbose = true;
478        args.ned = true;
479    }
480    if args.verbose {
481        args.show_text = true;
482        args.show_unicode = true;
483        args.show_line_num = true;
484    }
485}
486
487fn write_output(output: &str, output_file: Option<&PathBuf>) -> Result<(), String> {
488    if let Some(path) = output_file {
489        let mut file = std::fs::File::create(path)
490            .map_err(|e| format!("Error: cannot create '{}': {e}", path.display()))?;
491        file.write_all(output.as_bytes())
492            .map_err(|e| format!("Error: cannot write '{}': {e}", path.display()))?;
493    } else {
494        io::stdout()
495            .lock()
496            .write_all(output.as_bytes())
497            .map_err(|e| format!("Error: writing stdout: {e}"))?;
498    }
499
500    Ok(())
501}
502
503fn read_stdin() -> Result<String, String> {
504    io::read_to_string(io::stdin()).map_err(|e| format!("Error: reading stdin: {e}"))
505}
506
507fn parse_unicodes(s: &str) -> Result<String, String> {
508    let mut text = String::new();
509    for token in s.split([',', ' ', ';', '\t']) {
510        let token = token.trim();
511        if token.is_empty() {
512            continue;
513        }
514        let hex = token
515            .strip_prefix("U+")
516            .or_else(|| token.strip_prefix("u+"))
517            .or_else(|| token.strip_prefix("0x"))
518            .or_else(|| token.strip_prefix("0X"))
519            .unwrap_or(token);
520
521        let u = u32::from_str_radix(hex, 16)
522            .map_err(|_| format!("'{token}' is not a valid codepoint"))?;
523        let c = char::try_from(u).map_err(|_| format!("'{token}' is not a valid codepoint"))?;
524        text.push(c);
525    }
526    Ok(text)
527}
528
529fn parse_cluster(s: &str) -> Result<BufferClusterLevel, String> {
530    match s {
531        "0" => Ok(BufferClusterLevel::MonotoneGraphemes),
532        "1" => Ok(BufferClusterLevel::MonotoneCharacters),
533        "2" => Ok(BufferClusterLevel::Characters),
534        "3" => Ok(BufferClusterLevel::Graphemes),
535        _ => Err("invalid cluster level".to_string()),
536    }
537}
538
539fn serialize_unicode(text: &str, utf8_clusters: bool) -> String {
540    use std::fmt::Write;
541
542    let mut s = String::new();
543    let mut byte_offset = 0;
544    for (char_idx, c) in text.chars().enumerate() {
545        s.push(if s.is_empty() { '<' } else { '|' });
546        let cluster = if utf8_clusters { byte_offset } else { char_idx };
547        write!(s, "U+{:04X}={cluster}", c as u32).unwrap();
548        byte_offset += c.len_utf8();
549    }
550    if !s.is_empty() {
551        s.push('>');
552    }
553    s
554}