yara_x_fmt/
lib.rs

1/*! A code formatter for YARA rules
2
3This crate implements a code format for YARA in the spirit of other tools like
4`rustfmt` and `gofmt`.
5
6# Usage
7
8```no_run
9# use std::fs::File;
10use yara_x_fmt::Formatter;
11
12let input = File::open("original.yar").unwrap();
13let output = File::create("formatted.yar").unwrap();
14
15Formatter::new().format(input, output).unwrap();
16```
17*/
18use std::io;
19use std::io::Cursor;
20
21use thiserror::Error;
22
23use tokens::Token::*;
24use tokens::TokenStream;
25use yara_x_parser::cst::{CSTStream, Event, SyntaxKind};
26use yara_x_parser::{Parser, Span};
27
28use crate::align::Align;
29use crate::format_hex_patterns::FormatHexPatterns;
30use crate::indentation::AddIndentation;
31use crate::tokens::categories::*;
32use crate::tokens::*;
33use crate::trailing_spaces::RemoveTrailingSpaces;
34
35mod align;
36mod bubble;
37mod comments;
38mod format_hex_patterns;
39mod indentation;
40mod processor;
41mod tokens;
42mod trailing_spaces;
43
44#[cfg(test)]
45mod tests;
46
47/// Errors returned by [`Formatter::format`].
48#[derive(Error, Debug)]
49#[allow(clippy::large_enum_variant)]
50pub enum Error {
51    /// Error while reading from input.
52    #[error("read error: {0}")]
53    ReadError(io::Error),
54
55    /// Error while writing to output.
56    #[error("write error: {0}")]
57    WriteError(io::Error),
58
59    /// The input file contained invalid UTF-8.
60    #[error("invalid UTF-8 at {0}")]
61    InvalidUTF8(Span),
62}
63
64/// Specifies how to indent the formatted code.
65#[derive(Copy, Clone)]
66pub enum Indentation {
67    /// Use a given number of spaces per indentation level.
68    Spaces(usize),
69    /// Use one tab per indentation level.
70    Tabs,
71}
72
73/// Formats YARA source code automatically.
74pub struct Formatter {
75    align_metadata: bool,
76    align_patterns: bool,
77    indent_section_headers: bool,
78    indent_section_contents: bool,
79    newline_before_curly_brace: bool,
80    empty_line_before_section_header: bool,
81    empty_line_after_section_header: bool,
82    tab_size: usize,
83    indentation: Indentation,
84}
85
86impl Default for Formatter {
87    fn default() -> Self {
88        Self::new()
89    }
90}
91
92// Formatter public API.
93impl Formatter {
94    /// Creates a new formatter.
95    pub fn new() -> Self {
96        Formatter {
97            align_metadata: true,
98            align_patterns: true,
99            indent_section_headers: true,
100            indent_section_contents: true,
101            newline_before_curly_brace: false,
102            empty_line_before_section_header: true,
103            empty_line_after_section_header: false,
104            tab_size: 4,
105            indentation: Indentation::Spaces(2),
106        }
107    }
108
109    /// Specify if the metadata block must be aligned.
110    ///
111    /// If true, the metadata block will be converted from this...
112    ///
113    /// ```text
114    /// rule test {
115    ///   meta:
116    ///     short = "foo"
117    ///     very_long = "bar"
118    ///     even_longer = "baz"
119    ///   condition:
120    ///     ...
121    /// }
122    /// ```
123    ///
124    /// ... to this ...
125    ///
126    /// ```text
127    /// rule test {
128    ///   meta:
129    ///     short       = "foo"
130    ///     very_long   = "bar"
131    ///     even_longer = "baz"
132    ///   condition:
133    ///     ...
134    /// }
135    /// ```
136    ///
137    /// The default value is `true`.
138    pub fn align_metadata(mut self, yes: bool) -> Self {
139        self.align_metadata = yes;
140        self
141    }
142
143    /// Specify if the patterns definitions must be aligned.
144    ///
145    /// If true, the strings block will be converted from this...
146    ///
147    /// ```text
148    /// rule test {
149    ///   strings:
150    ///     $short = "foo"
151    ///     $very_long = "bar"
152    ///     $even_longer = "baz"
153    ///   condition:
154    ///     ...
155    /// }
156    /// ```
157    ///
158    /// ... to this ...
159    ///
160    /// ```text
161    /// rule test {
162    ///   strings:
163    ///     $short       = "foo"
164    ///     $very_long   = "bar"
165    ///     $even_longer = "baz"
166    ///   condition:
167    ///     ...
168    /// }
169    /// ```
170    ///
171    /// The default value is `true`.
172    pub fn align_patterns(mut self, yes: bool) -> Self {
173        self.align_patterns = yes;
174        self
175    }
176
177    /// Specify if the section definitions must be aligned.
178    ///
179    /// If true, the section headers look like this...
180    ///
181    /// ```text
182    /// rule test {
183    ///   strings:
184    ///     $short = "foo"
185    ///     $very_long = "bar"
186    ///     $even_longer = "baz"
187    ///   condition:
188    ///     ...
189    /// }
190    /// ```
191    ///
192    /// And if false, the section headers look like this...
193    ///
194    /// ```text
195    /// rule test {
196    /// strings:
197    ///   $short       = "foo"
198    ///   $very_long   = "bar"
199    ///   $even_longer = "baz"
200    /// condition:
201    ///   ...
202    /// }
203    /// ```
204    ///
205    /// The default value is `true`.
206    pub fn indent_section_headers(mut self, yes: bool) -> Self {
207        self.indent_section_headers = yes;
208        self
209    }
210
211    /// Specify if the section contents must be aligned.
212    ///
213    /// If true, the section contents look like this...
214    ///
215    /// ```text
216    /// rule test {
217    ///   strings:
218    ///     $short = "foo"
219    ///     $very_long = "bar"
220    ///     $even_longer = "baz"
221    ///   condition:
222    ///     ...
223    /// }
224    /// ```
225    ///
226    /// And if false, the section contents look like this...
227    ///
228    /// ```text
229    /// rule test {
230    ///   strings:
231    ///   $short       = "foo"
232    ///   $very_long   = "bar"
233    ///   $even_longer = "baz"
234    ///   condition:
235    ///   ...
236    /// }
237    /// ```
238    ///
239    /// The default value is `true`.
240    pub fn indent_section_contents(mut self, yes: bool) -> Self {
241        self.indent_section_contents = yes;
242        self
243    }
244
245    /// Number of spaces to indent, if indenting at all. Set to 0 to use tabs.
246    ///
247    /// The default is `2`.
248    #[deprecated(
249        since = "1.7.0",
250        note = "use `.indentation(Indentation::Spaces(n))` or `.indentation(Indentation::Tabs)` instead"
251    )]
252    pub fn indent_spaces(mut self, n: u8) -> Self {
253        if n == 0 {
254            self.indentation = Indentation::Tabs
255        } else {
256            self.indentation = Indentation::Spaces(n as usize)
257        }
258        self
259    }
260
261    /// Specifies how to indent the formatted source code.
262    ///
263    /// ```
264    /// # use yara_x_fmt::{Formatter, Indentation};
265    /// let indent_with_two_spaces = Formatter::new().indentation(Indentation::Spaces(2));
266    /// let indent_with_tabs = Formatter::new().indentation(Indentation::Tabs);
267    /// ```
268    pub fn indentation(mut self, indentation: Indentation) -> Self {
269        self.indentation = indentation;
270        self
271    }
272
273    /// Specifies the tab size (in spaces) expected in the unformatted source
274    /// code.
275    ///
276    /// If the input contains tab characters, the formatter uses this value to
277    /// determine how many spaces each tab represents. Setting this incorrectly
278    /// can lead to misaligned formatting when the code mixes tabs and spaces.
279    ///
280    /// Defaults to `4`.
281    pub fn input_tab_size(mut self, tab_size: usize) -> Self {
282        self.tab_size = tab_size;
283        self
284    }
285
286    /// Specify if newline should be added before the opening curly brace in a
287    /// rule declaration. If false the rule will look like this:
288    ///
289    /// ```text
290    /// rule test {
291    ///   condition:
292    ///     true
293    /// }
294    /// ```
295    ///
296    /// And if true, the rule will look like this:
297    ///
298    /// ```text
299    /// rule test
300    /// {
301    ///   condition:
302    ///     true
303    /// }
304    /// ```
305    ///
306    /// The default value is `false`.
307    pub fn newline_before_curly_brace(mut self, yes: bool) -> Self {
308        self.newline_before_curly_brace = yes;
309        self
310    }
311
312    /// Specify if an empty line should be added before the section header in a
313    /// rule. If false the rule will look like this:
314    ///
315    /// ```text
316    /// rule test {
317    ///   meta:
318    ///     foo = "bar"
319    ///   condition:
320    ///     true
321    /// }
322    /// ```
323    ///
324    /// And if true, the rule will look like this:
325    ///
326    /// ```text
327    /// rule test {
328    ///
329    ///   meta:
330    ///     foo = "bar"
331    ///
332    ///   condition:
333    ///     true
334    /// }
335    /// ```
336    ///
337    /// The default value is `false`.
338    pub fn empty_line_before_section_header(mut self, yes: bool) -> Self {
339        self.empty_line_before_section_header = yes;
340        self
341    }
342
343    /// Specify if an empty line should be added after the section header in a
344    /// rule. If false the rule will look like this:
345    ///
346    /// ```text
347    /// rule test {
348    ///   condition:
349    ///     true
350    /// }
351    /// ```
352    ///
353    /// And if true, the rule will look like this:
354    ///
355    /// ```text
356    /// rule test {
357    ///   condition:
358    ///
359    ///     true
360    /// }
361    /// ```
362    ///
363    /// The default value is `false`.
364    pub fn empty_line_after_section_header(mut self, yes: bool) -> Self {
365        self.empty_line_after_section_header = yes;
366        self
367    }
368
369    /// Reads YARA source code from `input` and write it into `output` after
370    /// formatting.
371    ///
372    /// Returns `true` if the output differs from the input.
373    ///
374    /// This function will fail if it can't read from the input, write to the
375    /// output, or when the input contains invalid UTF-8 characters.
376    pub fn format<R, W>(
377        &self,
378        mut input: R,
379        mut output: W,
380    ) -> Result<bool, Error>
381    where
382        R: io::Read,
383        W: io::Write,
384    {
385        let mut invalid_utf8 = Option::None;
386        let mut in_buf = Vec::with_capacity(256);
387
388        input.read_to_end(&mut in_buf).map_err(Error::ReadError)?;
389
390        let cst_stream = CSTStream::from(Parser::new(in_buf.as_slice()));
391
392        // Inspect the CST stream looking for events indicating the presence of
393        // invalid UTF-8 sequences.
394        let events = cst_stream.into_iter().inspect(|evt| {
395            if let Event::Token { kind: SyntaxKind::INVALID_UTF8, span } = evt
396            {
397                invalid_utf8.get_or_insert(span.clone());
398            }
399        });
400
401        let tokens = Tokens::new(in_buf.as_slice(), events);
402        let mut out_buf = Cursor::new(Vec::new());
403
404        self.format_impl(tokens)
405            .write_to(&mut out_buf)
406            .map_err(Error::WriteError)?;
407
408        if let Some(span) = invalid_utf8 {
409            return Err(Error::InvalidUTF8(span));
410        }
411
412        let modified = in_buf.ne(out_buf.get_ref());
413
414        output.write_all(out_buf.get_ref()).map_err(Error::WriteError)?;
415
416        Ok(modified)
417    }
418}
419
420// Private API for formatter.
421impl Formatter {
422    fn format_impl<'a, I>(&self, input: I) -> impl TokenStream<'a> + 'a
423    where
424        I: TokenStream<'a> + 'a,
425    {
426        // The first step is inserting newlines between top-level statements
427        // (rules, imports and includes) if they are in the same line.
428        let tokens = processor::Processor::new(input)
429            //
430            // Insert newline in front of import and include statements, making
431            // sure that they start at a new line. The newline is not inserted if
432            // the statement is at the start of the file.
433            //
434            // Example:
435            //
436            // import "foo" import "bar"
437            //
438            // Inserts newline before import "bar".
439            //
440            .add_rule(
441                |ctx| {
442                    let next_token = ctx.token(1);
443                    let prev_token = ctx.token(-1);
444
445                    matches!(
446                        next_token,
447                        Begin(SyntaxKind::IMPORT_STMT)
448                            | Begin(SyntaxKind::INCLUDE_STMT)
449                    ) && prev_token.neq(&Begin(SyntaxKind::SOURCE_FILE))
450                        && prev_token.is_not(*NEWLINE)
451                },
452                processor::actions::newline,
453            )
454            //
455            // Insert newline in front of rule declarations, making sure that
456            // rule declarations starts at a new line. The newline is not
457            // inserted if the rule is at the start of the file.
458            //
459            // Example:
460            //
461            // rule foo { ... } rule bar { ... }
462            //
463            // Inserts newline before "rule bar".
464            //
465            .add_rule(
466                |ctx| {
467                    let next_token = ctx.token(1);
468                    let prev_token = ctx.token(-1);
469
470                    next_token.eq(&Begin(SyntaxKind::RULE_DECL))
471                        && prev_token.neq(&Begin(SyntaxKind::SOURCE_FILE))
472                        && prev_token.is_not(*NEWLINE)
473                },
474                processor::actions::newline,
475            );
476
477        // Process comments before removing whitespaces. Whitespaces must
478        // be the original ones while the comments are being processed in
479        // order to maintain comment indentation unaltered.
480        let tokens =
481            comments::CommentProcessor::new(tokens).tab_size(self.tab_size);
482
483        // Remove all whitespaces from the original source.
484        let tokens = processor::Processor::new(tokens).add_rule(
485            |ctx| ctx.token(1).is(*WHITESPACE),
486            processor::actions::drop,
487        );
488
489        // Displace tail comments and newlines up with respect to tokens that
490        // indicate the start and end of a grammar rule. This effectively moves
491        // such comments and newlines to the innermost grammar rule preceding
492        // them. For example, suppose that we have the following rule:
493        //
494        //   import "test"  // Comment
495        //
496        //   rule test {
497        //     strings:
498        //       $a = "foo"
499        //     condition:
500        //       true
501        //   }
502        //
503        // The sequence of tokens produced for that rule looks like:
504        //
505        //   Begin(SOURCE_FILE)
506        //   Begin(IMPORT_STMT)
507        //   Keyword("import")
508        //   Literal("tests")
509        //   End(IMPORT_STMT)
510        //   Begin(RULE_DECL)
511        //   TailComment("// Comment")
512        //   Newline
513        //   Keyword("rule")
514        //   .... more
515        //
516        // Notice how TailComment("// Comment") and the Newline that follows
517        // are placed just before the "rule" keyword, and inside the rule_decl
518        // grammar rule. This is not the most natural place for this tail comment,
519        // its natural place is just after the "tests" literal, like this:
520        //
521        //   Begin(SOURCE_FILE)
522        //   Begin(IMPORT_STMT)
523        //   Keyword("import")
524        //   Literal("tests")
525        //   TailComment("// Comment")
526        //   Newline
527        //   End(IMPORT_STMT)
528        //   Begin(RULE_DECL)
529        //   Keyword("rule")
530        //   .... more
531        //
532        // That's exactly what the Bubble pipeline does. See the documentation for
533        // the `bubble` module for more details.
534        let tokens = bubble::Bubble::new(
535            tokens,
536            |token| {
537                matches!(token, Token::TailComment(_)) || token.is(*NEWLINE)
538            },
539            |token| matches!(token, Token::Begin(_) | Token::End(_)),
540        );
541
542        // Displace newlines down with respect to tokens that indicate end of a
543        // grammar rule. This effectively moves them to the outermost grammar
544        // rule. See the documentation for the `bubble` module for more details.
545        let tokens = bubble::Bubble::new(
546            tokens,
547            |token| matches!(token, Token::End(_)),
548            |token| token.is(*NEWLINE),
549        );
550
551        // Remove newlines in multiple cases.
552        let tokens = processor::Processor::new(tokens)
553            // Remove all newlines at the beginning of the file. When the
554            // processor is at the beginning of the file token(-1) is None.
555            // Notice that this works because all these newlines have been
556            // moved up and placed before Token::Begin(source_file) by the
557            // first bubble pipeline.
558            .add_rule(
559                |ctx| ctx.token(-1).is(*NONE) && ctx.token(1).is(*NEWLINE),
560                processor::actions::drop,
561            )
562            // Remove excess of consecutive newlines, only two consecutive
563            // newlines are allowed.
564            .add_rule(
565                |ctx| {
566                    ctx.token(1).is(*NEWLINE)
567                        && ctx.token(2).is(*NEWLINE)
568                        && ctx.token(3).is(*NEWLINE)
569                },
570                processor::actions::drop,
571            )
572            // Remove excess of newlines at the end of the file.
573            .add_rule(
574                |ctx| {
575                    ctx.token(-1).is(*NEWLINE)
576                        && ctx.token(1).is(*NEWLINE)
577                        && ctx.token(2).is(*NONE)
578                },
579                processor::actions::drop,
580            )
581            // Remove newlines between rule tags and between rule modifiers.
582            .add_rule(
583                |ctx| {
584                    (ctx.in_rule(SyntaxKind::RULE_MODS, false)
585                        || ctx.in_rule(SyntaxKind::RULE_TAGS, false))
586                        && ctx.token(-1).is_not(*COMMENT)
587                        && ctx.token(1).is(*NEWLINE)
588                },
589                processor::actions::drop,
590            )
591            // Remove newlines after rule modifiers.
592            .add_rule(
593                |ctx| {
594                    ctx.token(-1).eq(&End(SyntaxKind::RULE_MODS))
595                        && ctx.token(1).is(*NEWLINE)
596                },
597                processor::actions::drop,
598            )
599            // Remove newlines after rule tags.
600            .add_rule(
601                |ctx| {
602                    ctx.token(-1).eq(&End(SyntaxKind::RULE_TAGS))
603                        && ctx.token(1).is(*NEWLINE)
604                },
605                processor::actions::drop,
606            );
607
608        let tokens = if self.newline_before_curly_brace {
609            // Ensure we have a newline before the opening "{" in a rule
610            // declaration. Be careful to only insert it if one does not already
611            // exist.
612            Box::new(processor::Processor::new(tokens).add_rule(
613                |ctx| {
614                    ctx.in_rule(SyntaxKind::RULE_DECL, false)
615                        && ctx.token(1).eq(&LBRACE)
616                        && ctx.token(-1).is_not(*NEWLINE)
617                },
618                processor::actions::newline,
619            ))
620        } else {
621            // Remove newlines before the opening "{" in a rule declaration.
622            // It takes into account that we can have one or two newlines
623            // before the "{" character. In a previous step we have removed
624            // consecutive newlines leaving two at most.
625            Box::new(
626                    processor::Processor::new(tokens)
627                .add_rule(
628                    |ctx| {
629                        ctx.in_rule(SyntaxKind::RULE_DECL, false)
630                            && ctx.token(1).is(*NEWLINE)
631                            && (
632                                // Newline followed by "{"  or ...
633                                ctx.token(2).eq(&LBRACE) ||
634                                // ... two newlines followed by "{"
635                                ctx.token(2).is(*NEWLINE) && ctx.token(3).eq(&LBRACE)
636                            )
637                    },
638                    processor::actions::drop,
639                ))
640        };
641
642        let tokens = processor::Processor::new(tokens)
643            //
644            // Insert additional newline in front of a rule declaration that
645            // already starts at a newline, but only if not preceded by a
646            // comment. In other words, this adds empty lines in between rule
647            // declarations, but don't do it if the rule is preceded by a
648            // comment.
649            //
650            // Example:
651            //
652            //  rule foo {
653            //    ...
654            //  }
655            //  rule bar {
656            //    ...
657            //  }
658            //
659            // Inserts newline before "rule bar".
660            //
661            .add_rule(
662                |ctx| {
663                    ctx.token(1).eq(&Begin(SyntaxKind::RULE_DECL))
664                        && ctx.token(-1).is(*NEWLINE)
665                        && ctx.token(-2).is_not(*NEWLINE | *COMMENT)
666                },
667                processor::actions::newline,
668            );
669
670        let tokens = if self.empty_line_before_section_header {
671            Box::new(
672                processor::Processor::new(tokens)
673                    .set_passthrough(*CONTROL)
674                    .add_rule(
675                        |ctx| {
676                            matches!(
677                                ctx.token(1),
678                                Keyword(b"meta")
679                                    | Keyword(b"strings")
680                                    | Keyword(b"condition")
681                            ) && ctx.token(-1).is_not(*NEWLINE)
682                        },
683                        processor::actions::emptyline,
684                    )
685                    .add_rule(
686                        |ctx| {
687                            matches!(
688                                ctx.token(1),
689                                Keyword(b"meta")
690                                    | Keyword(b"strings")
691                                    | Keyword(b"condition")
692                            ) && ctx.token(-1).is(*NEWLINE)
693                                && ctx.token(-2).is_not(*NEWLINE)
694                        },
695                        processor::actions::newline,
696                    ),
697            )
698        } else {
699            Box::new(
700                processor::Processor::new(tokens)
701                    .set_passthrough(*CONTROL)
702                    .add_rule(
703                        |ctx| {
704                            matches!(
705                                ctx.token(1),
706                                Keyword(b"meta")
707                                    | Keyword(b"strings")
708                                    | Keyword(b"condition")
709                            ) && ctx.token(-1).is_not(*NEWLINE)
710                        },
711                        processor::actions::newline,
712                    )
713                    .add_rule(
714                        |ctx| {
715                            ctx.token(1).is(*NEWLINE)
716                                && matches!(
717                                    ctx.token(2),
718                                    Keyword(b"meta")
719                                        | Keyword(b"strings")
720                                        | Keyword(b"condition")
721                                )
722                                && ctx.token(-1).is(*NEWLINE)
723                        },
724                        processor::actions::drop,
725                    ),
726            )
727        };
728
729        // Always remove empty line before the first section header.
730        let tokens = processor::Processor::new(tokens)
731            .set_passthrough(*CONTROL)
732            .add_rule(
733                |ctx| {
734                    ctx.token(1).is(*NEWLINE)
735                        && matches!(
736                            ctx.token(2),
737                            Keyword(b"meta")
738                                | Keyword(b"strings")
739                                | Keyword(b"condition")
740                        )
741                        && ctx.token(-1).is(*NEWLINE)
742                        && ctx.token(-2).eq(&LBRACE)
743                },
744                processor::actions::drop,
745            );
746
747        let tokens = if self.empty_line_after_section_header {
748            Box::new(
749                processor::Processor::new(tokens)
750                    .add_rule(
751                        |ctx| {
752                            ctx.token(-1).eq(&COLON)
753                                && matches!(
754                                    ctx.token(-2),
755                                    Keyword(b"meta")
756                                        | Keyword(b"strings")
757                                        | Keyword(b"condition")
758                                )
759                                && ctx.token(1).is_not(*NEWLINE)
760                        },
761                        processor::actions::emptyline,
762                    )
763                    .add_rule(
764                        |ctx| {
765                            ctx.token(-1).eq(&COLON)
766                                && matches!(
767                                    ctx.token(-2),
768                                    Keyword(b"meta")
769                                        | Keyword(b"strings")
770                                        | Keyword(b"condition")
771                                )
772                                && ctx.token(1).is(*NEWLINE)
773                                && ctx.token(2).is_not(*NEWLINE)
774                        },
775                        processor::actions::newline,
776                    ),
777            )
778        } else {
779            Box::new(
780                processor::Processor::new(tokens)
781                    .add_rule(
782                        |ctx| {
783                            ctx.token(-1).eq(&COLON)
784                                && matches!(
785                                    ctx.token(-2),
786                                    Keyword(b"meta")
787                                        | Keyword(b"strings")
788                                        | Keyword(b"condition")
789                                )
790                                && ctx.token(1).is_not(*NEWLINE)
791                        },
792                        processor::actions::newline,
793                    )
794                    .add_rule(
795                        |ctx| {
796                            ctx.token(-1).eq(&COLON)
797                                && matches!(
798                                    ctx.token(-2),
799                                    Keyword(b"meta")
800                                        | Keyword(b"strings")
801                                        | Keyword(b"condition")
802                                )
803                                && ctx.token(1).is(*NEWLINE)
804                                && ctx.token(2).is(*NEWLINE)
805                        },
806                        processor::actions::drop,
807                    ),
808            )
809        };
810
811        // Add newline at multiple places.
812        let tokens = processor::Processor::new(tokens)
813            .set_passthrough(*CONTROL)
814            // Add a newline in front of meta definitions in the "meta" section.
815            .add_rule(
816                |ctx| {
817                    ctx.in_rule(SyntaxKind::META_DEF, false)
818                        && ctx.token(1).is(*IDENTIFIER)
819                        && ctx.token(-1).is_not(*NEWLINE)
820                },
821                processor::actions::newline,
822            )
823            // Add newline in front of pattern identifiers in the "strings"
824            // section.
825            .add_rule(
826                |ctx| {
827                    ctx.in_rule(SyntaxKind::PATTERN_DEF, false)
828                        && ctx.token(1).is(*IDENTIFIER)
829                        && ctx.token(-1).is_not(*NEWLINE)
830                },
831                processor::actions::newline,
832            )
833            // Add newline before each identifier in a `with` statement.
834            .add_rule(
835                |ctx| {
836                    ctx.in_rule(SyntaxKind::WITH_DECL, false)
837                        && ctx.token(1).is(*IDENTIFIER)
838                        && ctx.token(-1).is_not(*NEWLINE)
839                },
840                processor::actions::newline,
841            )
842            // Add newline before the closing brace at the end of rule.
843            .add_rule(
844                |ctx| {
845                    ctx.in_rule(SyntaxKind::RULE_DECL, false)
846                        && ctx.token(1).eq(&RBRACE)
847                        && ctx.token(-1).is_not(*NEWLINE)
848                },
849                processor::actions::newline,
850            )
851            // Add empty line at the end of the file
852            .add_rule(
853                |ctx| ctx.token(1).is(*NONE) && ctx.token(-1).is_not(*NEWLINE),
854                processor::actions::newline,
855            );
856
857        let tokens = FormatHexPatterns::new(tokens);
858
859        let tokens: Box<dyn Iterator<Item = Token<'a>>> =
860            if self.indent_section_headers {
861                Box::new(Self::indent_body(tokens))
862            } else {
863                Box::new(tokens)
864            };
865
866        let tokens: Box<dyn Iterator<Item = Token<'a>>> =
867            if self.indent_section_contents {
868                Box::new(Self::indent_sections(tokens))
869            } else {
870                Box::new(tokens)
871            };
872
873        let tokens = Self::indent_hex_patterns(tokens);
874        let tokens = Self::indent_parenthesized_exprs(tokens);
875        let tokens = Self::indent_with_expr(tokens);
876
877        // indent_body and indent_sections will insert Indentation tokens, but
878        // won't take into account that those tokens must appear before the
879        // newline they are expected to affect. This fixes the issue by moving
880        // indentation tokens in front of newline tokens if they appear in
881        // reverse order.
882        let tokens = bubble::Bubble::new(
883            tokens,
884            |token| matches!(token, Indentation(_)),
885            |token| token.is(*NEWLINE),
886        );
887
888        // Make sure that comments (except inline comments) are followed by
889        // newline. In most cases this is already the case, but some of the rules
890        // that remove newlines may remove those appearing after the comment.
891        let tokens = processor::Processor::new(tokens)
892            .set_passthrough(*CONTROL)
893            .add_rule(
894                |ctx| {
895                    matches!(
896                        ctx.token(-1),
897                        HeadComment(_) | TailComment(_) | BlockComment(_)
898                    ) && ctx.token(1).is_not(*NEWLINE)
899                },
900                processor::actions::newline,
901            );
902
903        let tokens = Self::add_spacing(tokens);
904        let tokens = Self::align_comments_in_hex_patterns(tokens);
905
906        let tokens: Box<dyn Iterator<Item = Token<'a>>> =
907            if self.align_metadata {
908                Box::new(Self::align_meta_section(tokens))
909            } else {
910                Box::new(tokens)
911            };
912
913        let tokens: Box<dyn Iterator<Item = Token<'a>>> =
914            if self.align_patterns {
915                Box::new(Self::align_patterns_section(tokens))
916            } else {
917                Box::new(tokens)
918            };
919
920        let tokens = AddIndentation::new(tokens, self.indentation);
921
922        RemoveTrailingSpaces::new(tokens)
923    }
924
925    /// Indents the sections (meta, strings, condition) of a rule one level up.
926    /// For example, for this input:
927    ///
928    /// ```text
929    /// rule foo {
930    /// strings:
931    /// $a = "foo"
932    /// condition:
933    /// true
934    /// }
935    /// ```
936    ///
937    /// ... the result is ...
938    ///
939    /// ```text
940    /// rule foo {
941    /// strings:
942    ///   $a = "foo"
943    /// condition:
944    ///   true
945    /// }
946    /// ```
947    fn indent_sections<'a, I>(input: I) -> impl TokenStream<'a> + 'a
948    where
949        I: TokenStream<'a> + 'a,
950    {
951        processor::Processor::new(input)
952            // Ignore all comments
953            .set_passthrough(*COMMENT)
954            // Increase indentation after "condition:"
955            .add_rule(
956                |ctx| {
957                    ctx.in_rule(SyntaxKind::CONDITION_BLK, false)
958                        && ctx.token(-1).eq(&COLON)
959                },
960                processor::actions::insert(Indentation(1)),
961            )
962            // Decrease indentation after the condition.
963            .add_rule(
964                |ctx| {
965                    ctx.token(1).eq(&End(SyntaxKind::CONDITION_BLK))
966                        && ctx.token(-1).neq(&Indentation(-1))
967                },
968                processor::actions::insert(Indentation(-1)),
969            )
970            // Increase indentation after "meta:"
971            .add_rule(
972                |ctx| {
973                    ctx.in_rule(SyntaxKind::META_BLK, false)
974                        && ctx.token(-1).eq(&COLON)
975                },
976                processor::actions::insert(Indentation(1)),
977            )
978            // Decrease indentation after meta definitions
979            .add_rule(
980                |ctx| {
981                    ctx.token(1).eq(&End(SyntaxKind::META_BLK))
982                        && ctx.token(-1).neq(&Indentation(-1))
983                },
984                processor::actions::insert(Indentation(-1)),
985            )
986            // Increase indentation after "strings:"
987            .add_rule(
988                |ctx| {
989                    ctx.in_rule(SyntaxKind::PATTERNS_BLK, false)
990                        && ctx.token(-1).eq(&COLON)
991                },
992                processor::actions::insert(Indentation(1)),
993            )
994            // Decrease indentation after pattern definitions.
995            .add_rule(
996                |ctx| {
997                    ctx.token(1).eq(&End(SyntaxKind::PATTERNS_BLK))
998                        && ctx.token(-1).neq(&Indentation(-1))
999                },
1000                processor::actions::insert(Indentation(-1)),
1001            )
1002    }
1003
1004    /// Indents the body of a rule. For this input...
1005    ///
1006    /// ```text
1007    /// rule foo {
1008    /// strings:
1009    /// $a = "foo"
1010    /// condition:
1011    /// true
1012    /// }
1013    /// ```
1014    /// ... the result is ...
1015    ///
1016    /// ```text
1017    /// rule foo {
1018    ///   strings:
1019    ///   $a = "foo"
1020    ///   condition:
1021    ///   true
1022    /// }
1023    /// ```
1024    ///
1025    fn indent_body<'a, I>(input: I) -> impl TokenStream<'a> + 'a
1026    where
1027        I: TokenStream<'a> + 'a,
1028    {
1029        processor::Processor::new(input)
1030            // Ignore all comments.
1031            .set_passthrough(*COMMENT)
1032            // Increase indentation after the opening brace in a rule
1033            // declaration.
1034            .add_rule(
1035                |ctx| {
1036                    ctx.in_rule(SyntaxKind::RULE_DECL, false)
1037                        && ctx.token(-1).eq(&LBRACE)
1038                },
1039                processor::actions::insert(Indentation(1)),
1040            )
1041            .add_rule(
1042                |ctx| {
1043                    ctx.in_rule(SyntaxKind::RULE_DECL, false)
1044                        && ctx.token(1).eq(&RBRACE)
1045                        && ctx.token(-1).neq(&Indentation(-1))
1046                },
1047                processor::actions::insert(Indentation(-1)),
1048            )
1049    }
1050
1051    /// Indent parenthesized expressions in rule conditions. For this input...
1052    ///
1053    /// ```text
1054    /// rule foo {
1055    /// strings:
1056    ///   $a = "foo"
1057    ///   $b = "bar"
1058    /// condition:
1059    ///    (
1060    ///    $a and $b
1061    ///    )
1062    /// }
1063    /// ```
1064    ///
1065    /// ... the result is ...
1066    ///
1067    /// ```text
1068    /// rule foo {
1069    /// strings:
1070    ///   $a = "foo"
1071    ///   $b = "bar"
1072    /// condition:
1073    ///    (
1074    ///      $a and $b
1075    ///    )
1076    /// }
1077    /// ```
1078    fn indent_parenthesized_exprs<'a, I>(input: I) -> impl TokenStream<'a> + 'a
1079    where
1080        I: TokenStream<'a> + 'a,
1081    {
1082        processor::Processor::new(input)
1083            .set_passthrough(*COMMENT)
1084            .add_rule(
1085                |ctx| {
1086                    ctx.in_rule(SyntaxKind::BOOLEAN_EXPR, true)
1087                        && ctx.token(-1).eq(&LPAREN)
1088                },
1089                processor::actions::insert(Indentation(1)),
1090            )
1091            .add_rule(
1092                |ctx| {
1093                    ctx.in_rule(SyntaxKind::BOOLEAN_EXPR, true)
1094                        && ctx.token(1).eq(&RPAREN)
1095                        && ctx.token(-1).neq(&Indentation(-1))
1096                },
1097                processor::actions::insert(Indentation(-1)),
1098            )
1099    }
1100
1101    /// Indent `with` expressions. For this input...
1102    ///
1103    /// ```text
1104    /// rule foo {
1105    /// condition:
1106    ///   with
1107    ///   foo = "foo"
1108    ///   bar = "bar": (...)
1109    /// }
1110    /// ```
1111    ///
1112    /// ... the result is ...
1113    ///
1114    /// ```text
1115    /// rule foo {
1116    /// condition:
1117    ///   with
1118    ///     foo = "foo"
1119    ///     bar = "bar": (...)
1120    /// }
1121    /// ```
1122    fn indent_with_expr<'a, I>(input: I) -> impl TokenStream<'a> + 'a
1123    where
1124        I: TokenStream<'a> + 'a,
1125    {
1126        processor::Processor::new(input)
1127            // Ignore all comments.
1128            .set_passthrough(*COMMENT)
1129            // Increase indentation after the `with` keyword.
1130            .add_rule(
1131                |ctx| {
1132                    ctx.in_rule(SyntaxKind::WITH_EXPR, false)
1133                        && ctx.token(-1).eq(&Keyword(b"with"))
1134                },
1135                processor::actions::insert(Indentation(1)),
1136            )
1137            // Decrease indentation after the `with` expression.
1138            .add_rule(
1139                |ctx| {
1140                    ctx.token(1).eq(&End(SyntaxKind::WITH_EXPR))
1141                        && ctx.token(-1).neq(&Indentation(-1))
1142                },
1143                processor::actions::insert(Indentation(-1)),
1144            )
1145    }
1146
1147    /// Indent hex patterns. For this input...
1148    ///
1149    /// ```text
1150    ///   $a = {
1151    ///   00 ..
1152    ///   01 ..
1153    ///   02
1154    ///   }
1155    /// }
1156    /// ```
1157    ///
1158    /// ... the result is ...
1159    ///
1160    /// ```text
1161    ///   $a = {
1162    ///     00 ..
1163    ///     01 ..
1164    ///     02
1165    ///   }
1166    /// }
1167    /// ```
1168    fn indent_hex_patterns<'a, I>(input: I) -> impl TokenStream<'a> + 'a
1169    where
1170        I: TokenStream<'a> + 'a,
1171    {
1172        processor::Processor::new(input)
1173            .set_passthrough(*COMMENT)
1174            .add_rule(
1175                |ctx| {
1176                    ctx.in_rule(SyntaxKind::HEX_PATTERN, true)
1177                        && ctx.token(-1).eq(&LBRACE)
1178                },
1179                processor::actions::insert(Indentation(1)),
1180            )
1181            .add_rule(
1182                |ctx| {
1183                    ctx.in_rule(SyntaxKind::HEX_PATTERN, true)
1184                        && ctx.token(1).eq(&RBRACE)
1185                        && ctx.token(-1).neq(&Indentation(-1))
1186                },
1187                processor::actions::insert(Indentation(-1)),
1188            )
1189    }
1190
1191    /// Aligns the equals signs in pattern definitions. For example, for this
1192    /// input:
1193    ///
1194    /// ```text
1195    /// rule foo {
1196    ///   strings:
1197    ///     $short = "foo"
1198    ///     $very_long = "bar"
1199    ///     $even_longer = "baz"
1200    ///   condition:
1201    ///     true
1202    /// }
1203    /// ```
1204    ///
1205    /// ... the result is ...
1206    ///
1207    /// ```text
1208    /// rule foo {
1209    ///   strings:
1210    ///     $short       = "foo"
1211    ///     $very_long   = "bar"
1212    ///     $even_longer = "baz"
1213    ///   condition:
1214    ///     true
1215    /// }
1216    /// ```
1217    ///
1218    /// Pattern groups separated by empty lines are handled independently, for
1219    /// example:
1220    ///
1221    /// ```text
1222    /// rule foo {
1223    ///   strings:
1224    ///     $short     = "foo"
1225    ///     $very_long = "bar"
1226    ///
1227    ///     $even_longer    = "baz"
1228    ///     $longest_of_all = "qux"
1229    ///   condition:
1230    ///     true
1231    /// }
1232    /// ```
1233    ///
1234    /// The patterns in the first block are aligned together, but they are not
1235    /// influenced by the longer lines in the second block.
1236    ///
1237    /// The input must contain at least one newline character after each
1238    /// pattern definition.
1239    fn align_patterns_section<'a, I>(input: I) -> impl TokenStream<'a> + 'a
1240    where
1241        I: TokenStream<'a> + 'a,
1242    {
1243        Self::align(input, SyntaxKind::PATTERNS_BLK, SyntaxKind::PATTERN_DEF)
1244    }
1245
1246    /// Aligns the equals signs in metadata definitions.
1247    ///
1248    /// This is similar to [`Formatter::align_patterns_section`] but for metadata.
1249    fn align_meta_section<'a, I>(input: I) -> impl TokenStream<'a> + 'a
1250    where
1251        I: TokenStream<'a> + 'a,
1252    {
1253        Self::align(input, SyntaxKind::META_BLK, SyntaxKind::META_DEF)
1254    }
1255
1256    fn align<'a, I>(
1257        input: I,
1258        block_kind: SyntaxKind,
1259        item_kind: SyntaxKind,
1260    ) -> impl TokenStream<'a> + 'a
1261    where
1262        I: TokenStream<'a> + 'a,
1263    {
1264        // First insert the alignment markers at the appropriate places...
1265        let input_with_markers = processor::Processor::new(input)
1266            // Insert `AlignmentBlockBegin` after the start of each block.
1267            .add_rule(
1268                move |ctx| ctx.token(-1).eq(&Begin(block_kind)),
1269                processor::actions::insert(AlignmentBlockBegin),
1270            )
1271            // Insert `AlignmentBlockEnd` just before the end each block.
1272            .add_rule(
1273                move |ctx| {
1274                    ctx.token(1).eq(&End(block_kind))
1275                        && ctx.token(-1).neq(&AlignmentBlockEnd)
1276                },
1277                processor::actions::insert(AlignmentBlockEnd),
1278            )
1279            .add_rule(
1280                move |ctx| {
1281                    ctx.in_rule(block_kind, false)
1282                        && ctx.token(-2).eq(&Newline)
1283                        && ctx.token(-1).eq(&Newline)
1284                },
1285                |ctx| {
1286                    ctx.push_output_token(Some(AlignmentBlockEnd));
1287                    ctx.push_output_token(Some(AlignmentBlockBegin));
1288                },
1289            )
1290            // Insert `AlignmentMarker` before each equal sign.
1291            .add_rule(
1292                move |ctx| {
1293                    ctx.in_rule(item_kind, false)
1294                        && ctx.token(1).eq(&EQUAL)
1295                        && ctx.token(-1).neq(&AlignmentMarker)
1296                },
1297                processor::actions::insert(AlignmentMarker),
1298            );
1299
1300        // ... then pass the token stream with the markers to Aligner, which
1301        // returns a token stream that replaces the markers with the
1302        // appropriate number of spaces.
1303        Align::new(input_with_markers)
1304    }
1305
1306    /// Aligns tail comments inside hex patterns. For this input...
1307    ///
1308    /// ```text
1309    /// rule foo {
1310    ///   strings:
1311    ///     $hex = {
1312    ///        00 01  // Lorem
1313    ///        00 01 02  // ipsum
1314    ///     }
1315    ///   condition:
1316    ///     true
1317    /// }
1318    /// ```
1319    ///
1320    /// ... the result is ...
1321    ///
1322    /// ```text
1323    /// rule foo {
1324    ///   strings:
1325    ///     $hex = {
1326    ///        00 01     // Lorem
1327    ///        00 01 02  // ipsum
1328    ///     }
1329    ///   condition:
1330    ///     true
1331    /// }
1332    /// ```
1333    ///
1334    fn align_comments_in_hex_patterns<'a, I>(
1335        input: I,
1336    ) -> impl TokenStream<'a> + 'a
1337    where
1338        I: TokenStream<'a> + 'a,
1339    {
1340        // First insert the alignment markers at the appropriate places...
1341        let input_with_markers = processor::Processor::new(input)
1342            .add_rule(
1343                |ctx| ctx.token(-1).eq(&Begin(SyntaxKind::HEX_PATTERN)),
1344                processor::actions::insert(AlignmentBlockBegin),
1345            )
1346            .add_rule(
1347                |ctx| {
1348                    ctx.token(1).eq(&End(SyntaxKind::HEX_PATTERN))
1349                        && ctx.token(-1).neq(&AlignmentBlockEnd)
1350                },
1351                processor::actions::insert(AlignmentBlockEnd),
1352            )
1353            .add_rule(
1354                |ctx| {
1355                    ctx.in_rule(SyntaxKind::HEX_PATTERN, true)
1356                        && matches!(ctx.token(1), Token::TailComment(_))
1357                        && ctx.token(-1).neq(&AlignmentMarker)
1358                },
1359                processor::actions::insert(AlignmentMarker),
1360            );
1361
1362        // ... then pass the token stream with the markers to Aligner, which
1363        // returns a token stream that replaces the markers with the
1364        // appropriate number of spaces.
1365        Align::new(input_with_markers)
1366    }
1367
1368    fn add_spacing<'a, I>(input: I) -> impl TokenStream<'a> + 'a
1369    where
1370        I: TokenStream<'a> + 'a,
1371    {
1372        processor::Processor::new(input)
1373            // Ignore all control tokens.
1374            .set_passthrough(*CONTROL)
1375            // Insert spaces in-between all tokens, except in the following
1376            // cases:
1377            // - No space after "(" and "["
1378            // - No space before ")" and "]"
1379            // - No space before ":"
1380            // - No space before ","
1381            // - No space before or after ".." (e.g: (0..10))
1382            // - No space before or after "." (e.g: foo.bar)
1383            // - No space in-between identifiers and "(" or "[" (e.g: array[0],
1384            //   func("foo")).
1385            // - No space before or after "-" in pattern modifiers and hex jumps
1386            //   (e.g: xor(0-255), [0-10]).
1387            .add_rule(
1388                |ctx| {
1389                    let prev_token = ctx.token(-1);
1390                    let next_token = ctx.token(1);
1391
1392                    // Insert space if previous token is anything except ( or [,
1393                    // and next token is anything except ) or ].
1394                    let add_space = prev_token.is(*TEXT ^ *LGROUPING)
1395                        && next_token.is(*TEXT ^ *RGROUPING);
1396
1397                    let drop_space =
1398                        // Don't insert space if next token is ":"
1399                        next_token.eq(&COLON)
1400                        || next_token.eq(&COMMA)
1401                        // Don't insert space after "-"
1402                        || prev_token.eq(&HYPHEN)
1403                        // Don't insert spaces around "."
1404                        || prev_token.eq(&DOT)
1405                        || next_token.eq(&DOT)
1406                        // don't insert space in-between some identifier and "("
1407                        // or "[".
1408                        || prev_token.is(*IDENTIFIER)
1409                            && next_token.is(*LGROUPING)
1410                        // don't insert space in-between some identifier and "*"
1411                        // like in $a*
1412                        || prev_token.is(*IDENTIFIER)
1413                            && next_token.eq(&ASTERISK)
1414                        // don't insert spaces before "(" in pattern modifiers.
1415                        || ctx.in_rule(SyntaxKind::PATTERN_MOD, false)
1416                            && next_token.is(*LGROUPING)
1417                        // don't insert spaces before or after "-" in pattern
1418                        // modifiers and hex jumps.
1419                        || (ctx.in_rule(SyntaxKind::PATTERN_MOD, false) ||
1420                            ctx.in_rule(SyntaxKind::HEX_JUMP, false))
1421                            && (next_token.eq(&HYPHEN) || prev_token.eq(&HYPHEN));
1422
1423                    add_space && !drop_space
1424                },
1425                processor::actions::space,
1426            )
1427            // Insert space before after inline comment.
1428            .add_rule(
1429                |ctx| {
1430                    matches!(
1431                        ctx.token(-1),
1432                        InlineComment(_)
1433                    )
1434                },
1435                processor::actions::space
1436            )
1437            // Insert two spaces before trailing comments in a line.
1438            .add_rule(
1439                |ctx| {
1440                    ctx.token(-1).is(*TEXT) &&
1441                    ctx.token(1).is(*COMMENT)
1442                },
1443                |ctx| {
1444                    ctx.push_output_token(Some(Whitespace));
1445                    ctx.push_output_token(Some(Whitespace));
1446                }
1447            )
1448    }
1449}
yara_x_fmt/lib.rs

yara_x_fmt/
lib.rs