org_rust_parser/
utils.rs

1use crate::types::Cursor;
2use phf::phf_set;
3
4// Either a whitespace character, -, ., ,, ;, :, !, ?, ', ), }, [, ", or the end of a line.
5static MARKUP_POST: phf::Set<u8> = phf_set! {
6 b'-',
7 b'.',
8 b',',
9 b';',
10 b':',
11 b'!',
12 b'?',
13 b')',
14 b'}',
15 b'[',
16 b'"',
17 b'\'',
18    // whitespace chars
19 b'\n',
20 b' ',
21 b'\t',
22 b'|',
23 b']',
24 b'/',
25 b'*',
26 b'_',
27 b'+',
28};
29
30// Either a whitespace character, -, (, {, ', ", or the beginning of a line.
31static MARKUP_PRE: phf::Set<u8> = phf_set! {
32 b'-',
33 b'(',
34 b'{',
35 b'\'',
36 b'"',
37 // whitespace character
38 b' ',
39 b'\t',
40 // checks for beginning of line
41 b'\n',
42 // // Non Standard
43 b'|',
44 b'[',
45 b'/',
46 b'*',
47 b'_',
48 b'+',
49 b':',
50};
51
52// Why add non-standard extenders?
53// org mode syntax allows */abc/* to be defined as both bold and italic
54// even though * and / are not in PRE/POST, this is because it clamps then
55// parses the contents.
56//
57// my extensions to PRE/POST are more permissive than the spec since it allows
58// [/abc/] to be interpreted as markup (the object doesn't have to belong to markup)
59// another example is:
60//
61// /abc _*one*/
62//
63// this shouldn't contain a bold object, but with these changes it does. I find this behaviour
64// to be fairly reasonable imo, and don't mind the more permissive markup syntax.
65// if there are other unexpected interactions however then I'll have to find
66// the ending delimeter and then parse the contents within (entails reading over the
67// contained text twice, not ideal).
68
69/// ## SAFETY:
70/// We are given a valid utf8 string to parse with, no need for re-validation
71/// with `str::from_utf8`()
72///
73/// Not measured to see if this is a significant performance hit, but
74/// it's a safe assumption to make that we're indexing into valid utf8,
75/// otherwise we have an internal bug and we'd be unwrapping immediately
76/// afterwards with the safe alternative either way.
77#[inline]
78pub(crate) fn bytes_to_str(byte_arr: &[u8]) -> &str {
79    unsafe { std::str::from_utf8_unchecked(byte_arr) }
80}
81
82/// The range of an arbitary item in the source text.
83#[derive(Debug, Clone)]
84pub struct Match<T> {
85    pub start: usize,
86    pub end: usize,
87    pub obj: T,
88}
89
90impl<'a, T> Match<T> {
91    #[inline]
92    pub fn to_str(&self, source: &'a str) -> &'a str {
93        &source[self.start..self.end]
94    }
95
96    pub fn len(&self) -> usize {
97        self.end - self.start
98    }
99}
100
101/// Compares variants of an enum for equality
102pub(crate) fn variant_eq<T>(a: &T, b: &T) -> bool {
103    std::mem::discriminant(a) == std::mem::discriminant(b)
104}
105
106pub(crate) fn verify_markup(cursor: Cursor, post: bool) -> bool {
107    let before_maybe = cursor.peek_rev(1);
108    let after_maybe = cursor.peek(1);
109
110    if post {
111        // if we're in post, then a character before the markup Must Exist
112        !before_maybe.unwrap().is_ascii_whitespace()
113            && if let Ok(val) = after_maybe {
114                MARKUP_POST.contains(&val)
115            } else {
116                true
117            }
118    } else if let Ok(after) = after_maybe {
119        !after.is_ascii_whitespace()
120            && if let Ok(val) = before_maybe {
121                MARKUP_PRE.contains(&val)
122            } else {
123                // bof is always valid
124                true
125            }
126    } else {
127        // if there's no after, cannot be valid markup
128        false
129    }
130}
131
132pub(crate) fn id_escape(potential_id: &str) -> String {
133    // minor over-allocation in some cases, but I expect most
134    // id recepients to be light on the shenanigans
135    let mut ret = String::with_capacity(potential_id.len());
136    for chr in potential_id.chars() {
137        if chr == ' ' {
138            ret.push('-');
139        } else if chr == '_' || chr == '-' {
140            ret.push(chr);
141        } else if chr.is_alphanumeric() {
142            // unicode lowercases can span multiple characters
143            for val in chr.to_lowercase() {
144                ret.push(val);
145            }
146        }
147    }
148    ret
149}
150
151/// Shorthand for extracting a [`crate::Expr`] from a [`crate::Parser`].
152///
153/// # Example
154///
155/// ```rust
156/// use org_rust_parser as org_parser;
157///
158/// use org_parser::{Expr, expr_in_pool, parse_org};
159/// use org_parser::element::Heading;
160///
161/// let ret_parse = parse_org("* Hello world!\n");
162/// let heading_expr: &Heading = expr_in_pool!(ret_parse, Heading).unwrap();
163/// ```
164#[macro_export]
165macro_rules! expr_in_pool {
166    ($parsed: ident, $name: ident) => {
167        $parsed.pool.iter().find_map(|x| {
168            if let Expr::$name(i) = &x.obj {
169                Some(i)
170            } else {
171                None
172            }
173        })
174    };
175}
176
177/// Shorthand for extracting a [`crate::Node`] from a [`crate::Parser`].
178///
179/// # Example
180///
181/// ```rust
182/// use org_rust_parser as org_parser;
183///
184/// use org_parser::{Expr, node_in_pool, parse_org, Node};
185///
186/// let ret_parse = parse_org("* Hello world!\n");
187/// let heading_expr: &Node = node_in_pool!(ret_parse, Heading).unwrap();
188/// ```
189#[macro_export]
190macro_rules! node_in_pool {
191    ($parsed: ident, $name: ident) => {
192        $parsed.pool.iter().find_map(|x| {
193            if let Expr::$name(i) = &x.obj {
194                Some(x)
195            } else {
196                None
197            }
198        })
199    };
200}