org_rust_parser/object/
link.rs

1use std::borrow::Cow;
2use std::fmt::Display;
3
4use crate::constants::{
5    BACKSLASH, COLON, HYPHEN, LANGLE, LBRACK, LPAREN, POUND, RANGLE, RBRACK, RPAREN, SLASH,
6};
7use crate::node_pool::NodeID;
8use crate::parse::parse_object;
9use crate::types::{Cursor, MarkupKind, MatchError, ParseOpts, Parseable, Parser, Result};
10use crate::utils::Match;
11
12const ORG_LINK_PARAMETERS: [&str; 9] = [
13    "shell", "news", "mailto", "https", "http", "ftp", "help", "file", "elisp",
14];
15
16#[derive(Debug, Clone)]
17pub struct RegularLink<'a> {
18    pub path: Match<PathReg<'a>>,
19    // One or more objects enclosed by square brackets.
20    // It can contain the minimal set of objects as well as export snippets,
21    // inline babel calls, inline source blocks, macros, and statistics cookies.
22    // It can also contain another link, but only when it is a plain or angle link.
23    // It can contain square brackets, so long as they are balanced.
24    pub description: Option<Vec<NodeID>>,
25}
26
27impl Display for PathReg<'_> {
28    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
29        match self {
30            PathReg::PlainLink(link) => {
31                f.write_fmt(format_args!("{}:{}", link.protocol, link.path))
32            }
33            PathReg::Id(inner) => f.write_fmt(format_args!("id:{inner}")),
34            PathReg::CustomId(inner) => f.write_fmt(format_args!("#{inner}")),
35            PathReg::Coderef(inner) => f.write_fmt(format_args!("({inner})")),
36            PathReg::Unspecified(inner) => f.write_fmt(format_args!("{inner}")),
37            PathReg::File(inner) => f.write_fmt(format_args!("file:{inner}")),
38        }
39    }
40}
41
42#[derive(Debug, Clone, PartialEq, Eq)]
43pub struct PlainLink<'a> {
44    pub protocol: Cow<'a, str>,
45    pub path: Cow<'a, str>,
46}
47
48impl From<&PlainLink<'_>> for String {
49    fn from(value: &PlainLink) -> Self {
50        format!("{}:{}", value.protocol, value.path)
51    }
52}
53
54/// Enum representing various file types
55#[derive(Debug, Clone)]
56pub enum PathReg<'a> {
57    PlainLink(PlainLink<'a>),
58    Id(&'a str),
59    /// allows changing the name of the exported id
60    CustomId(&'a str),
61    /// allows linking to specific lines in code blocks
62    Coderef(&'a str),
63    File(Cow<'a, str>),
64    Unspecified(Cow<'a, str>),
65    // We can't determine while parsing whether we point to a headline
66    // or a filename (we don't track headlines while building)
67    // leave it to the exporter.
68    // FileName(&'a Path),
69    // Fuzzy(&'a str),
70}
71
72impl<'a> PathReg<'a> {
73    fn new(cursor: Cursor<'a>) -> Self {
74        match cursor.curr() {
75            b'i' => {
76                if let Ok(id) = PathReg::parse_id(cursor) {
77                    return PathReg::Id(id);
78                } else if let Ok(link) = parse_plain_link(cursor) {
79                    return PathReg::PlainLink(link.obj);
80                }
81            }
82            b'f' => {
83                if let Ok(file_path) = PathReg::parse_file(cursor) {
84                    return PathReg::File(file_path.into());
85                } else if let Ok(link) = parse_plain_link(cursor) {
86                    return PathReg::PlainLink(link.obj);
87                }
88            }
89            POUND => {
90                // custom-id
91                return PathReg::CustomId(cursor.clamp(cursor.index + 1, cursor.len()));
92            }
93            LPAREN => {
94                // FIXME: breaks on ()
95                if cursor[cursor.len() - 1] == RPAREN {
96                    return PathReg::Coderef(cursor.clamp(cursor.index + 1, cursor.len()));
97                }
98            }
99            chr => {
100                if let Ok(link) = parse_plain_link(cursor) {
101                    return PathReg::PlainLink(link.obj);
102                }
103            }
104        }
105        // unspecified:
106        // We can't determine while parsing whether we point to a headline
107        // or a filename (we don't track headlines while building)
108        // leave it to the exporter.
109        // FileName(&'a Path),
110        // Fuzzy(&'a str),
111        return PathReg::Unspecified(cursor.clamp_forwards(cursor.len()).into());
112    }
113
114    fn parse_id(mut cursor: Cursor<'a>) -> Result<&'a str> {
115        cursor.word("id:")?;
116        let begin_id = cursor.index;
117
118        while let Ok(num) = cursor.try_curr() {
119            if !num.is_ascii_hexdigit() || num == HYPHEN {
120                return Err(MatchError::InvalidLogic);
121            }
122            cursor.next();
123        }
124
125        return Ok(cursor.clamp_backwards(begin_id));
126    }
127
128    fn parse_file(mut cursor: Cursor<'a>) -> Result<&'a str> {
129        cursor.word("file:")?;
130        let begin_id = cursor.index;
131
132        while let Ok(num) = cursor.try_curr() {
133            cursor.next();
134        }
135
136        return Ok(cursor.clamp_backwards(begin_id));
137    }
138}
139
140impl<'a> Parseable<'a> for RegularLink<'a> {
141    fn parse(
142        parser: &mut Parser<'a>,
143        mut cursor: Cursor<'a>,
144        parent: Option<NodeID>,
145        mut parse_opts: ParseOpts,
146    ) -> Result<NodeID> {
147        let start = cursor.index;
148        cursor.word("[[")?;
149
150        // find backslash
151        loop {
152            match cursor.try_curr()? {
153                BACKSLASH => {
154                    // check for escaped char, and skip past it
155                    if let BACKSLASH | LBRACK | RBRACK = cursor.peek(1)? {
156                        cursor.advance(2);
157                    } else {
158                        return Err(MatchError::InvalidLogic);
159                    }
160                }
161                RBRACK => {
162                    // handles the  [[][]]  case, would panic without this check
163                    if cursor.index == start + 2 {
164                        return Err(MatchError::InvalidLogic);
165                    }
166
167                    if LBRACK == cursor.peek(1)? {
168                        let path_reg_end = cursor.index;
169
170                        // skip ][
171                        cursor.advance(2);
172                        parse_opts.from_object = false;
173                        parse_opts.markup.insert(MarkupKind::Link);
174
175                        let mut content_vec: Vec<NodeID> = Vec::new();
176                        loop {
177                            match parse_object(parser, cursor, parent, parse_opts) {
178                                Ok(id) => {
179                                    cursor.index = parser.pool[id].end;
180                                    content_vec.push(id);
181                                }
182                                Err(MatchError::MarkupEnd(kind)) => {
183                                    if !kind.contains(MarkupKind::Link) {
184                                        // TODO: cache and explode
185                                        return Err(MatchError::InvalidLogic);
186                                    }
187
188                                    let reg_curs = cursor.clamp_off(start + 2, path_reg_end);
189                                    let pathreg = Match {
190                                        start: start + 2,
191                                        end: path_reg_end,
192                                        obj: PathReg::new(reg_curs),
193                                    };
194
195                                    // set parents of children
196                                    // TODO: abstract this? stolen from markup.rs
197                                    let new_id = parser.pool.reserve_id();
198                                    for id in &mut content_vec {
199                                        parser.pool[*id].parent = Some(new_id);
200                                    }
201
202                                    return Ok(parser.alloc_with_id(
203                                        Self {
204                                            path: pathreg,
205                                            description: Some(content_vec),
206                                        },
207                                        start,
208                                        cursor.index + 2, // link end is 2 bytes long
209                                        parent,
210                                        new_id,
211                                    ));
212                                }
213                                ret @ Err(_) => return ret,
214                            }
215                        }
216                    } else if RBRACK == cursor.peek(1)? {
217                        // close object;
218
219                        let reg_curs = cursor.clamp_off(start + 2, cursor.index);
220                        let pathreg = Match {
221                            start: start + 2,
222                            end: cursor.index,
223                            obj: PathReg::new(reg_curs),
224                        };
225
226                        return Ok(parser.alloc(
227                            Self {
228                                path: pathreg,
229                                description: None,
230                            },
231                            start,
232                            cursor.index + 2,
233                            parent,
234                        ));
235                    } else {
236                        return Err(MatchError::InvalidLogic);
237                    }
238                }
239                _ => {}
240            }
241            cursor.next();
242        }
243    }
244}
245
246// REVIEW:
247// apparently a word constituent..isn't undescore??
248// https://www.gnu.org/software/emacs/manual/html_node/elisp/Syntax-Class-Table.html
249// Parts of words in human languages.
250// These are typically used in variable and command names in programs.
251// All upper- and lower-case letters, and the digits, are typically word constituents.
252
253/// PROTOCOL
254/// A string which is one of the link type strings in org-link-parameters.
255///
256/// PATHPLAIN
257/// A string containing any non-whitespace character but (, ), <, or >.
258/// It must end with a word-constituent character,
259/// or any non-whitespace non-punctuation character followed by /.
260// Word-constituent characters are letters, digits, and the underscore.
261// source: https://www.gnu.org/software/grep/manual/grep.html
262pub(crate) fn parse_plain_link(mut cursor: Cursor<'_>) -> Result<Match<PlainLink<'_>>> {
263    if let Ok(pre_byte) = cursor.peek_rev(1) {
264        if pre_byte.is_ascii_alphanumeric() {
265            return Err(MatchError::InvalidLogic);
266        }
267    }
268    let start = cursor.index;
269
270    for (i, &protocol) in ORG_LINK_PARAMETERS.iter().enumerate() {
271        // DO NOT read up to the colon and use phf_set to determine if it's a protocol
272        // cause the colon might be in the middle-a-nowhere if we're parsing regular text here
273        if cursor.word(protocol).is_ok() {
274            if cursor.try_curr()? == COLON {
275                cursor.next();
276                let path_start = cursor.index;
277                // let pre
278
279                while let Ok(byte) = cursor.try_curr() {
280                    match byte {
281                        RANGLE | LPAREN | RPAREN | LANGLE | b'\t' | b'\n' | b'\x0C' | b'\r'
282                        | b' ' => {
283                            break;
284                        }
285                        // RANGLE => break,
286                        _ => {
287                            cursor.next();
288                        }
289                    }
290                }
291
292                let last_link_byte = cursor[cursor.index - 1];
293                // if no progress was made, i.e. just PROTOCOL (https://):
294
295                // rewind until we end with an alphanumeric char or SLASH
296                //
297                // so:
298                // https://abc.org...___
299                // would only get: https://abc.org
300                //
301                // if you do something like https://onea/a/aaaa/,,,,,/
302                // then i think that breaks the definition, cause the slash isn't after a non-punc char,,
303                // but also if you do that then you're just being difficult.
304
305                while !cursor.peek_rev(1)?.is_ascii_alphanumeric() && cursor.peek_rev(1)? != SLASH {
306                    cursor.prev();
307                    if cursor.index <= path_start {
308                        return Err(MatchError::InvalidLogic);
309                    }
310                }
311
312                if if let Ok(future_byte) = cursor.try_curr() {
313                    !future_byte.is_ascii_alphanumeric()
314                } else {
315                    true
316                } {
317                    return Ok(Match {
318                        start,
319                        end: cursor.index,
320                        obj: PlainLink {
321                            protocol: protocol.into(),
322                            path: cursor.clamp_backwards(path_start).into(),
323                        },
324                    });
325                } else {
326                    return Err(MatchError::EofError);
327                }
328            } else {
329                cursor.index -= protocol.len();
330            }
331        }
332    }
333
334    Err(MatchError::InvalidLogic)
335}
336
337pub(crate) fn parse_angle_link<'a>(
338    parser: &mut Parser<'a>,
339    mut cursor: Cursor<'a>,
340    parent: Option<NodeID>,
341    parse_opts: ParseOpts,
342) -> Result<NodeID> {
343    let start = cursor.index;
344
345    cursor.next();
346
347    for (i, &protocol) in ORG_LINK_PARAMETERS.iter().enumerate() {
348        if cursor.word(protocol).is_ok() {
349            if cursor.try_curr()? == COLON {
350                cursor.next();
351                let path_start = cursor.index;
352                while let Ok(byte) = cursor.try_curr() {
353                    match byte {
354                        RBRACK | LANGLE | b'\n' => return Err(MatchError::InvalidLogic),
355                        RANGLE => break,
356                        _ => {
357                            cursor.next();
358                        }
359                    }
360                }
361
362                // <PROTOCOL:> is valid, don't need to check indices
363
364                return Ok(parser.alloc(
365                    PlainLink {
366                        protocol: protocol.into(),
367                        path: cursor.clamp_backwards(path_start).into(),
368                    },
369                    start,
370                    cursor.index + 1, // skip rangle
371                    parent,
372                ));
373            } else {
374                cursor.index -= protocol.len();
375            }
376        }
377    }
378
379    Err(MatchError::InvalidLogic)
380}
381
382#[cfg(test)]
383mod tests {
384    use pretty_assertions::assert_eq;
385
386    use crate::expr_in_pool;
387    use crate::object::PlainLink;
388    use crate::parse_org;
389    use crate::types::Expr;
390
391    #[test]
392    fn basic_plain_link() {
393        let input = "https://swag.org";
394        let parsed = parse_org(input);
395        let l = expr_in_pool!(parsed, PlainLink).unwrap();
396        assert_eq!(
397            l,
398            &PlainLink {
399                protocol: "https".into(),
400                path: "//swag.org".into()
401            }
402        )
403    }
404
405    #[test]
406    fn plain_link_subprotocol() {
407        // http and https are protocols
408        let input = "http://swag.org";
409        let parsed = parse_org(input);
410        let l = expr_in_pool!(parsed, PlainLink).unwrap();
411        assert_eq!(
412            l,
413            &PlainLink {
414                protocol: "http".into(),
415                path: "//swag.org".into()
416            }
417        )
418    }
419
420    #[test]
421    fn plain_link_after() {
422        let input = "http://swag.com meow";
423        let parsed = parse_org(input);
424        let l = expr_in_pool!(parsed, PlainLink).unwrap();
425        assert_eq!(
426            l,
427            &PlainLink {
428                protocol: "http".into(),
429                path: "//swag.com".into()
430            }
431        )
432    }
433
434    #[test]
435    fn plain_link_ws_end() {
436        // http and https are protocols
437        let input = "  mailto:swag@cool.com   ";
438        let parsed = parse_org(input);
439        let l = expr_in_pool!(parsed, PlainLink).unwrap();
440
441        assert_eq!(
442            l,
443            &PlainLink {
444                protocol: "mailto".into(),
445                path: "swag@cool.com".into()
446            }
447        )
448    }
449
450    #[test]
451    fn plain_link_word_constituent() {
452        // http and https are protocols
453        let input = "  https://one_two_three_https______..............~~~!   ";
454        let parsed = parse_org(input);
455        let l = expr_in_pool!(parsed, PlainLink).unwrap();
456
457        assert_eq!(
458            l,
459            &PlainLink {
460                protocol: "https".into(),
461                path: "//one_two_three_https".into()
462            }
463        )
464    }
465
466    #[test]
467    fn plain_link_word_constituent_slash() {
468        // http and https are protocols
469        let input = "  https://one_two_three_https______/..............~~~!   ";
470        let parsed = parse_org(input);
471        let l = expr_in_pool!(parsed, PlainLink).unwrap();
472
473        assert_eq!(
474            l,
475            &PlainLink {
476                protocol: "https".into(),
477                path: "//one_two_three_https______/".into()
478            }
479        )
480    }
481
482    #[test]
483    fn basic_angle_link() {
484        // http and https are protocols
485        let input = "  <https://one two  !!@#!OIO DJDFK Jk> ";
486        let parsed = parse_org(input);
487        let l = expr_in_pool!(parsed, PlainLink).unwrap();
488
489        assert_eq!(
490            l,
491            &PlainLink {
492                protocol: "https".into(),
493                path: "//one two  !!@#!OIO DJDFK Jk".into()
494            }
495        )
496    }
497
498    #[test]
499    fn basic_regular_link() {
500        let input = "[[hps://.org]]";
501        let pool = parse_org(input);
502        pool.print_tree();
503    }
504
505    #[test]
506    fn regular_link_malformed() {
507        let input = "
508word
509[#A]
510";
511        let pool = parse_org(input);
512        pool.print_tree();
513    }
514
515    #[test]
516    fn regular_link_description() {
517        let input = " [[https://meo][cool site]]";
518        let pool = parse_org(input);
519        pool.print_tree();
520    }
521
522    #[test]
523    fn regular_link_unclosed_recursive_markup() {
524        let input = " [[https://meo][cool *site* ~one two~ three *four ]]";
525        let pool = parse_org(input);
526        pool.print_tree();
527    }
528
529    #[test]
530    fn regular_link_unclosed_plain_markup() {
531        let input = " [[https://meo][cool *site* ~one two~ three *four ~five six ]]";
532        let pool = parse_org(input);
533        pool.print_tree();
534    }
535
536    #[test]
537    fn file_link() {
538        let input = r"
539I'll be skipping over the instrumentals unless there's reason to.
540
541[[file:bmc.jpg]]
542** songs
543";
544
545        let pool = parse_org(input);
546        pool.print_tree();
547    }
548}