org_rust_parser/object/
link.rs

1use std::borrow::Cow;
2use std::fmt::Display;
3
4use phf::phf_set;
5
6use crate::constants::{
7    BACKSLASH, COLON, HYPHEN, LANGLE, LBRACK, LPAREN, POUND, RANGLE, RBRACK, RPAREN, SLASH,
8};
9use crate::node_pool::NodeID;
10use crate::parse::parse_object;
11use crate::types::{Cursor, MarkupKind, MatchError, ParseOpts, Parseable, Parser, Result};
12use crate::utils::Match;
13
14const ORG_LINK_PARAMETERS: [&str; 9] = [
15    "shell", "news", "mailto", "https", "http", "ftp", "help", "file", "elisp",
16];
17
18// file types we can wrap an `img` around
19static IMAGE_TYPES: phf::Set<&str> = phf_set! {
20    "jpeg",
21    "jpg",
22    "png",
23    "gif",
24    "svg",
25    "webp",
26};
27
28#[derive(Debug, Clone)]
29pub struct RegularLink<'a> {
30    pub path: Match<PathReg<'a>>,
31    // One or more objects enclosed by square brackets.
32    // It can contain the minimal set of objects as well as export snippets,
33    // inline babel calls, inline source blocks, macros, and statistics cookies.
34    // It can also contain another link, but only when it is a plain or angle link.
35    // It can contain square brackets, so long as they are balanced.
36    pub description: Option<Vec<NodeID>>,
37    // Captions will be filled in later. affiliated keywords are defined first, so the caption
38    // checks if the child is a link and then sticks it in
39    pub caption: Option<NodeID>,
40}
41
42impl Display for PathReg<'_> {
43    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
44        match self {
45            PathReg::PlainLink(link) => {
46                f.write_fmt(format_args!("{}:{}", link.protocol, link.path))
47            }
48            PathReg::Id(inner) => f.write_fmt(format_args!("id:{inner}")),
49            PathReg::CustomId(inner) => f.write_fmt(format_args!("#{inner}")),
50            PathReg::Coderef(inner) => f.write_fmt(format_args!("({inner})")),
51            PathReg::Unspecified(inner) => f.write_fmt(format_args!("{inner}")),
52            PathReg::File(inner) => f.write_fmt(format_args!("file:{inner}")),
53        }
54    }
55}
56
57#[derive(Debug, Clone, PartialEq, Eq)]
58pub struct PlainLink<'a> {
59    pub protocol: Cow<'a, str>,
60    pub path: Cow<'a, str>,
61}
62
63impl From<&PlainLink<'_>> for String {
64    fn from(value: &PlainLink) -> Self {
65        format!("{}:{}", value.protocol, value.path)
66    }
67}
68
69/// Enum representing various file types
70#[derive(Debug, Clone)]
71pub enum PathReg<'a> {
72    PlainLink(PlainLink<'a>),
73    Id(&'a str),
74    /// allows changing the name of the exported id
75    CustomId(&'a str),
76    /// allows linking to specific lines in code blocks
77    Coderef(&'a str),
78    File(Cow<'a, str>),
79    Unspecified(Cow<'a, str>),
80    // We can't determine while parsing whether we point to a headline
81    // or a filename (we don't track headlines while building)
82    // leave it to the exporter.
83    // FileName(&'a Path),
84    // Fuzzy(&'a str),
85}
86
87impl<'a> PathReg<'a> {
88    fn new(cursor: Cursor<'a>) -> Self {
89        match cursor.curr() {
90            b'i' => {
91                if let Ok(id) = PathReg::parse_id(cursor) {
92                    return PathReg::Id(id);
93                } else if let Ok(link) = parse_plain_link(cursor) {
94                    return PathReg::PlainLink(link.obj);
95                }
96            }
97            b'f' => {
98                if let Ok(file_path) = PathReg::parse_file(cursor) {
99                    return PathReg::File(file_path.into());
100                } else if let Ok(link) = parse_plain_link(cursor) {
101                    return PathReg::PlainLink(link.obj);
102                }
103            }
104            POUND => {
105                // custom-id
106                return PathReg::CustomId(cursor.clamp(cursor.index + 1, cursor.len()));
107            }
108            LPAREN => {
109                // FIXME: breaks on ()
110                if cursor[cursor.len() - 1] == RPAREN {
111                    return PathReg::Coderef(cursor.clamp(cursor.index + 1, cursor.len()));
112                }
113            }
114            chr => {
115                if let Ok(link) = parse_plain_link(cursor) {
116                    return PathReg::PlainLink(link.obj);
117                }
118            }
119        }
120        // unspecified:
121        // We can't determine while parsing whether we point to a headline
122        // or a filename (we don't track headlines while building)
123        // leave it to the exporter.
124        // FileName(&'a Path),
125        // Fuzzy(&'a str),
126        return PathReg::Unspecified(cursor.clamp_forwards(cursor.len()).into());
127    }
128
129    fn parse_id(mut cursor: Cursor<'a>) -> Result<&'a str> {
130        cursor.word("id:")?;
131        let begin_id = cursor.index;
132
133        while let Ok(num) = cursor.try_curr() {
134            if !num.is_ascii_hexdigit() || num == HYPHEN {
135                return Err(MatchError::InvalidLogic);
136            }
137            cursor.next();
138        }
139
140        Ok(cursor.clamp_backwards(begin_id))
141    }
142
143    fn parse_file(mut cursor: Cursor<'a>) -> Result<&'a str> {
144        cursor.word("file:")?;
145        let begin_id = cursor.index;
146
147        while let Ok(num) = cursor.try_curr() {
148            cursor.next();
149        }
150
151        Ok(cursor.clamp_backwards(begin_id))
152    }
153}
154
155impl<'a> Parseable<'a> for RegularLink<'a> {
156    fn parse(
157        parser: &mut Parser<'a>,
158        mut cursor: Cursor<'a>,
159        parent: Option<NodeID>,
160        mut parse_opts: ParseOpts,
161    ) -> Result<NodeID> {
162        let start = cursor.index;
163        cursor.word("[[")?;
164
165        // find backslash
166        loop {
167            match cursor.try_curr()? {
168                BACKSLASH => {
169                    // check for escaped char, and skip past it
170                    if let BACKSLASH | LBRACK | RBRACK = cursor.peek(1)? {
171                        cursor.advance(2);
172                    } else {
173                        return Err(MatchError::InvalidLogic);
174                    }
175                }
176                RBRACK => {
177                    // handles the  [[][]]  case, would panic without this check
178                    if cursor.index == start + 2 {
179                        return Err(MatchError::InvalidLogic);
180                    }
181
182                    if LBRACK == cursor.peek(1)? {
183                        let path_reg_end = cursor.index;
184
185                        // skip ][
186                        cursor.advance(2);
187                        parse_opts.from_object = false;
188                        parse_opts.markup.insert(MarkupKind::Link);
189
190                        let mut content_vec: Vec<NodeID> = Vec::new();
191                        loop {
192                            match parse_object(parser, cursor, parent, parse_opts) {
193                                Ok(id) => {
194                                    cursor.index = parser.pool[id].end;
195                                    content_vec.push(id);
196                                }
197                                Err(MatchError::MarkupEnd(kind)) => {
198                                    if !kind.contains(MarkupKind::Link) {
199                                        // TODO: cache and explode
200                                        return Err(MatchError::InvalidLogic);
201                                    }
202
203                                    let reg_curs = cursor.clamp_off(start + 2, path_reg_end);
204                                    let pathreg = Match {
205                                        start: start + 2,
206                                        end: path_reg_end,
207                                        obj: PathReg::new(reg_curs),
208                                    };
209
210                                    // set parents of children
211                                    // TODO: abstract this? stolen from markup.rs
212                                    let new_id = parser.pool.reserve_id();
213                                    for id in &mut content_vec {
214                                        parser.pool[*id].parent = Some(new_id);
215                                    }
216
217                                    return Ok(parser.alloc_with_id(
218                                        Self {
219                                            path: pathreg,
220                                            description: Some(content_vec),
221                                            caption: None,
222                                        },
223                                        start,
224                                        cursor.index + 2, // link end is 2 bytes long
225                                        parent,
226                                        new_id,
227                                    ));
228                                }
229                                ret @ Err(_) => return ret,
230                            }
231                        }
232                    } else if RBRACK == cursor.peek(1)? {
233                        // close object;
234
235                        let reg_curs = cursor.clamp_off(start + 2, cursor.index);
236                        let pathreg = Match {
237                            start: start + 2,
238                            end: cursor.index,
239                            obj: PathReg::new(reg_curs),
240                        };
241
242                        return Ok(parser.alloc(
243                            Self {
244                                path: pathreg,
245                                description: None,
246                                caption: None,
247                            },
248                            start,
249                            cursor.index + 2,
250                            parent,
251                        ));
252                    } else {
253                        return Err(MatchError::InvalidLogic);
254                    }
255                }
256                _ => {}
257            }
258            cursor.next();
259        }
260    }
261}
262
263impl RegularLink<'_> {
264    pub fn is_image(&self, parser: &Parser) -> bool {
265        let link_source: &str = match &self.path.obj {
266            PathReg::Unspecified(inner) => inner,
267            PathReg::File(inner) => inner,
268            PathReg::PlainLink(inner) => &inner.path,
269            _ => {
270                // HACK: we just want to jump outta here, everything else doesnt make sense
271                // in an image context
272                ""
273            }
274        };
275        link_source
276            .rsplit_once('.') // extract extension_type
277            .map(|(_, ext)| ext)
278            .is_some_and(|ext| IMAGE_TYPES.contains(ext))
279    }
280}
281
282// REVIEW:
283// apparently a word constituent..isn't undescore??
284// https://www.gnu.org/software/emacs/manual/html_node/elisp/Syntax-Class-Table.html
285// Parts of words in human languages.
286// These are typically used in variable and command names in programs.
287// All upper- and lower-case letters, and the digits, are typically word constituents.
288
289/// PROTOCOL
290/// A string which is one of the link type strings in org-link-parameters.
291///
292/// PATHPLAIN
293/// A string containing any non-whitespace character but (, ), <, or >.
294/// It must end with a word-constituent character,
295/// or any non-whitespace non-punctuation character followed by /.
296// Word-constituent characters are letters, digits, and the underscore.
297// source: https://www.gnu.org/software/grep/manual/grep.html
298pub(crate) fn parse_plain_link(mut cursor: Cursor<'_>) -> Result<Match<PlainLink<'_>>> {
299    if let Ok(pre_byte) = cursor.peek_rev(1)
300        && pre_byte.is_ascii_alphanumeric()
301    {
302        return Err(MatchError::InvalidLogic);
303    }
304
305    let start = cursor.index;
306
307    for (i, &protocol) in ORG_LINK_PARAMETERS.iter().enumerate() {
308        // DO NOT read up to the colon and use phf_set to determine if it's a protocol
309        // cause the colon might be in the middle-a-nowhere if we're parsing regular text here
310        if cursor.word(protocol).is_ok() {
311            if cursor.try_curr()? == COLON {
312                cursor.next();
313                let path_start = cursor.index;
314                // let pre
315
316                while let Ok(byte) = cursor.try_curr() {
317                    match byte {
318                        RANGLE | LPAREN | RPAREN | LANGLE | b'\t' | b'\n' | b'\x0C' | b'\r'
319                        | b' ' => {
320                            break;
321                        }
322                        // RANGLE => break,
323                        _ => {
324                            cursor.next();
325                        }
326                    }
327                }
328
329                let last_link_byte = cursor[cursor.index - 1];
330                // if no progress was made, i.e. just PROTOCOL (https://):
331
332                // rewind until we end with an alphanumeric char or SLASH
333                //
334                // so:
335                // https://abc.org...___
336                // would only get: https://abc.org
337                //
338                // if you do something like https://onea/a/aaaa/,,,,,/
339                // then i think that breaks the definition, cause the slash isn't after a non-punc char,,
340                // but also if you do that then you're just being difficult.
341
342                while !cursor.peek_rev(1)?.is_ascii_alphanumeric() && cursor.peek_rev(1)? != SLASH {
343                    cursor.prev();
344                    if cursor.index <= path_start {
345                        return Err(MatchError::InvalidLogic);
346                    }
347                }
348
349                if if let Ok(future_byte) = cursor.try_curr() {
350                    !future_byte.is_ascii_alphanumeric()
351                } else {
352                    true
353                } {
354                    return Ok(Match {
355                        start,
356                        end: cursor.index,
357                        obj: PlainLink {
358                            protocol: protocol.into(),
359                            path: cursor.clamp_backwards(path_start).into(),
360                        },
361                    });
362                } else {
363                    return Err(MatchError::EofError);
364                }
365            } else {
366                cursor.index -= protocol.len();
367            }
368        }
369    }
370
371    Err(MatchError::InvalidLogic)
372}
373
374pub(crate) fn parse_angle_link<'a>(
375    parser: &mut Parser<'a>,
376    mut cursor: Cursor<'a>,
377    parent: Option<NodeID>,
378    parse_opts: ParseOpts,
379) -> Result<NodeID> {
380    let start = cursor.index;
381
382    cursor.next();
383
384    for (i, &protocol) in ORG_LINK_PARAMETERS.iter().enumerate() {
385        if cursor.word(protocol).is_ok() {
386            if cursor.try_curr()? == COLON {
387                cursor.next();
388                let path_start = cursor.index;
389                while let Ok(byte) = cursor.try_curr() {
390                    match byte {
391                        RBRACK | LANGLE | b'\n' => return Err(MatchError::InvalidLogic),
392                        RANGLE => break,
393                        _ => {
394                            cursor.next();
395                        }
396                    }
397                }
398
399                // <PROTOCOL:> is valid, don't need to check indices
400
401                return Ok(parser.alloc(
402                    PlainLink {
403                        protocol: protocol.into(),
404                        path: cursor.clamp_backwards(path_start).into(),
405                    },
406                    start,
407                    cursor.index + 1, // skip rangle
408                    parent,
409                ));
410            } else {
411                cursor.index -= protocol.len();
412            }
413        }
414    }
415
416    Err(MatchError::InvalidLogic)
417}
418
419#[cfg(test)]
420mod tests {
421    use pretty_assertions::assert_eq;
422
423    use crate::element::Affiliated;
424    use crate::expr_in_pool;
425    use crate::object::{PlainLink, RegularLink};
426    use crate::parse_org;
427    use crate::types::Expr;
428
429    #[test]
430    fn basic_plain_link() {
431        let input = "https://swag.org";
432        let parsed = parse_org(input);
433        let l = expr_in_pool!(parsed, PlainLink).unwrap();
434        assert_eq!(
435            l,
436            &PlainLink {
437                protocol: "https".into(),
438                path: "//swag.org".into()
439            }
440        )
441    }
442
443    #[test]
444    fn plain_link_subprotocol() {
445        // http and https are protocols
446        let input = "http://swag.org";
447        let parsed = parse_org(input);
448        let l = expr_in_pool!(parsed, PlainLink).unwrap();
449        assert_eq!(
450            l,
451            &PlainLink {
452                protocol: "http".into(),
453                path: "//swag.org".into()
454            }
455        )
456    }
457
458    #[test]
459    fn plain_link_after() {
460        let input = "http://swag.com meow";
461        let parsed = parse_org(input);
462        let l = expr_in_pool!(parsed, PlainLink).unwrap();
463        assert_eq!(
464            l,
465            &PlainLink {
466                protocol: "http".into(),
467                path: "//swag.com".into()
468            }
469        )
470    }
471
472    #[test]
473    fn plain_link_ws_end() {
474        // http and https are protocols
475        let input = "  mailto:swag@cool.com   ";
476        let parsed = parse_org(input);
477        let l = expr_in_pool!(parsed, PlainLink).unwrap();
478
479        assert_eq!(
480            l,
481            &PlainLink {
482                protocol: "mailto".into(),
483                path: "swag@cool.com".into()
484            }
485        )
486    }
487
488    #[test]
489    fn plain_link_word_constituent() {
490        // http and https are protocols
491        let input = "  https://one_two_three_https______..............~~~!   ";
492        let parsed = parse_org(input);
493        let l = expr_in_pool!(parsed, PlainLink).unwrap();
494
495        assert_eq!(
496            l,
497            &PlainLink {
498                protocol: "https".into(),
499                path: "//one_two_three_https".into()
500            }
501        )
502    }
503
504    #[test]
505    fn plain_link_word_constituent_slash() {
506        // http and https are protocols
507        let input = "  https://one_two_three_https______/..............~~~!   ";
508        let parsed = parse_org(input);
509        let l = expr_in_pool!(parsed, PlainLink).unwrap();
510
511        assert_eq!(
512            l,
513            &PlainLink {
514                protocol: "https".into(),
515                path: "//one_two_three_https______/".into()
516            }
517        )
518    }
519
520    #[test]
521    fn basic_angle_link() {
522        // http and https are protocols
523        let input = "  <https://one two  !!@#!OIO DJDFK Jk> ";
524        let parsed = parse_org(input);
525        let l = expr_in_pool!(parsed, PlainLink).unwrap();
526
527        assert_eq!(
528            l,
529            &PlainLink {
530                protocol: "https".into(),
531                path: "//one two  !!@#!OIO DJDFK Jk".into()
532            }
533        )
534    }
535
536    #[test]
537    fn basic_regular_link() {
538        let input = "[[hps://.org]]";
539        let pool = parse_org(input);
540        pool.print_tree();
541    }
542
543    #[test]
544    fn regular_link_malformed() {
545        let input = "
546word
547[#A]
548";
549        let pool = parse_org(input);
550        pool.print_tree();
551    }
552
553    #[test]
554    fn regular_link_description() {
555        let input = " [[https://meo][cool site]]";
556        let pool = parse_org(input);
557        pool.print_tree();
558    }
559
560    #[test]
561    fn regular_link_unclosed_recursive_markup() {
562        let input = " [[https://meo][cool *site* ~one two~ three *four ]]";
563        let pool = parse_org(input);
564        pool.print_tree();
565    }
566
567    #[test]
568    fn regular_link_unclosed_plain_markup() {
569        let input = " [[https://meo][cool *site* ~one two~ three *four ~five six ]]";
570        let pool = parse_org(input);
571        pool.print_tree();
572    }
573
574    #[test]
575    fn file_link() {
576        let input = r"
577I'll be skipping over the instrumentals unless there's reason to.
578
579[[file:bmc.jpg]]
580** songs
581";
582
583        let pool = parse_org(input);
584        pool.print_tree();
585    }
586
587    #[test]
588    fn caption_link() {
589        let input = r"
590#+caption: sing song
591[[heathers.jpg]]
592
593";
594
595        let parser = parse_org(input);
596        let image_link = expr_in_pool!(parser, RegularLink).unwrap();
597        if let Some(cap_id) = image_link.caption
598            && let Expr::Affiliated(Affiliated::Caption(aff)) = &parser.pool[cap_id].obj
599            && let Expr::Paragraph(par) = &parser.pool[*aff].obj
600            && let Expr::Plain(text) = parser.pool[par.0[0]].obj
601        {
602            // REVIEW: does the cap need to be trimmed
603            assert_eq!(text, " sing song");
604        } else {
605            panic!()
606        };
607    }
608}
org_rust_parser/object/link.rs

org_rust_parser/object/
link.rs