readme_sync/
cmark_data.rs

1use core::slice::Iter;
2use std::borrow::Cow;
3use std::string::String;
4use std::sync::Arc;
5use std::vec::Vec;
6
7use pulldown_cmark::Event;
8use thiserror::Error;
9
10use crate::{CMarkItem, File, FileDocs, TextSource};
11
12/// A `CMarkItem`s container storing a list of events with multiple transformation functions.
13#[derive(Clone, Debug, Default, PartialEq)]
14pub struct CMarkData(Vec<Arc<CMarkItem>>);
15
16/// A `CMarkItem`s container iterator.
17pub type CMarkDataIter<'a> = Iter<'a, Arc<CMarkItem>>;
18
19impl CMarkData {
20    /// Creates `CMarkData` from `CMarkItem`s.
21    pub fn from_items(items: Vec<Arc<CMarkItem>>) -> Self {
22        Self(items)
23    }
24
25    /// Creates `CMarkData` from the specified `File`.
26    pub fn from_file(file: Arc<File>) -> Self {
27        Self::from_text_source(TextSource::File(file))
28    }
29
30    /// Creates `CMarkData` from the specified `FileDocs`.
31    pub fn from_file_docs(file_docs: Arc<FileDocs>) -> Self {
32        Self::from_text_source(TextSource::FileDocs(file_docs))
33    }
34
35    /// Creates `CMarkData` from the specified `TextSource`.
36    pub fn from_text_source(text_source: TextSource) -> Self {
37        use crate::IntoStatic;
38        use pulldown_cmark::Parser;
39
40        let text = match &text_source {
41            TextSource::File(file) => file.text(),
42            TextSource::FileDocs(file_docs) => file_docs.docs(),
43        };
44
45        Self(
46            Parser::new(text)
47                .into_offset_iter()
48                .map(|(event, range)| {
49                    CMarkItem::from(event.into_static(), range, text_source.clone())
50                })
51                .collect(),
52        )
53        .concat_texts()
54    }
55
56    /// Consumes the `CMarkData`, returning `CMarkItem`s.
57    pub fn into_items(self) -> Vec<Arc<CMarkItem>> {
58        self.0
59    }
60
61    /// Iterate over `CMarkItem`s.
62    pub fn iter(&self) -> CMarkDataIter<'_> {
63        self.0.iter()
64    }
65
66    /// Iterate over pulldown-cmark events.
67    pub fn iter_events(&self) -> impl Iterator<Item = &Event<'_>> {
68        self.0.iter().filter_map(|item| item.event())
69    }
70
71    fn map<F>(self, func: F) -> Self
72    where
73        F: FnMut(Arc<CMarkItem>) -> Arc<CMarkItem>,
74    {
75        Self(self.0.into_iter().map(func).collect())
76    }
77
78    /// Concatenate adjacent text events.
79    ///
80    /// Use this transformation if you deleted some nodes manually
81    /// and want to merge the neighboring text nodes.
82    ///
83    /// This transformation is always applied right after
84    /// readme and docs parsing, because some text events remain ununited.
85    /// For example Rust attribute parser generate seperate text events
86    /// for every line of source code, and pulldown_cmark generate
87    /// seperate text events for character entity reference.
88    pub fn concat_texts(self) -> Self {
89        use core::mem::take;
90
91        let mut result = Vec::new();
92        let mut text_nodes = Vec::new();
93        let mut text_value = String::new();
94
95        for node in self.0.into_iter() {
96            match node.event() {
97                None => {
98                    result.push(node);
99                }
100                Some(Event::Text(event_text)) => {
101                    text_value += event_text;
102                    text_nodes.push(node);
103                }
104                Some(_) => {
105                    if let Some(text_node) =
106                        merge_text_nodes(take(&mut text_nodes), take(&mut text_value))
107                    {
108                        result.push(text_node);
109                    }
110                    result.push(node);
111                }
112            }
113        }
114
115        Self(result)
116    }
117}
118
119fn merge_text_nodes(nodes: Vec<Arc<CMarkItem>>, text: String) -> Option<Arc<CMarkItem>> {
120    use crate::CMarkItemAsModified;
121    use pulldown_cmark::CowStr;
122
123    match nodes.len() {
124        0 => None,
125        1 => Some(nodes.into_iter().next().unwrap()),
126        _ => Some(nodes.into_modified(
127            Event::Text(CowStr::Boxed(text.into_boxed_str())),
128            Cow::from("concat_texts()"),
129        )),
130    }
131}
132
133impl CMarkData {
134    /// Increment levels of all headings.
135    ///
136    /// In readme, the first level heading is usually used only for the project title.
137    /// The second level header is usually used in for text section headings in readme.
138    /// Rustdoc automatically adds the header of a crate name and the first level headers are used for text sections.
139    ///
140    /// So it is necessary to increase the level of all headings in the documentation in order to synchronize the headings.
141    pub fn increment_heading_levels(self) -> Self {
142        use crate::CMarkItemAsModified;
143        use pulldown_cmark::{Tag, TagEnd};
144
145        self.map(|node| {
146            let event = match node.event() {
147                Some(Event::Start(Tag::Heading {
148                    level,
149                    id,
150                    classes,
151                    attrs,
152                })) => Some(Event::Start(Tag::Heading {
153                    level: increase_heading_level(*level),
154                    id: id.clone(),
155                    classes: classes.clone(),
156                    attrs: attrs.clone(),
157                })),
158                Some(Event::End(TagEnd::Heading(level))) => {
159                    Some(Event::End(TagEnd::Heading(increase_heading_level(*level))))
160                }
161                _ => None,
162            };
163            if let Some(event) = event {
164                node.into_modified(event, Cow::from("increment_heading_levels()"))
165            } else {
166                node
167            }
168        })
169    }
170
171    /// Add a first level heading with the specified text.
172    ///
173    /// This function could be useful after heading level incremented.
174    pub fn add_title(self, text: &str) -> Self {
175        use pulldown_cmark::{CowStr, HeadingLevel, Tag, TagEnd};
176        use std::string::ToString;
177
178        let heading = std::vec![
179            CMarkItem::new(
180                Event::Start(Tag::Heading {
181                    level: HeadingLevel::H1,
182                    id: None,
183                    classes: std::vec![],
184                    attrs: std::vec![]
185                }),
186                Cow::from("add_title()")
187            ),
188            CMarkItem::new(
189                Event::Text(CowStr::Boxed(text.to_string().into_boxed_str())),
190                Cow::from("add_title()")
191            ),
192            CMarkItem::new(
193                Event::End(TagEnd::Heading(HeadingLevel::H1)),
194                Cow::from("add_title()")
195            ),
196        ];
197
198        Self(heading.into_iter().chain(self.0).collect())
199    }
200
201    /// Removes first paragraph that contains only images and image-links,
202    /// if the specified predicate returns true when passing image urls to it.
203    #[allow(clippy::match_like_matches_macro)] // requires minimum rustc version 1.42.0
204    pub fn remove_images_only_paragraph<P>(self, mut predicate: P) -> Self
205    where
206        P: FnMut(&[&str]) -> bool,
207    {
208        use crate::CMarkItemAsRemoved;
209        use core::mem::take;
210        use pulldown_cmark::{Tag, TagEnd};
211        use std::string::ToString;
212
213        let mut result = Vec::new();
214        let mut paragraph = Vec::new();
215        let mut image_urls = Vec::new();
216        let mut is_image = false;
217        let mut is_already_removed = false;
218
219        for node in self.0.into_iter() {
220            if is_already_removed {
221                result.push(node);
222                continue;
223            }
224
225            if !paragraph.is_empty() {
226                if is_image {
227                    let event = node.event();
228                    is_image = if let Some(Event::End(TagEnd::Image { .. })) = event {
229                        false
230                    } else {
231                        true
232                    };
233                    paragraph.push(node);
234                } else {
235                    paragraph.push(node);
236                    let node = paragraph.last().unwrap();
237                    let event = node.event();
238                    match event {
239                        Some(Event::End(TagEnd::Paragraph)) => {
240                            let urls: Vec<String> = take(&mut image_urls);
241                            let urls: Vec<&str> = urls.iter().map(|url| url.as_str()).collect();
242                            if !urls.is_empty() && predicate(&urls) {
243                                result
244                                    .push(take(&mut paragraph).into_removed(Cow::from(
245                                        "remove_images_only_paragraphs()",
246                                    )));
247                                is_already_removed = true;
248                            } else {
249                                result.append(&mut paragraph);
250                            }
251                        }
252                        Some(Event::Start(Tag::Image { dest_url, .. })) => {
253                            image_urls.push(dest_url.as_ref().to_string());
254                            is_image = true;
255                        }
256                        Some(Event::Start(Tag::Link { .. }))
257                        | Some(Event::End(TagEnd::Link { .. }))
258                        | Some(Event::SoftBreak)
259                        | None => {}
260                        Some(_) => {
261                            result.append(&mut paragraph);
262                        }
263                    }
264                }
265            } else {
266                let event = node.event();
267                match event {
268                    Some(Event::Start(Tag::Paragraph)) => paragraph.push(node),
269                    _ => result.push(node),
270                }
271            }
272        }
273
274        result.append(&mut paragraph);
275
276        Self(result)
277    }
278
279    /// Removes first paragraph that contains only badges.
280    pub fn remove_badges_paragraph(self) -> Self {
281        let patterns = crate::badge_url_patterns();
282        self.remove_images_only_paragraph(|image_urls| {
283            image_urls
284                .iter()
285                .any(|url| patterns.iter().any(|pattern| pattern.matches(url)))
286        })
287    }
288
289    /// Remove section with the specified heading text and level and its subsections.
290    pub fn remove_section(self, heading: &str, level: u32) -> Self {
291        use core::mem::take;
292        use pulldown_cmark::Tag;
293
294        let mut section = Vec::new();
295        let mut result = Vec::new();
296        let mut is_already_removed = false;
297
298        for node in self.0.into_iter() {
299            if !is_already_removed {
300                let event = node.event();
301                if let Some(Event::Start(Tag::Heading {
302                    level: node_level, ..
303                })) = event
304                {
305                    if heading_level(*node_level) <= level {
306                        let (mut section, is_removed) =
307                            into_removed_section_if_matched(take(&mut section), heading, level);
308                        result.append(&mut section);
309                        is_already_removed = is_removed;
310                    }
311                }
312            }
313            if is_already_removed {
314                result.push(node);
315            } else {
316                section.push(node);
317            }
318        }
319
320        result.append(&mut into_removed_section_if_matched(take(&mut section), heading, level).0);
321
322        Self(result)
323    }
324
325    /// Remove sections with heading `Documentation` and level 2.
326    pub fn remove_documentation_section(self) -> Self {
327        self.remove_section("Documentation", 2)
328    }
329}
330
331fn into_removed_section_if_matched(
332    section: Vec<Arc<CMarkItem>>,
333    heading: &str,
334    level: u32,
335) -> (Vec<Arc<CMarkItem>>, bool) {
336    use crate::CMarkItemAsRemoved;
337    use std::vec;
338
339    if is_matched_section(&section, heading, level) {
340        (
341            vec![section.into_removed(Cow::from(std::format!(
342                "remove_section(name = \"{}\", level = {})",
343                heading,
344                level
345            )))],
346            true,
347        )
348    } else {
349        (section, false)
350    }
351}
352
353fn is_matched_section(section: &[Arc<CMarkItem>], heading: &str, level: u32) -> bool {
354    use pulldown_cmark::Tag;
355
356    let first_event = section.first().and_then(|node| node.event());
357    let second_event = section.get(1).and_then(|node| node.event());
358    if let (
359        Some(Event::Start(Tag::Heading {
360            level: node_level, ..
361        })),
362        Some(Event::Text(node_text)),
363    ) = (first_event, second_event)
364    {
365        heading_level(*node_level) == level && node_text.as_ref() == heading
366    } else {
367        false
368    }
369}
370
371impl CMarkData {
372    /// Returns self if absolute blob links to the specified repository not found,
373    /// otherwise returns an error.
374    pub fn disallow_absolute_blob_links(
375        self,
376        repository_url: &str,
377    ) -> Result<Self, DisallowUrlsWithPrefixError> {
378        self.disallow_urls_with_prefix(&blob_path_prefix(repository_url))
379    }
380
381    /// Returns self if absolute docs links to the specified repository not found,
382    /// otherwise returns an error.
383    pub fn disallow_absolute_docs_links(
384        self,
385        package_name: &str,
386        documentation_url: &str,
387    ) -> Result<Self, DisallowUrlsWithPrefixError> {
388        self.disallow_urls_with_prefix(&docs_path_prefix(package_name, documentation_url))
389    }
390
391    /// Returns self if links with the specified prefix not found, otherwise returns an error.
392    pub fn disallow_urls_with_prefix(
393        self,
394        prefix: &str,
395    ) -> Result<Self, DisallowUrlsWithPrefixError> {
396        use pulldown_cmark::Tag;
397        use std::string::ToString;
398
399        for node in &self.0 {
400            if let Some(Event::Start(Tag::Link { dest_url, .. })) = node.event() {
401                if dest_url.starts_with(prefix) {
402                    return Err(DisallowUrlsWithPrefixError::PrefixFound {
403                        url: dest_url.as_ref().to_string(),
404                        prefix: prefix.to_string(),
405                    });
406                }
407            }
408        }
409
410        Ok(self)
411    }
412
413    /// Convert all relative links into absolute ones using
414    /// the repository url as the root address.
415    pub fn use_absolute_blob_urls(self, repository_url: &str) -> Self {
416        self.with_absolute_urls(&blob_path_prefix(repository_url))
417    }
418
419    /// Convert all relative links into absolute ones using
420    /// the package documentation url as the root address.
421    pub fn use_absolute_docs_urls(self, package_name: &str, documentation_url: &str) -> Self {
422        self.with_absolute_urls(&docs_path_prefix(package_name, documentation_url))
423    }
424
425    /// Convert all relative links into absolute ones using specified url prefix.
426    pub fn with_absolute_urls(self, prefix: &str) -> Self {
427        use std::format;
428
429        self.map_links(
430            |url| {
431                if !is_absolute_url(url) && !is_fragment(url) {
432                    Cow::from([prefix, url].concat())
433                } else {
434                    Cow::from(url)
435                }
436            },
437            Cow::from(format!("with_absolute_urls(prefix = \"{}\")", prefix)),
438        )
439    }
440
441    /// Converts all links with function `func` applied to each link address.
442    pub fn map_links<F>(self, mut func: F, note: impl Into<Cow<'static, str>>) -> Self
443    where
444        for<'b> F: FnMut(&'b str) -> Cow<'b, str>,
445    {
446        use crate::CMarkItemAsModified;
447        use pulldown_cmark::{CowStr, Tag};
448
449        fn map_link<'a, F>(tag: &Tag<'a>, mut func: F) -> Option<Tag<'a>>
450        where
451            for<'b> F: FnMut(&'b str) -> Cow<'b, str>,
452        {
453            if let Tag::Link {
454                link_type,
455                dest_url,
456                title,
457                id,
458            } = tag
459            {
460                let new_url = func(dest_url.as_ref());
461                if dest_url.as_ref() != new_url.as_ref() {
462                    let title = title.clone();
463                    return Some(Tag::Link {
464                        link_type: *link_type,
465                        dest_url: CowStr::from(new_url.into_owned()),
466                        title: title.clone(),
467                        id: id.clone(),
468                    });
469                }
470            }
471            None
472        }
473
474        let note = note.into();
475        self.map(|node| {
476            let event = match node.event() {
477                Some(Event::Start(tag)) => map_link(tag, &mut func).map(Event::Start),
478                _ => None,
479            };
480            match event {
481                Some(event) => node.into_modified(event, note.clone()),
482                None => node,
483            }
484        })
485    }
486}
487
488fn is_absolute_url(url: &str) -> bool {
489    is_url_with_scheme(url)
490}
491
492fn is_fragment(url: &str) -> bool {
493    url.starts_with('#')
494}
495
496#[allow(clippy::match_like_matches_macro)] // requires minimum rustc version 1.42.0
497fn is_url_with_scheme(url: &str) -> bool {
498    if let Some(scheme) = url.split("//").next() {
499        if scheme.is_empty() {
500            return true;
501        } else if scheme.ends_with(':') && scheme.len() >= 2 {
502            let scheme = &scheme[..scheme.len() - 1];
503            if let b'a'..=b'z' | b'A'..=b'Z' = scheme.as_bytes()[0] {
504                return scheme.as_bytes()[1..].iter().all(|ch| {
505                    if let b'a'..=b'z' | b'A'..=b'Z' | b'0'..=b'9' | b'+' | b'.' | b'-' = ch {
506                        true
507                    } else {
508                        false
509                    }
510                });
511            }
512        }
513    }
514    false
515}
516
517fn without_trailing_slash(value: &str) -> &str {
518    match value.as_bytes().last() {
519        Some(b'/') => &value[..value.len() - 1],
520        _ => value,
521    }
522}
523
524fn blob_path_prefix(repository_url: &str) -> String {
525    use std::string::ToString;
526    without_trailing_slash(repository_url).to_string() + "/blob/master/"
527}
528
529fn docs_path_prefix(package_name: &str, documentation_url: &str) -> String {
530    use std::string::ToString;
531
532    let url = without_trailing_slash(documentation_url);
533    let name = package_name.to_string().replace('-', "_");
534    [url, "/*/", &name, "/"].concat()
535}
536
537impl CMarkData {
538    /// Remove the specified fenced code block tag.
539    pub fn remove_codeblock_tag(self, tag: &str) -> Self {
540        self.remove_codeblock_tags(&[tag])
541    }
542
543    /// Remove the specified fenced code block tags.
544    pub fn remove_codeblock_tags(self, tags: &[&str]) -> Self {
545        use crate::CMarkItemAsModified;
546
547        self.map(|node| {
548            let event = match node.event() {
549                Some(Event::Start(tag)) => remove_codeblock_tag_tags(tag, tags).map(Event::Start),
550                _ => None,
551            };
552            match event {
553                Some(event) => node.into_modified(
554                    event,
555                    Cow::from(std::format!("remove_codeblock_tags(tags = {:?})", tags)),
556                ),
557                None => node,
558            }
559        })
560    }
561}
562
563fn remove_codeblock_tag_tags<'a>(
564    event_tag: &pulldown_cmark::Tag<'a>,
565    tags: &[&str],
566) -> Option<pulldown_cmark::Tag<'a>> {
567    use pulldown_cmark::{CodeBlockKind, CowStr, Tag};
568
569    if let Tag::CodeBlock(CodeBlockKind::Fenced(ref node_tags)) = event_tag {
570        let has_tags = node_tags
571            .split(',')
572            .any(|node_tag| tags.iter().any(|tag| &node_tag == tag));
573        if has_tags {
574            let node_tags: Vec<_> = node_tags
575                .split(',')
576                .filter(|node_tag| !tags.iter().any(|tag| node_tag == tag))
577                .collect();
578            let node_tags = CowStr::Boxed(node_tags.join(",").into_boxed_str());
579            return Some(Tag::CodeBlock(CodeBlockKind::Fenced(node_tags)));
580        }
581    }
582    None
583}
584
585impl CMarkData {
586    /// Remove fenced code block tags that are used by `cargo test`.
587    ///
588    /// See <https://doc.rust-lang.org/rustdoc/documentation-tests.html> for more details.
589    pub fn remove_codeblock_rust_test_tags(self) -> Self {
590        use crate::codeblock_rust_test_tags;
591
592        self.remove_codeblock_tags(codeblock_rust_test_tags())
593    }
594
595    /// Use the specified codeblock tag, if they are not specified
596    pub fn use_default_codeblock_tag(self, tag: &str) -> Self {
597        use crate::CMarkItemAsModified;
598
599        self.map(|node| {
600            let event = match node.event() {
601                Some(Event::Start(node_tag)) => {
602                    map_default_codeblock_tag(node_tag, tag).map(Event::Start)
603                }
604                _ => None,
605            };
606            match event {
607                Some(event) => node.into_modified(
608                    event,
609                    Cow::from(std::format!("use_default_codeblock_tag(tag = \"{}\")", tag)),
610                ),
611                None => node,
612            }
613        })
614    }
615}
616
617fn map_default_codeblock_tag<'a>(
618    event_tag: &pulldown_cmark::Tag<'a>,
619    tag: &str,
620) -> Option<pulldown_cmark::Tag<'a>> {
621    use pulldown_cmark::{CodeBlockKind, CowStr, Tag};
622    use std::string::ToString;
623
624    if let Tag::CodeBlock(CodeBlockKind::Fenced(ref node_tag)) = event_tag {
625        if node_tag.as_ref() == "" {
626            return Some(Tag::CodeBlock(CodeBlockKind::Fenced(CowStr::Boxed(
627                tag.to_string().into_boxed_str(),
628            ))));
629        }
630    }
631    None
632}
633
634impl CMarkData {
635    /// Use rust fenced codeblock highlight as default.
636    pub fn use_default_codeblock_rust_tag(self) -> Self {
637        self.use_default_codeblock_tag("rust")
638    }
639
640    /// Remove hidden rust code from rust fenced codeblocks.
641    ///
642    /// See <https://doc.rust-lang.org/rustdoc/documentation-tests.html#hiding-portions-of-the-example> for more details.
643    pub fn remove_hidden_rust_code(self) -> Self {
644        use crate::CMarkItemAsModified;
645        use pulldown_cmark::{CodeBlockKind, CowStr, Tag};
646
647        let mut is_rust_codeblock = false;
648
649        self.map(|node| {
650            match node.event() {
651                Some(Event::Start(Tag::CodeBlock(CodeBlockKind::Fenced(tags)))) => {
652                    is_rust_codeblock |= tags.split(',').any(|tag| tag == "rust")
653                }
654                Some(Event::Text(text)) => {
655                    if is_rust_codeblock {
656                        let text: Vec<_> = text
657                            .split('\n')
658                            .filter(|line| *line != "#" && !line.starts_with("# "))
659                            .collect();
660                        let text = text.join("\n");
661                        let event = Event::Text(CowStr::Boxed(text.into_boxed_str()));
662                        return node.into_modified(event, Cow::from("remove_hidden_rust_code()"));
663                    }
664                }
665                _ => {}
666            };
667            node
668        })
669    }
670}
671
672/// An error which can occur when checking for disallowed link prefixes.
673#[derive(Clone, Debug, Error)]
674pub enum DisallowUrlsWithPrefixError {
675    /// A prefix found
676    #[error("The url `{url}` use a prohibited prefix `{prefix}`.")]
677    PrefixFound {
678        /// Full url
679        url: String,
680        /// Disallowed prefix
681        prefix: String,
682    },
683}
684
685fn increase_heading_level(level: pulldown_cmark::HeadingLevel) -> pulldown_cmark::HeadingLevel {
686    use pulldown_cmark::HeadingLevel;
687
688    match level {
689        HeadingLevel::H1 => HeadingLevel::H2,
690        HeadingLevel::H2 => HeadingLevel::H3,
691        HeadingLevel::H3 => HeadingLevel::H4,
692        HeadingLevel::H4 => HeadingLevel::H5,
693        HeadingLevel::H5 | HeadingLevel::H6 => HeadingLevel::H6,
694    }
695}
696
697fn heading_level(level: pulldown_cmark::HeadingLevel) -> u32 {
698    use pulldown_cmark::HeadingLevel;
699
700    match level {
701        HeadingLevel::H1 => 1,
702        HeadingLevel::H2 => 2,
703        HeadingLevel::H3 => 3,
704        HeadingLevel::H4 => 4,
705        HeadingLevel::H5 => 5,
706        HeadingLevel::H6 => 6,
707    }
708}
709
710#[test]
711fn test_is_url_with_scheme() {
712    assert!(!is_url_with_scheme("Foo"));
713    assert!(!is_url_with_scheme("crate::Foo"));
714    assert!(is_url_with_scheme("//Foo"));
715    assert!(!is_url_with_scheme("://Foo"));
716    assert!(is_url_with_scheme("a://Foo"));
717    assert!(is_url_with_scheme("Z://Foo"));
718    assert!(!is_url_with_scheme("0://Foo"));
719    assert!(is_url_with_scheme("aa://Foo"));
720    assert!(is_url_with_scheme("a0://Foo"));
721    assert!(is_url_with_scheme("a+://Foo"));
722    assert!(is_url_with_scheme("a.://Foo"));
723    assert!(is_url_with_scheme("a-://Foo"));
724    assert!(!is_url_with_scheme("a?://Foo"));
725    assert!(is_url_with_scheme("http://Foo"));
726    assert!(is_url_with_scheme("https://Foo"));
727}