nab 0.7.1 - Docs.rs

# Reddit post extraction via Reddit JSON API.
#
# Reddit returns a bare JSON array: [post_listing, comments_listing].
# Appending `.json` to any post URL gives the API response.
#
# The `client = "standard"` option is required because Reddit returns HTML
# instead of JSON when the client forces HTTP/2 via prior knowledge (no ALPN).
# A standard reqwest client negotiates the HTTP version via TLS ALPN.

[site]
name = "reddit"
patterns = [
    "(?i)(?:www\\.)?reddit\\.com/r/[^/]+/comments/",
    "(?i)old\\.reddit\\.com/r/[^/]+/comments/",
]

[rewrite]
# Strip query string and add .json suffix.
from = "(?i)(https?://(?:(?:www\\.|old\\.)?reddit\\.com/r/[^/]+/comments/[^/?#]+(?:/[^/?#]+)?))[?#]?.*"
to = "$1.json"

[request]
client = "standard"
headers = { "User-Agent" = "nab/1.0 (by /u/nab-cli)" }
accept = "application/json"

[json]
# Post data lives in the first listing: [0].data.children[0].data
title    = "[0].data.children[0].data.title"
author   = "[0].data.children[0].data.author"
score    = "[0].data.children[0].data.score"
comments = "[0].data.children[0].data.num_comments"
selftext = "[0].data.children[0].data.selftext"
url      = "[0].data.children[0].data.url"
subreddit = "[0].data.children[0].data.subreddit"
permalink = "[0].data.children[0].data.permalink"
# Top 3 comments from the second listing: [1].data.children[N].data
c0_author = "[1].data.children[0].data.author"
c0_body   = "[1].data.children[0].data.body"
c0_score  = "[1].data.children[0].data.score"
c1_author = "[1].data.children[1].data.author"
c1_body   = "[1].data.children[1].data.body"
c1_score  = "[1].data.children[1].data.score"
c2_author = "[1].data.children[2].data.author"
c2_body   = "[1].data.children[2].data.body"
c2_score  = "[1].data.children[2].data.score"

[template]
format = """
## {title}

r/{subreddit} · by u/{author} · {score|number} points · {comments|number} comments

{selftext}

### Top Comments

**u/{c0_author}** · {c0_score|number} points

{c0_body}

---

**u/{c1_author}** · {c1_score|number} points

{c1_body}

---

**u/{c2_author}** · {c2_score|number} points

{c2_body}

---

[View on Reddit](https://www.reddit.com{permalink})
"""

[metadata]
platform = "Reddit"
author = "u/{author}"
title_field = "title"
canonical_url_field = "url"

[engagement]
likes = "score"
replies = "comments"

# Fallback: scrape old.reddit.com HTML when the JSON API returns 403/404.
# Reddit now blocks unauthenticated .json requests; old.reddit.com still
# renders server-side HTML that we can extract with CSS selectors.
[[fallback]]
rewrite_from = "(?i)https?://(?:(?:www\\.)?reddit\\.com|old\\.reddit\\.com)(/r/[^/]+/comments/[^/?#]+(?:/[^/?#]+)?)[?#]?.*"
rewrite_to   = "https://old.reddit.com$1"
type         = "html"

[fallback.css]
title     = ".title a.title"
author    = ".tagline .author"
score     = ".score .number"
selftext  = ".usertext-body .md"
subreddit = ".redditname a"