pithy 0.1.4 - Docs.rs

import regex as re #type: ignore
import urllib3 #type: ignore
requests_pool = urllib3.PoolManager()
import trafilatura

def extract_urls():
    #From https://plato.stanford.edu/contents.html
    #each line might look like this see <a href="entries/informed-consent/">informed consent</a> </li>
    #this function extracts the urls from the file, prepending them with https://plato.stanford.edu/entries/
    urls = re.findall(r'<a href="(.*?)">', open('plato_index.txt').read())
    urls = [u for u in urls if u.startswith('entries/')]
    urls = ['https://plato.stanford.edu/' + u for u in urls]
    return urls

import os
def process_link(link):
    page_html = requests_pool.request('GET', link).data
    article = trafilatura.extract(
        page_html,
        favor_precision=True,
        include_comments=False,
        include_formatting=False,
        include_images=False,
        include_tables=False
    )
    article_id = link.split('/')[-2]
    with open('texts/' + article_id, "w") as f:
        f.write(article)
    os.system(f"pithy -f texts/{article_id} --nobar --no_context --sentences 1 > texts/{article_id}.pithy")
    with open("texts/" + article_id + ".pithy", "r") as f:
        result = f.read().replace('\n', ' ').split(' ', 1)[1]
    os.remove("texts/" + article_id + ".pithy")
    os.remove("texts/" + article_id)

    with open("stanford_plato.md", "a") as f:
        f.write(f"## {link}\n")
        f.write(f"* {result}\n\n")

import threading
import time
def scrape():
    urls = extract_urls()
    for url in urls:
        print(url)
        process_link(url)
        #time.sleep(0.3)
        threading.Thread(target=process_link, args=(url,)).start()

scrape()