import regex as re import urllib3 requests_pool = urllib3.PoolManager()
import trafilatura
def extract_urls():
urls = re.findall(r'<a href="(.*?)">', open('plato_index.txt').read())
urls = [u for u in urls if u.startswith('entries/')]
urls = ['https://plato.stanford.edu/' + u for u in urls]
return urls
import os
def process_link(link):
page_html = requests_pool.request('GET', link).data
article = trafilatura.extract(
page_html,
favor_precision=True,
include_comments=False,
include_formatting=False,
include_images=False,
include_tables=False
)
article_id = link.split('/')[-2]
with open('texts/' + article_id, "w") as f:
f.write(article)
os.system(f"pithy -f texts/{article_id} --nobar --no_context --sentences 1 > texts/{article_id}.pithy")
with open("texts/" + article_id + ".pithy", "r") as f:
result = f.read().replace('\n', ' ').split(' ', 1)[1]
os.remove("texts/" + article_id + ".pithy")
os.remove("texts/" + article_id)
with open("stanford_plato.md", "a") as f:
f.write(f"## {link}\n")
f.write(f"* {result}\n\n")
import threading
import time
def scrape():
urls = extract_urls()
for url in urls:
print(url)
process_link(url)
threading.Thread(target=process_link, args=(url,)).start()
scrape()