|
|
|
|
|
by Jugurtha
923 days ago
|
|
Here's a small, crude, Scrapy spider, with hardcoded values and all. You can set the value of `DOWNLOAD_DELAY` in `settings.py` for courtesy. It puts the comments in a `posts` directory as `html` files. It doesn't do upvotes nor stories/links submitted (they have the type `story` in the response, as opposed to `text` for comments). You can easily tweak it. from pathlib import Path
import scrapy
import requests
import html
import json
import os
USER = 'Jugurtha'
LINKS = f'https://hacker-news.firebaseio.com/v0/user/{USER}.json?print=pretty'
BASE_URL = 'https://hacker-news.firebaseio.com/v0/item/'
class HNSpider(scrapy.Spider):
name = "hn"
def start_requests(self):
submitted = requests.get(LINKS).json()['submitted']
urls = [f'{BASE_URL}{sub}.json?print=pretty' for sub in submitted]
for url in urls:
item = url.split('/item/')[1].split('.json')[0]
filename = f'{item}.html'
filepath = Path(f'posts/{filename}')
if not os.path.exists(filepath):
yield scrapy.Request(url=url, callback=self.parse)
else:
self.log(f'Skipping already downloaded {url}')
def parse(self, response):
item = response.url.split('/item/')[1].split('.json')[0]
filename = f"{item}.html"
content = json.loads(response.text).get('text')
if content is not None:
text = html.unescape(content)
filepath = Path(f'posts/{filename}')
with open(Path(f'posts/{filename}'), 'w') as f:
f.write(text)
self.log(f"Saved file {filename}")
|
|