|
|
|
|
|
by gabrielsroka
923 days ago
|
|
I cleaned up the code a little bit, but I didn't test it. This will have the same limitation as the Python I posted earlier in that you're not authenticated. from pathlib import Path
import scrapy
import requests
import html
import json
import os
# Set this:
USER = 'Jugurtha'
BASE_URL = 'https://hacker-news.firebaseio.com/v0' # https://github.com/HackerNews/API
LINKS = f'${BASE_URL}/user/{USER}.json'
class HNSpider(scrapy.Spider):
name = 'hn'
def start_requests(self):
items = requests.get(LINKS).json()['submitted']
for item in items:
url = f'{BASE_URL}/item/{item}.json'
filepath = Path(f'posts/{item}.html')
if os.path.exists(filepath):
self.log(f'Skipping already downloaded {url}')
else:
yield scrapy.Request(url=url, callback=self.parse)
def parse(self, response):
item = response.url.split('/item/')[1].split('.json')[0]
filename = f'{item}.html'
content = json.loads(response.text).get('text')
if content:
text = html.unescape(content)
with open(Path(f'posts/{filename}'), 'w') as f:
f.write(text)
self.log(f'Saved file {filename}')
|
|