| HN Mirror

from pathlib import Path import scrapy import requests import html import json import os USER = 'Jugurtha' LINKS = f'https://hacker-news.firebaseio.com/v0/user/{USER}.json?print=pretty' BASE_URL = 'https://hacker-news.firebaseio.com/v0/item/' class HNSpider(scrapy.Spider): name = "hn" def start_requests(self): submitted = requests.get(LINKS).json()['submitted'] urls = [f'{BASE_URL}{sub}.json?print=pretty' for sub in submitted] for url in urls: item = url.split('/item/')[1].split('.json')[0] filename = f'{item}.html' filepath = Path(f'posts/{filename}') if not os.path.exists(filepath): yield scrapy.Request(url=url, callback=self.parse) else: self.log(f'Skipping already downloaded {url}') def parse(self, response): item = response.url.split('/item/')[1].split('.json')[0] filename = f"{item}.html" content = json.loads(response.text).get('text') if content is not None: text = html.unescape(content) filepath = Path(f'posts/{filename}') with open(Path(f'posts/{filename}'), 'w') as f: f.write(text) self.log(f"Saved file {filename}")

I cleaned up the code a little bit, but I didn't test it. This will have the same limitation as the Python I posted earlier in that you're not authenticated.

  from pathlib import Path
  
  import scrapy
  import requests
  import html
  import json
  import os
 
  # Set this:
  USER = 'Jugurtha'  
  
  BASE_URL = 'https://hacker-news.firebaseio.com/v0' # https://github.com/HackerNews/API
  LINKS = f'${BASE_URL}/user/{USER}.json'
 
  class HNSpider(scrapy.Spider):
      name = 'hn'
  
      def start_requests(self):
          items = requests.get(LINKS).json()['submitted']
          for item in items:
              url = f'{BASE_URL}/item/{item}.json'
              filepath = Path(f'posts/{item}.html')
              if os.path.exists(filepath):
                  self.log(f'Skipping already downloaded {url}')
              else:
                  yield scrapy.Request(url=url, callback=self.parse)
  
      def parse(self, response):
          item = response.url.split('/item/')[1].split('.json')[0]
  
          filename = f'{item}.html'
          content = json.loads(response.text).get('text')
          if content:
              text = html.unescape(content)
  
              with open(Path(f'posts/{filename}'), 'w') as f:
                  f.write(text)
                  self.log(f'Saved file {filename}')