Hacker News archive is hosted in ClickHouse as a publicly accessible data lake. It is available without sign-up and is updated in real-time. Example:
# Download ClickHouse:
curl https://clickhouse.com/ | sh
./clickhouse local
# Attach the table:
CREATE TABLE hackernews_history UUID '66491946-56e3-4790-a112-d2dc3963e68a'
(
update_time DateTime DEFAULT now(),
id UInt32,
deleted UInt8,
type Enum8('story' = 1, 'comment' = 2, 'poll' = 3, 'pollopt' = 4, 'job' = 5),
by LowCardinality(String),
time DateTime,
text String,
dead UInt8,
parent UInt32,
poll UInt32,
kids Array(UInt32),
url String,
score Int32,
title String,
parts Array(UInt32),
descendants Int32
)
ENGINE = ReplacingMergeTree(update_time)
ORDER BY id
SETTINGS refresh_parts_interval = 60,
disk = disk(readonly = true, type = 's3_plain_rewritable', endpoint = 'https://clicklake-test-2.s3.eu-central-1.amazonaws.com/', use_environment_credentials = false);
# Run queries:
SELECT time, decodeHTMLComponent(extractTextFromHTML(text)) AS t
FROM hackernews_history ORDER BY time DESC LIMIT 10 \G
# Download everything as Parquet/JSON/CSV...
SELECT * FROM hackernews_history INTO OUTFILE 'dump.parquet'