|
The better question is, why is DuckDB so popular when one can use Polars which has a sane, lintable, typesafe API compared to the mess that is SQL: WITH lagged AS (
SELECT
*,
LAG(event_time) OVER (PARTITION BY user_id ORDER BY event_time) AS prev_time
FROM events
),
sessions AS (
SELECT
*,
SUM(COALESCE((date_diff('minute', prev_time, event_time) > 30)::INT, 1))
OVER (PARTITION BY user_id ORDER BY event_time) AS session_id
FROM lagged
)
SELECT
user_id,
session_id,
MIN(event_time) AS session_start,
MAX(event_time) AS session_end,
COUNT(*) AS event_count
FROM sessions
GROUP BY ALL
ORDER BY user_id, session_start;
vs result = (
df.sort(["user_id", "event_time"])
.with_columns(
session_id=(
pl.when(pl.col("event_time").diff().is_null())
.then(1)
.when(pl.col("event_time").diff().dt.total_minutes() > 30)
.then(1)
.otherwise(0)
.cum_sum()
.over("user_id")
)
)
.group_by(["user_id", "session_id"])
.agg(
session_start=pl.col("event_time").min(),
session_end=pl.col("event_time").max(),
event_count=pl.col("event_time").count(),
)
.sort(["user_id", "session_start"])
)
|