WITH lagged AS (
SELECT
*,
LAG(event_time) OVER (PARTITION BY user_id ORDER BY event_time) AS prev_time
FROM events
),
sessions AS (
SELECT
*,
SUM(COALESCE((date_diff('minute', prev_time, event_time) > 30)::INT, 1))
OVER (PARTITION BY user_id ORDER BY event_time) AS session_id
FROM lagged
)
SELECT
user_id,
session_id,
MIN(event_time) AS session_start,
MAX(event_time) AS session_end,
COUNT(*) AS event_count
FROM sessions
GROUP BY ALL
ORDER BY user_id, session_start;
vs result = (
df.sort(["user_id", "event_time"])
.with_columns(
session_id=(
pl.when(pl.col("event_time").diff().is_null())
.then(1)
.when(pl.col("event_time").diff().dt.total_minutes() > 30)
.then(1)
.otherwise(0)
.cum_sum()
.over("user_id")
)
)
.group_by(["user_id", "session_id"])
.agg(
session_start=pl.col("event_time").min(),
session_end=pl.col("event_time").max(),
event_count=pl.col("event_time").count(),
)
.sort(["user_id", "session_start"])
)Other dataframe libraries work directly on duckdb also, like Ibis.
FWIW, I think SQLAlchemy does also.
In any case though, I find both the SQL and the code there equally readable.
SQL has been around since the dawn of databases. I am happy to see a trend away from pandas.
Performance is definitely one of them, but it also has inconsistent and duplicated methods, inconsistent defaults (e.g. some methods are inplace by default), copy by reference issues, I could go on.
It was an early winner in an extremely popular language. That's really the main thing going for it, but alternatives have been a long time coming.