데이터베이스(Database)/Clickhouse

클릭하우스 자주 사용하는 쿼리

leebaro 2023. 11. 13.
728x90

 

 

 

리텐션

아래와 같은 방식으로 리텐션과 리텐션율을 계산할 수 있

SELECT 
    retention_test.date,
    count() AS initial_users,
    countIf(arrayExists(d -> d = retention_test.date + 1, dates)) AS day1,
    countIf(arrayExists(d -> d = retention_test.date + 2, dates)) AS day2,
    countIf(arrayExists(d -> d = retention_test.date + 3, dates)) AS day3,
    countIf(arrayExists(d -> d = retention_test.date + 4, dates)) AS day4,
    countIf(arrayExists(d -> d = retention_test.date + 5, dates)) AS day5,
    countIf(arrayExists(d -> d = retention_test.date + 6, dates)) AS day6,
    countIf(arrayExists(d -> d = retention_test.date + 7, dates)) AS day7,
    countIf(arrayExists(d -> d = retention_test.date + 8, dates)) AS day8,
    countIf(arrayExists(d -> d = retention_test.date + 9, dates)) AS day9
FROM (
    SELECT 
        uid,
        groupArray(date) AS dates
    FROM retention_test
    WHERE date BETWEEN '2023-01-01' AND '2023-01-03' OR date BETWEEN '2023-01-01' AND '2023-01-13'
    GROUP BY uid
) AS retention_data
RIGHT JOIN retention_test ON retention_data.uid = retention_test.uid
WHERE retention_test.date BETWEEN '2023-01-01' AND '2023-01-03'
GROUP BY retention_test.date



SELECT 
    retention_test.date,
    count() AS initial_users,
    count() > 0 ? round(countIf(arrayExists(d -> d = retention_test.date + 1, dates)) / count(), 2) : 0 AS day1_rate,
    count() > 0 ? round(countIf(arrayExists(d -> d = retention_test.date + 2, dates)) / count(), 2) : 0 AS day2_rate,
    count() > 0 ? round(countIf(arrayExists(d -> d = retention_test.date + 3, dates)) / count(), 2) : 0 AS day3_rate,
    count() > 0 ? round(countIf(arrayExists(d -> d = retention_test.date + 4, dates)) / count(), 2) : 0 AS day4_rate,
    -- 계속하여 day5_rate, day6_rate 등을 추가
FROM (
    SELECT 
        uid,
        groupArray(date) AS dates
    FROM retention_test
    WHERE date BETWEEN '2023-01-01' AND '2023-01-03' OR date BETWEEN '2023-01-01' AND '2023-01-13'
    GROUP BY uid
) AS retention_data
RIGHT JOIN retention_test ON retention_data.uid = retention_test.uid
WHERE retention_test.date BETWEEN '2023-01-01' AND '2023-01-03'
GROUP BY retention_test.date

 

Lag / Lead

Sample data

CREATE TABLE llexample (
    g Int32,
    a Date )
ENGINE = Memory;

INSERT INTO llexample SELECT
    number % 3,
    toDate('2020-01-01') + number
FROM numbers(10);

SELECT * FROM llexample ORDER BY g,a;

┌─g─┬──────────a─┐
│ 0 │ 2020-01-01 │
│ 0 │ 2020-01-04 │
│ 0 │ 2020-01-07 │
│ 0 │ 2020-01-10 │
│ 1 │ 2020-01-02 │
│ 1 │ 2020-01-05 │
│ 1 │ 2020-01-08 │
│ 2 │ 2020-01-03 │
│ 2 │ 2020-01-06 │
│ 2 │ 2020-01-09 │
└───┴────────────┘
 

Using arrays

select g, (arrayJoin(tuple_ll) as ll).1 a, ll.2 prev, ll.3 next
from (
select g, arrayZip( arraySort(groupArray(a)) as aa,
                    arrayPopBack(arrayPushFront(aa, toDate(0))),
                    arrayPopFront(arrayPushBack(aa, toDate(0))) ) tuple_ll
from llexample
group by g)
order by g, a;

┌─g─┬──────────a─┬───────prev─┬───────next─┐
│ 0 │ 2020-01-01 │ 1970-01-01 │ 2020-01-04 │
│ 0 │ 2020-01-04 │ 2020-01-01 │ 2020-01-07 │
│ 0 │ 2020-01-07 │ 2020-01-04 │ 2020-01-10 │
│ 0 │ 2020-01-10 │ 2020-01-07 │ 1970-01-01 │
│ 1 │ 2020-01-02 │ 1970-01-01 │ 2020-01-05 │
│ 1 │ 2020-01-05 │ 2020-01-02 │ 2020-01-08 │
│ 1 │ 2020-01-08 │ 2020-01-05 │ 1970-01-01 │
│ 2 │ 2020-01-03 │ 1970-01-01 │ 2020-01-06 │
│ 2 │ 2020-01-06 │ 2020-01-03 │ 2020-01-09 │
│ 2 │ 2020-01-09 │ 2020-01-06 │ 1970-01-01 │
└───┴────────────┴────────────┴────────────┘
 

Using window functions (starting from Clickhouse 21.3)

SET allow_experimental_window_functions = 1;

SELECT
    g,
    a,
    any(a) OVER (PARTITION BY g ORDER BY a ASC ROWS
                 BETWEEN 1 PRECEDING AND 1 PRECEDING) AS prev,
    any(a) OVER (PARTITION BY g ORDER BY a ASC ROWS
                 BETWEEN 1 FOLLOWING AND 1 FOLLOWING) AS next
FROM llexample
ORDER BY
    g ASC,
    a ASC;

┌─g─┬──────────a─┬───────prev─┬───────next─┐
│ 0 │ 2020-01-01 │ 1970-01-01 │ 2020-01-04 │
│ 0 │ 2020-01-04 │ 2020-01-01 │ 2020-01-07 │
│ 0 │ 2020-01-07 │ 2020-01-04 │ 2020-01-10 │
│ 0 │ 2020-01-10 │ 2020-01-07 │ 1970-01-01 │
│ 1 │ 2020-01-02 │ 1970-01-01 │ 2020-01-05 │
│ 1 │ 2020-01-05 │ 2020-01-02 │ 2020-01-08 │
│ 1 │ 2020-01-08 │ 2020-01-05 │ 1970-01-01 │
│ 2 │ 2020-01-03 │ 1970-01-01 │ 2020-01-06 │
│ 2 │ 2020-01-06 │ 2020-01-03 │ 2020-01-09 │
│ 2 │ 2020-01-09 │ 2020-01-06 │ 1970-01-01 │
└───┴────────────┴────────────┴────────────┘
 

Using lagInFrame/leadInFrame (starting from ClickHouse 21.4)

SELECT
    g,
    a,
    lagInFrame(a) OVER (PARTITION BY g ORDER BY a ASC ROWS
                 BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING) AS prev,
    leadInFrame(a) OVER (PARTITION BY g ORDER BY a ASC ROWS
                 BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING) AS next
FROM llexample
ORDER BY
    g ASC,
    a ASC;

┌─g─┬──────────a─┬───────prev─┬───────next─┐
│ 0 │ 2020-01-01 │ 1970-01-01 │ 2020-01-04 │
│ 0 │ 2020-01-04 │ 2020-01-01 │ 2020-01-07 │
│ 0 │ 2020-01-07 │ 2020-01-04 │ 2020-01-10 │
│ 0 │ 2020-01-10 │ 2020-01-07 │ 1970-01-01 │
│ 1 │ 2020-01-02 │ 1970-01-01 │ 2020-01-05 │
│ 1 │ 2020-01-05 │ 2020-01-02 │ 2020-01-08 │
│ 1 │ 2020-01-08 │ 2020-01-05 │ 1970-01-01 │
│ 2 │ 2020-01-03 │ 1970-01-01 │ 2020-01-06 │
│ 2 │ 2020-01-06 │ 2020-01-03 │ 2020-01-09 │
│ 2 │ 2020-01-09 │ 2020-01-06 │ 1970-01-01 │
└───┴────────────┴────────────┴────────────┘
 

Using neighbor (no grouping, incorrect result over blocks)

SELECT
    g,
    a,
    neighbor(a, -1) AS prev,
    neighbor(a, 1) AS next
FROM
(
    SELECT *
    FROM llexample
    ORDER BY
        g ASC,
        a ASC
);

┌─g─┬──────────a─┬───────prev─┬───────next─┐
│ 0 │ 2020-01-01 │ 1970-01-01 │ 2020-01-04 │
│ 0 │ 2020-01-04 │ 2020-01-01 │ 2020-01-07 │
│ 0 │ 2020-01-07 │ 2020-01-04 │ 2020-01-10 │
│ 0 │ 2020-01-10 │ 2020-01-07 │ 2020-01-02 │
│ 1 │ 2020-01-02 │ 2020-01-10 │ 2020-01-05 │
│ 1 │ 2020-01-05 │ 2020-01-02 │ 2020-01-08 │
│ 1 │ 2020-01-08 │ 2020-01-05 │ 2020-01-03 │
│ 2 │ 2020-01-03 │ 2020-01-08 │ 2020-01-06 │
│ 2 │ 2020-01-06 │ 2020-01-03 │ 2020-01-09 │
│ 2 │ 2020-01-09 │ 2020-01-06 │ 1970-01-01 │
└───┴────────────┴────────────┴────────────┘

 

 

 

날짜

필요한 날짜

반복적으로 필요한 날짜를 추출할 수 있음

WITH RECURSIVE DateRange AS (
  SELECT '2023-01-01' AS Date
  UNION ALL
  SELECT DATE_ADD(Date, INTERVAL 1 DAY)
  FROM DateRange
  WHERE Date < CURDATE()
)
SELECT Date
FROM DateRange;

 


참고

https://kb.altinity.com/altinity-kb-queries-and-syntax/lag-lead/

728x90