Temperature anomaly detection 🔍
Result Set
Regardless of the database, the result set should look like:
site_id | reading_datetime | temperature | average_temperature | percentage_increase |
---|---|---|---|---|
1 | 2021-01-02 02:01:17 | 22.43 | 20.0525 | 11.8564 |
1 | 2021-01-04 21:45:34 | 22.69 | 20.2700 | 11.9388 |
2 | 2021-01-02 01:59:43 | 23.10 | 20.6050 | 12.1087 |
Expand for the DDL
solution(site_id, reading_datetime, temperature, average_temperature, percentage_increase) as (
values
(1, '2021-01-02 02:01:17'::timestamp, 22.43, 20.0525, 11.8564),
(1, '2021-01-04 21:45:34'::timestamp, 22.69, 20.2700, 11.9388),
(2, '2021-01-02 01:59:43'::timestamp, 23.10, 20.6050, 12.1087)
)
Solution
Some SQL solutions per database are provided below.
DuckDB
with temperatures as (
select
site_id,
reading_datetime,
temperature,
avg(temperature) over rows_around_site_reading as average_temperature
from readings
window rows_around_site_reading as (
partition by site_id
order by reading_datetime
rows between 2 preceding
and 2 following
exclude current row
)
qualify 4 = count(*) over rows_around_site_reading
)
select
site_id,
reading_datetime,
temperature,
round(average_temperature, 4) as average_temperature,
round(100.0 * (temperature - average_temperature) / average_temperature, 4) as percentage_increase
from temperatures
where percentage_increase > 10
order by
site_id,
reading_datetime
SQLite, PostgreSQL
with temperatures as (
select
site_id,
reading_datetime,
temperature,
avg(temperature) over rows_around_site_reading as average_temperature,
count(*) over rows_around_site_reading as count_of_rows
from readings
window rows_around_site_reading as (
partition by site_id
order by reading_datetime
rows between 2 preceding
and 2 following
exclude current row
)
)
select
site_id,
reading_datetime,
temperature,
round(average_temperature, 4) as average_temperature,
round(100.0 * (temperature - average_temperature) / average_temperature, 4) as percentage_increase
from temperatures
where 1=1
and count_of_rows = 4
and (temperature - average_temperature) / average_temperature > 0.1
order by
site_id,
reading_datetime
Snowflake
with temperatures as (
select
site_id,
reading_datetime,
temperature,
sum(temperature) over (
partition by site_id
order by reading_datetime rows between 2 preceding and 2 following
) as sum_temps
from readings
qualify 5 = count(*) over (
partition by site_id
order by reading_datetime rows between 2 preceding and 2 following
)
)
select
* exclude (sum_temps),
round((sum_temps - temperature) / 4, 4) as average_temperature,
round(100 * (temperature - average_temperature) / temperature, 4) as percentage_increase
from temperatures
where percentage_increase > 10
order by
site_id,
reading_datetime
SQL Server
with temperatures as (
select
site_id,
reading_datetime,
temperature,
(sum(temperature) over rows_around_site_reading - temperature) / 4 as average_temperature,
count(*) over rows_around_site_reading as count_of_rows
from readings
window rows_around_site_reading as (
partition by site_id
order by reading_datetime rows between 2 preceding and 2 following
)
)
select
site_id,
reading_datetime,
temperature,
round(average_temperature, 4) as average_temperature,
round(100.0 * (temperature - average_temperature) / temperature, 4) as percentage_increase
from temperatures
where 1=1
and count_of_rows = 5
and (temperature - average_temperature) / temperature > 0.1
order by
site_id,
reading_datetime