From a06e1189c19dd42a06366172dfe615d154a111e9 Mon Sep 17 00:00:00 2001 From: Nik Sauer Date: Thu, 20 Feb 2025 10:14:53 +0100 Subject: [PATCH] city current filter outlier for all dimensions --- code/enums.py | 3 +++ code/routers/city.py | 22 +++++++++++++++++++--- 2 files changed, 22 insertions(+), 3 deletions(-) diff --git a/code/enums.py b/code/enums.py index 7fb1be1..7abeb4e 100644 --- a/code/enums.py +++ b/code/enums.py @@ -94,6 +94,9 @@ class Dimension(): NO2: "no2_ppb", } + # outlider factor + IQR_FACTOR = 3 + _filter_thresholds = { PM2_5: (0, 999) } diff --git a/code/routers/city.py b/code/routers/city.py index 8cd2fa0..dbce015 100644 --- a/code/routers/city.py +++ b/code/routers/city.py @@ -1,4 +1,5 @@ import json +import numpy as np from geopy.geocoders import Nominatim from fastapi import APIRouter, Depends, HTTPException, Query, Response from sqlalchemy.orm import Session @@ -65,7 +66,7 @@ async def get_average_measurements_by_city( q = ( db.query( Values.dimension, - func.avg(Values.value), + func.array_agg(Values.value), func.count(Values.id), func.count(distinct(Station.id)), ) @@ -78,7 +79,7 @@ async def get_average_measurements_by_city( .filter(Values.value != 'nan') .filter(Measurement.time_measured >= start) # filter outlier - .filter(or_(Values.dimension != Dimension.PM2_5, and_(LOWER <= Values.value, Values.value <= UPPER))) + #.filter(or_(Values.dimension != Dimension.PM2_5, and_(LOWER <= Values.value, Values.value <= UPPER))) .group_by(Values.dimension) ) @@ -86,6 +87,21 @@ async def get_average_measurements_by_city( station_count = db.query(Station).join(Location).join(City).filter(City.slug == city_slug).count() + # filter outlier with Quartiles + data = [] + for dim, val_list, val_count, s_cnt in q.all(): + a = np.array(val_list) + q1 = np.percentile(a, 25) + q3 = np.percentile(a, 75) + iqr = (q3 - q1) + + l = q1 - iqr * Dimension.IQR_FACTOR + r = q3 + iqr * Dimension.IQR_FACTOR + + b = a[(a >= l) & (a <= r)] + + data.append((dim, np.mean(b), val_count, s_cnt)) + j = { "type": "Feature", "geometry": { @@ -104,7 +120,7 @@ async def get_average_measurements_by_city( "value": val, "value_count": val_count, "station_count": s_cnt - } for dim, val, val_count, s_cnt in q.all()], + } for dim, val, val_count, s_cnt in data], } }