Skip to content

Commit 74b745c

Browse files
committed
local db skeleton
1 parent cb49ea3 commit 74b745c

File tree

5 files changed

+1613
-118
lines changed

5 files changed

+1613
-118
lines changed

pvdeg/__init__.py

+1
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414
from . import montecarlo
1515
from .scenario import Scenario, GeospatialScenario
1616
from . import spectral
17+
from . import store
1718
from . import symbolic
1819
from . import standards
1920
from . import temperature

pvdeg/config.py

+3
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,9 @@
1111
DATA_DIR = PVDEG_DIR / "data"
1212
TEST_DIR = PVDEG_DIR.parent / "tests"
1313
TEST_DATA_DIR = PVDEG_DIR.parent / "tests" / "data"
14+
15+
# downloader target directory
16+
METOROLOGICAL_DOWNLOAD_PATH = Path.home() / "PVDeg-Meteorological"
1417

1518
# DATA_LIBRARY = PVDEG_DIR.parent / "DataLibrary"
1619
# if not os.path.isdir(DATA_LIBRARY):

pvdeg/store.py

+185
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,185 @@
1+
from pathlib import Path
2+
import xarray as xr
3+
import pandas as pd
4+
import numpy as np
5+
import zarr
6+
import os
7+
8+
from pvdeg import METOROLOGICAL_DOWNLOAD_PATH
9+
10+
def get(group):
11+
"""
12+
Extract a weather xarray dataset and metadata pandas dataframe from your zarr store.
13+
`get` pulls the entire datastore into these objects. PVDeg does not make indexing available at this stage.
14+
This is practical because all datavariables are stored in dask arrays so they are loaded lazily instead of into memmory when this is called.
15+
Choose the points you need after this method is called by using `sel`, `isel`, `loc, `iloc`.
16+
17+
`store.get` is meant to match the API of other geospatial weather api's from pvdeg like `pvdeg.weather.get`, `pvdeg.weather.distributed_weather`, `GeospatialScenario.get_geospatial_data`
18+
19+
Parameters
20+
-----------
21+
group : str
22+
name of the group to access from your local zarr store.
23+
Groups are created automatically in your store when you save data using `pvdeg.store.store`.
24+
25+
*From `pvdeg.store.store` docstring*
26+
Hourly PVGIS data will be saved to "PVGIS-1hr", 30 minute PVGIS to "PVGIS-30min", similarly 15 minute PVGIS will be saved to "PVGIS-15min"
27+
28+
Returns
29+
-------
30+
weather_ds : xr.Dataset
31+
Weather data for all locations requested in an xarray.Dataset using a dask array backend. This may be larger than memory.
32+
meta_df : pd.DataFrame
33+
Pandas DataFrame containing metadata for all requested locations. Each row maps to a single entry in the weather_ds.
34+
"""
35+
36+
combined_ds = xr.open_zarr(
37+
store=METOROLOGICAL_DOWNLOAD_PATH,
38+
group=group
39+
)
40+
41+
weather_ds, meta_df = _seperate_geo_weather_meta(ds_from_zarr=combined_ds)
42+
43+
return weather_ds, meta_df
44+
45+
def store(weather_ds, meta_df):
46+
"""
47+
Add geospatial meteorolical data to your zarr store. Data will be saved to the correct group based on its periodicity.
48+
49+
Hourly PVGIS data will be saved to "PVGIS-1hr", 30 minute PVGIS to "PVGIS-30min", similarly 15 minute PVGIS will be saved to "PVGIS-15min"
50+
51+
Parameters
52+
-----------
53+
weather_ds : xr.Dataset
54+
Weather data for all locations requested in an xarray.Dataset using a dask array backend. This may be larger than memory.
55+
meta_df : pd.DataFrame
56+
Pandas DataFrame containing metadata for all requested locations. Each row maps to a single entry in the weather_ds.
57+
58+
Returns
59+
--------
60+
None
61+
"""
62+
63+
group = meta_df.iloc[0]["Source"]
64+
rows_per_entry = weather_ds.isel(gid=0).time.size
65+
66+
if rows_per_entry == 8760:
67+
periodicity = "1hr"
68+
elif rows_per_entry == 17520:
69+
periodicity = "30min"
70+
elif rows_per_entry == 35040:
71+
periodicity = "15min"
72+
else:
73+
raise ValueError(f"first location to store has {rows_per_entry} rows, must have 8670, 17520, 35040 rows")
74+
75+
combined_ds = _combine_geo_weather_meta(weather_ds, meta_df)
76+
77+
# what mode should this be
78+
# we want to add to indexes if need be or overwrite old ones
79+
combined_ds.to_zarr(
80+
store=METOROLOGICAL_DOWNLOAD_PATH,
81+
group=f"{group}-{periodicity}"
82+
)
83+
84+
print(f"dataset saved to zarr store at {METOROLOGICAL_DOWNLOAD_PATH}")
85+
86+
87+
88+
def check_store():
89+
"""Check if you have a zarr store at the default download path defined in pvdeg.config"""
90+
if os.path.exists(os.path.join(METOROLOGICAL_DOWNLOAD_PATH, ".zattrs")):
91+
92+
size = sum(f.stat().st_size for f in METOROLOGICAL_DOWNLOAD_PATH.glob('**/*') if f.is_file())
93+
94+
print(f"""
95+
You have a zarr store at {METOROLOGICAL_DOWNLOAD_PATH}.
96+
97+
It has {size} bytes.
98+
""")
99+
100+
elif os.path.exists(METOROLOGICAL_DOWNLOAD_PATH):
101+
print(f"You have a directory but no zarr store at {METOROLOGICAL_DOWNLOAD_PATH}")
102+
103+
else:
104+
raise FileNotFoundError(f"No Directory exists at {METOROLOGICAL_DOWNLOAD_PATH}. Data has not been saved here.")
105+
106+
107+
def my_path():
108+
"""Finds path to your zarr store of data if it exists"""
109+
if os.path.exists(os.path.join(METOROLOGICAL_DOWNLOAD_PATH, ".zattrs")):
110+
print(METOROLOGICAL_DOWNLOAD_PATH)
111+
112+
else:
113+
print("Directory not found")
114+
115+
def _combine_geo_weather_meta(
116+
weather_ds: xr.Dataset,
117+
meta_df: pd.DataFrame
118+
):
119+
"""Combine weather dataset and meta dataframe into a single dataset"""
120+
121+
meta_ds = xr.Dataset.from_dataframe(meta_df)
122+
# we could do some encoding scheme here, dont need to store source? unless the zarr compression handles it for us
123+
124+
meta_ds['gid'] = meta_ds['index'].values.astype(np.int32)
125+
meta_ds = meta_ds.drop_vars(["index"])
126+
127+
combined = xr.merge([weather_ds, meta_ds]).assign_coords(
128+
latitude=("gid", meta_ds.latitude.values),
129+
longitude=('gid', meta_ds.longitude.values),
130+
)
131+
132+
return combined
133+
134+
135+
def _seperate_geo_weather_meta(
136+
ds_from_zarr: xr.Dataset,
137+
):
138+
"""
139+
Take loaded dataset in the zarr store schema (weather and meta combined)
140+
and seperate it into `weather_ds` and `meta_df`.
141+
"""
142+
143+
# there may be a more optimal way to do this
144+
data = np.column_stack(
145+
[
146+
ds_from_zarr.gid.to_numpy(),
147+
ds_from_zarr.latitude.to_numpy(),
148+
ds_from_zarr.longitude.to_numpy(),
149+
ds_from_zarr.altitude.to_numpy(),
150+
ds_from_zarr.Source.to_numpy(),
151+
ds_from_zarr.wind_height.to_numpy(),
152+
]
153+
)
154+
155+
seperated_meta_df = pd.DataFrame(data, columns=["gid", "latitude", "longitude", "altitude", "Source", "wind_height"]).set_index("gid")
156+
157+
seperated_weather_ds = ds_from_zarr.drop_vars(("latitude", "longitude", "altitude", "Source", "wind_height"))
158+
159+
return seperated_weather_ds, seperated_meta_df
160+
161+
def _make_coords_to_gid_da(
162+
ds_from_zarr: xr.Dataset
163+
):
164+
"""Create a 2D indexable array that maps coordinates (lat and lon) to gid stored in zarr store"""
165+
166+
import dask.array as da
167+
168+
# only want to do this if the arrays are dask arrays
169+
lats = ds_from_zarr.latitude.to_numpy()
170+
lons = ds_from_zarr.longitude.to_numpy()
171+
172+
gids = -1 * da.empty((len(lats), len(lons)))
173+
174+
points = xr.DataArray(
175+
data=gids,
176+
coords={
177+
"latitude": lats,
178+
"longitude": lons
179+
},
180+
dims=["latitude", "longitude"],
181+
)
182+
183+
points.set_index(latitude="latitude", longitude="longitude")
184+
185+
return points

pvdeg/weather.py

+8-1
Original file line numberDiff line numberDiff line change
@@ -1014,13 +1014,20 @@ def pvgis_empty_weather_ds(gids_size):
10141014
return weather_ds
10151015

10161016

1017+
# add some check to see if a dask client exists
1018+
# can force user to pass dask client to ensure it exists
1019+
1020+
# if called without dask client we will return a xr.Dataset
1021+
# with dask backend that does not appear as if it failed until we compute it
10171022
def weather_distributed(database, coords):
10181023
"""
10191024
Grab weather using pvgis for all of the following locations using dask for parallelization.
1020-
You must create a dask client with multiple processes before calling this function, otherwise no speedup will be offered.
1025+
You must create a dask client with multiple processes before calling this function, otherwise results will not be properly calculated.
10211026
10221027
PVGIS supports up to 30 requests per second so your dask client should not have more than $x$ workers/threads
10231028
that would put you over this limit.
1029+
1030+
With eventual NSRDB implementation API keys will be an issue, each key is rate limited.
10241031
10251032
Parameters
10261033
----------

0 commit comments

Comments
 (0)