diff --git a/.vscode/settings.json b/.vscode/settings.json new file mode 100644 index 0000000..0e33e5b --- /dev/null +++ b/.vscode/settings.json @@ -0,0 +1,8 @@ +{ + "editor.formatOnSave": true, + "editor.rulers": [120], + "files.insertFinalNewline": true, + "[mdx]": { + "editor.defaultFormatter": "esbenp.prettier-vscode" + } +} diff --git a/assets/logos/python.svg b/assets/logos/python.svg new file mode 100644 index 0000000..b04f3ee --- /dev/null +++ b/assets/logos/python.svg @@ -0,0 +1,13 @@ + + + + + + + + + + + diff --git a/assets/xarray/xarray-datastructure.png b/assets/xarray/xarray-datastructure.png new file mode 100644 index 0000000..c32d37d Binary files /dev/null and b/assets/xarray/xarray-datastructure.png differ diff --git a/mint.json b/mint.json index 450093d..810f837 100644 --- a/mint.json +++ b/mint.json @@ -37,12 +37,7 @@ "navigation": [ { "group": "Get Started", - "pages": [ - "introduction", - "console", - "quickstart", - "authentication" - ] + "pages": ["introduction", "console", "quickstart", "authentication"] }, { "group": "SDKs", @@ -57,26 +52,23 @@ "sdks/python/xarray", "sdks/python/async", "sdks/python/geometries", - "sdks/python/api-reference" + { + "group": "API Reference", + "icon": "book", + "pages": ["sdks/python/api-reference/datasets", "sdks/python/api-reference/workflows"] + } ] }, { "group": "Go", "icon": "golang", - "pages": [ - "sdks/go/introduction" - ] + "pages": ["sdks/go/introduction"] } ] }, { "group": "Datasets", - "pages": [ - "datasets/introduction", - "datasets/timeseries", - "datasets/collections", - "datasets/loading-data" - ] + "pages": ["datasets/introduction", "datasets/timeseries", "datasets/collections", "datasets/loading-data"] }, { "group": "Workflows", @@ -84,29 +76,16 @@ "workflows/introduction", { "group": "Concepts", - "pages": [ - "workflows/tasks", - "workflows/jobs", - "workflows/task-runners", - "workflows/clusters" - ] + "pages": ["workflows/tasks", "workflows/jobs", "workflows/task-runners", "workflows/clusters"] }, "workflows/caching", { "group": "Observability", - "pages": [ - "workflows/tracing", - "workflows/logging", - "workflows/axiom" - ] + "pages": ["workflows/tracing", "workflows/logging", "workflows/axiom"] }, { "group": "Near-Real Time", - "pages": [ - "workflows/recurring-tasks", - "workflows/cron-triggers", - "workflows/storage-event-triggers" - ] + "pages": ["workflows/recurring-tasks", "workflows/cron-triggers", "workflows/storage-event-triggers"] } ] } @@ -121,4 +100,4 @@ "github": "https://github.com/tilebox", "linkedin": "https://www.linkedin.com/company/tilebox-io" } -} \ No newline at end of file +} diff --git a/prettier.config.js b/prettier.config.js new file mode 100644 index 0000000..b0634be --- /dev/null +++ b/prettier.config.js @@ -0,0 +1,6 @@ +/** @type {import("prettier").Config} */ +const config = { + printWidth: 120, +}; + +module.exports = config; diff --git a/quickstart.mdx b/quickstart.mdx index 28530f1..eb98f4b 100644 --- a/quickstart.mdx +++ b/quickstart.mdx @@ -28,6 +28,7 @@ If you prefer to work locally in your device, the steps below help you get start Tilebox Console Tilebox Console + Use the datasets client to query data from a dataset. @@ -36,8 +37,8 @@ If you prefer to work locally in your device, the steps below help you get start from tilebox.datasets import Client client = Client(token="YOUR_TILEBOX_API_KEY") - - # select an open data dataset + + # select an Opendata dataset datasets = client.datasets() dataset = datasets.open_data.asf.sentinel2_msi @@ -45,6 +46,7 @@ If you prefer to work locally in your device, the steps below help you get start collection = dataset.collection("S2A_S2MSI1C") data_january_2022 = collection.load(("2022-01-01", "2022-02-01")) ``` + Use the workflows client to create and submit a task. @@ -70,6 +72,7 @@ If you prefer to work locally in your device, the steps below help you get start For this snippet to work you need to have a cluster already created. Check out the guide on [clusters](/workflows/clusters) to learn how to create one. + Check out the following guides to learn more about the individual modules that make up Tilebox: diff --git a/sdks/python/async.mdx b/sdks/python/async.mdx new file mode 100644 index 0000000..06962cb --- /dev/null +++ b/sdks/python/async.mdx @@ -0,0 +1,242 @@ +--- +title: Async support +description: In this section we look at async support within the tilebox datasets python client. +icon: rotate +--- + +Tilebox offer a standard synchronous API by default, but also give you to option of an async client if you need it. + +The synchronous datasets client is great for data exploration in interactive environments like Jupyter notebooks. +The asynchronous datasets client is great for building production ready applications that need to scale. + +Async is a concurrency model that is far more efficient than multi-threading, and can provide significant +performance benefits. + +## Switching to an async datasets client + +Typically all you need to do is swap out your import statement of the `Client` and you're good to go. Check out +the example below to see how that is done works. + + + ```python Python (Sync) + from tilebox.datasets import Client + + # this client is sync + client = Client() + ``` + ```python Python (Async) + from tilebox.datasets.aio import Client + + # this client is async + client = Client() + ``` + + + +Once you have switched to the async client, you can use the `async` and `await` keywords to make your code async. +Check out the examples below to see how that works for a few examples. + + + +```python Python (Sync) +# Listing datasets +datasets = client.datasets() + +# Listing collections +dataset = datasets.open_data.asf.sentinel1_sar +collections = dataset.collections() + +# Collection information +collection = collections["Sentinel-1A"] +info = collection.info() +print(f"Data for My-collection is available for {info.availability}") + +# Loading data +data = collection.load(("2022-05-01", "2022-06-01"), show_progress=True) + +# Finding a specific datapoint +datapoint_uuid = "01811c8f-0928-e6f5-df34-364cfa8a86e8" +datapoint = collection.find(datapoint_uuid) +``` + +```python Python (Async) +# Listing datasets +datasets = await client.datasets() + +# Listing collections +dataset = datasets.open_data.asf.sentinel1_sar +collections = await dataset.collections() + +# Collection information +collection = collections["Sentinel-1A"] +info = await collection.info() +print(f"Data for My-collection is available for {info.availability}") + +# Loading data +data = await collection.load(("2022-05-01", "2022-06-01"), show_progress=True) + +# Finding a specific datapoint +datapoint_uuid = "01811c8f-0928-e6f5-df34-364cfa8a86e8" +datapoint = await collection.find(datapoint_uuid) +``` + + + + + Async concurrency is also supported in Jupyter notebooks or similar interactive environments. You can even use `await + some_async_call()` as the output of a code cell. + + +## Benefits + +The main benefit of using an async client is that you can run requests concurrently, which improve performance. +This is especially useful when you are loading data from different collections. +Check out the example below to see how that works. + +## Example: Fetching data concurrently + +The following example fetches data from different collections. +In the synchronous example, it fetches the data sequentially, whereas in the async example it fetches the data concurrently. +This means that the async approach is faster for such use cases. + + + +```python Python (Sync) +# example: fetching data sequentially + +import time +from tilebox.datasets import Client +from tilebox.datasets.timeseries import RemoteTimeseriesDatasetCollection # for type hinting + +client = Client() +datasets = client.datasets() +collections = datasets.open_data.asf.sentinel1_sar.collections() + +def stats_for_2020(collection: RemoteTimeseriesDatasetCollection) -> None: + """Fetch data for 2020 and print the number of data points that were loaded.""" + data = collection.load(("2020-01-01", "2021-01-01"), show_progress=True) + n = data.sizes['time'] if 'time' in data else 0 + print(f"There are {data.sizes['time']} datapoints in {collection.name} for 2020.") + +start = time.time() + +# for each collection +for name in collections: + # fetch the data, print the number of datapoints and then continue to the next collection + stats_for_2020(collections[name]) + +end = time.time() +print(f"Fetching data took {end - start:.2f} seconds") +``` + +```python Python (Async) +# example: fetching data concurrently + +import asyncio +import time +from tilebox.datasets.aio import Client +from tilebox.datasets.timeseries import RemoteTimeseriesDatasetCollection # for type hinting + +client = Client() +datasets = await client.datasets() +collections = await datasets.open_data.asf.sentinel1_sar.collections() + +async def stats_for_2020(collection: RemoteTimeseriesDatasetCollection) -> None: + """Fetch data for 2020 and print the number of data points that were loaded.""" + data = await collection.load(("2020-01-01", "2021-01-01"), show_progress=True) + n = data.sizes['time'] if 'time' in data else 0 + print(f"There are {data.sizes['time']} datapoints in {collection.name} for 2020.") + +start = time.time() + +# initiate all requests concurrently +requests = [stats_for_2020(collections[name]) for name in collections] +# and then wait for all to finish in parallel before continuing +await asyncio.gather(*requests) + +end = time.time() +print(f"Fetching data took {end - start:.2f} seconds") +``` + + + +The output is shown below. As you can see, the async approach is 5 seconds faster. If you have `show_progress` enabled, +the progress bars are updated concurrently. In this example the second collection contains less data than the first one, +so it finishes first. + + + +```txt Python (Sync) +Fetching data: 100% |██████████████████████████████ [00:13<00:00, 207858 datapoints, 3.91 MB/s] +There are 207858 datapoints in Sentinel-1A for 2020. +Fetching data: 100% |██████████████████████████████ [00:11<00:00, 179665 datapoints, 4.39 MB/s] +There are 179665 datapoints in Sentinel-1B for 2020. +Fetching data took 25.34 seconds +``` + +```txt Python (Async) +Fetching data: 100% |██████████████████████████████ [00:19<00:00, 207858 datapoints, 2.21 MB/s] +Fetching data: 100% |██████████████████████████████ [00:17<00:00, 179665 datapoints, 2.94 MB/s] +There are 179665 datapoints in Sentinel-1B for 2020. +There are 207858 datapoints in Sentinel-1A for 2020. +Fetching data took 20.12 seconds +``` + + + +## Supported async environments + +The Tilebox Datasets Python client supports either `asyncio` or `trio` as an async backend. +It auto-detects which of those two to use. + +### AsyncIO + +AsyncIO is Python's [built-in library](https://docs.python.org/3/library/asyncio.html) for writing concurrent +code with the async/await syntax. + +```python +import asyncio +from tilebox.datasets.aio import Client + +async def main(): + client = Client() + datasets = await client.datasets() + print(datasets) + +asyncio.run(main()) +``` + +### Trio + +Trio is an [alternative async library](https://trio.readthedocs.io/en/stable/), designed around the +[principles of structured concurrency](https://en.wikipedia.org/wiki/Structured_concurrency). + +```python +import trio +from tilebox.datasets.aio import Client + +async def main(): + client = Client() + datasets = await client.datasets() + print(datasets) + +trio.run(main) +``` + +### AnyIO + +AnyIO is an [asynchronous networking and concurrency library](https://anyio.readthedocs.io/en/stable/) that works on +top of either asyncio or trio. The Tilebox Datasets Python client is written using `anyio`, that way it can be used with +either `asyncio` or `trio`. + +```python +import anyio +from tilebox.datasets.aio import Client + +async def main(): + client = Client() + datasets = await client.datasets() + print(datasets) + +anyio.run(main, backend="asyncio") +``` diff --git a/sdks/python/geometries.mdx b/sdks/python/geometries.mdx new file mode 100644 index 0000000..7ca12fd --- /dev/null +++ b/sdks/python/geometries.mdx @@ -0,0 +1,343 @@ +--- +title: Geometries +description: How geometries are handled in the Tilebox python client. +icon: earth-americas +--- + +Many datasets consist of granules that represent a certain geographical area on the earths surface. +Often times a polygon defining the outline of this area - a footprint - is stored alongside other granule metadata in time series datasets. +Tilebox provides special support for working with Geometries out of the box. + +Here is an example that loads some `granules` of the `ERS SAR` Opendata dataset, that contains geometries. + +```python Loading ERS data +from tilebox.datasets import Client + +client = Client() +datasets = client.datasets() + +ers_collection = datasets.open_data.asf.ers_sar.collection("ERS-2") +ers_data = ers_collection.load(("2008-02-10T21:00", "2008-02-10T22:00")) +``` + +## Shapely + +In the `ers_data` dataset each granule contains a `geometry` field which represents the footprint each granule as a polygon. +Tilebox automatically converts such geometry fields to `Polygon` or `MultiPolygon` objects from the +[shapely](https://shapely.readthedocs.io/en/stable/manual.html) library. +By integrating with shapely, you can make use of the rich set of libraries and tooling around it, which includes support for computing +polygon characteristics such as the total area, intersection checks or the conversion to other common formats. + +Here are some geometries which are part of the ERS granules loaded. + +```python Printing geometries +geometries = ers_data.geometry.values +print(geometries) +``` + +```plaintext Output +[ + + + + + + + + + + + + ] +``` + +### Accessing Coordinates + +You can select one polygon out of the geometries, and access the underlying coordinates, as well as an automatically +computed centroid point. + +```python Accessing coordinates and computing a centroid point +polygon = geometries[0] +lon, lat = polygon.exterior.coords.xy +center, = list(polygon.centroid.coords) + +print(lon) +print(lat) +print(center) +``` + +```plaintext Output +array('d', [-150.753244, -152.031574, -149.183655, -147.769339, -150.753244]) +array('d', [74.250081, 73.336051, 73.001748, 73.899483, 74.250081]) +(-149.92927907414239, 73.62538063474753) +``` + + + Interactive environments such as [Jupyter Notebooks](/sdks/python/sample-notebooks) can also directly visualize + Polygon shapes graphically, when a `shapely.Polygon` is the output of a cell. Just type `polygon` in an empty cell and + execute it to get a visual representation of the shape of the polygon. + + +### Visualization on a Map + +To visualize polygons on a map, you can use the [folium](https://pypi.org/project/folium/) library. +Here is a short helper function which produces an OpenStreetMap with the Polygon overlaid on top. + +```python visualize helper function +# pip install folium +from folium import Figure, Map, Polygon as FoliumPolygon, GeoJson, TileLayer +from folium.plugins import MiniMap +from shapely import Polygon, to_geojson +from collections.abc import Iterable + +def visualize(poly: Polygon | Iterable[Polygon], zoom=4): + """Visualize a polygon or a list of polygons on a map""" + if not isinstance(poly, Iterable): + poly = [poly] + + fig = Figure(width=600, height=600) + map = Map(location=geometries[len(geometries)//2].centroid.coords[0][::-1], zoom_start=zoom, control_scale=True) + map.add_child(MiniMap()) + fig.add_child(map) + + for p in poly: + map.add_child(GeoJson(to_geojson(p))) + return fig +``` + +Here is how you can use it. + +```python Visualizing our polygon +visualize(polygon) +``` + + + +The `visualize` helper function also supports a whole list of polygons, which you can use to showcase the data layout of the ERS granules. + +```python Visualizing our polygon +visualize(geometries) +``` + + + +## Format conversion + +Shapely provides support out of the box for converting the Polygons to some common formats, +such as [GeoJSON](https://geojson.org/) or [Well-Known Text (WKT)](https://docs.ogc.org/is/18-010r7/18-010r7.html) + +```python Converting to GeoJSON +from shapely import to_geojson + +print(to_geojson(polygon)) +``` + +```plaintext Output +{"type":"Polygon","coordinates":[[[-150.753244,74.250081],[-152.031574,73.336051],[-149.183655,73.001748],[-147.769339,73.899483],[-150.753244,74.250081]]]} +``` + +```python Converting to WKT +from shapely import to_wkt + +print(to_wkt(polygon)) +``` + +```plaintext Output +POLYGON ((-150.753244 74.250081, -152.031574 73.336051, -149.183655 73.001748, -147.769339 73.899483, -150.753244 74.250081)) +``` + +## Checking intersections + +One common task when working with geometries is to check whether a given geometry falls into a certain area of interest. +Fortunately, `shapely` provides an intersects method for this use case. + +```python Checking intersections +from shapely import box + +# box representing the rectangular area lon=(-160, -150) and lat=(69, 70) +area_of_interest = box(-160, 69, -150, 70) + +for i, polygon in enumerate(geometries): + if area_of_interest.intersects(polygon): + print(f"{ers_data.granule_name[i].item()} intersects the area of interest!") + else: + print(f"{ers_data.granule_name[i].item()} doesn't intersect the area of interest!") +``` + +```plaintext Output +E2_66974_STD_F264 doesn't intersect the area of interest! +E2_66974_STD_F265 doesn't intersect the area of interest! +E2_66974_STD_F267 doesn't intersect the area of interest! +E2_66974_STD_F269 doesn't intersect the area of interest! +E2_66974_STD_F271 doesn't intersect the area of interest! +E2_66974_STD_F273 intersects the area of interest! +E2_66974_STD_F275 intersects the area of interest! +E2_66974_STD_F277 intersects the area of interest! +E2_66974_STD_F279 doesn't intersect the area of interest! +E2_66974_STD_F281 doesn't intersect the area of interest! +E2_66974_STD_F283 doesn't intersect the area of interest! +E2_66974_STD_F285 doesn't intersect the area of interest! +E2_66974_STD_F289 doesn't intersect the area of interest! +``` + +## Combining polygons + +As you saw in the preceding visualization of the granule footprints, the granules all put together form a whole orbit from pole to pole. +Often times measurements are then combined together in certain processing steps. +you can also do the same thing for the geometries, and combine them into a single polygon, which represents the hull around all individual footprints. +To do so, you can make use of [shapely.unary_union](https://shapely.readthedocs.io/en/stable/reference/shapely.unary_union.html) + +```python Combining multiple polygons +from shapely.ops import unary_union + +hull = unary_union(geometries) +visualize(hull) +``` + + + +## Multi Polygons + +As you can see, the computed hull actually consists of two polygons, because there is a gap (probably a missing granule) in the geometries. + +Shapely represents such geometries as a [shapely.MultiPolygon](https://shapely.readthedocs.io/en/latest/reference/shapely.MultiPolygon.html), +which in essence just represent a series of individual polygons put together. + +```python Accessing individual polygons of a MultiPolygon +print(f"The computed hull of type {type(hull).__name__} consists of {len(hull.geoms)} sub polygons") +for i, poly in enumerate(hull.geoms): + print(f"Sub polygon {i} has an area of {poly.area}") +``` + +```plaintext Output +The computed hull of type MultiPolygon consists of 2 sub polygons +Sub polygon 0 has an area of 2.025230449898011 +Sub polygon 1 has an area of 24.389998081651527 +``` + +## Antimeridian Crossings + +One common problem when working with `longitude / latitude` geometries like this are crossings of the 180-meridian, or the antimeridian. +Consider the coordinates of a `LineString` from Japan to the United States. It's longitude coordinates would look something like the following. + +`140, 141, 142, ..., 179, 180, -179, -178, ..., -125, -124` + +Libraries like `shapely` are not designed for handling spherical coordinate systems, extra care needs to be taken when handling such geometries. + +The `GeoJSON` spec actually provides a solution for this problem. +In the section [Antimeridian Cutting](https://datatracker.ietf.org/doc/html/rfc7946#section-3.1.9) they propose to +always cut lines and polygons into two parts, one representing the eastern hemisphere, and the other one the western hemisphere. + +As an example, here is an `ERS` granule where this exact problem occurs. + +```python Antimeridian Crossing +# a granule that crosses the antimeridian +granule = ers_collection.find("0119bb86-0260-5819-6aab-f99796417155") +polygon = granule.geometry.item() +print(polygon.exterior.coords.xy) +visualize(polygon) +``` + +```plaintext Output +array('d', [177.993407, 176.605009, 179.563047, -178.904076, 177.993407]) +array('d', [74.983185, 74.074615, 73.727752, 74.61847, 74.983185]) +``` + + + +This 2D visualization doesn't seem right. Not only the visualization is wrong, the same also applies for +calculations which you may want to do. For example, testing whether the granule intersects the 0-meridian gives the wrong result. + +```python Problems with calculating intersections +from shapely import LineString + +null_meridian = LineString([(0, -90), (0, 90)]) +print(polygon.intersects(null_meridian)) # True - but this is wrong! +``` + +One solution to solve this issue is to cut the polygon into two parts. +The [antimeridian](https://pypi.org/project/antimeridian/) package does exactly that. + +```python Cutting the polygon along the antimeridian +# pip install antimeridian + +import antimeridian +fixed_polygon = antimeridian.fix_polygon(polygon) +visualize(fixed_polygon) + +print(fixed_polygon.intersects(null_meridian)) # False - this is correct now +``` + + + +Since `shapely` is not aware of the spherical nature of the data, the `centroid` of this fixed polygon is still wrong. +The `antimeridian` package also has a function to correct this. + +```python Calculating the centroid of a fixed polygon crossing the antimeridian +print("Wrongly computed centroid coordinates (shapely)") +print(list(fixed_polygon.centroid.coords)) +print("Correct centroid coordinates (antimeridian taken into account)") +print(list(antimeridian.centroid(fixed_polygon).coords)) +``` + +```plaintext Output +Wrongly computed centroid coordinates (shapely) +[(139.8766350146937, 74.3747116658462)] +Correct centroid coordinates (antimeridian taken into account) +[(178.7782777050171, 74.3747116658462)] +``` + +## Spherical Geometry + +Another approach to handling the antimeridian issue is to perform all coordinate related calculations, +such as polygon intersections, in a [3D spherical coordinate system](https://en.wikipedia.org/wiki/Spherical_coordinate_system). + +One great library to do this is [spherical_geometry](https://spherical-geometry.readthedocs.io/en/latest/). +Here is an example. + +```python Spherical Geometry +# pip install spherical-geometry + +from spherical_geometry.polygon import SphericalPolygon +from spherical_geometry.vector import lonlat_to_vector + +lon, lat = polygon.exterior.coords.xy +spherical_poly = SphericalPolygon.from_lonlat(lon, lat) +# let's check the x, y, z coordinates of the spherical polygon: +print(list(spherical_poly.points)) +``` + +```plaintext Output +[array([[-0.25894363, 0.00907234, 0.96584983], + [-0.2651968 , -0.00507317, 0.96418096], + [-0.28019363, 0.00213687, 0.95994112], + [-0.27390375, 0.01624885, 0.96161984], + [-0.25894363, 0.00907234, 0.96584983]])] +``` + +Now you can easily compute intersections or check whether a certain point is inside the polygon. +You can compare the incorrect computation from `shapely` with the fixed version when using `spherical_geometry`. + +```python Correct calculations using spherical geometry +# a point on the null-meridian, way off from our polygon +null_meridian_point = 0, 74.4 +# a point actually inside our polygon +point_inside = 178.8, 74.4 + +print("Shapely results:") +print("- Null meridian point inside:", polygon.contains(Point(*null_meridian_point))) +print("- Actual inside point inside:", polygon.contains(Point(*point_inside))) + +print("Spherical geometry results:") +print("- Null meridian point inside:", spherical_poly.contains_lonlat(*null_meridian_point)) +print("- Actual inside point inside:", spherical_poly.contains_lonlat(*point_inside)) +``` + +```plaintext Output +Shapely results: +- Null meridian point inside: True +- Actual inside point inside: False +Spherical geometry results: +- Null meridian point inside: False +- Actual inside point inside: True +``` diff --git a/sdks/python/introduction.mdx b/sdks/python/introduction.mdx index 7df08e4..19cb348 100644 --- a/sdks/python/introduction.mdx +++ b/sdks/python/introduction.mdx @@ -1,4 +1,24 @@ --- title: Introduction -description: Learn about the Tilebox Python SDK ---- \ No newline at end of file +description: The Tilebox Python SDK +icon: python +--- + +
+
+ # Tilebox Python Clients + + We provide various Python clients to interact with different Tilebox services. They are structured into separate packages, + which can be installed independently based on the services you want to interact with or all together for a complete experience. + + `tilebox-datasets` is a python package and client to interact with the Tilebox Data Access Module. + + `tilebox-workflows` is a python package and client to interact with the Tilebox Workflow Orchestrator. + + Both packages are available in a private PyPI repository. To install them, check out the [installation instructions](/sdks/python/installation). +
+ + Tilebox Console + + +
diff --git a/sdks/python/sample-notebooks.mdx b/sdks/python/sample-notebooks.mdx new file mode 100644 index 0000000..dd8f8ec --- /dev/null +++ b/sdks/python/sample-notebooks.mdx @@ -0,0 +1,103 @@ +--- +title: Sample Notebooks +description: Sample code that is maintained and available to use and learn from. +icon: notebook +--- + +To get the hang of the Python client quickly and effortlessly this is a list of use-cases and sample notebooks. + +Each notebook can be executed from top to bottom, and does not require any setup. + +## Quick Link + +Check out some example notebooks prepared in this shared [Google Drive Folder](https://drive.google.com/drive/folders/1I7G35LLeB2SmKQJFsyhpSVNyZK9uOeJt). + + + Within Drive you can right click a notebook and select "Open with -> Google Colaboratory" to open it in an interactive + Google Colab environment. + + +## Notebook Overview + +The Tilebox ERS Opendata Demo is a good starting point to get familiar with the Python client, and recommended for everyone who wants to use the client. + +- [Tilebox ERS Opendata Demo](https://colab.research.google.com/drive/1LTYhLKy8m9psMhu0DvANs7hyS3ViVpas) + +This notebook shows how to use the Python client to query metadata from the ERS-SAR Opendata dataset. +Then it shows how to filter the results based on geographical location, and then download the product data for a specific granule. + +- [Tilebox S5P Tropomi Methane Data Access](https://colab.research.google.com/drive/1eVYARNFnTIeQqBs6gqeay01EDvRk2EI4) + +This notebook shows how to use the Python client to query the S5P Tropomi Opendata dataset for Methane products. +Then it shows how to download and access the corresponding product file, and finally how to plot methane data in a georeferenced map plot. + +Cells are executed one-by-one by typing `Shift+Enter` (see [Interactive Environment](/tools/jupyter)), most commonly used libraries are pre-installed. + +All the demo notebooks require python 3.10 or upwards. + +## Interactive Environments + +Jupyter, Google Colab or JetBrains Datalore are interactive environments that make developing and sharing algorithmic code simple and accessible. +They allow to work in notebooks, which are documents that contains both code and rich text elements, such as figures, links, equations, and more. +Notebooks don't need any setup and can be shared with others. + + + + [Jupyter notebooks](https://jupyter.org/) are the original interactive environment for Python. They are great to + work with, but require a local installation. + + + [Google Colab](https://colab.research.google.com/) is a free tool that offers a hosted interactive Python + environment. Google Colab is great to connect to local Jupyter instances, and to share code using Google + credentials, or within organizations that use Google Workspace. + + + [JetBrains Datalore](https://datalore.jetbrains.com/) is a free and convenient way to collaboratively test, develop + and share Python code and algorithms. It comes with secret management built in, so you can store your credentials + and share notebooks. Datalore comes with the advanced JetBrains syntax highlighting and autocompletion software + developers are used to. Note that it currently supports only Python 3.8, which is not compatible with the Tilebox + Python client. + + + +Seeing that Colab is a hosted, free tool that supports all requirements including Python ≥3.10, it's recommended to use Colab. + +## Installing Packages + +From within your interactive environment you can install missing packages via pip in "magic" cells started with +an exclamation mark. + +```bash +# pip is already installed in your interactive environment +!pip3 install .... +``` + +All APIs or commands that require authentication can be used through client libraries that hide the tokens, such that +notebooks can be shared without sharing personal credentials. + +## Executing Code + +Code can be executed by pressing the play button in the top left corner of the cell, or by pressing `Shift + Enter`. +While the code is running the cell show a spinning icon, and when the code is done the icon is replaced by +a number. This number indicates the order in which the cells were executed. The output of the cell is shown below +the code. + +## Authorization + +When sharing notebooks it's important to not share your Tilebox API key by using it directly. +Instead, there are two methods to use the Tilebox Python client in interactive environments. +Using environment variables, or interactively. + +```python Using environment variables to store your API key +# first define an environment variable "TILEBOX_API_KEY" which contains your API key +import os +token = os.getenv("TILEBOX_API_KEY") +``` + +**Interactive** authorization is possible by using the builtin `getpass` module. It prompts the user for the API key +when running the code, which is stored in memory and again is not shared with other users when a notebook is shared. + +```python Interactively providing your API key +from getpass import getpass +token = getpass("API key:") +``` diff --git a/sdks/python/xarray.mdx b/sdks/python/xarray.mdx new file mode 100644 index 0000000..824c216 --- /dev/null +++ b/sdks/python/xarray.mdx @@ -0,0 +1,361 @@ +--- +title: Xarray +description: Xarray library, common use-cases and how they can be implemented easily. +icon: chart-bar +--- + +[Xarray](https://xarray.dev/) is a library for working with labelled multi-dimensional arrays. +Xarray is built on top of [NumPy](https://numpy.org/) and [Pandas](https://pandas.pydata.org/). Xarray introduces labels in +the form of dimensions, coordinates and attributes on top of raw NumPy-like arrays, which allows for a more intuitive, +more concise, and less error-prone developer experience. The package includes a large and growing library of +domain-agnostic functions for advanced analytics and visualization with these data structures. + + + Overview of the Xarray data structure + + + + A good overview of the Xarray library and why it's a perfect fit for N-dimensional data (such as Tilebox time series + datasets) can be found in the official [Why Xarray? documentation + page](https://xarray.pydata.org/en/stable/why-xarray.html). + + +The Tilebox Python client provides access to your satellite data in the form of a +[xarray.Dataset](https://docs.xarray.dev/en/stable/generated/xarray.Dataset.html#xarray.Dataset). This brings a great +number of benefits compared to custom Tilebox specific data structures such as: + +- **Familiarity**: Xarray is built on top of NumPy and Pandas, which are two of the most popular Python libraries for + scientific computing. If you are already familiar with these libraries, you are right at home with Xarray. +- **Performance**: By using NumPy under the hood, which in turn is built on top of C and Fortran, Xarray benefits from + all the performance optimizations that those libraries offer. This means that Xarray is fast and can handle large + datasets with ease. +- **Interoperability**: Xarray is a popular library and is used by many other libraries. This means that you can + easily integrate Xarray into your existing workflows. Many third party libraries are available to extend Xarray + with more capability for different use cases. +- **Flexibility**: Xarray is a flexible library and can be used for a wide range of use-cases. It's also + easy to extend Xarray with custom capability. + +## An example dataset + +To get an understanding of how Xarray works, a simple example dataset is used, as it could be returned by a +[Tilebox timeseries dataset](/timeseries). + + + +```python Python (Sync) +from tilebox.datasets import Client + +client = Client() +datasets = client.datasets() +collection = datasets.open_data.asf.sentinel1_sar.collection("Sentinel-1A") +satellite_data = collection.load(("2022-05-01", "2022-06-01"), show_progress=True) +print(satellite_data) +``` + +```python Python (Async) +from tilebox.datasets.aio import Client + +client = Client() +datasets = await client.datasets() +collection = datasets.open_data.asf.sentinel1_sar.collection("Sentinel-1A") +satellite_data = await collection.load(("2022-05-01", "2022-06-01"), show_progress=True) +print(satellite_data) +``` + + + +```txt Output + Size: 8MB +Dimensions: (time: 16507, latlon: 2, n_footprint: 5) +Coordinates: + ingestion_time (time) datetime64[ns] 132kB 2023-10-20T10:04:07 ... ... + id (time) + This is a simple dataset that was generated to showcase some common Xarray use-cases. If you want to follow along, you + can download the dataset as a NetCDF file. The [Reading and writing + files section](/sdks/python/xarray#reading-and-writing-files) explains how to save and load Xarray datasets to and + from NetCDF files. + + +Here is a breakdown of the preceding output: + +- `satellite_data` **dataset** contains different **dimensions**, **coordinates** and **variables** +- `time` **dimension** consists of 570396 elements. This means there are 570396 data points in the dataset +- `time` **dimension coordinate** contains datetime values. This is the time when the data was measured. + The `*` mark shows that it's a dimension coordinate. Dimension coordinates are used for label based + indexing and alignment, it means you can use the time to access individual data points in the dataset +- `ingestion_time` **non-dimension coordinate** contains datetime values. This is the time when the data was + ingested into the Tilebox database. Non-dimension coordinates are variables that contain coordinate data, but are not + used for label based indexing and alignment. They can [even be multidimensional](https://docs.xarray.dev/en/stable/examples/multidimensional-coords.html). +- `sensor` **variable** contains integers. This variable tells you which sensor produced a given measurement. + A sensor in this case is identified by a number, `1` or `2` in the example dataset +- `measurement` **variable** contains floating point values. This variable contains the actual measurement values. + + + Check out the [xarray terminology overview](https://docs.xarray.dev/en/stable/user-guide/terminology.html) to deepen + your understanding of **datasets**, **dimensions**, **coordinates**, and **variables**. + + +The examples below showcase some of the most common use-cases for Xarray. Since the data is already loaded into memory, +no more API requests are required, there is no difference between the sync and async Client in the examples below. + +## Accessing data in a dataset + +### Accessing by index + +There a couple of different ways that you can access data in a dataset. The Xarray documentation provides a +[great overview](https://docs.xarray.dev/en/stable/user-guide/indexing.html) of all those methods. + +You can access the `measurement` variable: + +```python Accessing values +# Let's print the first measurement value +print(satellite_data.measurement[0]) +``` + +```txt Output + array(3.07027067) Coordinates: +ingestion_time datetime64[ns] 2017-01-01T15:26:32 time datetime64[ns] +2017-01-01T02:45:35 +``` + +You can see in the preceding output that the first measurement value is `3.07027067`, but the output is not just a plain +`float` containing that value. Instead it's an `xarray.DataArray` object. This is because that way you can still +access the coordinates belonging to that value. To get the plain python object you can use the `item()` method: + +```python Accessing raw values +measurement = satellite_data.measurement[0].item() +print(measurement) +``` + +```python Output +3.070270667590244 +``` + +You can access coordinates in a similar manner. For datetime fields Xarray additionally offers a special `dt` (datetime) +accessor, which you can use to format the time as a string: + +```python Accessing and formatting datetime fields +time_format = "%Y-%m-%d %H:%M:%S" +time = satellite_data.time[0].dt.strftime(time_format).item() +ingestion_time = satellite_data.ingestion_time[0].dt.strftime(time_format).item() +print(f"Measurement 0 was taken at {time} and ingested at {ingestion_time}") +``` + +```txt Output +Measurement 0 was taken at 2017-01-01 02:45:35 and ingested at 2017-01-01 +15:26:32 +``` + +Similarly you can also retrieve a whole dataset containing all variables and coordinates for a single data point in the +example dataset. For this Xarray offers the `isel` method (it stands for index selection): + +```python Accessing a whole datapoint by index +datapoint = satellite_data.isel(time=0) +print(datapoint) +``` + +```txt Output + + Dimensions: () Coordinates: ingestion_time datetime64[ns] 2017-01-01T15:26:32 + time datetime64[ns] 2017-01-01T02:45:35 Data variables: sensor int64 2 + measurement float64 3.07 + +``` + +### Subsets of data + +You can also access a subsets of the data. +Here are a couple of ways you can retrieve the first 3 and last 3 measurements. + +```python Accessing raw values +# individual variables +first_3_measurements = satellite_data.measurement[0:3] +print("First 3 measurements", first_3_measurements.values) +last_3_measurements = satellite_data.measurement[-3:] +print("Last 3 measurements", last_3_measurements.values) + +# whole sub datasets +first_3 = satellite_data.isel(time=slice(0, 3)) +last_3 = satellite_data.isel(time=slice(-3, None)) +print("Sub dataset of the last 3 datapoints") +print(last_3) +``` + +```txt Output +First 3 measurements [3.07027067 2.42966457 3.58839564] Last 3 measurements +[1.4907412 2.04492377 2.79836407] Sub dataset of the last 3 datapoints + + Dimensions: (time: 3) Coordinates: ingestion_time (time) datetime64[ns] + 2022-12-31T20:56:40 ... 2022-12-31T... * time (time) datetime64[ns] + 2022-12-31T15:47:54 ... 2022-12-31T... Data variables: sensor (time) int64 1 2 + 1 measurement (time) float64 1.491 2.045 2.798 + +``` + +## Filtering data + +Xarray also offers a convenient way of filtering a dataset based on a condition. +For example, you can filter the dataset to only look at measurements taken by sensor `1`. + +```python Filtering data by sensor +measurements_by_sensor_1 = satellite_data.measurement[satellite_data.sensor == 1] +print(measurements_by_sensor_1) +``` + +```txt Output + array([3.58839564e+00, +2.70314237e+00, 3.27767130e-03, ..., 2.83278085e+00, 1.49074120e+00, +2.79836407e+00]) Coordinates: ingestion_time (time) datetime64[ns] +2017-01-01T15:26:32 ... 2022-12-31T... * time (time) datetime64[ns] +2017-01-01T02:54:03 ... 2022-12-31T... +``` + +You can combine conditions, e.g. to filter for measurement values between `1.5` and `1.6` taken by sensor `1`: + +```python Filtering data by sensor and measurement value +data_filter = ( + (satellite_data.sensor == 1) & + (satellite_data.measurement > 1.5) & + (satellite_data.measurement < 1.6) +) +filtered_measurements = satellite_data.measurement[data_filter] +print(filtered_measurements) +``` + +```txt Output + array([1.54675131, 1.58851704, +1.52978976, ..., 1.54684979, 1.58256101, 1.5325089 ]) Coordinates: +ingestion_time (time) datetime64[ns] 2017-01-01T05:21:17 ... 2022-12-31T... * +time (time) datetime64[ns] 2017-01-01T18:17:47 ... 2022-12-31T... +``` + +## Selecting data by value + +You can use the values of a primary coordinate to index your dataset. +For example, you can access the measurement value taken at `2021-01-14 07:21:04`: + +```python Indexing by time +specific_measurement = satellite_data.sel(time="2021-01-14T07:21:04") +print(specific_measurement) +``` + +```txt Output + + Dimensions: () Coordinates: ingestion_time datetime64[ns] 2020-12-27T18:30:47 + time datetime64[ns] 2021-01-14T07:21:04 Data variables: sensor int64 1 + measurement float64 3.873 + +``` + +When trying to access a value that is not in the dataset, a `KeyError` is raised. + +```python Indexing by time (not found) +nearest_measurement = satellite_data.sel(time="2021-01-14T07:21:05") +>>> raises KeyError: "2021-01-14T07:21:05" +``` + +The `method` parameter can be used to return the closest value instead of raising an error. + +```python Finding the closest measurement +nearest_measurement = satellite_data.sel(time="2021-01-14T07:21:05", method="nearest") +assert nearest_measurement.equals(specific_measurement) # passes +``` + + + Indexing requires the coordinate values to be unique. If there are duplicated values, Xarray raises an error, because + it can not determine which value to return. An easy way to avoid this is to + [drop_duplicates](https://docs.xarray.dev/en/stable/generated/xarray.Dataset.drop_duplicates.html) before indexing. + `satellite_data = satellite_data.drop_duplicates("time")` + + +## Statistics + +Xarray and NumPy offer a wide range of statistical functions that can be applied to a dataset or a DataArray. Here are +a few examples: + +```python Computing dataset statistics +measurements = satellite_data.measurement +min_meas = measurements.min().item() +max_meas = measurements.max().item() +mean_meas = measurements.mean().item() +std_meas = measurements.std().item() +print(f"Measurements from {min_meas:.2f} to {max_meas:.2f} with mean {mean_meas:.2f} and a std of {std_meas:.2f}") +``` + +```txt Output +Measurements from 0.00 to 4.00 with mean 1.91 and a std of 1.44 +``` + +You can also use many NumPy functions directly on a dataset or DataArray. For example, to find out which sensors +you are dealing with, you can use [np.unique](https://numpy.org/doc/stable/reference/generated/numpy.unique.html) to +get all the unique values in the `sensor` data array. + +```python Finding unique values +import numpy as np +print("Sensors:", np.unique(satellite_data.sensor)) +``` + +```txt Output +Sensors: [1 2] +``` + +## Reading and writing files + +Xarray also offers a convenient way to save and load datasets to and from files. This is especially useful if you want +to share your data with others or if you want to persist your data for later use. Xarray supports a wide range of file +formats, including NetCDF, Zarr, GRIB, and many more. For a full list of supported formats, please refer to the +[official documentation page](https://docs.xarray.dev/en/stable/user-guide/io.html). + +Here is how you can save the example dataset to a NetCDF file: + +```python Saving a dataset to a file +satellite_data.to_netcdf("example_satellite_data.nc") +``` + +It creates a file called `example_satellite_data.nc` in the current directory. You can now load this file back +into memory: + +```python Loading a dataset from a file +import xarray as xr + +satellite_data = xr.open_dataset("example_satellite_data.nc") +``` + +In case you want to follow along with the examples in this section, you can download the example dataset as a NetCDF +file here. + +## Further reading + +This section only covered a few of the most common use-cases for Xarray. Xarray offers many more functions and features. +For more information, please refer to the [Xarray documentation](https://xarray.pydata.org/en/stable/) +or check out the [Xarray Tutorials](https://tutorial.xarray.dev/intro.html). + +Some useful capability that this section did not cover include: + +- [Grouping data](https://docs.xarray.dev/en/stable/user-guide/groupby.html) +- [Computation](https://docs.xarray.dev/en/stable/user-guide/computation.html) +- [Time series specific functionality](https://docs.xarray.dev/en/stable/user-guide/time-series.html) +- [Interpolation](https://docs.xarray.dev/en/latest/user-guide/interpolation.html) +- [Plotting](https://docs.xarray.dev/en/latest/user-guide/plotting.html) diff --git a/snippets/components.mdx b/snippets/components.mdx index 186bf07..3c5fe3a 100644 --- a/snippets/components.mdx +++ b/snippets/components.mdx @@ -1,9 +1,12 @@ export const HeroCard = ({ children, title, description, href }) => { return ( - + {children}

{title}

{description}

- ) -} \ No newline at end of file + ); +}; diff --git a/vale/styles/config/vocabularies/docs/accept.txt b/vale/styles/config/vocabularies/docs/accept.txt index 4855e82..4cb4b19 100644 --- a/vale/styles/config/vocabularies/docs/accept.txt +++ b/vale/styles/config/vocabularies/docs/accept.txt @@ -1 +1,18 @@ +# (?i) makes the regex case-insensitive, see https://vale.sh/docs/topics/vocab/#case-sensitivity + Tilebox +(?i)Xarray +NumPy +(?i)async +(?i)antimeridian +Datalore +Colab +Tropomi +georeferenced +Fortran +Zarr +datetime +accessor +Pipenv +Opendata +APIs