diff --git a/README.md b/README.md index 7d7741b..6e1e391 100644 --- a/README.md +++ b/README.md @@ -1,10 +1,38 @@ # stac-rucio -A python package for managing STAC asset data locality when those assets are managed by a RUCIO server. +A python package for managing STAC asset locality amongst a list of Rucio Storage Elements. + +The package is intended to work by consuming the results of queries made against a stac catalogue. For a given stac search, +a list of items are returned, for this list of items, the stac_rucio package supports creating replicas, +and creating replication rules. + +The stac_rucio package additionally comes with a modifier function to be used with the pystac_client, +the modifier will check rucio for any existing replicas across the available RSEs, available replicas will +be listed within an asset using the alternate assets stac extension. ## Installation PyPI support coming. -## Usage -## Development \ No newline at end of file +## Example + +A notebook example is available [here](./examples/demo_example.ipynb). + +In the image below, we'll consinder a stac item which has no replicas or rules in rucio, but our +user wants to bring the data into RSE #2. For this to work, we require a Non-Deterministic +RSE ( ND-RSE ) to be configured amongst the available RSEs. A ND-RSE is a storage element in Rucio with no +physical storage attached. This storage element is a pragmatic solution for "registering" data in Rucio where +that data remains located at external http endpoints. Data will only be replicated in this example, when +the user creates a replication rule to get data from RSE #1 to RSE #2 ( but this data never actually resides at +RSE #1 ). + +![Stac Rucio](stac_rucio.png "Stac Rucio") + +To register the data for B01, our user needs to create a replica in the ND-RSE for this item if it does not +already exist. For STAC collections supported by a Rucio server in this way, it would be best to create these replicas +ahead of time, as they will always be required, and will always be registered at the same ND-RSE. When replicas are created, +the file size needs to be provided in bytes, as well as the adler32 checksum. + +When the replica exists, the user needs to create a replication rule. The replication rule will instruct Rucio to copy +data from RSE #1 to RSE #2. As the data does not actually exist at RSE #1, it will be downloaded from the original http +endpoint exposed by the stac catalogue. diff --git a/examples/demo_example.ipynb b/examples/demo_example.ipynb index fbe650b..22dd5a1 100644 --- a/examples/demo_example.ipynb +++ b/examples/demo_example.ipynb @@ -25,21 +25,9 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "ename": "NameError", - "evalue": "name 'stac_rucio' is not defined", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)", - "Cell \u001b[0;32mIn[2], line 6\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mpystac_client\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m Client\n\u001b[1;32m 3\u001b[0m \u001b[38;5;66;03m# STAC client connection, with a modifier for updating items on the fly.\u001b[39;00m\n\u001b[1;32m 4\u001b[0m eodc \u001b[38;5;241m=\u001b[39m Client\u001b[38;5;241m.\u001b[39mopen(\n\u001b[1;32m 5\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mhttps://stac.eodc.eu/api/v1\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[0;32m----> 6\u001b[0m modifier\u001b[38;5;241m=\u001b[39m\u001b[43mstac_rucio\u001b[49m\u001b[38;5;241m.\u001b[39mrucio_item\n\u001b[1;32m 7\u001b[0m )\n", - "\u001b[0;31mNameError\u001b[0m: name 'stac_rucio' is not defined" - ] - } - ], + "outputs": [], "source": [ "from pystac_client import Client\n", "\n", @@ -128,7 +116,7 @@ "outputs": [], "source": [ "# Create replicas carries out the initial registration of the target asset at the specified RSE.\n", - "stac_rucio.create_replicas(items=november_list, rse=\"EODC-DATA\", target=\"mfcover\")" + "stac_rucio.create_replicas(items=november_list, rse=\"EODC-DATA\", targets=[\"mfcover\"])" ] }, { @@ -138,7 +126,7 @@ "outputs": [], "source": [ "# Create replication rules generates the rules that instruct rucio that a request has been made to copy data from one RSE to another.\n", - "stac_rucio.create_replication_rules(items=november_list, dst_rse=\"DESY-DCACHE\")" + "stac_rucio.create_replication_rules(items=november_list, dst_rse=\"DESY-DCACHE\", targets=[\"mfcover\"])" ] }, { diff --git a/stac_rucio.png b/stac_rucio.png new file mode 100644 index 0000000..c95e30b Binary files /dev/null and b/stac_rucio.png differ