From 9705b616b2ddd44f7e632bcd2bec3e606b544ed8 Mon Sep 17 00:00:00 2001 From: mpc Date: Fri, 18 Oct 2024 12:06:21 +0100 Subject: [PATCH] Added script to download supporting docs --- README.md | 39 ++++++++++++++--- dvc.lock | 63 +++++++++++++++------------ dvc.yaml | 7 ++- params.yaml | 44 ++++++++++--------- scripts/fetch_eidc_supporting_docs.py | 0 scripts/fetch_supporting_docs.py | 47 ++++++++++++++++++++ 6 files changed, 144 insertions(+), 56 deletions(-) delete mode 100644 scripts/fetch_eidc_supporting_docs.py create mode 100644 scripts/fetch_supporting_docs.py diff --git a/README.md b/README.md index 588b68f..20d03ac 100644 --- a/README.md +++ b/README.md @@ -29,12 +29,12 @@ This pipeline is defined in [`dvc.yaml`](dvc.yaml) and can be viewed with the co dvc dag ``` ``` - +----------------+ - | fetch-metadata | - +----------------+ - * - * - * + +----------------+ + | fetch-metadata | + +----------------+ + ** ** + *** *** + ** ** +------------------+ +-----------------------+ | extract-metadata | | fetch-supporting-docs | +------------------+ +-----------------------+ @@ -67,9 +67,34 @@ dvc dag * +----------+ | evaluate | - +----------+ + +----------+ +``` + +> Note: To re-run the `fetch-supporting-docs` stage of the pipeline you will need to request access to the [Legilo](https://legilo.eds-infra.ceh.ac.uk/) service from the EDS dev team and provide your `username` and `password` in a `.env` file. + +## Running Experiments +The pipeline by default will run using the parameters defind in [`params.yaml`](params.yaml). To experiment with varying these paramaters you can change them directly, or use [DVC experiments](). + +To run an experiment varying a particual parameter: +```shell +dvc exp run -S hp.chunk-size=1000 ``` +This will re-run the pipeline but override the value of the `hp.chunk-size` parameter in [`params.yaml`](params.yaml) and set it to `1000`. Only the necessary stages of the pipeline should be re-run and the result should appear in your workspace. +You can compare the results of your experiment to the results of the baseline run of the pipeline using: +```shell +dvc exp diff +``` +```shell +Path Metric HEAD workspace Change +data/metrics.json answer_correctness 0.049482 0.043685 -0.0057974 +data/metrics.json answer_similarity 0.19793 0.17474 -0.02319 +data/metrics.json context_recall 0.125 0 -0.125 +data/metrics.json faithfulness 0.75 0.69375 -0.05625 + +Path Param HEAD workspace Change +params.yaml hp.chunk-size 300 1000 700 +``` ## Notes ### DVC and CML diff --git a/dvc.lock b/dvc.lock index d143f87..dd7f7b1 100644 --- a/dvc.lock +++ b/dvc.lock @@ -45,8 +45,8 @@ stages: md5: 789fda7a14f9a85c6ee0e10af8170a95 size: 4584498 chunk-data: - cmd: python scripts/chunk_data.py -o data/chunked_data.json -c 300 -ol 100 -s - 10 data/extracted_metadata.json + cmd: python scripts/chunk_data.py -o data/chunked_data.json -c 500 -ol 100 -s + 10 data/extracted_metadata.json data/supporting-docs.json deps: - path: data/extracted_metadata.json hash: md5 @@ -54,8 +54,8 @@ stages: size: 4584498 - path: data/supporting-docs.json hash: md5 - md5: 0febface6f1d23fda46c11bef65284f4 - size: 34 + md5: b0941cc9a7ca7df456157380bcc28f39 + size: 75646 - path: scripts/chunk_data.py hash: md5 md5: 681528e4aa1dc8cfb5fe5e5472e25fdf @@ -63,15 +63,15 @@ stages: outs: - path: data/chunked_data.json hash: md5 - md5: e9160d8c6c0fa7f647c5baa03bd1b5dd - size: 14947 + md5: 97f06c3b76ff05d62ccdecd9d5742712 + size: 137681 create-embeddings: cmd: python scripts/create_embeddings.py data/chunked_data.json data/embeddings.json deps: - path: data/chunked_data.json hash: md5 - md5: e9160d8c6c0fa7f647c5baa03bd1b5dd - size: 14947 + md5: 97f06c3b76ff05d62ccdecd9d5742712 + size: 137681 - path: scripts/create_embeddings.py hash: md5 md5: 4649c700dfae922b43b3608ee4f00c1a @@ -79,16 +79,16 @@ stages: outs: - path: data/embeddings.json hash: md5 - md5: b08299369d1f243eb8d8ffa2cdb9a90f - size: 351126 + md5: 8d80ef225c59ede34d026f6f2930bae3 + size: 1894126 upload-to-docstore: cmd: python scripts/upload_to_docstore.py data/embeddings.json -o data/chroma-data -em all-MiniLM-L6-v2 -c eidc-data deps: - path: data/embeddings.json hash: md5 - md5: b08299369d1f243eb8d8ffa2cdb9a90f - size: 351126 + md5: 8d80ef225c59ede34d026f6f2930bae3 + size: 1894126 - path: scripts/upload_to_docstore.py hash: md5 md5: 41da88e3bb6d2592bee938ce347f6983 @@ -96,8 +96,8 @@ stages: outs: - path: data/chroma-data hash: md5 - md5: 6e5431dd6f6ec33877e8f9d4da166d83.dir - size: 2069220 + md5: cc85398c596d4c5839714e93e33468bb.dir + size: 3580644 nfiles: 5 run-rag-pipeline: cmd: python scripts/run_rag_pipeline.py data/eidc_rag_test_sample.csv data/evaluation_data.csv @@ -105,8 +105,8 @@ stages: deps: - path: data/chroma-data hash: md5 - md5: 6e5431dd6f6ec33877e8f9d4da166d83.dir - size: 2069220 + md5: cc85398c596d4c5839714e93e33468bb.dir + size: 3580644 nfiles: 5 - path: data/eidc_rag_test_sample.csv hash: md5 @@ -119,8 +119,8 @@ stages: outs: - path: data/evaluation_data.csv hash: md5 - md5: f6bce3f5c551e84da224d36201858839 - size: 6638 + md5: 9825cf7e7a89ca17634b44e9256eefc9 + size: 9695 generate-testset: cmd: cp data/synthetic-datasets/eidc_rag_test_sample.csv data/ outs: @@ -129,20 +129,29 @@ stages: md5: a371d83c5822d256286e80d64d58c3fe size: 7524 fetch-supporting-docs: - cmd: echo "Fetch supporitng docs from legilo" > data/supporting-docs.json + cmd: python scripts/fetch_supporting_docs.py data/eidc_metadata.json data/supporting-docs.json + deps: + - path: data/eidc_metadata.json + hash: md5 + md5: b4f3774a2921debb4d7740165ac604d4 + size: 12157676 + - path: scripts/fetch_supporting_docs.py + hash: md5 + md5: de0c11e81bf10e040bef67e43466b789 + size: 1472 outs: - path: data/supporting-docs.json hash: md5 - md5: 0febface6f1d23fda46c11bef65284f4 - size: 34 + md5: b0941cc9a7ca7df456157380bcc28f39 + size: 75646 evaluate: cmd: python scripts/evaluate.py data/evaluation_data.csv -m data/metrics.json -img data/eval.png deps: - path: data/evaluation_data.csv hash: md5 - md5: f6bce3f5c551e84da224d36201858839 - size: 6638 + md5: 9825cf7e7a89ca17634b44e9256eefc9 + size: 9695 - path: scripts/evaluate.py hash: md5 md5: 10f76511eafc8a1a9b90e9ae92a76bc5 @@ -150,9 +159,9 @@ stages: outs: - path: data/eval.png hash: md5 - md5: fd66aa842f93e8f370399dae5b68e2fe - size: 50525 + md5: 1279778c7e509e972d1f366157d24966 + size: 58228 - path: data/metrics.json hash: md5 - md5: 55266ae1bd64a3499508d07651a5aa13 - size: 214 + md5: 2b93334ba0e8226c916d0964237cb72c + size: 225 diff --git a/dvc.yaml b/dvc.yaml index fa419ff..0e9f154 100644 --- a/dvc.yaml +++ b/dvc.yaml @@ -6,7 +6,10 @@ stages: outs: - ${files.metadata} fetch-supporting-docs: - cmd: echo "Fetch supporitng docs from legilo" > ${files.supporting-docs} + cmd: python scripts/fetch_supporting_docs.py ${files.metadata} ${files.supporting-docs} + deps: + - ${files.metadata} + - scripts/fetch_supporting_docs.py outs: - ${files.supporting-docs} extract-metadata: @@ -17,7 +20,7 @@ stages: outs: - ${files.extracted} chunk-data: - cmd: python scripts/chunk_data.py -o ${files.chunked} -c ${hp.chunk-size} -ol ${hp.overlap} -s ${sample-size} ${files.extracted} + cmd: python scripts/chunk_data.py -o ${files.chunked} -c ${hp.chunk-size} -ol ${hp.overlap} -s ${sample-size} ${files.extracted} ${files.supporting-docs} deps: - ${files.extracted} - ${files.supporting-docs} diff --git a/params.yaml b/params.yaml index 988dbdb..85c3119 100644 --- a/params.yaml +++ b/params.yaml @@ -1,30 +1,34 @@ hp: - chunk-size: 300 + chunk-size: 500 overlap: 100 - embeddings-model: "all-MiniLM-L6-v2" + embeddings-model: all-MiniLM-L6-v2 doc-store: - collection: "eidc-data" - files: "data/chroma-data" + collection: eidc-data + files: data/chroma-data files: - metadata: "data/eidc_metadata.json" - extracted: "data/extracted_metadata.json" - supporting-docs: "data/supporting-docs.json" - chunked: "data/chunked_data.json" - embeddings: "data/embeddings.json" - doc-store: "data/chroma-data" - test-set: "data/eidc_rag_test_sample.csv" - eval-set: "data/evaluation_data.csv" - metrics: "data/metrics.json" - eval-plot: "data/eval.png" + metadata: data/eidc_metadata.json + extracted: data/extracted_metadata.json + supporting-docs: data/supporting-docs.json + chunked: data/chunked_data.json + embeddings: data/embeddings.json + doc-store: data/chroma-data + test-set: data/eidc_rag_test_sample.csv + eval-set: data/evaluation_data.csv + metrics: data/metrics.json + eval-plot: data/eval.png sample-size: 10 # sample size of 0 will process all data rag: model: llama3.1 - prompt: > - You are part of a retrieval augmented pipeline. You will be given a question and a context on which to base your answer.\n + prompt: >- + You are part of a retrieval augmented pipeline. You will be given a question and + a context on which to base your answer.\n Do not use your own knowledge to answer the question.\n - The context provided will be metadata from datasets contained in the Environmental Information Data Centre (EIDC).\n - Do not refer to "context" in your answer, instead refer to the context as available information. - If the answer to the question is not clear from the context, suggest which dataset or datasets might be helpful in answering the question.\n + The context provided will be metadata from datasets contained in the Environmental + Information Data Centre (EIDC).\n + Do not refer to "context" in your answer, instead refer to the context as available + information. + If the answer to the question is not clear from the context, suggest which dataset + or datasets might be helpful in answering the question.\n Question: {{query}}\n Context: {% for document in documents%}\n{{ document.content }}\n{% endfor %} - Answer: \ No newline at end of file + Answer: diff --git a/scripts/fetch_eidc_supporting_docs.py b/scripts/fetch_eidc_supporting_docs.py deleted file mode 100644 index e69de29..0000000 diff --git a/scripts/fetch_supporting_docs.py b/scripts/fetch_supporting_docs.py new file mode 100644 index 0000000..36354e7 --- /dev/null +++ b/scripts/fetch_supporting_docs.py @@ -0,0 +1,47 @@ +from argparse import ArgumentParser +import json +from tqdm import tqdm +import requests +import os +from typing import Dict, List +from dotenv import load_dotenv + + +def extract_ids(metadata_file: str): + with open(metadata_file) as f: + json_data = json.load(f) + ids = [dataset["identifier"] for dataset in json_data["results"]] + return ids + + +def get_supporting_docs(eidc_id: str, user: str, password: str) -> List[Dict[str, str]]: + res = requests.get( + f"https://legilo.eds-infra.ceh.ac.uk/{eidc_id}/documents", auth=(user, password) + ) + json_data = res.json() + docs = [] + for key, val in json_data["success"].items(): + docs.append({"id": eidc_id, "field": key, "value": val}) + return docs + + +def main(metadata_file: str, supporting_docs_file: str): + load_dotenv() + user = os.getenv("username") + password = os.getenv("password") + ids = extract_ids(metadata_file) + docs = [] + for id in tqdm(ids): + docs.extend(get_supporting_docs(id, user, password)) + if len(docs) > 0: + break + with open(supporting_docs_file, "w") as f: + json.dump(docs, f, indent=4) + + +if __name__ == "__main__": + parser = ArgumentParser("fetch_supporting_docs.py") + parser.add_argument("metadata", help="File containing EIDC metadata.") + parser.add_argument("supporting_docs", help="File to save supporting docs to.") + args = parser.parse_args() + main(args.metadata, args.supporting_docs)