From 9705b616b2ddd44f7e632bcd2bec3e606b544ed8 Mon Sep 17 00:00:00 2001
From: mpc <matthewcoole@gmail.com>
Date: Fri, 18 Oct 2024 12:06:21 +0100
Subject: [PATCH] Added script to download supporting docs

---
 README.md                             | 39 ++++++++++++++---
 dvc.lock                              | 63 +++++++++++++++------------
 dvc.yaml                              |  7 ++-
 params.yaml                           | 44 ++++++++++---------
 scripts/fetch_eidc_supporting_docs.py |  0
 scripts/fetch_supporting_docs.py      | 47 ++++++++++++++++++++
 6 files changed, 144 insertions(+), 56 deletions(-)
 delete mode 100644 scripts/fetch_eidc_supporting_docs.py
 create mode 100644 scripts/fetch_supporting_docs.py

diff --git a/README.md b/README.md
index 588b68f..20d03ac 100644
--- a/README.md
+++ b/README.md
@@ -29,12 +29,12 @@ This pipeline is defined in [`dvc.yaml`](dvc.yaml) and can be viewed with the co
 dvc dag
 ```
 ```
-                 +----------------+                                        
-                 | fetch-metadata |                                        
-                 +----------------+                                        
-                          *                                                
-                          *                                                
-                          *                                                
+                                  +----------------+                       
+                                  | fetch-metadata |                       
+                                  +----------------+                       
+                                  **               **                      
+                               ***                   ***                   
+                             **                         **                 
                 +------------------+            +-----------------------+  
                 | extract-metadata |            | fetch-supporting-docs |  
                 +------------------+            +-----------------------+  
@@ -67,9 +67,34 @@ dvc dag
                           *                                                
                     +----------+                                           
                     | evaluate |                                           
-                    +----------+
+                    +----------+  
+```
+
+> Note: To re-run the `fetch-supporting-docs` stage of the pipeline you will need to request access to the [Legilo](https://legilo.eds-infra.ceh.ac.uk/) service from the EDS dev team and provide your `username` and `password` in a `.env` file.
+
+## Running Experiments
+The pipeline by default will run using the parameters defind in [`params.yaml`](params.yaml). To experiment with varying these paramaters you can change them directly, or use [DVC experiments](). 
+
+To run an experiment varying a particual parameter:
+```shell
+dvc exp run -S hp.chunk-size=1000
 ```
+This will re-run the pipeline but override the value of the `hp.chunk-size` parameter in [`params.yaml`](params.yaml) and set it to `1000`. Only the necessary stages of the pipeline should be re-run and the result should appear in your workspace.
 
+You can compare the results of your experiment to the results of the baseline run of the pipeline using:
+```shell
+dvc exp diff
+```
+```shell
+Path               Metric              HEAD      workspace    Change
+data/metrics.json  answer_correctness  0.049482  0.043685     -0.0057974
+data/metrics.json  answer_similarity   0.19793   0.17474      -0.02319
+data/metrics.json  context_recall      0.125     0            -0.125
+data/metrics.json  faithfulness        0.75      0.69375      -0.05625
+
+Path         Param          HEAD    workspace    Change
+params.yaml  hp.chunk-size  300     1000         700
+```
 ## Notes
 
 ### DVC and CML
diff --git a/dvc.lock b/dvc.lock
index d143f87..dd7f7b1 100644
--- a/dvc.lock
+++ b/dvc.lock
@@ -45,8 +45,8 @@ stages:
       md5: 789fda7a14f9a85c6ee0e10af8170a95
       size: 4584498
   chunk-data:
-    cmd: python scripts/chunk_data.py -o data/chunked_data.json -c 300 -ol 100 -s
-      10 data/extracted_metadata.json
+    cmd: python scripts/chunk_data.py -o data/chunked_data.json -c 500 -ol 100 -s
+      10 data/extracted_metadata.json data/supporting-docs.json
     deps:
     - path: data/extracted_metadata.json
       hash: md5
@@ -54,8 +54,8 @@ stages:
       size: 4584498
     - path: data/supporting-docs.json
       hash: md5
-      md5: 0febface6f1d23fda46c11bef65284f4
-      size: 34
+      md5: b0941cc9a7ca7df456157380bcc28f39
+      size: 75646
     - path: scripts/chunk_data.py
       hash: md5
       md5: 681528e4aa1dc8cfb5fe5e5472e25fdf
@@ -63,15 +63,15 @@ stages:
     outs:
     - path: data/chunked_data.json
       hash: md5
-      md5: e9160d8c6c0fa7f647c5baa03bd1b5dd
-      size: 14947
+      md5: 97f06c3b76ff05d62ccdecd9d5742712
+      size: 137681
   create-embeddings:
     cmd: python scripts/create_embeddings.py data/chunked_data.json data/embeddings.json
     deps:
     - path: data/chunked_data.json
       hash: md5
-      md5: e9160d8c6c0fa7f647c5baa03bd1b5dd
-      size: 14947
+      md5: 97f06c3b76ff05d62ccdecd9d5742712
+      size: 137681
     - path: scripts/create_embeddings.py
       hash: md5
       md5: 4649c700dfae922b43b3608ee4f00c1a
@@ -79,16 +79,16 @@ stages:
     outs:
     - path: data/embeddings.json
       hash: md5
-      md5: b08299369d1f243eb8d8ffa2cdb9a90f
-      size: 351126
+      md5: 8d80ef225c59ede34d026f6f2930bae3
+      size: 1894126
   upload-to-docstore:
     cmd: python scripts/upload_to_docstore.py data/embeddings.json -o data/chroma-data
       -em all-MiniLM-L6-v2 -c eidc-data
     deps:
     - path: data/embeddings.json
       hash: md5
-      md5: b08299369d1f243eb8d8ffa2cdb9a90f
-      size: 351126
+      md5: 8d80ef225c59ede34d026f6f2930bae3
+      size: 1894126
     - path: scripts/upload_to_docstore.py
       hash: md5
       md5: 41da88e3bb6d2592bee938ce347f6983
@@ -96,8 +96,8 @@ stages:
     outs:
     - path: data/chroma-data
       hash: md5
-      md5: 6e5431dd6f6ec33877e8f9d4da166d83.dir
-      size: 2069220
+      md5: cc85398c596d4c5839714e93e33468bb.dir
+      size: 3580644
       nfiles: 5
   run-rag-pipeline:
     cmd: python scripts/run_rag_pipeline.py data/eidc_rag_test_sample.csv data/evaluation_data.csv
@@ -105,8 +105,8 @@ stages:
     deps:
     - path: data/chroma-data
       hash: md5
-      md5: 6e5431dd6f6ec33877e8f9d4da166d83.dir
-      size: 2069220
+      md5: cc85398c596d4c5839714e93e33468bb.dir
+      size: 3580644
       nfiles: 5
     - path: data/eidc_rag_test_sample.csv
       hash: md5
@@ -119,8 +119,8 @@ stages:
     outs:
     - path: data/evaluation_data.csv
       hash: md5
-      md5: f6bce3f5c551e84da224d36201858839
-      size: 6638
+      md5: 9825cf7e7a89ca17634b44e9256eefc9
+      size: 9695
   generate-testset:
     cmd: cp data/synthetic-datasets/eidc_rag_test_sample.csv data/
     outs:
@@ -129,20 +129,29 @@ stages:
       md5: a371d83c5822d256286e80d64d58c3fe
       size: 7524
   fetch-supporting-docs:
-    cmd: echo "Fetch supporitng docs from legilo" > data/supporting-docs.json
+    cmd: python scripts/fetch_supporting_docs.py data/eidc_metadata.json data/supporting-docs.json
+    deps:
+    - path: data/eidc_metadata.json
+      hash: md5
+      md5: b4f3774a2921debb4d7740165ac604d4
+      size: 12157676
+    - path: scripts/fetch_supporting_docs.py
+      hash: md5
+      md5: de0c11e81bf10e040bef67e43466b789
+      size: 1472
     outs:
     - path: data/supporting-docs.json
       hash: md5
-      md5: 0febface6f1d23fda46c11bef65284f4
-      size: 34
+      md5: b0941cc9a7ca7df456157380bcc28f39
+      size: 75646
   evaluate:
     cmd: python scripts/evaluate.py data/evaluation_data.csv -m data/metrics.json
       -img data/eval.png
     deps:
     - path: data/evaluation_data.csv
       hash: md5
-      md5: f6bce3f5c551e84da224d36201858839
-      size: 6638
+      md5: 9825cf7e7a89ca17634b44e9256eefc9
+      size: 9695
     - path: scripts/evaluate.py
       hash: md5
       md5: 10f76511eafc8a1a9b90e9ae92a76bc5
@@ -150,9 +159,9 @@ stages:
     outs:
     - path: data/eval.png
       hash: md5
-      md5: fd66aa842f93e8f370399dae5b68e2fe
-      size: 50525
+      md5: 1279778c7e509e972d1f366157d24966
+      size: 58228
     - path: data/metrics.json
       hash: md5
-      md5: 55266ae1bd64a3499508d07651a5aa13
-      size: 214
+      md5: 2b93334ba0e8226c916d0964237cb72c
+      size: 225
diff --git a/dvc.yaml b/dvc.yaml
index fa419ff..0e9f154 100644
--- a/dvc.yaml
+++ b/dvc.yaml
@@ -6,7 +6,10 @@ stages:
     outs:
     - ${files.metadata}
   fetch-supporting-docs:
-    cmd: echo "Fetch supporitng docs from legilo" > ${files.supporting-docs}
+    cmd: python scripts/fetch_supporting_docs.py ${files.metadata} ${files.supporting-docs}
+    deps:
+    - ${files.metadata}
+    - scripts/fetch_supporting_docs.py
     outs:
     - ${files.supporting-docs}
   extract-metadata:
@@ -17,7 +20,7 @@ stages:
     outs:
     - ${files.extracted}
   chunk-data:
-    cmd: python scripts/chunk_data.py -o ${files.chunked} -c ${hp.chunk-size} -ol ${hp.overlap} -s ${sample-size} ${files.extracted}
+    cmd: python scripts/chunk_data.py -o ${files.chunked} -c ${hp.chunk-size} -ol ${hp.overlap} -s ${sample-size} ${files.extracted} ${files.supporting-docs}
     deps:
     - ${files.extracted}
     - ${files.supporting-docs}
diff --git a/params.yaml b/params.yaml
index 988dbdb..85c3119 100644
--- a/params.yaml
+++ b/params.yaml
@@ -1,30 +1,34 @@
 hp:
-  chunk-size: 300
+  chunk-size: 500
   overlap: 100
-  embeddings-model: "all-MiniLM-L6-v2"
+  embeddings-model: all-MiniLM-L6-v2
 doc-store:
-  collection: "eidc-data"
-  files: "data/chroma-data"
+  collection: eidc-data
+  files: data/chroma-data
 files:
-  metadata: "data/eidc_metadata.json"
-  extracted: "data/extracted_metadata.json"
-  supporting-docs: "data/supporting-docs.json"
-  chunked: "data/chunked_data.json"
-  embeddings: "data/embeddings.json"
-  doc-store: "data/chroma-data"
-  test-set: "data/eidc_rag_test_sample.csv"
-  eval-set: "data/evaluation_data.csv"
-  metrics: "data/metrics.json"
-  eval-plot: "data/eval.png"
+  metadata: data/eidc_metadata.json
+  extracted: data/extracted_metadata.json
+  supporting-docs: data/supporting-docs.json
+  chunked: data/chunked_data.json
+  embeddings: data/embeddings.json
+  doc-store: data/chroma-data
+  test-set: data/eidc_rag_test_sample.csv
+  eval-set: data/evaluation_data.csv
+  metrics: data/metrics.json
+  eval-plot: data/eval.png
 sample-size: 10 # sample size of 0 will process all data
 rag:
   model: llama3.1
-  prompt: >
-    You are part of a retrieval augmented pipeline. You will be given a question and a context on which to base your answer.\n
+  prompt: >-
+    You are part of a retrieval augmented pipeline. You will be given a question and
+    a context on which to base your answer.\n
     Do not use your own knowledge to answer the question.\n
-    The context provided will be metadata from datasets contained in the Environmental Information Data Centre (EIDC).\n
-    Do not refer to "context" in your answer, instead refer to the context as available information.
-    If the answer to the question is not clear from the context, suggest which dataset or datasets might be helpful in answering the question.\n
+    The context provided will be metadata from datasets contained in the Environmental
+    Information Data Centre (EIDC).\n
+    Do not refer to "context" in your answer, instead refer to the context as available
+    information.
+    If the answer to the question is not clear from the context, suggest which dataset
+    or datasets might be helpful in answering the question.\n
     Question: {{query}}\n
     Context: {% for document in documents%}\n{{ document.content }}\n{% endfor %}
-    Answer:
\ No newline at end of file
+    Answer:
diff --git a/scripts/fetch_eidc_supporting_docs.py b/scripts/fetch_eidc_supporting_docs.py
deleted file mode 100644
index e69de29..0000000
diff --git a/scripts/fetch_supporting_docs.py b/scripts/fetch_supporting_docs.py
new file mode 100644
index 0000000..36354e7
--- /dev/null
+++ b/scripts/fetch_supporting_docs.py
@@ -0,0 +1,47 @@
+from argparse import ArgumentParser
+import json
+from tqdm import tqdm
+import requests
+import os
+from typing import Dict, List
+from dotenv import load_dotenv
+
+
+def extract_ids(metadata_file: str):
+    with open(metadata_file) as f:
+        json_data = json.load(f)
+        ids = [dataset["identifier"] for dataset in json_data["results"]]
+        return ids
+
+
+def get_supporting_docs(eidc_id: str, user: str, password: str) -> List[Dict[str, str]]:
+    res = requests.get(
+        f"https://legilo.eds-infra.ceh.ac.uk/{eidc_id}/documents", auth=(user, password)
+    )
+    json_data = res.json()
+    docs = []
+    for key, val in json_data["success"].items():
+        docs.append({"id": eidc_id, "field": key, "value": val})
+    return docs
+
+
+def main(metadata_file: str, supporting_docs_file: str):
+    load_dotenv()
+    user = os.getenv("username")
+    password = os.getenv("password")
+    ids = extract_ids(metadata_file)
+    docs = []
+    for id in tqdm(ids):
+        docs.extend(get_supporting_docs(id, user, password))
+        if len(docs) > 0:
+            break
+    with open(supporting_docs_file, "w") as f:
+        json.dump(docs, f, indent=4)
+
+
+if __name__ == "__main__":
+    parser = ArgumentParser("fetch_supporting_docs.py")
+    parser.add_argument("metadata", help="File containing EIDC metadata.")
+    parser.add_argument("supporting_docs", help="File to save supporting docs to.")
+    args = parser.parse_args()
+    main(args.metadata, args.supporting_docs)