-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathdvc.lock
182 lines (182 loc) · 5.33 KB
/
dvc.lock
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
schema: '2.0'
stages:
fetch-metadata:
cmd: python scripts/fetch_eidc_metadata.py data/eidc_metadata.json -s 0
deps:
- path: scripts/fetch_eidc_metadata.py
hash: md5
md5: 82907434d9521996e30014df01bbba8e
size: 964
outs:
- path: data/eidc_metadata.json
hash: md5
md5: 413f59888bf033c30cc27b84b1a3f40b
size: 12313041
prepare:
cmd: python scripts/extract_metadata.py data/eidc_metadata.json data/extracted_metadata.json
deps:
- path: data/eidc_metadata.json
hash: md5
md5: 423dc3a61ede72e1d5c818d74277c0b4
size: 12140491
- path: scripts/extract_metadata.py
hash: md5
md5: c2fa7d2c4b8f28a6e24536ce0df244fd
size: 1296
outs:
- path: data/extracted_metadata.json
hash: md5
md5: 7d2ae8d6a41a960592f30496eb498af7
size: 4578493
extract-metadata:
cmd: python scripts/extract_metadata.py data/eidc_metadata.json data/extracted_metadata.json
deps:
- path: data/eidc_metadata.json
hash: md5
md5: 413f59888bf033c30cc27b84b1a3f40b
size: 12313041
- path: scripts/extract_metadata.py
hash: md5
md5: e66f21369c5106eaaad4476612c6fb5e
size: 1313
outs:
- path: data/extracted_metadata.json
hash: md5
md5: 226225c5bd64e15d803ba88560810c5a
size: 4629991
chunk-data:
cmd: python scripts/chunk_data.py -o data/chunked_data.json -c 250 -ol 75 data/extracted_metadata.json data/supporting-docs.json -m
0
deps:
- path: data/extracted_metadata.json
hash: md5
md5: 226225c5bd64e15d803ba88560810c5a
size: 4629991
- path: data/supporting-docs.json
hash: md5
md5: e2581aff9abe25942e8009214b88d0a5
size: 72680213
- path: scripts/chunk_data.py
hash: md5
md5: 3ad449140b03e1c2904b22a5b401a12e
size: 2705
outs:
- path: data/chunked_data.json
hash: md5
md5: 9893d839409c8cf4561e99ab5f747f20
size: 177068127
create-embeddings:
cmd: python scripts/create_embeddings.py data/chunked_data.json data/embeddings.json
-m all-MiniLM-L6-v2
deps:
- path: data/chunked_data.json
hash: md5
md5: 9893d839409c8cf4561e99ab5f747f20
size: 177068127
- path: scripts/create_embeddings.py
hash: md5
md5: b0d8f7cb90f244e709656b1f38723e2d
size: 1552
outs:
- path: data/embeddings.json
hash: md5
md5: 5c8ca3cdde4d5bc559fa2e701ff090a8
size: 3754990368
upload-to-docstore:
cmd: python scripts/upload_to_docstore.py data/embeddings.json -o data/chroma-data -em
all-MiniLM-L6-v2 -c eidc-data
deps:
- path: data/embeddings.json
hash: md5
md5: 5c8ca3cdde4d5bc559fa2e701ff090a8
size: 3754990368
- path: scripts/upload_to_docstore.py
hash: md5
md5: 930456cedd43723c1d643ad90c146952
size: 2793
outs:
- path: data/chroma-data
hash: md5
md5: c06796220fbfe9db3b08b8439edf87b4.dir
size: 3081399131
nfiles: 6
run-rag-pipeline:
cmd: python scripts/run_rag_pipeline.py -i data/eidc_rag_testset.csv -o data/evaluation_data.csv -ds
data/chroma-data -c eidc-data -m llama3.1 -p data/pipeline.yml
deps:
- path: data/chroma-data
hash: md5
md5: c06796220fbfe9db3b08b8439edf87b4.dir
size: 3081399131
nfiles: 6
- path: data/eidc_rag_testset.csv
hash: md5
md5: 90d23c9bfcaddf9f152109a7b51e3151
size: 149155
- path: scripts/run_rag_pipeline.py
hash: md5
md5: 35eb5f65605242094a1581b92e9b2ef4
size: 5862
outs:
- path: data/evaluation_data.csv
hash: md5
md5: 1b5f226c52d70bda7e2551d7778c1e89
size: 385945
- path: data/pipeline.yml
hash: md5
md5: 70385a724cdf687c287596b8360e1448
size: 3334
generate-testset:
cmd: python scripts/generate_synthetic_testset.py data/extracted_metadata.json
data/eidc_rag_testset.csv 200
deps:
- path: data/extracted_metadata.json
hash: md5
md5: 226225c5bd64e15d803ba88560810c5a
size: 4629991
- path: scripts/generate_synthetic_testset.py
hash: md5
md5: fdac8b2f28de8f3b4e5025ca47bb94ca
size: 2175
outs:
- path: data/eidc_rag_testset.csv
hash: md5
md5: 90d23c9bfcaddf9f152109a7b51e3151
size: 149155
fetch-supporting-docs:
cmd: python scripts/fetch_supporting_docs.py data/eidc_metadata.json data/supporting-docs.json
deps:
- path: data/eidc_metadata.json
hash: md5
md5: 413f59888bf033c30cc27b84b1a3f40b
size: 12313041
- path: scripts/fetch_supporting_docs.py
hash: md5
md5: 02b94a2cc7bff711784cbdec3650b618
size: 1718
outs:
- path: data/supporting-docs.json
hash: md5
md5: e2581aff9abe25942e8009214b88d0a5
size: 72680213
evaluate:
cmd: python scripts/evaluate.py data/evaluation_data.csv -m data/metrics.json
-img data/eval.png
deps:
- path: data/evaluation_data.csv
hash: md5
md5: 1b5f226c52d70bda7e2551d7778c1e89
size: 385945
- path: scripts/evaluate.py
hash: md5
md5: 4154acf8e74c1d8bcd0b0da72af038e0
size: 2728
outs:
- path: data/eval.png
hash: md5
md5: 3308b984c5168a996805443d25697026
size: 83001
- path: data/metrics.json
hash: md5
md5: 709909482614d6cb47c160506088f53e
size: 287