Produce CSV from JSON order tables #788
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# First, stream JSON files to mlr, converting them from JSON to CSV; use xargs because the file list exhausts argument limits (~5k per batch). | |
# Then, remove extra header rows caused by calling mlr multiple times through xargs. | |
# Then, remove the processing file, and save the output. | |
# | |
# NB, differences in format between this and output of `oic-analysis` (which runs against the HTML): | |
# - pcNumber and htmlHash column names remain in camelCase | |
# - attachments is a stringified JSON array | |
# - year and number are not extracted from pcNumber | |
# - department is not processed after extraction, department_raw is not created | |
# - text column (html_text2 of the html column) is not created | |
name: Produce CSV from JSON order tables | |
on: | |
workflow_dispatch: | |
workflow_run: | |
workflows: [Scrape latest orders and attachments] | |
types: | |
- completed | |
jobs: | |
update-csvs-from-json: | |
runs-on: ubuntu-latest | |
steps: | |
- name: Check out this repo | |
uses: actions/checkout@v2 | |
- name: Install miller | |
run: |- | |
sudo apt-get install miller | |
- name: Run order table conversions with miller | |
run: |- | |
find order-tables -iname "*.json" | xargs mlr --ijson --ocsv --no-auto-flatten regularize > orders-interim1.csv | |
mlr --csv --from orders-interim1.csv filter '$html != "html"' > orders-interim2.csv | |
mlr --csv --from orders-interim2.csv cut -x -f html then sort -f pcNumber > processed-csvs/orders.csv | |
mlr --csv --from orders-interim2.csv then cut -f pcNumber,html then sort -f pcNumber > processed-csvs/orders-html.csv | |
rm orders-interim1.csv orders-interim2.csv | |
- name: Run attachment conversions with miller | |
run: |- | |
find attachments -iname "*.json" | xargs mlr --ijson --ocsv --no-auto-flatten regularize > attachments-interim1.csv | |
mlr --csv --from attachments-interim1.csv filter '$attachmentHtml != "attachmentHtml"' > attachments-interim2.csv | |
mlr --csv --from attachments-interim2.csv cut -x -f attachmentHtml then sort -n id > processed-csvs/attachments.csv | |
mlr --csv --from attachments-interim2.csv then cut -f id,attachmentHtml then sort -n id | gzip > processed-csvs/attachments-html.csv.gz | |
rm attachments-interim1.csv attachments-interim2.csv | |
- name: Commit and push if it changed | |
run: |- | |
git config user.name "Automated" | |
git config user.email "actions@users.noreply.github.com" | |
git add processed-csvs/orders.csv processed-csvs/orders-html.csv | |
git add processed-csvs/attachments.csv processed-csvs/attachments-html.csv.gz | |
timestamp=$(date -u) | |
git commit -m "CSV from JSON as of: ${timestamp}" || exit 0 | |
git push |