Skip to content

Commit

Permalink
Adding dataset images
Browse files Browse the repository at this point in the history
  • Loading branch information
akariv committed Oct 30, 2019
1 parent 0369446 commit 706dee9
Show file tree
Hide file tree
Showing 9 changed files with 383 additions and 2 deletions.
2 changes: 2 additions & 0 deletions .dockerignore
Original file line number Diff line number Diff line change
Expand Up @@ -22,3 +22,5 @@ LICENSE
README.md
tox.ini
*.secret.env
./data/
node_modules/
5 changes: 4 additions & 1 deletion Dockerfile
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
FROM frictionlessdata/datapackage-pipelines:2.1.8

RUN apk --update --no-cache add bash wget
RUN apk --update --no-cache add bash wget nodejs npm nss chromium
RUN npm install -g npm@latest
RUN cd /pipelines/ && PUPPETEER_SKIP_CHROMIUM_DOWNLOAD="true" npm install puppeteer

COPY docker-dpp-run.sh /dpp/docker/run.sh

Expand All @@ -12,6 +14,7 @@ COPY setup.py /pipelines/
RUN python3 -m pip install -e .

ENV DPP_ELASTICSEARCH=localhost:19200
ENV CHROME_BIN="/usr/bin/chromium-browser"

COPY datapackage_pipelines_migdar /pipelines/datapackage_pipelines_migdar
COPY download_search_results_unique_records.sh /pipelines/
Expand Down
31 changes: 31 additions & 0 deletions datapackage_pipelines_migdar/flows/dataset_assets.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
import os
import subprocess

import dataflows as DF

SCREENSHOT = os.path.join(os.path.dirname(__file__), 'node', 'screenshot.js')


def do_screenshot():
def func(rows):
for row in rows:
doc_id = row['doc_id']
url = f'https://yodaat.org/card/{doc_id}'
outpath = os.path.join('data', os.path.dirname(doc_id))
os.makedirs(outpath, exist_ok=True)
outpath = os.path.join('data', doc_id + '.png')
subprocess.call(['node', SCREENSHOT, url, outpath, '.card'])
return []
return func


def flow(*_, path='data/datasets_in_es'):
return DF.Flow(
DF.load('{}/datapackage.json'.format(path)),
do_screenshot(),
DF.update_resource(-1, **{'dpp:streaming': True})
)


if __name__ == '__main__':
flow(path='https://api.yodaat.org/data/datasets_in_es').process()
1 change: 1 addition & 0 deletions datapackage_pipelines_migdar/flows/node/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
node_modules/
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
315 changes: 315 additions & 0 deletions datapackage_pipelines_migdar/flows/node/package-lock.json

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Loading

0 comments on commit 706dee9

Please sign in to comment.