diff --git a/.dockerignore b/.dockerignore index 0616994..db470a6 100644 --- a/.dockerignore +++ b/.dockerignore @@ -22,3 +22,5 @@ LICENSE README.md tox.ini *.secret.env +./data/ +node_modules/ \ No newline at end of file diff --git a/Dockerfile b/Dockerfile index 6271c1e..c388a5a 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,6 +1,8 @@ FROM frictionlessdata/datapackage-pipelines:2.1.8 -RUN apk --update --no-cache add bash wget +RUN apk --update --no-cache add bash wget nodejs npm nss chromium +RUN npm install -g npm@latest +RUN cd /pipelines/ && PUPPETEER_SKIP_CHROMIUM_DOWNLOAD="true" npm install puppeteer COPY docker-dpp-run.sh /dpp/docker/run.sh @@ -12,6 +14,7 @@ COPY setup.py /pipelines/ RUN python3 -m pip install -e . ENV DPP_ELASTICSEARCH=localhost:19200 +ENV CHROME_BIN="/usr/bin/chromium-browser" COPY datapackage_pipelines_migdar /pipelines/datapackage_pipelines_migdar COPY download_search_results_unique_records.sh /pipelines/ diff --git a/datapackage_pipelines_migdar/flows/dataset_assets.py b/datapackage_pipelines_migdar/flows/dataset_assets.py new file mode 100644 index 0000000..108ce60 --- /dev/null +++ b/datapackage_pipelines_migdar/flows/dataset_assets.py @@ -0,0 +1,31 @@ +import os +import subprocess + +import dataflows as DF + +SCREENSHOT = os.path.join(os.path.dirname(__file__), 'node', 'screenshot.js') + + +def do_screenshot(): + def func(rows): + for row in rows: + doc_id = row['doc_id'] + url = f'https://yodaat.org/card/{doc_id}' + outpath = os.path.join('data', os.path.dirname(doc_id)) + os.makedirs(outpath, exist_ok=True) + outpath = os.path.join('data', doc_id + '.png') + subprocess.call(['node', SCREENSHOT, url, outpath, '.card']) + return [] + return func + + +def flow(*_, path='data/datasets_in_es'): + return DF.Flow( + DF.load('{}/datapackage.json'.format(path)), + do_screenshot(), + DF.update_resource(-1, **{'dpp:streaming': True}) + ) + + +if __name__ == '__main__': + flow(path='https://api.yodaat.org/data/datasets_in_es').process() \ No newline at end of file diff --git a/datapackage_pipelines_migdar/flows/node/.gitignore b/datapackage_pipelines_migdar/flows/node/.gitignore new file mode 100644 index 0000000..40b878d --- /dev/null +++ b/datapackage_pipelines_migdar/flows/node/.gitignore @@ -0,0 +1 @@ +node_modules/ \ No newline at end of file diff --git a/datapackage_pipelines_migdar/flows/node/dataset.png b/datapackage_pipelines_migdar/flows/node/dataset.png new file mode 100644 index 0000000..f73f17e Binary files /dev/null and b/datapackage_pipelines_migdar/flows/node/dataset.png differ diff --git a/datapackage_pipelines_migdar/flows/node/package-lock.json b/datapackage_pipelines_migdar/flows/node/package-lock.json new file mode 100644 index 0000000..0ac3b16 --- /dev/null +++ b/datapackage_pipelines_migdar/flows/node/package-lock.json @@ -0,0 +1,315 @@ +{ + "requires": true, + "lockfileVersion": 1, + "dependencies": { + "agent-base": { + "version": "4.3.0", + "resolved": "https://registry.npmjs.org/agent-base/-/agent-base-4.3.0.tgz", + "integrity": "sha512-salcGninV0nPrwpGNn4VTXBb1SOuXQBiqbrNXoeizJsHrsL6ERFM2Ne3JUSBWRE6aeNJI2ROP/WEEIDUiDe3cg==", + "requires": { + "es6-promisify": "^5.0.0" + } + }, + "async-limiter": { + "version": "1.0.1", + "resolved": "https://registry.npmjs.org/async-limiter/-/async-limiter-1.0.1.tgz", + "integrity": "sha512-csOlWGAcRFJaI6m+F2WKdnMKr4HhdhFVBk0H/QbJFMCr+uO2kwohwXQPxw/9OCxp05r5ghVBFSyioixx3gfkNQ==" + }, + "balanced-match": { + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/balanced-match/-/balanced-match-1.0.0.tgz", + "integrity": "sha1-ibTRmasr7kneFk6gK4nORi1xt2c=" + }, + "brace-expansion": { + "version": "1.1.11", + "resolved": "https://registry.npmjs.org/brace-expansion/-/brace-expansion-1.1.11.tgz", + "integrity": "sha512-iCuPHDFgrHX7H2vEI/5xpz07zSHB00TpugqhmYtVmMO6518mCuRMoOYFldEBl0g187ufozdaHgWKcYFb61qGiA==", + "requires": { + "balanced-match": "^1.0.0", + "concat-map": "0.0.1" + } + }, + "buffer-from": { + "version": "1.1.1", + "resolved": "https://registry.npmjs.org/buffer-from/-/buffer-from-1.1.1.tgz", + "integrity": "sha512-MQcXEUbCKtEo7bhqEs6560Hyd4XaovZlO/k9V3hjVUF/zwW7KBVdSK4gIt/bzwS9MbR5qob+F5jusZsb0YQK2A==" + }, + "concat-map": { + "version": "0.0.1", + "resolved": "https://registry.npmjs.org/concat-map/-/concat-map-0.0.1.tgz", + "integrity": "sha1-2Klr13/Wjfd5OnMDajug1UBdR3s=" + }, + "concat-stream": { + "version": "1.6.2", + "resolved": "https://registry.npmjs.org/concat-stream/-/concat-stream-1.6.2.tgz", + "integrity": "sha512-27HBghJxjiZtIk3Ycvn/4kbJk/1uZuJFfuPEns6LaEvpvG1f0hTea8lilrouyo9mVc2GWdcEZ8OLoGmSADlrCw==", + "requires": { + "buffer-from": "^1.0.0", + "inherits": "^2.0.3", + "readable-stream": "^2.2.2", + "typedarray": "^0.0.6" + } + }, + "core-util-is": { + "version": "1.0.2", + "resolved": "https://registry.npmjs.org/core-util-is/-/core-util-is-1.0.2.tgz", + "integrity": "sha1-tf1UIgqivFq1eqtxQMlAdUUDwac=" + }, + "debug": { + "version": "4.1.1", + "resolved": "https://registry.npmjs.org/debug/-/debug-4.1.1.tgz", + "integrity": "sha512-pYAIzeRo8J6KPEaJ0VWOh5Pzkbw/RetuzehGM7QRRX5he4fPHx2rdKMB256ehJCkX+XRQm16eZLqLNS8RSZXZw==", + "requires": { + "ms": "^2.1.1" + } + }, + "es6-promise": { + "version": "4.2.8", + "resolved": "https://registry.npmjs.org/es6-promise/-/es6-promise-4.2.8.tgz", + "integrity": "sha512-HJDGx5daxeIvxdBxvG2cb9g4tEvwIk3i8+nhX0yGrYmZUzbkdg8QbDevheDB8gd0//uPj4c1EQua8Q+MViT0/w==" + }, + "es6-promisify": { + "version": "5.0.0", + "resolved": "https://registry.npmjs.org/es6-promisify/-/es6-promisify-5.0.0.tgz", + "integrity": "sha1-UQnWLz5W6pZ8S2NQWu8IKRyKUgM=", + "requires": { + "es6-promise": "^4.0.3" + } + }, + "extract-zip": { + "version": "1.6.7", + "resolved": "https://registry.npmjs.org/extract-zip/-/extract-zip-1.6.7.tgz", + "integrity": "sha1-qEC0uK9kAyZMjbV/Txp0Mz74H+k=", + "requires": { + "concat-stream": "1.6.2", + "debug": "2.6.9", + "mkdirp": "0.5.1", + "yauzl": "2.4.1" + }, + "dependencies": { + "debug": { + "version": "2.6.9", + "resolved": "https://registry.npmjs.org/debug/-/debug-2.6.9.tgz", + "integrity": "sha512-bC7ElrdJaJnPbAP+1EotYvqZsb3ecl5wi6Bfi6BJTUcNowp6cvspg0jXznRTKDjm/E7AdgFBVeAPVMNcKGsHMA==", + "requires": { + "ms": "2.0.0" + } + }, + "ms": { + "version": "2.0.0", + "resolved": "https://registry.npmjs.org/ms/-/ms-2.0.0.tgz", + "integrity": "sha1-VgiurfwAvmwpAd9fmGF4jeDVl8g=" + } + } + }, + "fd-slicer": { + "version": "1.0.1", + "resolved": "https://registry.npmjs.org/fd-slicer/-/fd-slicer-1.0.1.tgz", + "integrity": "sha1-i1vL2ewyfFBBv5qwI/1nUPEXfmU=", + "requires": { + "pend": "~1.2.0" + } + }, + "fs.realpath": { + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/fs.realpath/-/fs.realpath-1.0.0.tgz", + "integrity": "sha1-FQStJSMVjKpA20onh8sBQRmU6k8=" + }, + "glob": { + "version": "7.1.5", + "resolved": "https://registry.npmjs.org/glob/-/glob-7.1.5.tgz", + "integrity": "sha512-J9dlskqUXK1OeTOYBEn5s8aMukWMwWfs+rPTn/jn50Ux4MNXVhubL1wu/j2t+H4NVI+cXEcCaYellqaPVGXNqQ==", + "requires": { + "fs.realpath": "^1.0.0", + "inflight": "^1.0.4", + "inherits": "2", + "minimatch": "^3.0.4", + "once": "^1.3.0", + "path-is-absolute": "^1.0.0" + } + }, + "https-proxy-agent": { + "version": "3.0.1", + "resolved": "https://registry.npmjs.org/https-proxy-agent/-/https-proxy-agent-3.0.1.tgz", + "integrity": "sha512-+ML2Rbh6DAuee7d07tYGEKOEi2voWPUGan+ExdPbPW6Z3svq+JCqr0v8WmKPOkz1vOVykPCBSuobe7G8GJUtVg==", + "requires": { + "agent-base": "^4.3.0", + "debug": "^3.1.0" + }, + "dependencies": { + "debug": { + "version": "3.2.6", + "resolved": "https://registry.npmjs.org/debug/-/debug-3.2.6.tgz", + "integrity": "sha512-mel+jf7nrtEl5Pn1Qx46zARXKDpBbvzezse7p7LqINmdoIk8PYP5SySaxEmYv6TZ0JyEKA1hsCId6DIhgITtWQ==", + "requires": { + "ms": "^2.1.1" + } + } + } + }, + "inflight": { + "version": "1.0.6", + "resolved": "https://registry.npmjs.org/inflight/-/inflight-1.0.6.tgz", + "integrity": "sha1-Sb1jMdfQLQwJvJEKEHW6gWW1bfk=", + "requires": { + "once": "^1.3.0", + "wrappy": "1" + } + }, + "inherits": { + "version": "2.0.4", + "resolved": "https://registry.npmjs.org/inherits/-/inherits-2.0.4.tgz", + "integrity": "sha512-k/vGaX4/Yla3WzyMCvTQOXYeIHvqOKtnqBduzTHpzpQZzAskKMhZ2K+EnBiSM9zGSoIFeMpXKxa4dYeZIQqewQ==" + }, + "isarray": { + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/isarray/-/isarray-1.0.0.tgz", + "integrity": "sha1-u5NdSFgsuhaMBoNJV6VKPgcSTxE=" + }, + "mime": { + "version": "2.4.4", + "resolved": "https://registry.npmjs.org/mime/-/mime-2.4.4.tgz", + "integrity": "sha512-LRxmNwziLPT828z+4YkNzloCFC2YM4wrB99k+AV5ZbEyfGNWfG8SO1FUXLmLDBSo89NrJZ4DIWeLjy1CHGhMGA==" + }, + "minimatch": { + "version": "3.0.4", + "resolved": "https://registry.npmjs.org/minimatch/-/minimatch-3.0.4.tgz", + "integrity": "sha512-yJHVQEhyqPLUTgt9B83PXu6W3rx4MvvHvSUvToogpwoGDOUQ+yDrR0HRot+yOCdCO7u4hX3pWft6kWBBcqh0UA==", + "requires": { + "brace-expansion": "^1.1.7" + } + }, + "minimist": { + "version": "0.0.8", + "resolved": "https://registry.npmjs.org/minimist/-/minimist-0.0.8.tgz", + "integrity": "sha1-hX/Kv8M5fSYluCKCYuhqp6ARsF0=" + }, + "mkdirp": { + "version": "0.5.1", + "resolved": "https://registry.npmjs.org/mkdirp/-/mkdirp-0.5.1.tgz", + "integrity": "sha1-MAV0OOrGz3+MR2fzhkjWaX11yQM=", + "requires": { + "minimist": "0.0.8" + } + }, + "ms": { + "version": "2.1.2", + "resolved": "https://registry.npmjs.org/ms/-/ms-2.1.2.tgz", + "integrity": "sha512-sGkPx+VjMtmA6MX27oA4FBFELFCZZ4S4XqeGOXCv68tT+jb3vk/RyaKWP0PTKyWtmLSM0b+adUTEvbs1PEaH2w==" + }, + "once": { + "version": "1.4.0", + "resolved": "https://registry.npmjs.org/once/-/once-1.4.0.tgz", + "integrity": "sha1-WDsap3WWHUsROsF9nFC6753Xa9E=", + "requires": { + "wrappy": "1" + } + }, + "path-is-absolute": { + "version": "1.0.1", + "resolved": "https://registry.npmjs.org/path-is-absolute/-/path-is-absolute-1.0.1.tgz", + "integrity": "sha1-F0uSaHNVNP+8es5r9TpanhtcX18=" + }, + "pend": { + "version": "1.2.0", + "resolved": "https://registry.npmjs.org/pend/-/pend-1.2.0.tgz", + "integrity": "sha1-elfrVQpng/kRUzH89GY9XI4AelA=" + }, + "process-nextick-args": { + "version": "2.0.1", + "resolved": "https://registry.npmjs.org/process-nextick-args/-/process-nextick-args-2.0.1.tgz", + "integrity": "sha512-3ouUOpQhtgrbOa17J7+uxOTpITYWaGP7/AhoR3+A+/1e9skrzelGi/dXzEYyvbxubEF6Wn2ypscTKiKJFFn1ag==" + }, + "progress": { + "version": "2.0.3", + "resolved": "https://registry.npmjs.org/progress/-/progress-2.0.3.tgz", + "integrity": "sha512-7PiHtLll5LdnKIMw100I+8xJXR5gW2QwWYkT6iJva0bXitZKa/XMrSbdmg3r2Xnaidz9Qumd0VPaMrZlF9V9sA==" + }, + "proxy-from-env": { + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/proxy-from-env/-/proxy-from-env-1.0.0.tgz", + "integrity": "sha1-M8UDmPcOp+uW0h97gXYwpVeRx+4=" + }, + "puppeteer": { + "version": "2.0.0", + "resolved": "https://registry.npmjs.org/puppeteer/-/puppeteer-2.0.0.tgz", + "integrity": "sha512-t3MmTWzQxPRP71teU6l0jX47PHXlc4Z52sQv4LJQSZLq1ttkKS2yGM3gaI57uQwZkNaoGd0+HPPMELZkcyhlqA==", + "requires": { + "debug": "^4.1.0", + "extract-zip": "^1.6.6", + "https-proxy-agent": "^3.0.0", + "mime": "^2.0.3", + "progress": "^2.0.1", + "proxy-from-env": "^1.0.0", + "rimraf": "^2.6.1", + "ws": "^6.1.0" + } + }, + "readable-stream": { + "version": "2.3.6", + "resolved": "https://registry.npmjs.org/readable-stream/-/readable-stream-2.3.6.tgz", + "integrity": "sha512-tQtKA9WIAhBF3+VLAseyMqZeBjW0AHJoxOtYqSUZNJxauErmLbVm2FW1y+J/YA9dUrAC39ITejlZWhVIwawkKw==", + "requires": { + "core-util-is": "~1.0.0", + "inherits": "~2.0.3", + "isarray": "~1.0.0", + "process-nextick-args": "~2.0.0", + "safe-buffer": "~5.1.1", + "string_decoder": "~1.1.1", + "util-deprecate": "~1.0.1" + } + }, + "rimraf": { + "version": "2.7.1", + "resolved": "https://registry.npmjs.org/rimraf/-/rimraf-2.7.1.tgz", + "integrity": "sha512-uWjbaKIK3T1OSVptzX7Nl6PvQ3qAGtKEtVRjRuazjfL3Bx5eI409VZSqgND+4UNnmzLVdPj9FqFJNPqBZFve4w==", + "requires": { + "glob": "^7.1.3" + } + }, + "safe-buffer": { + "version": "5.1.2", + "resolved": "https://registry.npmjs.org/safe-buffer/-/safe-buffer-5.1.2.tgz", + "integrity": "sha512-Gd2UZBJDkXlY7GbJxfsE8/nvKkUEU1G38c1siN6QP6a9PT9MmHB8GnpscSmMJSoF8LOIrt8ud/wPtojys4G6+g==" + }, + "string_decoder": { + "version": "1.1.1", + "resolved": "https://registry.npmjs.org/string_decoder/-/string_decoder-1.1.1.tgz", + "integrity": "sha512-n/ShnvDi6FHbbVfviro+WojiFzv+s8MPMHBczVePfUpDJLwoLT0ht1l4YwBCbi8pJAveEEdnkHyPyTP/mzRfwg==", + "requires": { + "safe-buffer": "~5.1.0" + } + }, + "typedarray": { + "version": "0.0.6", + "resolved": "https://registry.npmjs.org/typedarray/-/typedarray-0.0.6.tgz", + "integrity": "sha1-hnrHTjhkGHsdPUfZlqeOxciDB3c=" + }, + "util-deprecate": { + "version": "1.0.2", + "resolved": "https://registry.npmjs.org/util-deprecate/-/util-deprecate-1.0.2.tgz", + "integrity": "sha1-RQ1Nyfpw3nMnYvvS1KKJgUGaDM8=" + }, + "wrappy": { + "version": "1.0.2", + "resolved": "https://registry.npmjs.org/wrappy/-/wrappy-1.0.2.tgz", + "integrity": "sha1-tSQ9jz7BqjXxNkYFvA0QNuMKtp8=" + }, + "ws": { + "version": "6.2.1", + "resolved": "https://registry.npmjs.org/ws/-/ws-6.2.1.tgz", + "integrity": "sha512-GIyAXC2cB7LjvpgMt9EKS2ldqr0MTrORaleiOno6TweZ6r3TKtoFQWay/2PceJ3RuBasOHzXNn5Lrw1X0bEjqA==", + "requires": { + "async-limiter": "~1.0.0" + } + }, + "yauzl": { + "version": "2.4.1", + "resolved": "https://registry.npmjs.org/yauzl/-/yauzl-2.4.1.tgz", + "integrity": "sha1-lSj0QtqxsihOWLQ3m7GU4i4MQAU=", + "requires": { + "fd-slicer": "~1.0.1" + } + } + } +} diff --git a/datapackage_pipelines_migdar/flows/node/screenshot.js b/datapackage_pipelines_migdar/flows/node/screenshot.js new file mode 100644 index 0000000..c170658 --- /dev/null +++ b/datapackage_pipelines_migdar/flows/node/screenshot.js @@ -0,0 +1,23 @@ +const puppeteer = require('puppeteer'); + +(async () => { + const [url, filename, selector] = process.argv.slice(2); + + const browser = await puppeteer.launch({ + headless: true, + executablePath: process.env.CHROME_BIN || null, + args: ['--no-sandbox', '--headless', '--disable-gpu', '--disable-dev-shm-usage'] + }); + const page = await browser.newPage(); + page.setViewport({width: 1300, height: 1200}); + await page.goto(url); + await page.waitForSelector(selector + ' svg'); + + const rect = await page.evaluate(selector => { + const element = document.querySelector(selector); + const {x, y, width, height} = element.getBoundingClientRect(); + return {x, y, width, height}; + }, selector); + await page.screenshot({path: filename, clip: rect}); + await browser.close(); +})(); \ No newline at end of file diff --git a/pipeline-spec.yaml b/pipeline-spec.yaml index a23e5ee..7254e82 100644 --- a/pipeline-spec.yaml +++ b/pipeline-spec.yaml @@ -75,6 +75,12 @@ datasets: pipeline: - flow: datapackage_pipelines_migdar.flows.datasets +dataset-assets: + dependencies: + - pipeline: ./datasets + pipeline: + - flow: datapackage_pipelines_migdar.flows.dataset_assets + zotero: schedule: diff --git a/requirements.txt b/requirements.txt index 9297c39..f53558a 100644 --- a/requirements.txt +++ b/requirements.txt @@ -3,4 +3,4 @@ datapackage-pipelines-elasticsearch dataflows>=0.0.57 gunicorn fuzzywuzzy[speedup] -google-api-python-client \ No newline at end of file +google-api-python-client