From 9c874d342f55c67f26e680f5b2592011fa66508e Mon Sep 17 00:00:00 2001 From: cachatj Date: Sun, 1 Sep 2024 08:11:50 -0400 Subject: [PATCH] adobe PDF Parse --- 0-jc_notebooks/PDF_Extract_Get_Text.ipynb | 447 ++++++++++++++++++++++ 0-jc_notebooks/parsePDF_AdobeAPI.ipynb | 287 ++++++++++++++ 2 files changed, 734 insertions(+) create mode 100644 0-jc_notebooks/PDF_Extract_Get_Text.ipynb create mode 100644 0-jc_notebooks/parsePDF_AdobeAPI.ipynb diff --git a/0-jc_notebooks/PDF_Extract_Get_Text.ipynb b/0-jc_notebooks/PDF_Extract_Get_Text.ipynb new file mode 100644 index 00000000000..8cb52d2969d --- /dev/null +++ b/0-jc_notebooks/PDF_Extract_Get_Text.ipynb @@ -0,0 +1,447 @@ +{ + "nbformat": 4, + "nbformat_minor": 0, + "metadata": { + "colab": { + "provenance": [] + }, + "kernelspec": { + "name": "python3", + "display_name": "Python 3" + }, + "language_info": { + "name": "python" + } + }, + "cells": [ + { + "cell_type": "markdown", + "source": [ + "#PDF Extract Demo\n", + "\n", + "To use this notebook, please upload your credentials, both the JSON file and private.key, to the content directory. Click on the folder icon at left, click on the .. to show folders, click on the content folder, and then click the three dots to the right of the content folder and click Upload. You can upload both by selecting both.\n", + "\n", + "Then execute the first cell to install the SDK. You'll see an error about pip's dependency resolver. You can ignore this error, but you do need to restart the runtime by clicking the button at the bottom of the pip install output." + ], + "metadata": { + "id": "sijtZnY7_kUV" + } + }, + { + "cell_type": "code", + "source": [ + "!pip install pdfservices-sdk" + ], + "metadata": { + "id": "NJKQCjL5cVZK", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 1000 + }, + "outputId": "f26bab8a-aa12-4169-a12e-37b3c9f67a4b" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/\n", + "Collecting pdfservices-sdk\n", + " Downloading pdfservices_sdk-1.0.2-py3-none-any.whl (74 kB)\n", + "\u001b[K |████████████████████████████████| 74 kB 2.0 MB/s \n", + "\u001b[?25hCollecting packaging==20.9\n", + " Downloading packaging-20.9-py2.py3-none-any.whl (40 kB)\n", + "\u001b[K |████████████████████████████████| 40 kB 5.4 MB/s \n", + "\u001b[?25hCollecting certifi==2020.12.5\n", + " Downloading certifi-2020.12.5-py2.py3-none-any.whl (147 kB)\n", + "\u001b[K |████████████████████████████████| 147 kB 13.0 MB/s \n", + "\u001b[?25hCollecting build==0.3.0\n", + " Downloading build-0.3.0-py2.py3-none-any.whl (13 kB)\n", + "Collecting requests-toolbelt==0.9.1\n", + " Downloading requests_toolbelt-0.9.1-py2.py3-none-any.whl (54 kB)\n", + "\u001b[K |████████████████████████████████| 54 kB 2.6 MB/s \n", + "\u001b[?25hCollecting cryptography==3.4.6\n", + " Downloading cryptography-3.4.6-cp36-abi3-manylinux2014_x86_64.whl (3.2 MB)\n", + "\u001b[K |████████████████████████████████| 3.2 MB 44.6 MB/s \n", + "\u001b[?25hCollecting pyparsing==2.4.7\n", + " Downloading pyparsing-2.4.7-py2.py3-none-any.whl (67 kB)\n", + "\u001b[K |████████████████████████████████| 67 kB 5.9 MB/s \n", + "\u001b[?25hRequirement already satisfied: six==1.15.0 in /usr/local/lib/python3.7/dist-packages (from pdfservices-sdk) (1.15.0)\n", + "Collecting polling2==0.4.6\n", + " Downloading polling2-0.4.6-py2.py3-none-any.whl (6.3 kB)\n", + "Collecting PyYAML==5.4.1\n", + " Downloading PyYAML-5.4.1-cp37-cp37m-manylinux1_x86_64.whl (636 kB)\n", + "\u001b[K |████████████████████████████████| 636 kB 58.2 MB/s \n", + "\u001b[?25hCollecting urllib3==1.26.8\n", + " Downloading urllib3-1.26.8-py2.py3-none-any.whl (138 kB)\n", + "\u001b[K |████████████████████████████████| 138 kB 73.7 MB/s \n", + "\u001b[?25hCollecting PyJWT==2.0.1\n", + " Downloading PyJWT-2.0.1-py3-none-any.whl (15 kB)\n", + "Collecting pycparser==2.20\n", + " Downloading pycparser-2.20-py2.py3-none-any.whl (112 kB)\n", + "\u001b[K |████████████████████████████████| 112 kB 72.3 MB/s \n", + "\u001b[?25hCollecting definitions==0.2.0\n", + " Downloading definitions-0.2.0.tar.gz (6.4 kB)\n", + "Collecting requests==2.25.1\n", + " Downloading requests-2.25.1-py2.py3-none-any.whl (61 kB)\n", + "\u001b[K |████████████████████████████████| 61 kB 7.2 MB/s \n", + "\u001b[?25hRequirement already satisfied: toml==0.10.2 in /usr/local/lib/python3.7/dist-packages (from pdfservices-sdk) (0.10.2)\n", + "Collecting Pygments==2.8.0\n", + " Downloading Pygments-2.8.0-py3-none-any.whl (983 kB)\n", + "\u001b[K |████████████████████████████████| 983 kB 49.6 MB/s \n", + "\u001b[?25hCollecting pep517==0.9.1\n", + " Downloading pep517-0.9.1-py2.py3-none-any.whl (18 kB)\n", + "Collecting multipart==0.2.4\n", + " Downloading multipart-0.2.4-py3-none-any.whl (7.4 kB)\n", + "Collecting polling==0.3.1\n", + " Downloading polling-0.3.1-py3-none-any.whl (4.1 kB)\n", + "Collecting cffi==1.14.5\n", + " Downloading cffi-1.14.5-cp37-cp37m-manylinux1_x86_64.whl (402 kB)\n", + "\u001b[K |████████████████████████████████| 402 kB 59.9 MB/s \n", + "\u001b[?25hRequirement already satisfied: importlib-metadata in /usr/local/lib/python3.7/dist-packages (from build==0.3.0->pdfservices-sdk) (4.12.0)\n", + "Requirement already satisfied: zipp in /usr/local/lib/python3.7/dist-packages (from pep517==0.9.1->pdfservices-sdk) (3.8.1)\n", + "Requirement already satisfied: idna<3,>=2.5 in /usr/local/lib/python3.7/dist-packages (from requests==2.25.1->pdfservices-sdk) (2.10)\n", + "Requirement already satisfied: chardet<5,>=3.0.2 in /usr/local/lib/python3.7/dist-packages (from requests==2.25.1->pdfservices-sdk) (3.0.4)\n", + "Requirement already satisfied: typing-extensions>=3.6.4 in /usr/local/lib/python3.7/dist-packages (from importlib-metadata->build==0.3.0->pdfservices-sdk) (4.1.1)\n", + "Building wheels for collected packages: definitions\n", + " Building wheel for definitions (setup.py) ... \u001b[?25l\u001b[?25hdone\n", + " Created wheel for definitions: filename=definitions-0.2.0-py3-none-any.whl size=4574 sha256=e8531f7aee0068619b43356b1bb8ffbfb364a5296098c45eb2f9bf3138cfd68e\n", + " Stored in directory: /root/.cache/pip/wheels/a5/52/9b/118a78668501262218f428b9ed49615cf93499038dc239de53\n", + "Successfully built definitions\n", + "Installing collected packages: urllib3, pyparsing, pycparser, certifi, requests, PyYAML, pep517, packaging, cffi, requests-toolbelt, PyJWT, Pygments, polling2, polling, multipart, definitions, cryptography, build, pdfservices-sdk\n", + " Attempting uninstall: urllib3\n", + " Found existing installation: urllib3 1.24.3\n", + " Uninstalling urllib3-1.24.3:\n", + " Successfully uninstalled urllib3-1.24.3\n", + " Attempting uninstall: pyparsing\n", + " Found existing installation: pyparsing 3.0.9\n", + " Uninstalling pyparsing-3.0.9:\n", + " Successfully uninstalled pyparsing-3.0.9\n", + " Attempting uninstall: pycparser\n", + " Found existing installation: pycparser 2.21\n", + " Uninstalling pycparser-2.21:\n", + " Successfully uninstalled pycparser-2.21\n", + " Attempting uninstall: certifi\n", + " Found existing installation: certifi 2022.6.15\n", + " Uninstalling certifi-2022.6.15:\n", + " Successfully uninstalled certifi-2022.6.15\n", + " Attempting uninstall: requests\n", + " Found existing installation: requests 2.23.0\n", + " Uninstalling requests-2.23.0:\n", + " Successfully uninstalled requests-2.23.0\n", + " Attempting uninstall: PyYAML\n", + " Found existing installation: PyYAML 6.0\n", + " Uninstalling PyYAML-6.0:\n", + " Successfully uninstalled PyYAML-6.0\n", + " Attempting uninstall: pep517\n", + " Found existing installation: pep517 0.13.0\n", + " Uninstalling pep517-0.13.0:\n", + " Successfully uninstalled pep517-0.13.0\n", + " Attempting uninstall: packaging\n", + " Found existing installation: packaging 21.3\n", + " Uninstalling packaging-21.3:\n", + " Successfully uninstalled packaging-21.3\n", + " Attempting uninstall: cffi\n", + " Found existing installation: cffi 1.15.1\n", + " Uninstalling cffi-1.15.1:\n", + " Successfully uninstalled cffi-1.15.1\n", + " Attempting uninstall: Pygments\n", + " Found existing installation: Pygments 2.6.1\n", + " Uninstalling Pygments-2.6.1:\n", + " Successfully uninstalled Pygments-2.6.1\n", + "\u001b[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.\n", + "ipython 7.9.0 requires jedi>=0.10, which is not installed.\u001b[0m\n", + "Successfully installed PyJWT-2.0.1 PyYAML-5.4.1 Pygments-2.8.0 build-0.3.0 certifi-2020.12.5 cffi-1.14.5 cryptography-3.4.6 definitions-0.2.0 multipart-0.2.4 packaging-20.9 pdfservices-sdk-1.0.2 pep517-0.9.1 polling-0.3.1 polling2-0.4.6 pycparser-2.20 pyparsing-2.4.7 requests-2.25.1 requests-toolbelt-0.9.1 urllib3-1.26.8\n" + ] + }, + { + "output_type": "display_data", + "data": { + "application/vnd.colab-display-data+json": { + "pip_warning": { + "packages": [ + "cffi", + "pygments", + "pyparsing" + ] + } + } + }, + "metadata": {} + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "Run this cell, then choose a file to upload to the Extract service. If you get an error like this:\n", + "```\n", + "Cannot read property '_uploadFiles' of undefined\n", + "```\n", + "then you'll need to enable third-party cookies. On Chrome, go to chrome://settings/content/cookies." + ], + "metadata": { + "id": "yrFCZeMNSjl6" + } + }, + { + "cell_type": "code", + "source": [ + "from google.colab import files\n", + "input = files.upload()\n", + "filename = list(input.keys())[0]" + ], + "metadata": { + "id": "a2mBY71Uuhrj", + "colab": { + "resources": { + "http://localhost:8080/nbextensions/google.colab/files.js": { + "data": "Ly8gQ29weXJpZ2h0IDIwMTcgR29vZ2xlIExMQwovLwovLyBMaWNlbnNlZCB1bmRlciB0aGUgQXBhY2hlIExpY2Vuc2UsIFZlcnNpb24gMi4wICh0aGUgIkxpY2Vuc2UiKTsKLy8geW91IG1heSBub3QgdXNlIHRoaXMgZmlsZSBleGNlcHQgaW4gY29tcGxpYW5jZSB3aXRoIHRoZSBMaWNlbnNlLgovLyBZb3UgbWF5IG9idGFpbiBhIGNvcHkgb2YgdGhlIExpY2Vuc2UgYXQKLy8KLy8gICAgICBodHRwOi8vd3d3LmFwYWNoZS5vcmcvbGljZW5zZXMvTElDRU5TRS0yLjAKLy8KLy8gVW5sZXNzIHJlcXVpcmVkIGJ5IGFwcGxpY2FibGUgbGF3IG9yIGFncmVlZCB0byBpbiB3cml0aW5nLCBzb2Z0d2FyZQovLyBkaXN0cmlidXRlZCB1bmRlciB0aGUgTGljZW5zZSBpcyBkaXN0cmlidXRlZCBvbiBhbiAiQVMgSVMiIEJBU0lTLAovLyBXSVRIT1VUIFdBUlJBTlRJRVMgT1IgQ09ORElUSU9OUyBPRiBBTlkgS0lORCwgZWl0aGVyIGV4cHJlc3Mgb3IgaW1wbGllZC4KLy8gU2VlIHRoZSBMaWNlbnNlIGZvciB0aGUgc3BlY2lmaWMgbGFuZ3VhZ2UgZ292ZXJuaW5nIHBlcm1pc3Npb25zIGFuZAovLyBsaW1pdGF0aW9ucyB1bmRlciB0aGUgTGljZW5zZS4KCi8qKgogKiBAZmlsZW92ZXJ2aWV3IEhlbHBlcnMgZm9yIGdvb2dsZS5jb2xhYiBQeXRob24gbW9kdWxlLgogKi8KKGZ1bmN0aW9uKHNjb3BlKSB7CmZ1bmN0aW9uIHNwYW4odGV4dCwgc3R5bGVBdHRyaWJ1dGVzID0ge30pIHsKICBjb25zdCBlbGVtZW50ID0gZG9jdW1lbnQuY3JlYXRlRWxlbWVudCgnc3BhbicpOwogIGVsZW1lbnQudGV4dENvbnRlbnQgPSB0ZXh0OwogIGZvciAoY29uc3Qga2V5IG9mIE9iamVjdC5rZXlzKHN0eWxlQXR0cmlidXRlcykpIHsKICAgIGVsZW1lbnQuc3R5bGVba2V5XSA9IHN0eWxlQXR0cmlidXRlc1trZXldOwogIH0KICByZXR1cm4gZWxlbWVudDsKfQoKLy8gTWF4IG51bWJlciBvZiBieXRlcyB3aGljaCB3aWxsIGJlIHVwbG9hZGVkIGF0IGEgdGltZS4KY29uc3QgTUFYX1BBWUxPQURfU0laRSA9IDEwMCAqIDEwMjQ7CgpmdW5jdGlvbiBfdXBsb2FkRmlsZXMoaW5wdXRJZCwgb3V0cHV0SWQpIHsKICBjb25zdCBzdGVwcyA9IHVwbG9hZEZpbGVzU3RlcChpbnB1dElkLCBvdXRwdXRJZCk7CiAgY29uc3Qgb3V0cHV0RWxlbWVudCA9IGRvY3VtZW50LmdldEVsZW1lbnRCeUlkKG91dHB1dElkKTsKICAvLyBDYWNoZSBzdGVwcyBvbiB0aGUgb3V0cHV0RWxlbWVudCB0byBtYWtlIGl0IGF2YWlsYWJsZSBmb3IgdGhlIG5leHQgY2FsbAogIC8vIHRvIHVwbG9hZEZpbGVzQ29udGludWUgZnJvbSBQeXRob24uCiAgb3V0cHV0RWxlbWVudC5zdGVwcyA9IHN0ZXBzOwoKICByZXR1cm4gX3VwbG9hZEZpbGVzQ29udGludWUob3V0cHV0SWQpOwp9CgovLyBUaGlzIGlzIHJvdWdobHkgYW4gYXN5bmMgZ2VuZXJhdG9yIChub3Qgc3VwcG9ydGVkIGluIHRoZSBicm93c2VyIHlldCksCi8vIHdoZXJlIHRoZXJlIGFyZSBtdWx0aXBsZSBhc3luY2hyb25vdXMgc3RlcHMgYW5kIHRoZSBQeXRob24gc2lkZSBpcyBnb2luZwovLyB0byBwb2xsIGZvciBjb21wbGV0aW9uIG9mIGVhY2ggc3RlcC4KLy8gVGhpcyB1c2VzIGEgUHJvbWlzZSB0byBibG9jayB0aGUgcHl0aG9uIHNpZGUgb24gY29tcGxldGlvbiBvZiBlYWNoIHN0ZXAsCi8vIHRoZW4gcGFzc2VzIHRoZSByZXN1bHQgb2YgdGhlIHByZXZpb3VzIHN0ZXAgYXMgdGhlIGlucHV0IHRvIHRoZSBuZXh0IHN0ZXAuCmZ1bmN0aW9uIF91cGxvYWRGaWxlc0NvbnRpbnVlKG91dHB1dElkKSB7CiAgY29uc3Qgb3V0cHV0RWxlbWVudCA9IGRvY3VtZW50LmdldEVsZW1lbnRCeUlkKG91dHB1dElkKTsKICBjb25zdCBzdGVwcyA9IG91dHB1dEVsZW1lbnQuc3RlcHM7CgogIGNvbnN0IG5leHQgPSBzdGVwcy5uZXh0KG91dHB1dEVsZW1lbnQubGFzdFByb21pc2VWYWx1ZSk7CiAgcmV0dXJuIFByb21pc2UucmVzb2x2ZShuZXh0LnZhbHVlLnByb21pc2UpLnRoZW4oKHZhbHVlKSA9PiB7CiAgICAvLyBDYWNoZSB0aGUgbGFzdCBwcm9taXNlIHZhbHVlIHRvIG1ha2UgaXQgYXZhaWxhYmxlIHRvIHRoZSBuZXh0CiAgICAvLyBzdGVwIG9mIHRoZSBnZW5lcmF0b3IuCiAgICBvdXRwdXRFbGVtZW50Lmxhc3RQcm9taXNlVmFsdWUgPSB2YWx1ZTsKICAgIHJldHVybiBuZXh0LnZhbHVlLnJlc3BvbnNlOwogIH0pOwp9CgovKioKICogR2VuZXJhdG9yIGZ1bmN0aW9uIHdoaWNoIGlzIGNhbGxlZCBiZXR3ZWVuIGVhY2ggYXN5bmMgc3RlcCBvZiB0aGUgdXBsb2FkCiAqIHByb2Nlc3MuCiAqIEBwYXJhbSB7c3RyaW5nfSBpbnB1dElkIEVsZW1lbnQgSUQgb2YgdGhlIGlucHV0IGZpbGUgcGlja2VyIGVsZW1lbnQuCiAqIEBwYXJhbSB7c3RyaW5nfSBvdXRwdXRJZCBFbGVtZW50IElEIG9mIHRoZSBvdXRwdXQgZGlzcGxheS4KICogQHJldHVybiB7IUl0ZXJhYmxlPCFPYmplY3Q+fSBJdGVyYWJsZSBvZiBuZXh0IHN0ZXBzLgogKi8KZnVuY3Rpb24qIHVwbG9hZEZpbGVzU3RlcChpbnB1dElkLCBvdXRwdXRJZCkgewogIGNvbnN0IGlucHV0RWxlbWVudCA9IGRvY3VtZW50LmdldEVsZW1lbnRCeUlkKGlucHV0SWQpOwogIGlucHV0RWxlbWVudC5kaXNhYmxlZCA9IGZhbHNlOwoKICBjb25zdCBvdXRwdXRFbGVtZW50ID0gZG9jdW1lbnQuZ2V0RWxlbWVudEJ5SWQob3V0cHV0SWQpOwogIG91dHB1dEVsZW1lbnQuaW5uZXJIVE1MID0gJyc7CgogIGNvbnN0IHBpY2tlZFByb21pc2UgPSBuZXcgUHJvbWlzZSgocmVzb2x2ZSkgPT4gewogICAgaW5wdXRFbGVtZW50LmFkZEV2ZW50TGlzdGVuZXIoJ2NoYW5nZScsIChlKSA9PiB7CiAgICAgIHJlc29sdmUoZS50YXJnZXQuZmlsZXMpOwogICAgfSk7CiAgfSk7CgogIGNvbnN0IGNhbmNlbCA9IGRvY3VtZW50LmNyZWF0ZUVsZW1lbnQoJ2J1dHRvbicpOwogIGlucHV0RWxlbWVudC5wYXJlbnRFbGVtZW50LmFwcGVuZENoaWxkKGNhbmNlbCk7CiAgY2FuY2VsLnRleHRDb250ZW50ID0gJ0NhbmNlbCB1cGxvYWQnOwogIGNvbnN0IGNhbmNlbFByb21pc2UgPSBuZXcgUHJvbWlzZSgocmVzb2x2ZSkgPT4gewogICAgY2FuY2VsLm9uY2xpY2sgPSAoKSA9PiB7CiAgICAgIHJlc29sdmUobnVsbCk7CiAgICB9OwogIH0pOwoKICAvLyBXYWl0IGZvciB0aGUgdXNlciB0byBwaWNrIHRoZSBmaWxlcy4KICBjb25zdCBmaWxlcyA9IHlpZWxkIHsKICAgIHByb21pc2U6IFByb21pc2UucmFjZShbcGlja2VkUHJvbWlzZSwgY2FuY2VsUHJvbWlzZV0pLAogICAgcmVzcG9uc2U6IHsKICAgICAgYWN0aW9uOiAnc3RhcnRpbmcnLAogICAgfQogIH07CgogIGNhbmNlbC5yZW1vdmUoKTsKCiAgLy8gRGlzYWJsZSB0aGUgaW5wdXQgZWxlbWVudCBzaW5jZSBmdXJ0aGVyIHBpY2tzIGFyZSBub3QgYWxsb3dlZC4KICBpbnB1dEVsZW1lbnQuZGlzYWJsZWQgPSB0cnVlOwoKICBpZiAoIWZpbGVzKSB7CiAgICByZXR1cm4gewogICAgICByZXNwb25zZTogewogICAgICAgIGFjdGlvbjogJ2NvbXBsZXRlJywKICAgICAgfQogICAgfTsKICB9CgogIGZvciAoY29uc3QgZmlsZSBvZiBmaWxlcykgewogICAgY29uc3QgbGkgPSBkb2N1bWVudC5jcmVhdGVFbGVtZW50KCdsaScpOwogICAgbGkuYXBwZW5kKHNwYW4oZmlsZS5uYW1lLCB7Zm9udFdlaWdodDogJ2JvbGQnfSkpOwogICAgbGkuYXBwZW5kKHNwYW4oCiAgICAgICAgYCgke2ZpbGUudHlwZSB8fCAnbi9hJ30pIC0gJHtmaWxlLnNpemV9IGJ5dGVzLCBgICsKICAgICAgICBgbGFzdCBtb2RpZmllZDogJHsKICAgICAgICAgICAgZmlsZS5sYXN0TW9kaWZpZWREYXRlID8gZmlsZS5sYXN0TW9kaWZpZWREYXRlLnRvTG9jYWxlRGF0ZVN0cmluZygpIDoKICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgJ24vYSd9IC0gYCkpOwogICAgY29uc3QgcGVyY2VudCA9IHNwYW4oJzAlIGRvbmUnKTsKICAgIGxpLmFwcGVuZENoaWxkKHBlcmNlbnQpOwoKICAgIG91dHB1dEVsZW1lbnQuYXBwZW5kQ2hpbGQobGkpOwoKICAgIGNvbnN0IGZpbGVEYXRhUHJvbWlzZSA9IG5ldyBQcm9taXNlKChyZXNvbHZlKSA9PiB7CiAgICAgIGNvbnN0IHJlYWRlciA9IG5ldyBGaWxlUmVhZGVyKCk7CiAgICAgIHJlYWRlci5vbmxvYWQgPSAoZSkgPT4gewogICAgICAgIHJlc29sdmUoZS50YXJnZXQucmVzdWx0KTsKICAgICAgfTsKICAgICAgcmVhZGVyLnJlYWRBc0FycmF5QnVmZmVyKGZpbGUpOwogICAgfSk7CiAgICAvLyBXYWl0IGZvciB0aGUgZGF0YSB0byBiZSByZWFkeS4KICAgIGxldCBmaWxlRGF0YSA9IHlpZWxkIHsKICAgICAgcHJvbWlzZTogZmlsZURhdGFQcm9taXNlLAogICAgICByZXNwb25zZTogewogICAgICAgIGFjdGlvbjogJ2NvbnRpbnVlJywKICAgICAgfQogICAgfTsKCiAgICAvLyBVc2UgYSBjaHVua2VkIHNlbmRpbmcgdG8gYXZvaWQgbWVzc2FnZSBzaXplIGxpbWl0cy4gU2VlIGIvNjIxMTU2NjAuCiAgICBsZXQgcG9zaXRpb24gPSAwOwogICAgZG8gewogICAgICBjb25zdCBsZW5ndGggPSBNYXRoLm1pbihmaWxlRGF0YS5ieXRlTGVuZ3RoIC0gcG9zaXRpb24sIE1BWF9QQVlMT0FEX1NJWkUpOwogICAgICBjb25zdCBjaHVuayA9IG5ldyBVaW50OEFycmF5KGZpbGVEYXRhLCBwb3NpdGlvbiwgbGVuZ3RoKTsKICAgICAgcG9zaXRpb24gKz0gbGVuZ3RoOwoKICAgICAgY29uc3QgYmFzZTY0ID0gYnRvYShTdHJpbmcuZnJvbUNoYXJDb2RlLmFwcGx5KG51bGwsIGNodW5rKSk7CiAgICAgIHlpZWxkIHsKICAgICAgICByZXNwb25zZTogewogICAgICAgICAgYWN0aW9uOiAnYXBwZW5kJywKICAgICAgICAgIGZpbGU6IGZpbGUubmFtZSwKICAgICAgICAgIGRhdGE6IGJhc2U2NCwKICAgICAgICB9LAogICAgICB9OwoKICAgICAgbGV0IHBlcmNlbnREb25lID0gZmlsZURhdGEuYnl0ZUxlbmd0aCA9PT0gMCA/CiAgICAgICAgICAxMDAgOgogICAgICAgICAgTWF0aC5yb3VuZCgocG9zaXRpb24gLyBmaWxlRGF0YS5ieXRlTGVuZ3RoKSAqIDEwMCk7CiAgICAgIHBlcmNlbnQudGV4dENvbnRlbnQgPSBgJHtwZXJjZW50RG9uZX0lIGRvbmVgOwoKICAgIH0gd2hpbGUgKHBvc2l0aW9uIDwgZmlsZURhdGEuYnl0ZUxlbmd0aCk7CiAgfQoKICAvLyBBbGwgZG9uZS4KICB5aWVsZCB7CiAgICByZXNwb25zZTogewogICAgICBhY3Rpb246ICdjb21wbGV0ZScsCiAgICB9CiAgfTsKfQoKc2NvcGUuZ29vZ2xlID0gc2NvcGUuZ29vZ2xlIHx8IHt9OwpzY29wZS5nb29nbGUuY29sYWIgPSBzY29wZS5nb29nbGUuY29sYWIgfHwge307CnNjb3BlLmdvb2dsZS5jb2xhYi5fZmlsZXMgPSB7CiAgX3VwbG9hZEZpbGVzLAogIF91cGxvYWRGaWxlc0NvbnRpbnVlLAp9Owp9KShzZWxmKTsK", + "ok": true, + "headers": [ + [ + "content-type", + "application/javascript" + ] + ], + "status": 200, + "status_text": "" + } + }, + "base_uri": "https://localhost:8080/", + "height": 73 + }, + "outputId": "e2ae7d4e-4190-44e7-db1b-e05862a02dd1" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "display_data", + "data": { + "text/plain": [ + "" + ], + "text/html": [ + "\n", + " \n", + " \n", + " Upload widget is only available when the cell has been executed in the\n", + " current browser session. Please rerun this cell to enable.\n", + " \n", + " " + ] + }, + "metadata": {} + }, + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Saving FOLIODETE_20220811092600.pdf to FOLIODETE_20220811092600.pdf\n" + ] + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "Run this cell to upload your file to the service. If successful, it will create a zip file with the same basename in content folder. If you get an error that adobe was not found, you'll need to reinstall the SDK (the first cell)." + ], + "metadata": { + "id": "cMPb2ak5ZdAe" + } + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "z9WtP5J_bxin", + "colab": { + "base_uri": "https://localhost:8080/" + }, + "outputId": "da367a6f-41b7-4c54-ef0d-936e794e82cb" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Extract from FOLIODETE_20220811092600.pdf ...\n", + "Saved as FOLIODETE_20220811092600.zip\n" + ] + } + ], + "source": [ + "from adobe.pdfservices.operation.auth.credentials import Credentials\n", + "from adobe.pdfservices.operation.exception.exceptions import ServiceApiException, ServiceUsageException, SdkException\n", + "from adobe.pdfservices.operation.pdfops.options.extractpdf.extract_pdf_options import ExtractPDFOptions\n", + "from adobe.pdfservices.operation.pdfops.options.extractpdf.extract_element_type import ExtractElementType\n", + "from adobe.pdfservices.operation.execution_context import ExecutionContext\n", + "from adobe.pdfservices.operation.io.file_ref import FileRef\n", + "from adobe.pdfservices.operation.pdfops.extract_pdf_operation import ExtractPDFOperation\n", + "from adobe.pdfservices.operation.pdfops.options.extractpdf.extract_renditions_element_type import ExtractRenditionsElementType\n", + "\n", + "import os\n", + "\n", + "basename, _ = os.path.splitext(filename)\n", + "zip_file = f\"{basename}.zip\"\n", + "print(f'Extract from {filename} ...')\n", + "\n", + "\n", + "credentials = Credentials.service_account_credentials_builder()\\\n", + " .from_file(\"pdfservices-api-credentials.json\") \\\n", + " .build()\n", + "\n", + "execution_context = ExecutionContext.create(credentials)\n", + "extract_pdf_operation = ExtractPDFOperation.create_new()\n", + "\n", + "#Set operation input from a source file.\n", + "source = FileRef.create_from_local_file(filename)\n", + "extract_pdf_operation.set_input(source)\n", + "\n", + "# Build ExtractPDF options and set them into the operation\n", + "extract_pdf_options: ExtractPDFOptions = ExtractPDFOptions.builder() \\\n", + " .with_elements_to_extract([ExtractElementType.TEXT, ExtractElementType.TABLES]) \\\n", + " .with_elements_to_extract_renditions([ExtractRenditionsElementType.TABLES,ExtractRenditionsElementType.FIGURES]) \\\n", + " .build()\n", + "\n", + "extract_pdf_operation.set_options(extract_pdf_options)\n", + "\n", + "#Execute the operation.\n", + "result = extract_pdf_operation.execute(execution_context)\n", + "\n", + "# Save the result to the specified location.\n", + "result.save_as(zip_file)\n", + "print(f'Saved as {zip_file}')" + ] + }, + { + "cell_type": "code", + "source": [], + "metadata": { + "id": "C9o3ic10-Zx9" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "The next step will extract the JSON from the zip and loop over it to find the text of the PDF. This will be stored in a string variable and returned as output." + ], + "metadata": { + "id": "cH7tHmyzZtji" + } + }, + { + "cell_type": "code", + "source": [ + "import zipfile\n", + "import json\n", + "\n", + "with zipfile.ZipFile(zip_file) as z:\n", + " raw = z.read('structuredData.json').decode()\n", + " data = json.loads(raw)\n", + "\n", + "text = ''\n", + "for element in data[\"elements\"]:\n", + " if \"Text\" in element:\n", + " text += element[\"Text\"] + \"\\n\"\n", + "\n", + "print(text)" + ], + "metadata": { + "id": "jHLWiqQIyzaY", + "colab": { + "base_uri": "https://localhost:8080/" + }, + "outputId": "00a4ffd0-3bf7-40b8-d4d2-d83ae8fda723" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "CAMDEN, RAYMOND \n", + "403 ROBINHOOD CIRCLE \n", + "LAFAYETTE LA 70508 \n", + "UNITED STATES OF AMERICA \n", + "HAMPTON INN - LAS COLINAS,TX \n", + "820 W. WALNUT HILL LANE \n", + "IRVING, TX 75038 United States of America TELEPHONE 972-753-1232 • FAX 972-550-0300 Reservations \n", + "www.hamptoninn.com or 1 800 HAMPTON \n", + "Room No: \n", + "505/KXTD \n", + "Arrival Date: \n", + "8/9/2022 1:38:00 PM \n", + "Departure Date: \n", + "8/11/2022 9:26:00 AM \n", + "Adult/Child: \n", + "1/0 \n", + "Cashier ID: \n", + "VSTITT \n", + "Room Rate: \n", + "156.00 \n", + "AL: \n", + "HH # \n", + "538140961 GOLD \n", + "VAT # \n", + "Folio No/Che \n", + "529987 A \n", + "Confirmation Number: 54367540 \n", + "HAMPTON INN - LAS COLINAS,TX 8/11/2022 9:25:00 AM \n", + "DATE \n", + "REF NO \n", + "DESCRIPTION \n", + "CHARGES \n", + "8/9/2022 \n", + "1754251 \n", + "GUEST ROOM \n", + "$156.00 \n", + "8/9/2022 \n", + "1754251 \n", + "STATE TAX \n", + "$9.36 \n", + "8/9/2022 \n", + "1754251 \n", + "CITY TAX \n", + "$14.04 \n", + "8/9/2022 \n", + "1754251 \n", + "STATE RECOVERY FEE \n", + "$0.81 \n", + "8/10/2022 \n", + "1754504 \n", + "GUEST ROOM \n", + "$163.00 \n", + "8/10/2022 \n", + "1754504 \n", + "STATE TAX \n", + "$9.78 \n", + "8/10/2022 \n", + "1754504 \n", + "CITY TAX \n", + "$14.67 \n", + "8/10/2022 \n", + "1754504 \n", + "STATE RECOVERY FEE \n", + "$0.85 \n", + "8/11/2022 \n", + "1754615 \n", + "MC *0354 \n", + "**BALANCE** \n", + "($368.51) \n", + "$0.00 \n", + "Hilton Honors(R) stays are posted within 72 hours of checkout. To check your earnings or book your next stay at more than 6,500+ hotels and resorts in 119 countries, please visit Honors.com \n", + "WE LOVE HAVING YOU HERE!! \n", + "Page:1 \n", + "\n" + ] + } + ] + } + ] +} \ No newline at end of file diff --git a/0-jc_notebooks/parsePDF_AdobeAPI.ipynb b/0-jc_notebooks/parsePDF_AdobeAPI.ipynb new file mode 100644 index 00000000000..2becb454330 --- /dev/null +++ b/0-jc_notebooks/parsePDF_AdobeAPI.ipynb @@ -0,0 +1,287 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "sijtZnY7_kUV" + }, + "source": [ + "#PDF Extract Demo\n", + "\n", + "To use this notebook, please upload your credentials, both the JSON file and private.key, to the content directory. Click on the folder icon at left, click on the .. to show folders, click on the content folder, and then click the three dots to the right of the content folder and click Upload. You can upload both by selecting both.\n", + "\n", + "Then execute the first cell to install the SDK. You'll see an error about pip's dependency resolver. You can ignore this error, but you do need to restart the runtime by clicking the button at the bottom of the pip install output." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "NJKQCjL5cVZK" + }, + "outputs": [], + "source": [ + "!pip install pdfservices-sdk" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "yrFCZeMNSjl6" + }, + "source": [ + "Run this cell, then choose a file to upload to the Extract service. If you get an error like this:\n", + "```\n", + "Cannot read property '_uploadFiles' of undefined\n", + "```\n", + "then you'll need to enable third-party cookies. On Chrome, go to chrome://settings/content/cookies." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "background_save": true, + "base_uri": "https://localhost:8080/", + "height": 38, + "resources": { + "http://localhost:8080/nbextensions/google.colab/files.js": { + "data": "Ly8gQ29weXJpZ2h0IDIwMTcgR29vZ2xlIExMQwovLwovLyBMaWNlbnNlZCB1bmRlciB0aGUgQXBhY2hlIExpY2Vuc2UsIFZlcnNpb24gMi4wICh0aGUgIkxpY2Vuc2UiKTsKLy8geW91IG1heSBub3QgdXNlIHRoaXMgZmlsZSBleGNlcHQgaW4gY29tcGxpYW5jZSB3aXRoIHRoZSBMaWNlbnNlLgovLyBZb3UgbWF5IG9idGFpbiBhIGNvcHkgb2YgdGhlIExpY2Vuc2UgYXQKLy8KLy8gICAgICBodHRwOi8vd3d3LmFwYWNoZS5vcmcvbGljZW5zZXMvTElDRU5TRS0yLjAKLy8KLy8gVW5sZXNzIHJlcXVpcmVkIGJ5IGFwcGxpY2FibGUgbGF3IG9yIGFncmVlZCB0byBpbiB3cml0aW5nLCBzb2Z0d2FyZQovLyBkaXN0cmlidXRlZCB1bmRlciB0aGUgTGljZW5zZSBpcyBkaXN0cmlidXRlZCBvbiBhbiAiQVMgSVMiIEJBU0lTLAovLyBXSVRIT1VUIFdBUlJBTlRJRVMgT1IgQ09ORElUSU9OUyBPRiBBTlkgS0lORCwgZWl0aGVyIGV4cHJlc3Mgb3IgaW1wbGllZC4KLy8gU2VlIHRoZSBMaWNlbnNlIGZvciB0aGUgc3BlY2lmaWMgbGFuZ3VhZ2UgZ292ZXJuaW5nIHBlcm1pc3Npb25zIGFuZAovLyBsaW1pdGF0aW9ucyB1bmRlciB0aGUgTGljZW5zZS4KCi8qKgogKiBAZmlsZW92ZXJ2aWV3IEhlbHBlcnMgZm9yIGdvb2dsZS5jb2xhYiBQeXRob24gbW9kdWxlLgogKi8KKGZ1bmN0aW9uKHNjb3BlKSB7CmZ1bmN0aW9uIHNwYW4odGV4dCwgc3R5bGVBdHRyaWJ1dGVzID0ge30pIHsKICBjb25zdCBlbGVtZW50ID0gZG9jdW1lbnQuY3JlYXRlRWxlbWVudCgnc3BhbicpOwogIGVsZW1lbnQudGV4dENvbnRlbnQgPSB0ZXh0OwogIGZvciAoY29uc3Qga2V5IG9mIE9iamVjdC5rZXlzKHN0eWxlQXR0cmlidXRlcykpIHsKICAgIGVsZW1lbnQuc3R5bGVba2V5XSA9IHN0eWxlQXR0cmlidXRlc1trZXldOwogIH0KICByZXR1cm4gZWxlbWVudDsKfQoKLy8gTWF4IG51bWJlciBvZiBieXRlcyB3aGljaCB3aWxsIGJlIHVwbG9hZGVkIGF0IGEgdGltZS4KY29uc3QgTUFYX1BBWUxPQURfU0laRSA9IDEwMCAqIDEwMjQ7CgpmdW5jdGlvbiBfdXBsb2FkRmlsZXMoaW5wdXRJZCwgb3V0cHV0SWQpIHsKICBjb25zdCBzdGVwcyA9IHVwbG9hZEZpbGVzU3RlcChpbnB1dElkLCBvdXRwdXRJZCk7CiAgY29uc3Qgb3V0cHV0RWxlbWVudCA9IGRvY3VtZW50LmdldEVsZW1lbnRCeUlkKG91dHB1dElkKTsKICAvLyBDYWNoZSBzdGVwcyBvbiB0aGUgb3V0cHV0RWxlbWVudCB0byBtYWtlIGl0IGF2YWlsYWJsZSBmb3IgdGhlIG5leHQgY2FsbAogIC8vIHRvIHVwbG9hZEZpbGVzQ29udGludWUgZnJvbSBQeXRob24uCiAgb3V0cHV0RWxlbWVudC5zdGVwcyA9IHN0ZXBzOwoKICByZXR1cm4gX3VwbG9hZEZpbGVzQ29udGludWUob3V0cHV0SWQpOwp9CgovLyBUaGlzIGlzIHJvdWdobHkgYW4gYXN5bmMgZ2VuZXJhdG9yIChub3Qgc3VwcG9ydGVkIGluIHRoZSBicm93c2VyIHlldCksCi8vIHdoZXJlIHRoZXJlIGFyZSBtdWx0aXBsZSBhc3luY2hyb25vdXMgc3RlcHMgYW5kIHRoZSBQeXRob24gc2lkZSBpcyBnb2luZwovLyB0byBwb2xsIGZvciBjb21wbGV0aW9uIG9mIGVhY2ggc3RlcC4KLy8gVGhpcyB1c2VzIGEgUHJvbWlzZSB0byBibG9jayB0aGUgcHl0aG9uIHNpZGUgb24gY29tcGxldGlvbiBvZiBlYWNoIHN0ZXAsCi8vIHRoZW4gcGFzc2VzIHRoZSByZXN1bHQgb2YgdGhlIHByZXZpb3VzIHN0ZXAgYXMgdGhlIGlucHV0IHRvIHRoZSBuZXh0IHN0ZXAuCmZ1bmN0aW9uIF91cGxvYWRGaWxlc0NvbnRpbnVlKG91dHB1dElkKSB7CiAgY29uc3Qgb3V0cHV0RWxlbWVudCA9IGRvY3VtZW50LmdldEVsZW1lbnRCeUlkKG91dHB1dElkKTsKICBjb25zdCBzdGVwcyA9IG91dHB1dEVsZW1lbnQuc3RlcHM7CgogIGNvbnN0IG5leHQgPSBzdGVwcy5uZXh0KG91dHB1dEVsZW1lbnQubGFzdFByb21pc2VWYWx1ZSk7CiAgcmV0dXJuIFByb21pc2UucmVzb2x2ZShuZXh0LnZhbHVlLnByb21pc2UpLnRoZW4oKHZhbHVlKSA9PiB7CiAgICAvLyBDYWNoZSB0aGUgbGFzdCBwcm9taXNlIHZhbHVlIHRvIG1ha2UgaXQgYXZhaWxhYmxlIHRvIHRoZSBuZXh0CiAgICAvLyBzdGVwIG9mIHRoZSBnZW5lcmF0b3IuCiAgICBvdXRwdXRFbGVtZW50Lmxhc3RQcm9taXNlVmFsdWUgPSB2YWx1ZTsKICAgIHJldHVybiBuZXh0LnZhbHVlLnJlc3BvbnNlOwogIH0pOwp9CgovKioKICogR2VuZXJhdG9yIGZ1bmN0aW9uIHdoaWNoIGlzIGNhbGxlZCBiZXR3ZWVuIGVhY2ggYXN5bmMgc3RlcCBvZiB0aGUgdXBsb2FkCiAqIHByb2Nlc3MuCiAqIEBwYXJhbSB7c3RyaW5nfSBpbnB1dElkIEVsZW1lbnQgSUQgb2YgdGhlIGlucHV0IGZpbGUgcGlja2VyIGVsZW1lbnQuCiAqIEBwYXJhbSB7c3RyaW5nfSBvdXRwdXRJZCBFbGVtZW50IElEIG9mIHRoZSBvdXRwdXQgZGlzcGxheS4KICogQHJldHVybiB7IUl0ZXJhYmxlPCFPYmplY3Q+fSBJdGVyYWJsZSBvZiBuZXh0IHN0ZXBzLgogKi8KZnVuY3Rpb24qIHVwbG9hZEZpbGVzU3RlcChpbnB1dElkLCBvdXRwdXRJZCkgewogIGNvbnN0IGlucHV0RWxlbWVudCA9IGRvY3VtZW50LmdldEVsZW1lbnRCeUlkKGlucHV0SWQpOwogIGlucHV0RWxlbWVudC5kaXNhYmxlZCA9IGZhbHNlOwoKICBjb25zdCBvdXRwdXRFbGVtZW50ID0gZG9jdW1lbnQuZ2V0RWxlbWVudEJ5SWQob3V0cHV0SWQpOwogIG91dHB1dEVsZW1lbnQuaW5uZXJIVE1MID0gJyc7CgogIGNvbnN0IHBpY2tlZFByb21pc2UgPSBuZXcgUHJvbWlzZSgocmVzb2x2ZSkgPT4gewogICAgaW5wdXRFbGVtZW50LmFkZEV2ZW50TGlzdGVuZXIoJ2NoYW5nZScsIChlKSA9PiB7CiAgICAgIHJlc29sdmUoZS50YXJnZXQuZmlsZXMpOwogICAgfSk7CiAgfSk7CgogIGNvbnN0IGNhbmNlbCA9IGRvY3VtZW50LmNyZWF0ZUVsZW1lbnQoJ2J1dHRvbicpOwogIGlucHV0RWxlbWVudC5wYXJlbnRFbGVtZW50LmFwcGVuZENoaWxkKGNhbmNlbCk7CiAgY2FuY2VsLnRleHRDb250ZW50ID0gJ0NhbmNlbCB1cGxvYWQnOwogIGNvbnN0IGNhbmNlbFByb21pc2UgPSBuZXcgUHJvbWlzZSgocmVzb2x2ZSkgPT4gewogICAgY2FuY2VsLm9uY2xpY2sgPSAoKSA9PiB7CiAgICAgIHJlc29sdmUobnVsbCk7CiAgICB9OwogIH0pOwoKICAvLyBXYWl0IGZvciB0aGUgdXNlciB0byBwaWNrIHRoZSBmaWxlcy4KICBjb25zdCBmaWxlcyA9IHlpZWxkIHsKICAgIHByb21pc2U6IFByb21pc2UucmFjZShbcGlja2VkUHJvbWlzZSwgY2FuY2VsUHJvbWlzZV0pLAogICAgcmVzcG9uc2U6IHsKICAgICAgYWN0aW9uOiAnc3RhcnRpbmcnLAogICAgfQogIH07CgogIGNhbmNlbC5yZW1vdmUoKTsKCiAgLy8gRGlzYWJsZSB0aGUgaW5wdXQgZWxlbWVudCBzaW5jZSBmdXJ0aGVyIHBpY2tzIGFyZSBub3QgYWxsb3dlZC4KICBpbnB1dEVsZW1lbnQuZGlzYWJsZWQgPSB0cnVlOwoKICBpZiAoIWZpbGVzKSB7CiAgICByZXR1cm4gewogICAgICByZXNwb25zZTogewogICAgICAgIGFjdGlvbjogJ2NvbXBsZXRlJywKICAgICAgfQogICAgfTsKICB9CgogIGZvciAoY29uc3QgZmlsZSBvZiBmaWxlcykgewogICAgY29uc3QgbGkgPSBkb2N1bWVudC5jcmVhdGVFbGVtZW50KCdsaScpOwogICAgbGkuYXBwZW5kKHNwYW4oZmlsZS5uYW1lLCB7Zm9udFdlaWdodDogJ2JvbGQnfSkpOwogICAgbGkuYXBwZW5kKHNwYW4oCiAgICAgICAgYCgke2ZpbGUudHlwZSB8fCAnbi9hJ30pIC0gJHtmaWxlLnNpemV9IGJ5dGVzLCBgICsKICAgICAgICBgbGFzdCBtb2RpZmllZDogJHsKICAgICAgICAgICAgZmlsZS5sYXN0TW9kaWZpZWREYXRlID8gZmlsZS5sYXN0TW9kaWZpZWREYXRlLnRvTG9jYWxlRGF0ZVN0cmluZygpIDoKICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgJ24vYSd9IC0gYCkpOwogICAgY29uc3QgcGVyY2VudCA9IHNwYW4oJzAlIGRvbmUnKTsKICAgIGxpLmFwcGVuZENoaWxkKHBlcmNlbnQpOwoKICAgIG91dHB1dEVsZW1lbnQuYXBwZW5kQ2hpbGQobGkpOwoKICAgIGNvbnN0IGZpbGVEYXRhUHJvbWlzZSA9IG5ldyBQcm9taXNlKChyZXNvbHZlKSA9PiB7CiAgICAgIGNvbnN0IHJlYWRlciA9IG5ldyBGaWxlUmVhZGVyKCk7CiAgICAgIHJlYWRlci5vbmxvYWQgPSAoZSkgPT4gewogICAgICAgIHJlc29sdmUoZS50YXJnZXQucmVzdWx0KTsKICAgICAgfTsKICAgICAgcmVhZGVyLnJlYWRBc0FycmF5QnVmZmVyKGZpbGUpOwogICAgfSk7CiAgICAvLyBXYWl0IGZvciB0aGUgZGF0YSB0byBiZSByZWFkeS4KICAgIGxldCBmaWxlRGF0YSA9IHlpZWxkIHsKICAgICAgcHJvbWlzZTogZmlsZURhdGFQcm9taXNlLAogICAgICByZXNwb25zZTogewogICAgICAgIGFjdGlvbjogJ2NvbnRpbnVlJywKICAgICAgfQogICAgfTsKCiAgICAvLyBVc2UgYSBjaHVua2VkIHNlbmRpbmcgdG8gYXZvaWQgbWVzc2FnZSBzaXplIGxpbWl0cy4gU2VlIGIvNjIxMTU2NjAuCiAgICBsZXQgcG9zaXRpb24gPSAwOwogICAgZG8gewogICAgICBjb25zdCBsZW5ndGggPSBNYXRoLm1pbihmaWxlRGF0YS5ieXRlTGVuZ3RoIC0gcG9zaXRpb24sIE1BWF9QQVlMT0FEX1NJWkUpOwogICAgICBjb25zdCBjaHVuayA9IG5ldyBVaW50OEFycmF5KGZpbGVEYXRhLCBwb3NpdGlvbiwgbGVuZ3RoKTsKICAgICAgcG9zaXRpb24gKz0gbGVuZ3RoOwoKICAgICAgY29uc3QgYmFzZTY0ID0gYnRvYShTdHJpbmcuZnJvbUNoYXJDb2RlLmFwcGx5KG51bGwsIGNodW5rKSk7CiAgICAgIHlpZWxkIHsKICAgICAgICByZXNwb25zZTogewogICAgICAgICAgYWN0aW9uOiAnYXBwZW5kJywKICAgICAgICAgIGZpbGU6IGZpbGUubmFtZSwKICAgICAgICAgIGRhdGE6IGJhc2U2NCwKICAgICAgICB9LAogICAgICB9OwoKICAgICAgbGV0IHBlcmNlbnREb25lID0gZmlsZURhdGEuYnl0ZUxlbmd0aCA9PT0gMCA/CiAgICAgICAgICAxMDAgOgogICAgICAgICAgTWF0aC5yb3VuZCgocG9zaXRpb24gLyBmaWxlRGF0YS5ieXRlTGVuZ3RoKSAqIDEwMCk7CiAgICAgIHBlcmNlbnQudGV4dENvbnRlbnQgPSBgJHtwZXJjZW50RG9uZX0lIGRvbmVgOwoKICAgIH0gd2hpbGUgKHBvc2l0aW9uIDwgZmlsZURhdGEuYnl0ZUxlbmd0aCk7CiAgfQoKICAvLyBBbGwgZG9uZS4KICB5aWVsZCB7CiAgICByZXNwb25zZTogewogICAgICBhY3Rpb246ICdjb21wbGV0ZScsCiAgICB9CiAgfTsKfQoKc2NvcGUuZ29vZ2xlID0gc2NvcGUuZ29vZ2xlIHx8IHt9OwpzY29wZS5nb29nbGUuY29sYWIgPSBzY29wZS5nb29nbGUuY29sYWIgfHwge307CnNjb3BlLmdvb2dsZS5jb2xhYi5fZmlsZXMgPSB7CiAgX3VwbG9hZEZpbGVzLAogIF91cGxvYWRGaWxlc0NvbnRpbnVlLAp9Owp9KShzZWxmKTsK", + "headers": [ + [ + "content-type", + "application/javascript" + ] + ], + "ok": true, + "status": 200, + "status_text": "" + } + } + }, + "id": "a2mBY71Uuhrj", + "outputId": "3f994047-c1df-45a0-c15c-3ac408a4632c" + }, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + " \n", + " \n", + " Upload widget is only available when the cell has been executed in the\n", + " current browser session. Please rerun this cell to enable.\n", + " \n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "ename": "KeyboardInterrupt", + "evalue": "ignored", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mKeyboardInterrupt\u001b[0m Traceback (most recent call last)", + "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[0;32mfrom\u001b[0m \u001b[0mgoogle\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcolab\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mfiles\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 2\u001b[0;31m \u001b[0minput\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mfiles\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mupload\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 3\u001b[0m \u001b[0mfilename\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mlist\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0minput\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mkeys\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/usr/local/lib/python3.7/dist-packages/google/colab/files.py\u001b[0m in \u001b[0;36mupload\u001b[0;34m()\u001b[0m\n\u001b[1;32m 69\u001b[0m result = _output.eval_js(\n\u001b[1;32m 70\u001b[0m 'google.colab._files._uploadFilesContinue(\"{output_id}\")'.format(\n\u001b[0;32m---> 71\u001b[0;31m output_id=output_id))\n\u001b[0m\u001b[1;32m 72\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mresult\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'action'\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m!=\u001b[0m \u001b[0;34m'append'\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 73\u001b[0m \u001b[0;31m# JS side uses a generator of promises to process all of the files- some\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/usr/local/lib/python3.7/dist-packages/google/colab/output/_js.py\u001b[0m in \u001b[0;36meval_js\u001b[0;34m(script, ignore_result, timeout_sec)\u001b[0m\n\u001b[1;32m 38\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mignore_result\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 39\u001b[0m \u001b[0;32mreturn\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 40\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0m_message\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mread_reply_from_input\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mrequest_id\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtimeout_sec\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 41\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 42\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/usr/local/lib/python3.7/dist-packages/google/colab/_message.py\u001b[0m in \u001b[0;36mread_reply_from_input\u001b[0;34m(message_id, timeout_sec)\u001b[0m\n\u001b[1;32m 99\u001b[0m \u001b[0mreply\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0m_read_next_input_message\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 100\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mreply\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0m_NOT_READY\u001b[0m \u001b[0;32mor\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0misinstance\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mreply\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdict\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 101\u001b[0;31m \u001b[0mtime\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msleep\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;36m0.025\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 102\u001b[0m \u001b[0;32mcontinue\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 103\u001b[0m if (reply.get('type') == 'colab_reply' and\n", + "\u001b[0;31mKeyboardInterrupt\u001b[0m: " + ] + } + ], + "source": [ + "from google.colab import files\n", + "input = files.upload()\n", + "filename = list(input.keys())[0]" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "cMPb2ak5ZdAe" + }, + "source": [ + "Run this cell to upload your file to the service. If successful, it will create a zip file with the same basename in content folder. If you get an error that adobe was not found, you'll need to reinstall the SDK (the first cell)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "z9WtP5J_bxin", + "outputId": "4a79097a-1589-480a-e2eb-25d4f98d198c" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Extract from PlanetaryScienceDecadalSurvey.pdf ...\n", + "Saved as PlanetaryScienceDecadalSurvey.zip\n" + ] + } + ], + "source": [ + "from adobe.pdfservices.operation.auth.credentials import Credentials\n", + "from adobe.pdfservices.operation.exception.exceptions import ServiceApiException, ServiceUsageException, SdkException\n", + "from adobe.pdfservices.operation.pdfops.options.extractpdf.extract_pdf_options import ExtractPDFOptions\n", + "from adobe.pdfservices.operation.pdfops.options.extractpdf.extract_element_type import ExtractElementType\n", + "from adobe.pdfservices.operation.execution_context import ExecutionContext\n", + "from adobe.pdfservices.operation.io.file_ref import FileRef\n", + "from adobe.pdfservices.operation.pdfops.extract_pdf_operation import ExtractPDFOperation\n", + "from adobe.pdfservices.operation.pdfops.options.extractpdf.extract_renditions_element_type import ExtractRenditionsElementType\n", + "\n", + "import os\n", + "\n", + "basename, _ = os.path.splitext(filename)\n", + "zip_file = f\"{basename}.zip\"\n", + "print(f'Extract from {filename} ...')\n", + "\n", + "\n", + "credentials = Credentials.service_account_credentials_builder()\\\n", + " .from_file(\"pdfservices-api-credentials.json\") \\\n", + " .build()\n", + "\n", + "execution_context = ExecutionContext.create(credentials)\n", + "extract_pdf_operation = ExtractPDFOperation.create_new()\n", + "\n", + "#Set operation input from a source file.\n", + "source = FileRef.create_from_local_file(filename)\n", + "extract_pdf_operation.set_input(source)\n", + "\n", + "# Build ExtractPDF options and set them into the operation\n", + "extract_pdf_options: ExtractPDFOptions = ExtractPDFOptions.builder() \\\n", + " .with_elements_to_extract([ExtractElementType.TEXT, ExtractElementType.TABLES]) \\\n", + " .with_elements_to_extract_renditions([ExtractRenditionsElementType.TABLES,ExtractRenditionsElementType.FIGURES]) \\\n", + " .build()\n", + "\n", + "extract_pdf_operation.set_options(extract_pdf_options)\n", + "\n", + "#Execute the operation.\n", + "result = extract_pdf_operation.execute(execution_context)\n", + "\n", + "# Save the result to the specified location.\n", + "result.save_as(zip_file)\n", + "print(f'Saved as {zip_file}')" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "cH7tHmyzZtji" + }, + "source": [ + "You can download the zip file from the content folder yourself (using the three dots menu in the Files tree) or run this cell. Uploaded files are automatically deleted when the session ends, or you can do it yourself from the Files tree. (You may have to close and re-open the content folder for the pdf and zip files to show up." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 17 + }, + "id": "jHLWiqQIyzaY", + "outputId": "f61c00f2-826a-4698-f43b-c61752ca4a6f" + }, + "outputs": [ + { + "data": { + "application/javascript": [ + "\n", + " async function download(id, filename, size) {\n", + " if (!google.colab.kernel.accessAllowed) {\n", + " return;\n", + " }\n", + " const div = document.createElement('div');\n", + " const label = document.createElement('label');\n", + " label.textContent = `Downloading \"${filename}\": `;\n", + " div.appendChild(label);\n", + " const progress = document.createElement('progress');\n", + " progress.max = size;\n", + " div.appendChild(progress);\n", + " document.body.appendChild(div);\n", + "\n", + " const buffers = [];\n", + " let downloaded = 0;\n", + "\n", + " const channel = await google.colab.kernel.comms.open(id);\n", + " // Send a message to notify the kernel that we're ready.\n", + " channel.send({})\n", + "\n", + " for await (const message of channel.messages) {\n", + " // Send a message to notify the kernel that we're ready.\n", + " channel.send({})\n", + " if (message.buffers) {\n", + " for (const buffer of message.buffers) {\n", + " buffers.push(buffer);\n", + " downloaded += buffer.byteLength;\n", + " progress.value = downloaded;\n", + " }\n", + " }\n", + " }\n", + " const blob = new Blob(buffers, {type: 'application/binary'});\n", + " const a = document.createElement('a');\n", + " a.href = window.URL.createObjectURL(blob);\n", + " a.download = filename;\n", + " div.appendChild(a);\n", + " a.click();\n", + " div.remove();\n", + " }\n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/javascript": [ + "download(\"download_65ee8856-6e93-490d-9a86-20e6d98974b0\", \"PlanetaryScienceDecadalSurvey.zip\", 869238)" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "from google.colab import files\n", + "\n", + "files.download(zip_file)" + ] + } + ], + "metadata": { + "colab": { + "provenance": [] + }, + "kernelspec": { + "display_name": "Python 3", + "name": "python3" + }, + "language_info": { + "name": "python" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} \ No newline at end of file