Pdf subsampler #6

rom1504 · 2023-01-08T20:41:41Z

https://pymupdf.readthedocs.io/en/latest/app1.html#performance

rom1504 · 2023-01-22T21:20:18Z

https://github.com/sandorkonya/forms-dataset/blob/main/PDF_Info_retrieval.ipynb

sandorkonya · 2023-01-25T23:42:46Z

PdfSubsampler

import fitz
from urllib.parse import quote

class Subsampler:
    def __init__(self, timeout=5, maxlength=10**6):
        self.timeout = timeout
        self.maxlength = maxlength
    
    def __call__(self, r):
        """
        input: a request
        output: error,doc_size,doc_numberofpages,doc_form, characters, links,images,words,blocks
        """

        error,doc_size,doc_numberofpages,doc_form, characters, links,images,words,blocks = 0,0,0,0,0,0,0,0,0
        if r.status_code == 200:
            doc_size = r.headers["Content-Length"]
            if ((r.headers["Content-Type"] in ["application/pdf","stream/pdf"]) & (int(r.headers["Content-Length"]) < self.maxlength)):
                try:
                    doc = fitz.open(stream=r.content, filetype="pdf")
                    #gets the number of pages in the pdf
                    doc_numberofpages = len(doc)

                    #0 if no form element, otherwise the number of form elements in the pdf
                    doc_form = doc.is_form_pdf
                    doc_form = 0 if not doc_form else doc_form

                    #count together characters of readable text, links, images
                    for page in doc:
                        characters += len(page.get_text())
                        links += len(page.get_links())
                        images += len(page.get_images())
                        blocks += len(page.get_textpage().extractBLOCKS())
                        words += len(page.get_textpage().extractWORDS())

                except:
                    error = "OpenError"
            else:
                error = "HeaderError"
        else:
            error =  "StatusCodeError"

        return [error, quote(r.url, safe='/:?&') ,doc_numberofpages, doc_form, doc_size, characters, links,images,blocks,words]

call(self, r) where r is the request and not the file (since we wan to make some tests also on the request header).

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Pdf subsampler #6

Pdf subsampler #6

rom1504 commented Jan 8, 2023

rom1504 commented Jan 22, 2023

sandorkonya commented Jan 25, 2023 •

edited

Loading

Pdf subsampler #6

Pdf subsampler #6

Comments

rom1504 commented Jan 8, 2023

rom1504 commented Jan 22, 2023

sandorkonya commented Jan 25, 2023 • edited Loading

sandorkonya commented Jan 25, 2023 •

edited

Loading