Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Pdf subsampler #6

Open
rom1504 opened this issue Jan 8, 2023 · 2 comments
Open

Pdf subsampler #6

rom1504 opened this issue Jan 8, 2023 · 2 comments

Comments

@rom1504
Copy link
Owner

rom1504 commented Jan 8, 2023

https://pymupdf.readthedocs.io/en/latest/app1.html#performance

@rom1504
Copy link
Owner Author

rom1504 commented Jan 22, 2023

@sandorkonya
Copy link

sandorkonya commented Jan 25, 2023

PdfSubsampler

import fitz
from urllib.parse import quote

class Subsampler:
    def __init__(self, timeout=5, maxlength=10**6):
        self.timeout = timeout
        self.maxlength = maxlength
    
    def __call__(self, r):
        """
        input: a request
        output: error,doc_size,doc_numberofpages,doc_form, characters, links,images,words,blocks
        """

        error,doc_size,doc_numberofpages,doc_form, characters, links,images,words,blocks = 0,0,0,0,0,0,0,0,0
        if r.status_code == 200:
            doc_size = r.headers["Content-Length"]
            if ((r.headers["Content-Type"] in ["application/pdf","stream/pdf"]) & (int(r.headers["Content-Length"]) < self.maxlength)):
                try:
                    doc = fitz.open(stream=r.content, filetype="pdf")
                    #gets the number of pages in the pdf
                    doc_numberofpages = len(doc)

                    #0 if no form element, otherwise the number of form elements in the pdf
                    doc_form = doc.is_form_pdf
                    doc_form = 0 if not doc_form else doc_form

                    #count together characters of readable text, links, images
                    for page in doc:
                        characters += len(page.get_text())
                        links += len(page.get_links())
                        images += len(page.get_images())
                        blocks += len(page.get_textpage().extractBLOCKS())
                        words += len(page.get_textpage().extractWORDS())

                except:
                    error = "OpenError"
            else:
                error = "HeaderError"
        else:
            error =  "StatusCodeError"

        return [error, quote(r.url, safe='/:?&') ,doc_numberofpages, doc_form, doc_size, characters, links,images,blocks,words]

call(self, r) where r is the request and not the file (since we wan to make some tests also on the request header).

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

No branches or pull requests

2 participants