Merge branch 'release/0.0.5'

OpenBookPublishers · Jul 14, 2022 · 93d18f5 · 93d18f5
2 parents afce36b + 9d51a06
commit 93d18f5
Show file tree

Hide file tree

Showing 12 changed files with 415 additions and 305 deletions.
diff --git a/Dockerfile b/Dockerfile
@@ -1,4 +1,4 @@
-FROM python:3.8.0-slim-buster
+FROM python:3.9.0-slim-buster
 
 WORKDIR /ebook_automation
 
@@ -19,5 +19,5 @@ COPY ./src/ ./
 ENV COVER_PAGE=0
 ENV COPYRIGHT_PAGE=4
 
-CMD python main.py ./pdf_file.pdf ./output \
-                   --metadata ./pdf_file.json
+ENTRYPOINT ["python3"]
+CMD ["./main.py", "--help"]
diff --git a/README.md b/README.md
@@ -1,47 +1,59 @@
 # chapter-splitter
 *chapter-splitter* is a tool to split PDF books into individual chapters.
 
-Chapter data needs to be previously submitted to (Crossref)[https://www.crossref.org/] so that `chapter-splitter` can query the server and retrieve information such as chapter page ranges, title and author(s) to add to the output PDFs.
+Chapter data needs to be previously submitted to [Crossref](https://www.crossref.org/) or [Thoth](https://thoth.pub/) so that `chapter-splitter` can query the server and retrieve information such as chapter page ranges, title and author(s) to add to the output PDFs.
 
 # Usage
 
-`chapter-splitter` requires:
+The help page $ `python3 ./main.py --help` reports:
+```
+Usage: main.py [OPTIONS] DOI
 
- - PDF of the book;
- - A metadata json file with the isbn of the book, structured as `{"isbn": "978-1-80064-422-9"}`
+Arguments:
+  DOI  [required]
 
-## Running with docker
-If required, specify cover and copyright page numbers (zero based) in the Dockerfile (or override it in your `docker run [...]` command) as env variables.
+Options:
+  --input-file PATH               [default: ./file.pdf]
+  --output-folder PATH            [default: ./output/]
+  --database TEXT                 [default: thoth]
+  --help                          Show this message and exit.
+```
+
+so a running command would look something like this:
+
+$ `python3 ./main.py --input-file my_file.pdf --output-folder ~/output \
+                     --database crossref 10.11647/obp.0309`
+
+or querying Thoth:
+
+$ `python3 ./main.py --input-file my_file.pdf --output-folder ~/output \
+                     --database thoth 10.11647/obp.0309`
+
+`chapter-splitter` would try to append both the front cover of the original PDF and the copyright page to the output files. Page numbers (of these pages in the original document) are defined with the environment variables `COVER_PAGE` and `ENV COPYRIGHT_PAGE` (number, zero based).
 
+$ `COVER_PAGE=0`
+$ `COPYRIGHT_PAGE=4`
+
+
+## Running with docker
+Running the command reported above in docker would be:
 ```
 docker run --rm \
-  -v /path/to/local.pdf:/ebook_automation/pdf_file.pdf \
-  -v /path/to/local.json:/ebook_automation/pdf_file.json \
+  -v /path/to/local.pdf:/ebook_automation/file.pdf \
   -v /path/to/output:/ebook_automation/output \
-  openbookpublishers/chapter-splitter
+  openbookpublishers/chapter-splitter \
+  main.py 10.11647/obp.0309
 ```
 
 Alternatively you may clone the repo, build the image using `docker build . -t some/tag` and run the command above replacing `openbookpublishers/chapter-splitter` with `some/tag`.
 
 ## Running locally
 ### Installation
-*chapter-splitter* requires **pdftk** and **exiftool** to be installed on your system. These tools are available  in the official repositories of debian/debian-based distributions.
-Run `apt-get install pdftk exiftool`.
+*chapter-splitter* requires **exiftool** to be installed on your system. These tools are available  in the official repositories of debian/debian-based distributions.
+Run `apt-get install exiftool`.
 
 Besides python standard libraries, *chapter-splitter* requires some extra-libraries noted in `requirements.txt`. To install them (within a virtual environment, if you prefer), run `pip3.5 install requirements.txt`.
 
-#### Configuration
-If required, define cover and copyright page numbers (zero based) as env variables: $COVER_PAGE and $COPYRIGHT_PAGE.
-
-### Use
-Run the script as `python3 main.py ./input_file.pdf /output/folder -m ./metadata.json`. Type `python3 main.py --help` for more info.
-
-Example:
-
-$ `python3 main.py Hobbs-Provincial-Press.pdf /dev/shm -m metadata.json`
-
-You may specify `--compress-output` to output a zip file containing all the curated (without the 'original', metadata less, files) chapter PDFs.
-
 ## Dev
 ### Git hooks
 Use `pre-commit.sh` as a pre commit git hook to build a test image that will run `flake8` to enforce PEP8 style.

diff --git a/requirements.txt b/requirements.txt
@@ -1,3 +1,5 @@
 crossrefapi==1.4.0
 pymupdf==1.18.9
 roman==3.3
+typer==0.4.1
+thothlibrary==0.12.0
diff --git a/src/db.py b/src/db.py
@@ -0,0 +1,114 @@
+from crossref.restful import Works
+from thothlibrary import ThothClient
+from urllib.parse import urljoin
+import json
+import requests
+from typing import Dict, List
+
+
+class Crossref():
+    """Crossref compatibilty layer"""
+    def __init__(self, doi: str):
+        self.works = Works()
+        self.doi = urljoin('https://doi.org/', doi)
+
+    def get_book(self) -> Dict:
+        """Return the book data associated to the supplied ISBN"""
+        query = self.works.doi(self.doi)
+        data = {"title": query.get("title")[0],
+                "doi":   query.get("DOI")}
+        return data
+
+    def get_chapters(self, book: Dict) -> List:
+        """Returns a chapter data related to the book"""
+        query = self.works.filter(container_title=book.get("title"),
+                                  type='book-chapter') \
+                          .select('DOI', 'license', 'author',
+                                  'title', 'type', 'page',
+                                  'publisher', 'abstract')
+
+        # Assert that at least one DOI have been discovered
+        if not query:
+            raise AssertionError('Couldn\'t find any chapter-level DOIs'
+                                 + ' for the supplied --isbn value')
+
+        chapters = []
+        for chapter in query:
+            data = {"doi":       chapter.get("DOI"),
+                    "author":    self.join_author_names(chapter),
+                    "title":     chapter.get("title")[0],
+                    "publisher": chapter.get("publisher"),
+                    "abstract":  chapter.get("abstract"),
+                    "pages":     chapter.get("page"),
+                    "licence":   chapter.get("license")[0]['URL']}
+            chapters.append(data)
+
+        return chapters
+
+    def join_author_names(self, chapter_data: Dict) -> str:
+        """Returns a string with author names, separated by semicolon"""
+        author_list = []
+
+        for author in chapter_data.get("author"):
+            # do not assume we know author's first name
+            full_name = [author.get("given", ""), author.get("family", "")]
+            author_list.append(" ".join(full_name).strip())
+
+        return '; '.join(author_list)
+
+
+class Thoth():
+    """Thoth compatibilty layer"""
+    def __init__(self, doi: str):
+        self.thoth = ThothClient()
+        self.doi_url = urljoin('https://doi.org/', doi)
+
+    def get_book(self) -> Dict:
+        work = self.thoth.work_by_doi(doi=self.doi_url, raw=True)
+        work_dict = json.loads(work)['data']['workByDoi']
+
+        data = {"title": work_dict.get("fullTitle"),
+                "doi":   self.doi_url}
+        return data
+
+    def get_chapters(self, book: Dict) -> List:
+        # TODO replace this with a Thoth library method when available
+        url = 'https://api.thoth.pub/graphql'
+        query = {"query": """{ workByDoi (doi: "%s") {
+                                relations(relationTypes: HAS_CHILD) {
+                                    relatedWork {
+                                        fullTitle
+                                        copyrightHolder
+                                        longAbstract
+                                        pageInterval
+                                        doi
+                                        license
+                                        imprint {
+                                            imprintName
+                                            }
+                                        }
+                                    }
+                                }
+                             }""" % book.get("doi")}
+
+        try:
+            r = requests.post(url, json=query)
+            r.raise_for_status()
+        except requests.exceptions.HTTPError as err:
+            raise SystemExit(err)
+
+        r_dict = json.loads(r.text)
+
+        chapters = []
+        for relatedWork in r_dict['data']['workByDoi']['relations']:
+            work = relatedWork.get("relatedWork", {})
+            data = {"doi":       work.get("doi"),
+                    "author":    work.get("copyrightHolder"),
+                    "title":     work.get("fullTitle"),
+                    "publisher": work.get("imprint", {}).get("imprintName"),
+                    "abstract":  work.get("longAbstract"),
+                    "pages":     work.get("pageInterval"),
+                    "licence":   work.get("license")}
+            chapters.append(data)
+
+        return chapters
diff --git a/src/main.py b/src/main.py
@@ -2,55 +2,45 @@
 
 import os
 import tempfile
-import json
-from modules.core import Core
-from modules.pdf import Pdf
-from modules.metadata import Metadata
-from modules.checks import path_checks, file_checks, dependencies_checks
+import typer
+from pathlib import Path
+from pdf import Pdf
+from metadata import Metadata
+from shutil import copy2
 
+app = typer.Typer()
 
-def run():
-    # Destruction of the temporary directory on completion
-    with tempfile.TemporaryDirectory() as tmp_dir:
 
-        # Create core object instace
-        core = Core(tmp_dir)
+@app.command()
+def run(input_file:    Path = typer.Option("./file.pdf",
+                                           exists=True, resolve_path=True),
+        output_folder: Path = typer.Option("./output/",
+                                           exists=True, resolve_path=True),
+        doi:            str = typer.Argument(...),
+        database:       str = "thoth"):
 
-        # Checks
-        file_checks(core.argv.input_file)
-        file_checks(core.argv.metadata)
-        path_checks(core.argv.output_folder)
-        dependencies_checks()
+    with tempfile.TemporaryDirectory() as tmp_dir:
 
-        # Retrieve ISBN
-        json_file = os.path.abspath(core.argv.metadata)
-        with open(json_file) as json_data:
-            isbn = json.load(json_data)['isbn'].replace('-', '')
+        metadata = Metadata(database, doi=doi)
 
         # Create object instaces
-        metadata = Metadata(isbn)
-        pdf = Pdf(core.argv.input_file, tmp_dir)
+        pdf = Pdf(input_file, tmp_dir)
 
         # Iterate over chapters metadata
-        for chapter_data in metadata.chapters_data:
-            page_range = chapter_data['page'].split('-')
-            output_file_name = chapter_data['DOI'].split('/')[1] + '.pdf'
+        for chapter in metadata.get_chapters():
+            page_range = chapter.get("pages").split('-')
+            output_file_name = chapter.get("doi").split('/')[-1] + '.pdf'
 
             # Merge PDFs
             pdf.merge_pdfs(page_range, output_file_name)
 
             # Write metadata
             output_file_path = os.path.join(tmp_dir, output_file_name)
-            metadata.write_metadata(chapter_data, output_file_path)
+            metadata.write_metadata(chapter, output_file_path)
 
-        # PDFs are temporarely stored in tmp_dir
-        if core.argv.compress:
-            # Output a zip archive
-            core.output_archive(metadata.get_doi_suffix())
-        else:
-            # Output loose PDFs
-            core.output_pdfs()
+            # copy file to output dir
+            copy2(output_file_path, output_folder)
 
 
 if __name__ == '__main__':
-    run()
+    typer.run(run)