From 60e67275800137d0595cf11eae048618fb972467 Mon Sep 17 00:00:00 2001 From: rhigman <73792779+rhigman@users.noreply.github.com> Date: Thu, 20 Jul 2023 12:03:47 +0100 Subject: [PATCH 1/2] Update required package versions --- requirements.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/requirements.txt b/requirements.txt index ad31ecc..39dd8d0 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,5 @@ crossrefapi==1.4.0 -pymupdf==1.18.9 +pymupdf==1.22.5 roman==3.3 typer==0.4.1 -thothlibrary==0.12.0 +thothlibrary==0.20.1 From 1ddbe51b4f3bc461a4d6df348348a0bf0c31c24c Mon Sep 17 00:00:00 2001 From: rhigman <73792779+rhigman@users.noreply.github.com> Date: Thu, 20 Jul 2023 12:29:03 +0100 Subject: [PATCH 2/2] Write Landing Page and Full Text URLs to Thoth automatically on chapter creation --- README.md | 9 ++++++++ src/db.py | 55 ++++++++++++++++++++++++++++++++++++++++++++++++- src/main.py | 14 +++++++++++-- src/metadata.py | 7 ++++++- 4 files changed, 81 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index 608d7c9..5597af7 100644 --- a/README.md +++ b/README.md @@ -16,6 +16,7 @@ Options: --input-file PATH [default: ./file.pdf] --output-folder PATH [default: ./output/] --database TEXT [default: thoth] + --write-urls / --no-write-urls [default: write-urls] --help Show this message and exit. ``` @@ -34,11 +35,19 @@ $ `python3 ./main.py --input-file my_file.pdf --output-folder ~/output \ $ `COVER_PAGE=0` $ `COPYRIGHT_PAGE=4` +The `--write_urls` option attempts to write the appropriate OBP-specific Landing Page URL and Full Text URL to Thoth for each chapter created. For this, it is necessary to provide Thoth login credentials via the environment variables `THOTH_EMAIL` and `THOTH_PWD`. + +$ `THOTH_EMAIL=email@example.com` +$ `THOTH_PWD=password` +$ `python3 ./main.py --input-file my_file.pdf --output-folder ~/output \ + --database thoth --write-urls 10.11647/obp.0309` ## Running with docker Running the command reported above in docker would be: ``` docker run --rm \ + -e THOTH_EMAIL=email@example.com \ + -e THOTH_PWD=password \ -v /path/to/local.pdf:/ebook_automation/file.pdf \ -v /path/to/output:/ebook_automation/output \ openbookpublishers/chapter-splitter \ diff --git a/src/db.py b/src/db.py index c95daef..fb3042e 100644 --- a/src/db.py +++ b/src/db.py @@ -3,11 +3,13 @@ from urllib.parse import urljoin import json import requests +from os import getenv from typing import Dict, List class Db(): """Base Db class to derive specialised database classes from""" + def __init__(self, doi: str) -> None: self.db = self.init_db() self.doi = urljoin('https://doi.org/', doi) @@ -82,6 +84,7 @@ def join_author_names(self, chapter_data: Dict) -> str: class Thoth(Db): """Thoth compatibility layer""" + def init_db(self): """Init database object""" return ThothClient() @@ -102,6 +105,7 @@ def get_chapters(self, book: Dict) -> List: query = {"query": """{ workByDoi (doi: "%s") { relations(relationTypes: HAS_CHILD) { relatedWork { + workId fullTitle copyrightHolder longAbstract @@ -133,7 +137,56 @@ def get_chapters(self, book: Dict) -> List: "publisher": work.get("imprint", {}).get("imprintName"), "abstract": work.get("longAbstract"), "pages": work.get("pageInterval"), - "licence": work.get("license")} + "licence": work.get("license"), + "workId": work.get("workId")} chapters.append(data) return chapters + + def write_urls(self, chapter): + """Write Landing Page and Full Text URLs to Thoth""" + chapter_doi = chapter.get("doi").split('/')[-1].lower() + book_doi = chapter_doi.rpartition('.')[0] + landing_page_root = ( + 'https://www.openbookpublishers.com/books/10.11647/' + '{book_doi}/chapters/10.11647/{chapter_doi}') + full_text_url_root = ( + 'https://www.books.openbookpublishers.com/10.11647/' + '{chapter_doi}.pdf') + + username = getenv('THOTH_EMAIL') + password = getenv('THOTH_PWD') + if username is None: + raise KeyError( + 'No Thoth username provided ' + '(THOTH_EMAIL environment variable not set)') + if password is None: + raise KeyError( + 'No Thoth password provided ' + '(THOTH_PWD environment variable not set)') + + self.db.login(username, password) + + publication = {"workId": chapter.get("workId"), + "publicationType": "PDF", + "isbn": None, + "widthMm": None, + "widthIn": None, + "heightMm": None, + "heightIn": None, + "depthMm": None, + "depthIn": None, + "weightG": None, + "weightOz": None} + publication_id = self.db.create_publication(publication) + + location = {"publicationId": publication_id, + "landingPage": landing_page_root.format( + book_doi=book_doi, chapter_doi=chapter_doi), + "fullTextUrl": full_text_url_root.format( + book_doi=book_doi, chapter_doi=chapter_doi), + "locationPlatform": "OTHER", + "canonical": "true"} + self.db.create_location(location) + + print('{}: URLs written to Thoth'.format(chapter_doi)) diff --git a/src/main.py b/src/main.py index 746220a..2c05a03 100644 --- a/src/main.py +++ b/src/main.py @@ -7,6 +7,7 @@ from pdf import Pdf from metadata import Metadata from shutil import copy2 +from thothlibrary import ThothError import re app = typer.Typer() @@ -18,7 +19,8 @@ def run(input_file: Path = typer.Option("./file.pdf", output_folder: Path = typer.Option("./output/", exists=True, resolve_path=True), doi: str = typer.Argument(...), - database: str = "thoth"): + database: str = "thoth", + write_urls: bool = True): with tempfile.TemporaryDirectory() as tmp_dir: @@ -37,13 +39,21 @@ def run(input_file: Path = typer.Option("./file.pdf", # Merge PDFs pdf.merge_pdfs(page_range, output_file_name) - # Write metadata + # Write metadata to chapter PDF output_file_path = os.path.join(tmp_dir, output_file_name) metadata.write_metadata(chapter, output_file_path) # copy file to output dir copy2(output_file_path, output_folder) + if write_urls: + # Write chapter URL metadata to database + try: + metadata.write_urls(chapter) + except (KeyError, ThothError) as e: + # Continue on error, but display warning + print('Error writing URLs to {}: {}'.format(database, e)) + if __name__ == '__main__': typer.run(run) diff --git a/src/metadata.py b/src/metadata.py index 1573425..b13533c 100644 --- a/src/metadata.py +++ b/src/metadata.py @@ -31,6 +31,7 @@ class Chapter: doi: str = None licence: str = None publisher: str = None + workId: str = None @classmethod def from_dict(cls, d): @@ -77,6 +78,10 @@ def get_chapters(self) -> List[Dict]: return [chapter.to_dict() for chapter in self.chapters] + def write_urls(self, chapter_dict): + """Write URL metadata to Thoth""" + self.db.write_urls(chapter_dict) + @staticmethod def write_metadata(chapter_dict, output_file_path): """Writes metadata to file_name""" @@ -100,7 +105,7 @@ def write_metadata(chapter_dict, output_file_path): cmd.append(output_file_path) run(cmd) - print('{}: Metadata written' + print('{}: Metadata written to PDF' .format(path.split(output_file_path)[1])) @staticmethod