Skip to content

Commit

Permalink
Merge branch 'release/0.0.8'
Browse files Browse the repository at this point in the history
  • Loading branch information
rhigman committed Jul 24, 2023
2 parents 9ccf05b + 1ddbe51 commit de77c3f
Show file tree
Hide file tree
Showing 5 changed files with 83 additions and 6 deletions.
9 changes: 9 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ Options:
--input-file PATH [default: ./file.pdf]
--output-folder PATH [default: ./output/]
--database TEXT [default: thoth]
--write-urls / --no-write-urls [default: write-urls]
--help Show this message and exit.
```

Expand All @@ -34,11 +35,19 @@ $ `python3 ./main.py --input-file my_file.pdf --output-folder ~/output \
$ `COVER_PAGE=0`
$ `COPYRIGHT_PAGE=4`

The `--write_urls` option attempts to write the appropriate OBP-specific Landing Page URL and Full Text URL to Thoth for each chapter created. For this, it is necessary to provide Thoth login credentials via the environment variables `THOTH_EMAIL` and `THOTH_PWD`.

$ `THOTH_EMAIL=email@example.com`
$ `THOTH_PWD=password`
$ `python3 ./main.py --input-file my_file.pdf --output-folder ~/output \
--database thoth --write-urls 10.11647/obp.0309`

## Running with docker
Running the command reported above in docker would be:
```
docker run --rm \
-e THOTH_EMAIL=email@example.com \
-e THOTH_PWD=password \
-v /path/to/local.pdf:/ebook_automation/file.pdf \
-v /path/to/output:/ebook_automation/output \
openbookpublishers/chapter-splitter \
Expand Down
4 changes: 2 additions & 2 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
crossrefapi==1.4.0
pymupdf==1.18.9
pymupdf==1.22.5
roman==3.3
typer==0.4.1
thothlibrary==0.12.0
thothlibrary==0.20.1
55 changes: 54 additions & 1 deletion src/db.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,13 @@
from urllib.parse import urljoin
import json
import requests
from os import getenv
from typing import Dict, List


class Db():
"""Base Db class to derive specialised database classes from"""

def __init__(self, doi: str) -> None:
self.db = self.init_db()
self.doi = urljoin('https://doi.org/', doi)
Expand Down Expand Up @@ -82,6 +84,7 @@ def join_author_names(self, chapter_data: Dict) -> str:

class Thoth(Db):
"""Thoth compatibility layer"""

def init_db(self):
"""Init database object"""
return ThothClient()
Expand All @@ -102,6 +105,7 @@ def get_chapters(self, book: Dict) -> List:
query = {"query": """{ workByDoi (doi: "%s") {
relations(relationTypes: HAS_CHILD) {
relatedWork {
workId
fullTitle
copyrightHolder
longAbstract
Expand Down Expand Up @@ -133,7 +137,56 @@ def get_chapters(self, book: Dict) -> List:
"publisher": work.get("imprint", {}).get("imprintName"),
"abstract": work.get("longAbstract"),
"pages": work.get("pageInterval"),
"licence": work.get("license")}
"licence": work.get("license"),
"workId": work.get("workId")}
chapters.append(data)

return chapters

def write_urls(self, chapter):
"""Write Landing Page and Full Text URLs to Thoth"""
chapter_doi = chapter.get("doi").split('/')[-1].lower()
book_doi = chapter_doi.rpartition('.')[0]
landing_page_root = (
'https://www.openbookpublishers.com/books/10.11647/'
'{book_doi}/chapters/10.11647/{chapter_doi}')
full_text_url_root = (
'https://www.books.openbookpublishers.com/10.11647/'
'{chapter_doi}.pdf')

username = getenv('THOTH_EMAIL')
password = getenv('THOTH_PWD')
if username is None:
raise KeyError(
'No Thoth username provided '
'(THOTH_EMAIL environment variable not set)')
if password is None:
raise KeyError(
'No Thoth password provided '
'(THOTH_PWD environment variable not set)')

self.db.login(username, password)

publication = {"workId": chapter.get("workId"),
"publicationType": "PDF",
"isbn": None,
"widthMm": None,
"widthIn": None,
"heightMm": None,
"heightIn": None,
"depthMm": None,
"depthIn": None,
"weightG": None,
"weightOz": None}
publication_id = self.db.create_publication(publication)

location = {"publicationId": publication_id,
"landingPage": landing_page_root.format(
book_doi=book_doi, chapter_doi=chapter_doi),
"fullTextUrl": full_text_url_root.format(
book_doi=book_doi, chapter_doi=chapter_doi),
"locationPlatform": "OTHER",
"canonical": "true"}
self.db.create_location(location)

print('{}: URLs written to Thoth'.format(chapter_doi))
14 changes: 12 additions & 2 deletions src/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
from pdf import Pdf
from metadata import Metadata
from shutil import copy2
from thothlibrary import ThothError
import re

app = typer.Typer()
Expand All @@ -18,7 +19,8 @@ def run(input_file: Path = typer.Option("./file.pdf",
output_folder: Path = typer.Option("./output/",
exists=True, resolve_path=True),
doi: str = typer.Argument(...),
database: str = "thoth"):
database: str = "thoth",
write_urls: bool = True):

with tempfile.TemporaryDirectory() as tmp_dir:

Expand All @@ -37,13 +39,21 @@ def run(input_file: Path = typer.Option("./file.pdf",
# Merge PDFs
pdf.merge_pdfs(page_range, output_file_name)

# Write metadata
# Write metadata to chapter PDF
output_file_path = os.path.join(tmp_dir, output_file_name)
metadata.write_metadata(chapter, output_file_path)

# copy file to output dir
copy2(output_file_path, output_folder)

if write_urls:
# Write chapter URL metadata to database
try:
metadata.write_urls(chapter)
except (KeyError, ThothError) as e:
# Continue on error, but display warning
print('Error writing URLs to {}: {}'.format(database, e))


if __name__ == '__main__':
typer.run(run)
7 changes: 6 additions & 1 deletion src/metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ class Chapter:
doi: str = None
licence: str = None
publisher: str = None
workId: str = None

@classmethod
def from_dict(cls, d):
Expand Down Expand Up @@ -77,6 +78,10 @@ def get_chapters(self) -> List[Dict]:

return [chapter.to_dict() for chapter in self.chapters]

def write_urls(self, chapter_dict):
"""Write URL metadata to Thoth"""
self.db.write_urls(chapter_dict)

@staticmethod
def write_metadata(chapter_dict, output_file_path):
"""Writes metadata to file_name"""
Expand All @@ -100,7 +105,7 @@ def write_metadata(chapter_dict, output_file_path):
cmd.append(output_file_path)

run(cmd)
print('{}: Metadata written'
print('{}: Metadata written to PDF'
.format(path.split(output_file_path)[1]))

@staticmethod
Expand Down

0 comments on commit de77c3f

Please sign in to comment.