Skip to content

Commit

Permalink
Retrieve documents also based on effective date
Browse files Browse the repository at this point in the history
  • Loading branch information
filips123 committed Sep 27, 2024
1 parent 279869b commit 082a628
Show file tree
Hide file tree
Showing 2 changed files with 22 additions and 17 deletions.
35 changes: 20 additions & 15 deletions API/gimvicurnik/updaters/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -163,15 +163,22 @@ def handle_document(self, document: DocumentInfo, span: Span) -> None:
span.set_tag("document.modified", document.modified)
span.set_tag("document.action", "crashed")

# == DOCUMENT EFFECTIVE

# Get the document's effective date using the subclassed method
# This may return none for documents without an effective date
# If this fails, we can't do anything other than to skip the document
effective = self.get_document_effective(document)

# == DOCUMENT RECORD (GET)

# Try to find an existing document record
record = self.retrieve_document(document)
record = self.retrieve_document(document, effective)

# == DOCUMENT PROCESSING

# Get the modified time if it is set, otherwise use the current time
created = document.created or datetime.datetime.utcnow()
created = document.created or datetime.datetime.now(datetime.UTC)
modified = document.modified or created

# Check if the document has changed without downloading it and comparing hashes
Expand All @@ -193,8 +200,8 @@ def handle_document(self, document: DocumentInfo, span: Span) -> None:
# If this fails, we can't do anything other than to skip the document
stream, new_hash = self.download_document(document)

# Check if the document hash has changed
if record and record.parsed and record.hash == new_hash:
# Check if the document hash or document URL have changed
if record and record.parsed and record.hash == new_hash and record.url == document.url:
changed = False
else:
action = "updated"
Expand Down Expand Up @@ -233,11 +240,6 @@ def handle_document(self, document: DocumentInfo, span: Span) -> None:

return

# Get the document's effective date using the subclassed method
# This may return none for documents without an effective date
# If this fails, we can't do anything other than to skip the document
effective = self.get_document_effective(document)

if parsable:
# If there is no date, we can't do anything other than to skip the document
if not effective:
Expand Down Expand Up @@ -320,14 +322,17 @@ def handle_document(self, document: DocumentInfo, span: Span) -> None:
self.logger.info("Skipped because the %s document for %s is already stored", document.type.value, effective)
# fmt: on

def retrieve_document(self, document: DocumentInfo) -> Document | None:
def retrieve_document(self, document: DocumentInfo, effective: datetime.date | None) -> Document | None:
"""Get a document record from the database. May be set by subclasses."""

return (
self.session.query(Document)
.filter(Document.type == document.type, Document.url == document.url)
.first()
)
# Normally, the document URL should match
criterion = Document.url == document.url

if effective:
# If effective date is set, it may also match instead of the URL
criterion |= Document.effective == effective

return self.session.query(Document).filter(Document.type == document.type, criterion).first()

@with_span(op="download")
def download_document(self, document: DocumentInfo) -> tuple[BytesIO, str]:
Expand Down
4 changes: 2 additions & 2 deletions API/gimvicurnik/updaters/timetable.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
import re
import typing
from collections import defaultdict
from datetime import datetime
from datetime import datetime, UTC
from hashlib import sha256

import requests
Expand Down Expand Up @@ -167,7 +167,7 @@ def _parse(self, document: Document | None, raw_data: str, new_hash: str, span:
created = False

document.type = DocumentType.TIMETABLE
document.modified = datetime.utcnow()
document.modified = datetime.now(UTC)
document.url = self.config.url
document.hash = new_hash
self.session.add(document)
Expand Down

0 comments on commit 082a628

Please sign in to comment.