diff --git a/API/gimvicurnik/updaters/base.py b/API/gimvicurnik/updaters/base.py index a17e6b1..58d3320 100644 --- a/API/gimvicurnik/updaters/base.py +++ b/API/gimvicurnik/updaters/base.py @@ -163,15 +163,22 @@ def handle_document(self, document: DocumentInfo, span: Span) -> None: span.set_tag("document.modified", document.modified) span.set_tag("document.action", "crashed") + # == DOCUMENT EFFECTIVE + + # Get the document's effective date using the subclassed method + # This may return none for documents without an effective date + # If this fails, we can't do anything other than to skip the document + effective = self.get_document_effective(document) + # == DOCUMENT RECORD (GET) # Try to find an existing document record - record = self.retrieve_document(document) + record = self.retrieve_document(document, effective) # == DOCUMENT PROCESSING # Get the modified time if it is set, otherwise use the current time - created = document.created or datetime.datetime.utcnow() + created = document.created or datetime.datetime.now(datetime.timezone.utc) modified = document.modified or created # Check if the document has changed without downloading it and comparing hashes @@ -193,8 +200,8 @@ def handle_document(self, document: DocumentInfo, span: Span) -> None: # If this fails, we can't do anything other than to skip the document stream, new_hash = self.download_document(document) - # Check if the document hash has changed - if record and record.parsed and record.hash == new_hash: + # Check if the document hash or document URL have changed + if record and record.parsed and record.hash == new_hash and record.url == document.url: changed = False else: action = "updated" @@ -233,11 +240,6 @@ def handle_document(self, document: DocumentInfo, span: Span) -> None: return - # Get the document's effective date using the subclassed method - # This may return none for documents without an effective date - # If this fails, we can't do anything other than to skip the document - effective = self.get_document_effective(document) - if parsable: # If there is no date, we can't do anything other than to skip the document if not effective: @@ -320,14 +322,17 @@ def handle_document(self, document: DocumentInfo, span: Span) -> None: self.logger.info("Skipped because the %s document for %s is already stored", document.type.value, effective) # fmt: on - def retrieve_document(self, document: DocumentInfo) -> Document | None: + def retrieve_document(self, document: DocumentInfo, effective: datetime.date | None) -> Document | None: """Get a document record from the database. May be set by subclasses.""" - return ( - self.session.query(Document) - .filter(Document.type == document.type, Document.url == document.url) - .first() - ) + # Normally, the document URL should match + criterion = Document.url == document.url + + if effective: + # If effective date is set, it may also match instead of the URL + criterion |= Document.effective == effective + + return self.session.query(Document).filter(Document.type == document.type, criterion).first() @with_span(op="download") def download_document(self, document: DocumentInfo) -> tuple[BytesIO, str]: diff --git a/API/gimvicurnik/updaters/menu.py b/API/gimvicurnik/updaters/menu.py index d5affe2..80df918 100644 --- a/API/gimvicurnik/updaters/menu.py +++ b/API/gimvicurnik/updaters/menu.py @@ -90,7 +90,7 @@ def get_document_effective(self, document: DocumentInfo) -> datetime.date: # jedilnik-kosilo-YYYY-MM-DD(-popravek).pdf # jedilnik-malica-YYYY-MM-DD(-popravek).pdf date = re.search( - r"jedilnik-(?:kosilo|malica)-(\d+)-(\d+)-(\d+)(?:-[\w-]*)?\.(?:pdf|xlsx)", document.url + r"jedilnik-(?:kosilo|malica|K|M)-(\d+)-(\d+)-(\d+)(?:-[\w-]*)?\.(?:pdf|xlsx)", document.url ) # The specified date is commonly Monday of the effective week diff --git a/API/gimvicurnik/updaters/timetable.py b/API/gimvicurnik/updaters/timetable.py index 11c2a81..a8015cc 100644 --- a/API/gimvicurnik/updaters/timetable.py +++ b/API/gimvicurnik/updaters/timetable.py @@ -4,7 +4,7 @@ import re import typing from collections import defaultdict -from datetime import datetime +from datetime import datetime, timezone from hashlib import sha256 import requests @@ -167,7 +167,7 @@ def _parse(self, document: Document | None, raw_data: str, new_hash: str, span: created = False document.type = DocumentType.TIMETABLE - document.modified = datetime.utcnow() + document.modified = datetime.now(timezone.utc) document.url = self.config.url document.hash = new_hash self.session.add(document) diff --git a/website/src/components/TimetableDisplay.vue b/website/src/components/TimetableDisplay.vue index 10749bb..715fa36 100644 --- a/website/src/components/TimetableDisplay.vue +++ b/website/src/components/TimetableDisplay.vue @@ -1,13 +1,14 @@