Skip to content

Commit

Permalink
Merge pull request #302 from gitnnolabs/tk301
Browse files Browse the repository at this point in the history
Adicionar a capacidade de pupolar o campo related_articles
  • Loading branch information
gitnnolabs authored Oct 8, 2021
2 parents 76ea7df + 1fc2523 commit d47675c
Show file tree
Hide file tree
Showing 4 changed files with 146 additions and 5 deletions.
34 changes: 34 additions & 0 deletions airflow/dags/common/sps_package.py
Original file line number Diff line number Diff line change
Expand Up @@ -399,3 +399,37 @@ def is_document_deletion(self):
"""True if delete tag is present.
"""
return self.xmltree.find(".//article-id[@specific-use='delete']") is not None

@property
def related_articles(self):
"""Return a list of dict
Example:
"related_articles" : [
{
"doi" : "10.1590/S0103-50532006000200015",
"related_type" : "corrected-article"
},
{
"doi" : "10.1590/S0103-5053200600020098983",
"related_type" : "addendum"
},
{
"doi" : "10.1590/S0103-50532006000200015",
"related_type" : "retraction"
},
]
"""

related_list = []
for node in self.xmltree.findall(".//related-article"):
related_doi = node.attrib['{http://www.w3.org/1999/xlink}href']

related_dict = {}
related_dict['doi'] = related_doi
related_dict['related_type'] = node.attrib['related-article-type']

related_list.append(related_dict)

return related_list
98 changes: 97 additions & 1 deletion airflow/dags/operations/sync_kernel_to_website_operations.py
Original file line number Diff line number Diff line change
@@ -1,17 +1,20 @@
import logging
from datetime import datetime
from re import match
from typing import Iterable, Generator, Dict, List, Tuple
from typing import Callable, Iterable, Generator, Dict, List, Tuple

import requests
from lxml import etree as et
from opac_schema.v1 import models

import common.hooks as hooks
from operations.exceptions import InvalidOrderValueError
from operations.docs_utils import (
get_bundle_id,
)

from common.sps_package import (
SPS_Package,
extract_number_and_supplment_from_issue_element,
)

Expand Down Expand Up @@ -79,6 +82,7 @@ def ArticleFactory(
document_order: int,
document_xml_url: str,
repeated_doc_pids=None,
fetch_document_xml:callable=None,
) -> models.Article:
"""Cria uma instância de artigo a partir dos dados de entrada.
Expand All @@ -91,6 +95,8 @@ def ArticleFactory(
issue_id (str): Identificador de issue.
document_order (int): Posição do artigo.
document_xml_url (str): URL do XML do artigo
fetch_document_xml (callable): Função para obter o XML do Kernel caso
necessário.
Returns:
models.Article: Instância de um artigo próprio do modelo de dados do
Expand Down Expand Up @@ -352,6 +358,86 @@ def _get_order(document_order, pid_v2):
except (ValueError, TypeError):
raise InvalidOrderValueError(order_err_msg)

def _update_related_articles(article, related_dict):
"""
Atualiza os documentos relacionados.
Nesse método será realizado uma atualização no related_articles de
ambos os documento ou seja ``["correction", "retraction", "addendum",] -> documento``
quando ``documento -> ["correction", "retraction", "addendum",]``.
Será necessário uma pesquisa na base de dados do OPAC para obter o
pid_v3 dos documentos relacionado para que seja possível armazena-lo
nessa relação.
article = A instância corrente de models.Article(Artigo sendo processado)
related_dict = {
"doi" : "10.1590/S0103-50532006000200015",
"related_type" : "retraction"
}
Está sendo alterado o atributo related_articles do ``article``
"""

related_doi = related_dict.get('doi')

article_data = {
"ref_id": article._id,
"doi": article.doi ,
"related_type" : article.type,
}

if related_doi:
try:
related_article = models.Article.objects.get(doi=related_doi)
except models.Article.DoesNotExist as ex:
logging.error("Não foi possível encontrar na base de dados do site o artigo com DOI: %s, portanto, não foi possível atualiza o related_articles do relacionado, com os dados: %s, erro: %s" % (article.doi, article_data, ex))
else:

related_article_model = models.RelatedArticle(**article_data)

# Garante a unicidade da relação.
if related_article_model not in related_article.related_articles:
# Necessário atualizar o ``related_article`` com os dados do ``article`` caso ele exista na base de dados.
related_article.related_articles += [related_article_model]
related_article.save()

# Atualiza a referência no ``ref_id`` no dicionário de ``related_article```
related_dict['ref_id'] = related_article._id

article_related_model = models.RelatedArticle(
**related_dict)

# Garante a unicidade da relação.
if article_related_model not in article.related_articles:
article.related_articles += [article_related_model]
logging.info("Relacionamento entre o documento processado: %s e seu relacionado: %s, realizado com sucesso. Tipo de relação entre os documentos: %s" % (
article.doi, related_dict.get('doi'), related_dict.get('related_type')))


def _get_related_articles(xml):
"""
Obtém a lista de documentos relacionados do XML e atualiza os
documentos dessa realação.
Tag no XML que representa essa relação:
<related-article ext-link-type="doi" id="ra1"
related-article-type="corrected-article"
xlink:href="10.1590/S0103-50532006000200015"/>
"""

try:
etree_xml = et.XML(xml)
except ValueError as ex:
logging.error("Erro ao tentar analisar(parser) do XML, erro: %s", ex)
else:

sps_package = SPS_Package(etree_xml)

for related_dict in sps_package.related_articles:
_update_related_articles(article, related_dict)

article.authors = list(_get_article_authors(data))
article.authors_meta = _get_article_authors_meta(data)
article.languages = list(_get_languages(data))
Expand Down Expand Up @@ -403,6 +489,12 @@ def _get_order(document_order, pid_v2):
article.order = _get_order(document_order, article.pid)
article.xml = document_xml_url

# Se for uma errata ou retratação ou adendo.
if article.type in ["correction", "retraction", "addendum"]:
# Obtém o XML da errada no kernel
xml = fetch_document_xml(document_id)
_get_related_articles(xml)

# Campo de compatibilidade do OPAC
article.htmls = [{"lang": lang} for lang in _get_languages(data)]

Expand All @@ -417,6 +509,7 @@ def try_register_documents(
get_relation_data: callable,
fetch_document_front: callable,
article_factory: callable,
fetch_document_xml: callable,
) -> List[str]:
"""Registra documentos do Kernel na base de dados do `OPAC`.
Expand All @@ -433,6 +526,8 @@ def try_register_documents(
`front` do documento a partir da API do Kernel.
article_factory (callable): função que cria uma instância do modelo
de dados do Artigo na base do OPAC.
fetch_document_xml (callable): função que recupera XML
do documento a partir da API do Kernel.
Returns:
List[str] orphans: Lista contendo todos os identificadores dos
Expand Down Expand Up @@ -467,6 +562,7 @@ def try_register_documents(
item.get("order"),
document_xml_url,
repeated_doc_pids,
fetch_document_xml
)
document.save()
logging.info("ARTICLE saved %s %s" % (document_id, issue_id))
Expand Down
17 changes: 14 additions & 3 deletions airflow/dags/sync_kernel_to_website.py
Original file line number Diff line number Diff line change
Expand Up @@ -100,7 +100,7 @@ def _process_events(self, log):
return entities, last_timestamp


def fetch_data(endpoint):
def fetch_data(endpoint, json=True):
"""
Obtém o JSON do endpoint do Kernel
"""
Expand All @@ -111,7 +111,11 @@ def fetch_data(endpoint):
kernel_timeout = Variable.get("KERNEL_FETCH_DATA_TIMEOUT", default_var=None)
if kernel_timeout:
kwargs["timeout"] = int(kernel_timeout)
return kernel_connect(**kwargs).json()

if json:
return kernel_connect(**kwargs).json()
else:
return kernel_connect(**kwargs).content


def fetch_changes(since):
Expand Down Expand Up @@ -142,6 +146,13 @@ def fetch_documents_front(document_id):
return fetch_data("/documents/%s/front" % (document_id))


def fetch_documents_xml(document_id):
"""
Obtém o XML do Document do Kernel com base no parametro 'document_id'
"""
return fetch_data("/documents/%s" % (document_id), json=False)


def _get_relation_data_from_kernel_bundle(document_id, front_data=None):
"""
Obtém os dados do documento no bundle
Expand Down Expand Up @@ -727,7 +738,7 @@ def _get_known_documents(**kwargs) -> Dict[str, List[str]]:
)

orphans = try_register_documents(
documents_to_get, _get_relation_data, fetch_documents_front, ArticleFactory
documents_to_get, _get_relation_data, fetch_documents_front, ArticleFactory, fetch_documents_xml,
)

Variable.set("orphan_documents", orphans, serialize_json=True)
Expand Down
2 changes: 1 addition & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,6 @@ deepdiff[murmur]==4.0.7
feedparser==5.2.1
beautifulsoup4==4.9.0
git+https://github.com/scieloorg/xylose.git@1.35.8#egg=xylose
git+https://github.com/scieloorg/opac_schema.git@v2.58#egg=opac_schema
git+https://github.com/scieloorg/opac_schema.git@v2.60#egg=opac_schema
git+https://github.com/scieloorg/packtools.git@2.6.4#egg=packtools
aiohttp==3.6.2

0 comments on commit d47675c

Please sign in to comment.