Skip to content

Commit

Permalink
Passa a renderizar o HTML em tempo de execução (#1285)
Browse files Browse the repository at this point in the history
* Passa a renderizar o HTML em tempo de execução

Mesmo com esta alteração ainda há a dependência do atributo
`article.htmls` produzido durante o processamento.

* Adiciona a função fetch_data com melhor tratamento de erro

* Remove a view metasearch

* Remove código não utilizados
  • Loading branch information
jamilatta authored and Cesar Augusto committed May 2, 2019
1 parent 50e6b93 commit e027ec3
Show file tree
Hide file tree
Showing 4 changed files with 126 additions and 97 deletions.
199 changes: 123 additions & 76 deletions opac/webapp/main/views.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@

import logging
import requests
import mimetypes
from io import BytesIO
from bs4 import BeautifulSoup
from urllib.parse import urlparse
from datetime import datetime
Expand All @@ -24,6 +26,9 @@

from webapp.config.lang_names import display_original_lang_name

from lxml import etree
from packtools import HTMLGenerator

logger = logging.getLogger(__name__)

JOURNAL_UNPUBLISH = _("O periódico está indisponível por motivo de: ")
Expand All @@ -36,6 +41,40 @@ def url_external(endpoint, **kwargs):
return urljoin(request.url_root, url)


class RetryableError(Exception):
"""Erro recuperável sem que seja necessário modificar o estado dos dados
na parte cliente, e.g., timeouts, erros advindos de particionamento de rede
etc.
"""


class NonRetryableError(Exception):
"""Erro do qual não pode ser recuperado sem modificar o estado dos dados
na parte cliente, e.g., recurso solicitado não exite, URI inválida etc.
"""


def fetch_data(url: str, timeout: float = 2) -> bytes:
try:
response = requests.get(url, timeout=timeout)
except (requests.ConnectionError, requests.Timeout) as exc:
raise RetryableError(exc) from exc
except (requests.InvalidSchema, requests.MissingSchema, requests.InvalidURL) as exc:
raise NonRetryableError(exc) from exc
else:
try:
response.raise_for_status()
except requests.HTTPError as exc:
if 400 <= exc.response.status_code < 500:
raise NonRetryableError(exc) from exc
elif 500 <= exc.response.status_code < 600:
raise RetryableError(exc) from exc
else:
raise

return response.content


@main.before_app_request
def add_collection_to_g():
if not hasattr(g, 'collection'):
Expand Down Expand Up @@ -791,6 +830,60 @@ def article_detail_pid(pid):
url_seg_article=article.url_segment))


def render_html_from_xml(article, lang):
result = fetch_data(normalize_ssm_url(article.xml))

xml = etree.parse(BytesIO(result))

generator = HTMLGenerator.parse(xml, valid_only=False)

# Criamos um objeto do tip soup
soup = BeautifulSoup(etree.tostring(generator.generate(lang), encoding="UTF-8", method="html"), 'html.parser')

# Fatiamos o HTML pelo div com class: articleTxt
return soup.find('div', {'id': 'standalonearticle'}), generator.languages


def render_html_from_html(article, lang):
html_url = [html
for html in article.htmls
if html['lang'] == lang]

try:
html_url = html_url[0]['url']
except IndexError:
raise ValueError('Artigo não encontrado') from None

result = fetch_data(normalize_ssm_url(html_url))

html = result.decode('utf8')

text_languages = [html['lang'] for html in article.htmls]

return html, text_languages


def render_html(article, lang):
if article.xml:
return render_html_from_xml(article, lang)
elif article.htmls:
return render_html_from_html(article, lang)
else:
# TODO: Corrigir os teste que esperam ter o atributo ``htmls``
# O ideal seria levantar um ValueError.
return '', []


# TODO: Remover assim que o valor Article.xml estiver consistente na base de
# dados
def normalize_ssm_url(url):
if url.startswith("http"):
parsed_url = urlparse(url)
return current_app.config["SSM_BASE_URI"] + parsed_url.path
else:
return current_app.config["SSM_BASE_URI"] + url


@main.route('/article/<string:url_seg>/<regex("\d{4}\.(\w+[-\.]?\w+[-\.]?)"):url_seg_issue>/<string:url_seg_article>/')
@main.route('/article/<string:url_seg>/<regex("\d{4}\.(\w+[-\.]?\w+[-\.]?)"):url_seg_issue>/<string:url_seg_article>/<regex("(?:\w{2})"):lang_code>/')
@main.route('/article/<string:url_seg>/<regex("\d{4}\.(\w+[-\.]?\w+[-\.]?)"):url_seg_issue>/<regex("(.*)"):url_seg_article>/')
Expand Down Expand Up @@ -833,9 +926,6 @@ def article_detail(url_seg, url_seg_issue, url_seg_article, lang_code=''):
if not article.journal.is_public:
abort(404, JOURNAL_UNPUBLISH + _(article.journal.unpublish_reason))

journal = article.journal
issue = article.issue

articles = controllers.get_articles_by_iid(issue.iid, is_public=True)

article_list = [_article for _article in articles]
Expand All @@ -858,66 +948,35 @@ def article_detail(url_seg, url_seg_issue, url_seg_article, lang_code=''):
except Exception:
abort(404, _('PDF do Artigo não encontrado'))

html_article = None

text_versions = None
if article.htmls:
try:
html_url = [html for html in article.htmls if html['lang'] == lang_code]

if len(html_url) != 1:
abort(404, _('HTML do Artigo não encontrado'))
else:
html_url = html_url[0]['url']

if html_url.startswith('http'): # http:// ou https://
html_url_parsed = urlparse(html_url)
html_full_ssm_url = current_app.config['SSM_BASE_URI'] + html_url_parsed.path
else:
html_full_ssm_url = current_app.config['SSM_BASE_URI'] + html_url

# Obtemos o html do SSM
try:
result = requests.get(html_full_ssm_url)
except requests.exceptions.RequestException:
abort(404, _('HTML do Artigo não encontrado ou indisponível'))
else:
if result.status_code == 200 and len(result.content) > 0:

# Criamos um objeto do tip soup
soup = BeautifulSoup(result.content.decode('utf-8'), 'html.parser')

# Fatiamos o HTML pelo div com class: articleTxt
html_article = soup.find('div', {'id': 'standalonearticle'})
else:
abort(404, _('Artigo não encontrado'))

except IndexError:
abort(404, _('Artigo não encontrado'))
text_versions = sorted(
[
(
html['lang'],
display_original_lang_name(html['lang']),
url_for(
'main.article_detail',
url_seg=journal.url_segment,
url_seg_issue=issue.url_segment,
url_seg_article=article.url_segment,
lang_code=html['lang']
)
)
for html in article.htmls
]
)
try:
html, text_languages = render_html(article, lang_code)
except (ValueError, NonRetryableError, RetryableError):
abort(404, _('HTML do Artigo não encontrado ou indisponível'))

text_versions = sorted(
[
(
lang,
display_original_lang_name(lang),
url_for(
'main.article_detail',
url_seg=article.journal.url_segment,
url_seg_issue=article.issue.url_segment,
url_seg_article=article.url_segment,
lang_code=lang
)
)
for lang in text_languages
]
)

context = {
'next_article': next_article,
'previous_article': previous_article,
'article': article,
'journal': journal,
'journal': article.journal,
'issue': issue,
'html': html_article,
'html': html,
'pdfs': article.pdfs,
'pdf_urls_path': pdf_urls_path,
'article_lang': lang_code,
Expand Down Expand Up @@ -953,15 +1012,16 @@ def article_epdf():
@cache.cached(key_prefix=cache_key_with_lang_with_qs)
def get_content_from_ssm(resource_ssm_media_path):
resource_ssm_full_url = current_app.config['SSM_BASE_URI'] + resource_ssm_media_path

url = resource_ssm_full_url.strip()
mimetype, __ = mimetypes.guess_type(url)

try:
ssm_response = requests.get(resource_ssm_full_url.strip())
except Exception:
ssm_response = fetch_data(url)
except (NonRetryableError, RetryableError):
abort(404, _('Recruso não encontrado'))
else:
if ssm_response.status_code == 200 and len(ssm_response.content) > 0:
return Response(ssm_response.content, mimetype=ssm_response.headers['Content-Type'])
else:
abort(404, _('Recurso não encontrado'))
return Response(ssm_response, mimetype=mimetype)


@main.route('/media/assets/<regex("(.*)"):relative_media_path>')
Expand Down Expand Up @@ -1098,19 +1158,6 @@ def router_legacy_pdf(journal_acron, issue_info, pdf_filename):
url_seg_issue=issue.url_segment,
url_seg_article=article_match.url_segment, lang_code=pdf_lang)

# ##################################Search#######################################


@main.route("/metasearch/", methods=['GET'])
@cache.cached(key_prefix=cache_key_with_lang_with_qs)
def metasearch():
url = request.args.get('url', current_app.config['URL_SEARCH'], type=str)
params = {}
for k, v in list(request.args.items()):
if k != 'url':
params[k] = v
xml = utils.do_request(url, request.args)
return Response(xml, mimetype='text/xml')

# ###############################E-mail share####################################

Expand Down
20 changes: 0 additions & 20 deletions opac/webapp/utils/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -451,26 +451,6 @@ def get_resources_url(resource_list, type, lang):
return None


def do_request(url, params):
try:
response = requests.get(url, params=params)
except:
return None
if response.status_code == 200:
return response.content
return None


def do_request_json(url, params):
try:
response = requests.get(url, params=params)
except:
return {}
if response.status_code == 200:
return response.json()
return {}


def utc_to_local(utc_dt):
local_tz = pytz.timezone(current_app.config['LOCAL_ZONE'])

Expand Down
3 changes: 2 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -31,4 +31,5 @@ rq-dashboard==0.3.10
rq-scheduler==0.8.2
rq-scheduler-dashboard==0.0.2
lxml==4.2.4
Werkzeug==0.14.1
Werkzeug==0.14.1
packtools==2.4.3
1 change: 1 addition & 0 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@
# for production production
'chaussette>=1.3',
'gevent>=1.1.0',
'packtools'
]

dependency_links = [
Expand Down

0 comments on commit e027ec3

Please sign in to comment.