Passa a renderizar o HTML em tempo de execução (#1285)

* Passa a renderizar o HTML em tempo de execução Mesmo com esta alteração ainda há a dependência do atributo `article.htmls` produzido durante o processamento. * Adiciona a função fetch_data com melhor tratamento de erro * Remove a view metasearch * Remove código não utilizados
scieloorg · May 2, 2019 · e027ec3 · e027ec3
1 parent 50e6b93
commit e027ec3
Show file tree

Hide file tree

Showing 4 changed files with 126 additions and 97 deletions.
diff --git a/opac/webapp/main/views.py b/opac/webapp/main/views.py
@@ -2,6 +2,8 @@
 
 import logging
 import requests
+import mimetypes
+from io import BytesIO
 from bs4 import BeautifulSoup
 from urllib.parse import urlparse
 from datetime import datetime
@@ -24,6 +26,9 @@
 
 from webapp.config.lang_names import display_original_lang_name
 
+from lxml import etree
+from packtools import HTMLGenerator
+
 logger = logging.getLogger(__name__)
 
 JOURNAL_UNPUBLISH = _("O periódico está indisponível por motivo de: ")
@@ -36,6 +41,40 @@ def url_external(endpoint, **kwargs):
     return urljoin(request.url_root, url)
 
 
+class RetryableError(Exception):
+    """Erro recuperável sem que seja necessário modificar o estado dos dados
+    na parte cliente, e.g., timeouts, erros advindos de particionamento de rede
+    etc.
+    """
+
+
+class NonRetryableError(Exception):
+    """Erro do qual não pode ser recuperado sem modificar o estado dos dados
+    na parte cliente, e.g., recurso solicitado não exite, URI inválida etc.
+    """
+
+
+def fetch_data(url: str, timeout: float = 2) -> bytes:
+    try:
+        response = requests.get(url, timeout=timeout)
+    except (requests.ConnectionError, requests.Timeout) as exc:
+        raise RetryableError(exc) from exc
+    except (requests.InvalidSchema, requests.MissingSchema, requests.InvalidURL) as exc:
+        raise NonRetryableError(exc) from exc
+    else:
+        try:
+            response.raise_for_status()
+        except requests.HTTPError as exc:
+            if 400 <= exc.response.status_code < 500:
+                raise NonRetryableError(exc) from exc
+            elif 500 <= exc.response.status_code < 600:
+                raise RetryableError(exc) from exc
+            else:
+                raise
+
+    return response.content
+
+
 @main.before_app_request
 def add_collection_to_g():
     if not hasattr(g, 'collection'):
@@ -791,6 +830,60 @@ def article_detail_pid(pid):
                             url_seg_article=article.url_segment))
 
 
+def render_html_from_xml(article, lang):
+    result = fetch_data(normalize_ssm_url(article.xml))
+
+    xml = etree.parse(BytesIO(result))
+
+    generator = HTMLGenerator.parse(xml, valid_only=False)
+
+    # Criamos um objeto do tip soup
+    soup = BeautifulSoup(etree.tostring(generator.generate(lang), encoding="UTF-8", method="html"), 'html.parser')
+
+    # Fatiamos o HTML pelo div com class: articleTxt
+    return soup.find('div', {'id': 'standalonearticle'}), generator.languages
+
+
+def render_html_from_html(article, lang):
+    html_url = [html
+                for html in article.htmls
+                if html['lang'] == lang]
+
+    try:
+        html_url = html_url[0]['url']
+    except IndexError:
+        raise ValueError('Artigo não encontrado') from None
+
+    result = fetch_data(normalize_ssm_url(html_url))
+
+    html = result.decode('utf8')
+
+    text_languages = [html['lang'] for html in article.htmls]
+
+    return html, text_languages
+
+
+def render_html(article, lang):
+    if article.xml:
+        return render_html_from_xml(article, lang)
+    elif article.htmls:
+        return render_html_from_html(article, lang)
+    else:
+        # TODO: Corrigir os teste que esperam ter o atributo ``htmls``
+        # O ideal seria levantar um ValueError.
+        return '', []
+
+
+# TODO: Remover assim que o valor Article.xml estiver consistente na base de
+# dados
+def normalize_ssm_url(url):
+    if url.startswith("http"):
+        parsed_url = urlparse(url)
+        return current_app.config["SSM_BASE_URI"] + parsed_url.path
+    else:
+        return current_app.config["SSM_BASE_URI"] + url
+
+
 @main.route('/article/<string:url_seg>/<regex("\d{4}\.(\w+[-\.]?\w+[-\.]?)"):url_seg_issue>/<string:url_seg_article>/')
 @main.route('/article/<string:url_seg>/<regex("\d{4}\.(\w+[-\.]?\w+[-\.]?)"):url_seg_issue>/<string:url_seg_article>/<regex("(?:\w{2})"):lang_code>/')
 @main.route('/article/<string:url_seg>/<regex("\d{4}\.(\w+[-\.]?\w+[-\.]?)"):url_seg_issue>/<regex("(.*)"):url_seg_article>/')
@@ -833,9 +926,6 @@ def article_detail(url_seg, url_seg_issue, url_seg_article, lang_code=''):
     if not article.journal.is_public:
         abort(404, JOURNAL_UNPUBLISH + _(article.journal.unpublish_reason))
 
-    journal = article.journal
-    issue = article.issue
-
     articles = controllers.get_articles_by_iid(issue.iid, is_public=True)
 
     article_list = [_article for _article in articles]
@@ -858,66 +948,35 @@ def article_detail(url_seg, url_seg_issue, url_seg_article, lang_code=''):
         except Exception:
             abort(404, _('PDF do Artigo não encontrado'))
 
-    html_article = None
-
-    text_versions = None
-    if article.htmls:
-        try:
-            html_url = [html for html in article.htmls if html['lang'] == lang_code]
-
-            if len(html_url) != 1:
-                abort(404, _('HTML do Artigo não encontrado'))
-            else:
-                html_url = html_url[0]['url']
-
-            if html_url.startswith('http'):  # http:// ou https://
-                html_url_parsed = urlparse(html_url)
-                html_full_ssm_url = current_app.config['SSM_BASE_URI'] + html_url_parsed.path
-            else:
-                html_full_ssm_url = current_app.config['SSM_BASE_URI'] + html_url
-
-            # Obtemos o html do SSM
-            try:
-                result = requests.get(html_full_ssm_url)
-            except requests.exceptions.RequestException:
-                abort(404, _('HTML do Artigo não encontrado ou indisponível'))
-            else:
-                if result.status_code == 200 and len(result.content) > 0:
-
-                    # Criamos um objeto do tip soup
-                    soup = BeautifulSoup(result.content.decode('utf-8'), 'html.parser')
-
-                    # Fatiamos o HTML pelo div com class: articleTxt
-                    html_article = soup.find('div', {'id': 'standalonearticle'})
-                else:
-                    abort(404, _('Artigo não encontrado'))
-
-        except IndexError:
-            abort(404, _('Artigo não encontrado'))
-        text_versions = sorted(
-            [
-                (
-                    html['lang'],
-                    display_original_lang_name(html['lang']),
-                    url_for(
-                       'main.article_detail',
-                       url_seg=journal.url_segment,
-                       url_seg_issue=issue.url_segment,
-                       url_seg_article=article.url_segment,
-                       lang_code=html['lang']
-                    )
-                )
-                for html in article.htmls
-            ]
-        )
+    try:
+        html, text_languages = render_html(article, lang_code)
+    except (ValueError, NonRetryableError, RetryableError):
+        abort(404, _('HTML do Artigo não encontrado ou indisponível'))
+
+    text_versions = sorted(
+           [
+               (
+                   lang,
+                   display_original_lang_name(lang),
+                   url_for(
+                      'main.article_detail',
+                      url_seg=article.journal.url_segment,
+                      url_seg_issue=article.issue.url_segment,
+                      url_seg_article=article.url_segment,
+                      lang_code=lang
+                   )
+               )
+               for lang in text_languages
+           ]
+       )
 
     context = {
         'next_article': next_article,
         'previous_article': previous_article,
         'article': article,
-        'journal': journal,
+        'journal': article.journal,
         'issue': issue,
-        'html': html_article,
+        'html': html,
         'pdfs': article.pdfs,
         'pdf_urls_path': pdf_urls_path,
         'article_lang': lang_code,
@@ -953,15 +1012,16 @@ def article_epdf():
 @cache.cached(key_prefix=cache_key_with_lang_with_qs)
 def get_content_from_ssm(resource_ssm_media_path):
     resource_ssm_full_url = current_app.config['SSM_BASE_URI'] + resource_ssm_media_path
+
+    url = resource_ssm_full_url.strip()
+    mimetype, __ = mimetypes.guess_type(url)
+
     try:
-        ssm_response = requests.get(resource_ssm_full_url.strip())
-    except Exception:
+        ssm_response = fetch_data(url)
+    except (NonRetryableError, RetryableError):
         abort(404, _('Recruso não encontrado'))
     else:
-        if ssm_response.status_code == 200 and len(ssm_response.content) > 0:
-            return Response(ssm_response.content, mimetype=ssm_response.headers['Content-Type'])
-        else:
-            abort(404, _('Recurso não encontrado'))
+        return Response(ssm_response, mimetype=mimetype)
 
 
 @main.route('/media/assets/<regex("(.*)"):relative_media_path>')
@@ -1098,19 +1158,6 @@ def router_legacy_pdf(journal_acron, issue_info, pdf_filename):
             url_seg_issue=issue.url_segment,
             url_seg_article=article_match.url_segment, lang_code=pdf_lang)
 
-# ##################################Search#######################################
-
-
-@main.route("/metasearch/", methods=['GET'])
-@cache.cached(key_prefix=cache_key_with_lang_with_qs)
-def metasearch():
-    url = request.args.get('url', current_app.config['URL_SEARCH'], type=str)
-    params = {}
-    for k, v in list(request.args.items()):
-        if k != 'url':
-            params[k] = v
-    xml = utils.do_request(url, request.args)
-    return Response(xml, mimetype='text/xml')
 
 # ###############################E-mail share####################################
 

diff --git a/opac/webapp/utils/utils.py b/opac/webapp/utils/utils.py
@@ -451,26 +451,6 @@ def get_resources_url(resource_list, type, lang):
     return None
 
 
-def do_request(url, params):
-    try:
-        response = requests.get(url, params=params)
-    except:
-        return None
-    if response.status_code == 200:
-        return response.content
-    return None
-
-
-def do_request_json(url, params):
-    try:
-        response = requests.get(url, params=params)
-    except:
-        return {}
-    if response.status_code == 200:
-        return response.json()
-    return {}
-
-
 def utc_to_local(utc_dt):
     local_tz = pytz.timezone(current_app.config['LOCAL_ZONE'])
 

diff --git a/requirements.txt b/requirements.txt
@@ -31,4 +31,5 @@ rq-dashboard==0.3.10
 rq-scheduler==0.8.2
 rq-scheduler-dashboard==0.0.2
 lxml==4.2.4
-Werkzeug==0.14.1
+Werkzeug==0.14.1
+packtools==2.4.3
diff --git a/setup.py b/setup.py
@@ -31,6 +31,7 @@
     # for production production
     'chaussette>=1.3',
     'gevent>=1.1.0',
+    'packtools'
 ]
 
 dependency_links = [