Skip to content

Commit

Permalink
Salva arquivos localmente e evita re-download
Browse files Browse the repository at this point in the history
  • Loading branch information
turicas committed Mar 17, 2021
1 parent 0165bf8 commit 0b8f71a
Showing 1 changed file with 19 additions and 1 deletion.
20 changes: 19 additions & 1 deletion transparenciagovbr/spiders/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
import scrapy
from cached_property import cached_property

from transparenciagovbr import settings
from transparenciagovbr.utils.date import date_range, date_to_dict
from transparenciagovbr.utils.fields import Schema
from transparenciagovbr.utils.io import parse_zip
Expand All @@ -20,11 +21,15 @@ class TransparenciaBaseSpider(scrapy.Spider):
encoding = "iso-8859-1"
mirror_url = "https://data.brasil.io/mirror/transparenciagovbr/{dataset}/{filename}"

def __init__(self, use_mirror="False", *args, **kwargs):
def __init__(self, use_mirror="False", save_file="True", *args, **kwargs):
super().__init__(*args, **kwargs)
self.use_mirror = use_mirror.lower() == "true"
self.save_file = save_file.lower() == "true"
self.schema = Schema(self.schema_filename)

def make_filename(self, url):
return settings.DOWNLOAD_PATH / self.name / urlparse(url).path.rsplit("/", maxsplit=1)[-1]

def start_requests(self):
for date in date_range(
start=self.start_date, stop=self.end_date, interval=self.publish_frequency
Expand All @@ -35,9 +40,22 @@ def start_requests(self):
dataset=self.name,
filename=urlparse(url).path.rsplit("/", maxsplit=1)[-1],
)
elif self.save_file:
filename = self.make_filename(url)
if filename.exists():
url = f"file://{filename.absolute()}"
yield scrapy.Request(url, callback=self.parse_zip_response)

def parse_zip_response(self, response):
# If it's set to save file and the response comes from the Web, then
# save it to the disk.
if self.save_file and not response.request.url.startswith("file://"):
filename = self.make_filename(response.request.url)
if not filename.parent.exists():
filename.parent.mkdir(parents=True)
with open(filename, mode="wb") as fobj:
fobj.write(response.body)

data = parse_zip(
filename_or_fobj=io.BytesIO(response.body),
inner_filename_suffix=self.filename_suffix,
Expand Down

0 comments on commit 0b8f71a

Please sign in to comment.