Skip to content

Commit

Permalink
Merge pull request #23 from avdata99/create_siu_lib
Browse files Browse the repository at this point in the history
Usar librería SIU
  • Loading branch information
avdata99 authored Aug 26, 2020
2 parents f28229b + a08dda7 commit 74b0c4a
Show file tree
Hide file tree
Showing 49 changed files with 147,472 additions and 1,537 deletions.
3 changes: 1 addition & 2 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -41,5 +41,4 @@ coverage.xml
# Sphinx documentation
docs/_build/

ckanext/siu_harvester/harvesters/siu_transp_data/tmp.md
ckanext/siu_harvester/harvesters/siu_transp_data/results
ckanext/siu_harvester/harvesters/tmp.md
6 changes: 2 additions & 4 deletions .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -8,10 +8,8 @@ services:
- redis
install:
- bash bin/travis-build.bash
- pip install coveralls
- pip install -r dev-requirements.txt
script: sh bin/travis-run.sh
after_success:
- coveralls
deploy:
provider: pypi
username: "__token__"
Expand All @@ -20,4 +18,4 @@ deploy:
distributions: sdist
on:
python: 2.7
branch: master
tags: true
4 changes: 1 addition & 3 deletions MANIFEST.in
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,4 @@ include README.rst
include LICENSE
include requirements.txt
recursive-include ckanext/siu_harvester *.html *.js *.less *.css *.mo
recursive-include ckanext/siu_harvester/harvesters/siu_transp_data/queries *.json
exclude ckanext/siu_harvester/harvesters/siu_transp_data/sample_urls.txt
prune ckanext/siu_harvester/harvesters/siu_transp_data/results
exclude ckanext/siu_harvester/harvesters/tmp.md
14 changes: 13 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -100,7 +100,7 @@ Ejemplo:
### Datos a extraer

Estos endpoints pueden incluir multiples recursos. Cada recurso es un _query_ al endpoint ya listo para usar.
Estos ya están configurados en el directorio `ckanext/siu_harvester/harvesters/siu_transp_data/queries/`
Estos ya están configurados en el directorio `queries` de la librería [siu-data](https://pypi.org/project/siu-data/)

Por ejemplo `egresados-pos-facultad.json`

Expand Down Expand Up @@ -140,3 +140,15 @@ Por ejemplo `egresados-pos-facultad.json`
De esta forma este _harvester_ va a iterar por los años disponibles y creará un dataset para cada año.
Es posible agregar más _queries_ para consumir más datos.

## Tests

Locally

```
docker-compose \
-f docker-compose.yml \
-f docker-compose-dev.yml \
exec ckan bash -c \
"cd src_extensions/ckanext-siu-harvester && \
nosetests --ckan --nologcapture --with-pylons=test.ini ckanext/siu_harvester/tests"
```
2 changes: 1 addition & 1 deletion bin/travis-build.bash
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ cd ckan
# echo "CKAN branch: $latest_ckan_release_branch"
# git checkout $latest_ckan_release_branch

git checkout 2.8
git checkout ckan-2.8.4
python setup.py develop
pip install -r requirements.txt
pip install -r dev-requirements.txt
Expand Down
21 changes: 11 additions & 10 deletions bin/travis-run.sh
Original file line number Diff line number Diff line change
Expand Up @@ -4,13 +4,14 @@ echo "NO_START=0\nJETTY_HOST=127.0.0.1\nJETTY_PORT=8983\nJAVA_HOME=$JAVA_HOME" |
sudo cp ckan/ckan/config/solr/schema.xml /etc/solr/conf/schema.xml
sudo service jetty restart

# skip tests until we fix them
# nosetests --ckan \
# --nologcapture \
# --with-pylons=subdir/test.ini \
# --with-coverage \
# --cover-package=ckanext.siu_harvester \
# --cover-inclusive \
# --cover-erase \
# --cover-tests \
# ckanext/siu_harvester/tests
nosetests --ckan \
--debug=ckanext.siu_harvester \
--with-pylons=subdir/test.ini \
ckanext/siu_harvester/tests


# local test inside docker
# docker-compose -f docker-compose.yml -f docker-compose-dev.yml exec ckan bash
# cd src_extensions/ckanext-siu-harvester/
# pip install -r dev-requirements.txt
# nosetests --ckan --nologcapture --with-pylons=test.ini ckanext/siu_harvester/tests
83 changes: 45 additions & 38 deletions ckanext/siu_harvester/harvesters/siu_transp.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,9 @@
import requests
from werkzeug.datastructures import FileStorage

from siu_data.portal_data import SIUPoratlTransparenciaData
from siu_data.query_file import SIUTranspQueryFile

from ckan import plugins as p
from ckan import model
from ckan.lib.helpers import json
Expand All @@ -13,7 +16,6 @@
from ckanext.harvest.harvesters.base import HarvesterBase
from ckanext.harvest.model import HarvestObject, HarvestGatherError, HarvestObjectError
from ckanext.harvest.helpers import get_harvest_source
from ckanext.siu_harvester.harvesters.siu_transp_data.lib import SIUTranspQueryFile


logger = logging.getLogger(__name__)
Expand All @@ -24,11 +26,12 @@ class SIUTransparenciaHarvester(HarvesterBase):
def set_paths(self):
here = os.path.dirname(os.path.abspath(__file__))
base = os.environ.get('CKAN_STORAGE_PATH', here)
self.data_path = os.path.join(base, 'siu_transp_data')
self.queries_path = os.path.join(here, 'siu_transp_data', 'queries')
self.results_path = os.path.join(self.data_path, 'results')
if not os.path.isdir(self.results_path):
os.makedirs(self.results_path)
self.results_folder_path = os.path.join(base, 'siu-harvester-results')
if not os.path.isdir(self.results_folder_path):
os.makedirs(self.results_folder_path)

# librearia que gestiona los datos en el portal de SIU
self.siu_data_lib = SIUPoratlTransparenciaData()

## IHarvester
def info(self):
Expand Down Expand Up @@ -84,17 +87,22 @@ def gather_stage(self, harvest_job):
logger.info('Starts Gather SIU Transp')
# load paths
self.set_paths()
self.get_query_files()
self.siu_data_lib.get_query_files()

# basic things you'll need
self.source = harvest_job.source
self.source_config = json.loads(self.source.config)

self.siu_data_lib.base_url = self.source.url
self.siu_data_lib.username = self.source_config['username']
self.siu_data_lib.password = self.source_config['password']

# ####################################
# get previous harvested packages
pfr = self.get_packages_for_source(harvest_source_id=self.source.id)
prev_names = [pkg['name'] for pkg in pfr['results']]
logger.info('Get previous harvested objects {}'.format(prev_names))
# TODO
# ####################################

object_ids = [] # lista de IDs a procesar, esto se devuelve en esta funcion
Expand All @@ -108,13 +116,21 @@ def gather_stage(self, harvest_job):

report = [] # resumen de todos los resultados
logger.info('Iter files')
for qf in self.query_files:

for qf in self.siu_data_lib.query_files:
only_files = self.source_config.get('only_files', None)
if only_files is not None:
fname = qf.split('/')[-1]
if fname not in only_files:
logger.info('Skipping file by config {}'.format(fname))
continue

logger.info('Gather SIU Transp FILE {}'.format(qf))
stqf = SIUTranspQueryFile(harvest_source=self, path=qf)
stqf = SIUTranspQueryFile(portal=self.siu_data_lib, path=qf)
# open to read query params
stqf.open()
# request all data
stqf.request_all()
stqf.request_all(results_folder_path=self.results_folder_path)
for err in stqf.errors:
hgerr = HarvestGatherError(message=err, job=harvest_job)
hgerr.save()
Expand Down Expand Up @@ -202,19 +218,14 @@ def import_stage(self, harvest_object):

for resource in resources:
resource['package_id'] = pkg['id']
upload_from = resource.pop('upload')
resource['upload'] = FileStorage(filename=upload_from, stream=open(upload_from))
resource['url'] = ''

fn = p.toolkit.get_action('resource_create')
try:
res = fn(context, resource)
except Exception, e:
logger.error('Error creating resource {} {}'.format(resource, e))
raise
upload_from = resource.pop('upload')

final_resource = p.toolkit.get_action('resource_show')(context, {'id': res['id']})
logger.info('Final resource {}'.format(final_resource['name']))
if os.path.isfile(upload_from):
resource['upload'] = FileStorage(filename=upload_from, stream=open(upload_from))
self.create_resource(context, resource)
else:
logger.error('Resource to upload not found {}'.format(upload_from))

# Mark previous objects as not current
previous_object = model.Session.query(HarvestObject) \
Expand All @@ -233,24 +244,20 @@ def import_stage(self, harvest_object):
harvest_object.save()

return True

def get_query_files(self):
""" Generador para obtener cada uno de los archivos con datos para cosechar """
logger.info('Getting query files')

self.query_files = []

for f in os.listdir(self.queries_path):
logger.info('Get query file {}'.format(f))
path = os.path.join(self.queries_path, f)
if os.path.isfile(path):
ext = f.split('.')[-1]
if ext != 'json':
continue
self.query_files.append(f)

return self.query_files

def create_resource(self, context, resource):
fn = p.toolkit.get_action('resource_create')
try:
res = fn(context, resource)
except Exception, e:
logger.error('Error creating resource {} {}'.format(resource, e))
raise

final_resource = p.toolkit.get_action('resource_show')(context, {'id': res['id']})
logger.info('Final resource {}'.format(final_resource['name']))

return final_resource

def get_packages_for_source(self, harvest_source_id):
'''
Returns the current packages list for datasets associated with the given source id
Expand Down
Empty file.
Loading

0 comments on commit 74b0c4a

Please sign in to comment.