Skip to content

Commit

Permalink
Adiciona parser de pensionistas
Browse files Browse the repository at this point in the history
  • Loading branch information
turicas committed Jul 2, 2021
1 parent 0b8f71a commit 2b1f173
Show file tree
Hide file tree
Showing 7 changed files with 266 additions and 0 deletions.
20 changes: 20 additions & 0 deletions pensionista/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
# Dados de pensionistas

## Instalação

```shell
pip install -r requirements.txt
```

## Execução

Baixe os arquivos de pensionistas disponíveis [nesse
site](http://transparencia.gov.br/download-de-dados/servidores) e coloque-os em
`data/download/`. Depois, execute:

```shell
python convert.py <cadastro|observacao|remuneracao>
```

Os arquivos `cadastro.csv.gz`, `observacao.csv.gz` e `remuneracao.csv.gz` serão
gerados em `data/output/`.
153 changes: 153 additions & 0 deletions pensionista/convert.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,153 @@
import csv
import datetime
import io
from functools import lru_cache
from pathlib import Path
from uuid import NAMESPACE_URL, uuid5
from zipfile import ZipFile

from rows.fields import slug
from rows.utils import CsvLazyDictWriter, open_compressed
from tqdm import tqdm


strptime = datetime.datetime.strptime

@lru_cache(maxsize=1024 ** 2)
def convert_number(value):
return value.replace(".", "").replace(",", ".")


@lru_cache(maxsize=32 * 1024)
def convert_date(value):
value = value.strip()
if not value:
return None
return str(strptime(value, "%d/%m/%Y").date())


@lru_cache(maxsize=1024 * 1024)
def person_uuid(cpf, name):
"""Create UUID based on URLid methodology"""

if cpf is None:
cpf = "***********"
assert len(cpf) == 11, f"Invalid CPF: {repr(cpf)}"
internal_id = cpf[3:9] + "-" + slug(name).upper().replace("_", "-")
return str(uuid5(NAMESPACE_URL, f"https://id.brasil.io/person/v1/{internal_id}/"))


@lru_cache(maxsize=128)
def normalize_key(text):

text = text.replace("(R$)", "_brl_").replace("(U$)", "_usd_")
result = (
slug(text)
.replace("_registradas_em_sistemas_de_pessoal_", "_")
.replace("_programa_desligamento_voluntario_mp_792_2017_", "_deslig_voluntario_")
)
return result


def convert_row(row):
new = {}
for original_key, value in row.items():
key = normalize_key(original_key)
value = value.strip()
if (value and value[0] == "0" and value[-1] == "0" and set(value) == {"0"}) or value in ("-", "--"):
value = None
if not key and not value:
continue

if key.startswith("data_") and value is not None:
value = convert_date(value)
elif value is not None and ("R$" in original_key or "U$" in original_key):
value = convert_number(value)

new[key] = value
return new


def read_csv(fobj, table_name, year, month, input_encoding="iso-8859-1", delimiter=";"):
"""Read binary `fobj` as CSV, convert each row, adding `table_name` as a column"""

fobj = io.TextIOWrapper(fobj, encoding=input_encoding)
reader = csv.DictReader(fobj, delimiter=delimiter)
for row in reader:
new = convert_row(row)
if "(*)" in new.get("ano", ""): # Invalid row
continue
if "ano" not in new:
new["ano"] = year
if "mes" not in new:
new["mes"] = month
if "PENSIONISTA MENOR DE 16 ANOS" in new["cpf"]:
new["menor_16"] = True
new["cpf"] = None
else:
new["menor_16"] = False
new["cpf"] = new["cpf"].replace(".", "").replace("-", "")
new["sistema_origem"] = table_name
new["pessoa_uuid"] = person_uuid(new["cpf"], new["nome"])
if table_name == "cadastro":
for key in ("representante_legal", "instituidor"):
new[f"cpf_{key}"] = new[f"cpf_{key}"].replace(".", "").replace("-", "")
new[f"{key}_uuid"] = person_uuid(new[f"cpf_{key}"], new[f"nome_{key}"])
yield new


def extract_year_month(filename):
"""Extract year and month from ZIP filename"""

part = filename.name.lower().split(".zip")[0]
return int(part[:4]), int(part[4:6])


def extract_origin_system(filename):
return filename.split(".zip")[0].split("_")[-1]


if __name__ == "__main__":
import argparse

parser = argparse.ArgumentParser()
parser.add_argument("table_name", choices=("cadastro", "remuneracao", "observacao"))
args = parser.parse_args()

# Make sure all working paths exist before anything
DATA_PATH = Path(__file__).parent / "data"
DOWNLOAD_PATH = DATA_PATH / "download"
OUTPUT_PATH = DATA_PATH / "output"
for path in (DATA_PATH, DOWNLOAD_PATH, OUTPUT_PATH):
if not path.exists():
path.mkdir(parents=True)

# Create one compressed-CSV writer
filename = OUTPUT_PATH / f"pensionista_{args.table_name}.csv.gz"
fobj = open_compressed(filename, mode="w", buffering=8 * 1024 * 1024)
writer = CsvLazyDictWriter(fobj)

# Read each ZIP file, then each inner ZIP file, then filter desired
# inner-inner CSV file, convert it and write to the output CSV.
progress_bar = tqdm()
filenames = DOWNLOAD_PATH.glob("*.zip")
for filename in sorted(filenames, key=extract_year_month):
year, month = extract_year_month(filename)
progress_bar.desc = f"{year}-{month:02d}"
progress_bar.refresh()
zf = ZipFile(filename)
for fileinfo in zf.filelist:
origin_system = extract_origin_system(fileinfo.filename)
progress_bar.desc = f"{year}-{month:02d}/{origin_system}"
inner_zf = ZipFile(zf.open(fileinfo.filename))
for inner_fileinfo in inner_zf.filelist:
table_name = inner_fileinfo.filename.split(".")[0].split("_")[-1].lower().replace("observacoes", "observacao")
if table_name != args.table_name: # We don't want this file
continue
progress_bar.desc = f"{year}-{month:02d}/{origin_system}.{table_name}"
fobj = inner_zf.open(inner_fileinfo.filename)
reader = read_csv(fobj, origin_system, year, month)
for row in reader:
writer.writerow(row)
progress_bar.update()
progress_bar.close()
38 changes: 38 additions & 0 deletions pensionista/download-old.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
#!/bin/bash

mkdir -p data/download
cd data/download
wget -c -t 0 http://repositorio.dados.gov.br/segrt/pensionistas/PENSIONISTAS_112019.zip
wget -c -t 0 http://repositorio.dados.gov.br/segrt/pensionistas/PENSIONISTAS_122019.zip
wget -c -t 0 http://repositorio.dados.gov.br/segrt/pensionistas/PENSIONISTAS_012020.zip
wget -c -t 0 http://repositorio.dados.gov.br/segrt/pensionistas/PENSIONISTAS_022020.zip
wget -c -t 0 http://repositorio.dados.gov.br/segrt/pensionistas/PENSIONISTAS_032020.zip
wget -c -t 0 http://repositorio.dados.gov.br/segrt/pensionistas/PENSIONISTAS_042020.zip
wget -c -t 0 http://repositorio.dados.gov.br/segrt/pensionistas/1994.zip
wget -c -t 0 http://repositorio.dados.gov.br/segrt/pensionistas/1995.zip
wget -c -t 0 http://repositorio.dados.gov.br/segrt/pensionistas/1996.zip
wget -c -t 0 http://repositorio.dados.gov.br/segrt/pensionistas/1997.zip
wget -c -t 0 http://repositorio.dados.gov.br/segrt/pensionistas/1998.zip
wget -c -t 0 http://repositorio.dados.gov.br/segrt/pensionistas/1999.zip
wget -c -t 0 http://repositorio.dados.gov.br/segrt/pensionistas/2000.zip
wget -c -t 0 http://repositorio.dados.gov.br/segrt/pensionistas/2001.zip
wget -c -t 0 http://repositorio.dados.gov.br/segrt/pensionistas/2002.zip
wget -c -t 0 http://repositorio.dados.gov.br/segrt/pensionistas/2003.zip
wget -c -t 0 http://repositorio.dados.gov.br/segrt/pensionistas/2004.zip
wget -c -t 0 http://repositorio.dados.gov.br/segrt/pensionistas/2005.zip
wget -c -t 0 http://repositorio.dados.gov.br/segrt/pensionistas/2006.zip
wget -c -t 0 http://repositorio.dados.gov.br/segrt/pensionistas/2007.zip
wget -c -t 0 http://repositorio.dados.gov.br/segrt/pensionistas/2008.zip
wget -c -t 0 http://repositorio.dados.gov.br/segrt/pensionistas/2009.zip
wget -c -t 0 http://repositorio.dados.gov.br/segrt/pensionistas/2010.zip
wget -c -t 0 http://repositorio.dados.gov.br/segrt/pensionistas/2011.zip
wget -c -t 0 http://repositorio.dados.gov.br/segrt/pensionistas/2012.zip
wget -c -t 0 http://repositorio.dados.gov.br/segrt/pensionistas/2013.zip
wget -c -t 0 http://repositorio.dados.gov.br/segrt/pensionistas/2014.zip
wget -c -t 0 http://repositorio.dados.gov.br/segrt/pensionistas/2015.zip
wget -c -t 0 http://repositorio.dados.gov.br/segrt/pensionistas/2016.zip
wget -c -t 0 http://repositorio.dados.gov.br/segrt/pensionistas/2017.zip
wget -c -t 0 http://repositorio.dados.gov.br/segrt/pensionistas/2018.zip
wget -c -t 0 http://repositorio.dados.gov.br/segrt/pensionistas/2019.zip
wget -c -t 0 http://repositorio.dados.gov.br/segrt/pensionistas/PENSIONISTAS_052020.zip
cd -
16 changes: 16 additions & 0 deletions pensionista/import-pgsql.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
#!/bin/bash

if [ -z "$DATABASE_URL" ]; then
echo "ERROR: must set $DATABASE_URL with postgres connection string"
exit 1
fi

for table in cadastro observacao remuneracao; do
rows pgimport \
--dialect=excel \
--input-encoding=utf-8 \
--schema=schema/pensionista_${table}.csv \
data/output/pensionista_${table}.csv.gz \
$DATABASE_URL \
pensionista_${table}
done
15 changes: 15 additions & 0 deletions pensionista/indexes.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
CREATE INDEX idx_pensobs_id ON pensionista_observacao (id_servidor_portal, ano, mes, sistema_origem);
CREATE INDEX idx_pensobs_uuid ON pensionista_observacao (pessoa_uuid);
CREATE INDEX idx_pensobs_orig ON pensionista_observacao (sistema_origem);

CREATE INDEX idx_penscad_id ON pensionista_cadastro (id_servidor_portal, ano, mes, sistema_origem);
CREATE INDEX idx_penscad_uuid1 ON pensionista_cadastro (pessoa_uuid);
CREATE INDEX idx_penscad_uuid2 ON pensionista_cadastro (representante_legal_uuid);
CREATE INDEX idx_penscad_uuid3 ON pensionista_cadastro (instituidor_pensao_uuid);
CREATE INDEX idx_penscad_orig ON pensionista_cadastro (sistema_origem);

CREATE INDEX idx_pensrem_id ON pensionista_remuneracao (id_servidor_portal, ano, mes, sistema_origem);
CREATE INDEX idx_pensrem_uuid ON pensionista_remuneracao (pessoa_uuid);
CREATE INDEX idx_pensrem_orig ON pensionista_remuneracao (sistema_origem);

ALTER TABLE pensionista_cadastro ADD PRIMARY KEY (id_servidor_portal, ano, mes, sistema_origem);
22 changes: 22 additions & 0 deletions pensionista/list_zips.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
import json
from urllib.parse import urljoin
from urllib.request import urlopen


def ckan_package_resources(base_url, resource_id):
template_url = urljoin(base_url, "/api/3/action/package_show?id={resource_id}")
url = template_url.format(resource_id=resource_id)
response = urlopen(url)
data = json.loads(response.read())
return data["result"]["resources"]


if __name__ == "__main__":
resources = ckan_package_resources(
base_url="http://www.dados.gov.br", resource_id="c76a1bc6-2330-4b05-b3dd-491124931496"
)

for resource in resources:
if not resource["url"].lower().endswith(".zip"):
continue
print(resource["url"])
2 changes: 2 additions & 0 deletions pensionista/requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
calculadora-do-cidadao
https://github.com/turicas/rows/archive/develop.zip

0 comments on commit 2b1f173

Please sign in to comment.