-
Notifications
You must be signed in to change notification settings - Fork 6
/
Copy pathhousehold_us_flow.py
102 lines (93 loc) · 3.6 KB
/
household_us_flow.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
import datetime
import os
from dataflows import Flow, validate, update_resource
from dataflows import add_metadata, load, set_type, find_replace
def readme(fpath='README.md'):
if os.path.exists(fpath):
return open(fpath).read()
household_us = Flow(
add_metadata(
name="household-income-us-historical",
title="Income Limits for Each Fifth and Top 5 Percent of All Households: 1967 to 2016",
description="Households as of March of the following year. Income in current and 2016 CPI-U-RS adjusted dollars.",
sources=[
{
"path": "https://www2.census.gov",
"title": "United States Census Bureau"
}
],
licenses=[
{
"id": "odc-pddl",
"path": "http://opendatacommons.org/licenses/pddl/",
"title": "Open Data Commons Public Domain Dedication and License v1.0",
'name': "open_data_commons_public_domain_dedication_and_license_v1.0"
}
],
version="0.3.0",
views=[
{
"name": "comparison-of-upper-limit-of-each-fifth-and-lower-limit-of-top-5-percent",
"title": "Comparison of upper limit of each fifth and lower limit of top 5 percent (2016 dollars)",
"resources": ["household-income-us-historical"],
"specType": "simple",
"spec": {
"type": "line",
"group": "Year",
"series": ["Lowest", "Second", "Third", "Fourth", "Top 5 percent"]
}
},
{
"name": "lowest-fifth-vs-top-5-percent",
"title": "Ratio of lower limit of top 5 percent to upper limit of lowest fifth (2016 dollars)",
"resources": [
{
"name": "household-income-us-historical",
"transform": [
{
"type": "formula",
"expressions": ["data['Top 5 percent']/data['Lowest']"],
"asFields": ["Ratio"]
}
]
}
],
"specType": "simple",
"spec": {"type": "line","group": "Year","series": ["Ratio"]}
}
],
readme=readme()
),
load(
load_source='https://www2.census.gov/programs-surveys/cps/tables/time-series/historical-income-households/h01ar.xls',
format='xls',
sheet= 1,
encoding='utf-8',
# remove first 6 rows. remove rows that contain data from 1967 - last year and 3 rows after. Finaly last row
skip_rows=[i+1 for i in range(6 + datetime.datetime.now().year - 1966 + 3)] + [-1],
headers=['Year', 'Number (thousands)', 'Lowest', 'Second', 'Third', 'Fourth', 'Top 5 percent'],
),
find_replace(fields=[
{
'name': 'Year', 'patterns': [
{'find': '(\s?\(\d+\))|(\.0)', 'replace': ''}
]
},
{
'name': 'Fourth', 'patterns': [
{'find': '\+|', 'replace': ''}
]
}
], resources=0),
update_resource(0, **{
'name': 'household-income-us-historical',
'path':'data/household-income-us-historical.csv', 'dpp:streaming': True
}),
set_type('Year', type='year'),
set_type('^(?!Y).+', type='number'),
validate()
)
def flow(parameters, datapackage, resources, stats):
return household_us
if __name__ == '__main__':
household_us.process()