Skip to content

Commit

Permalink
Merge pull request #379 from gro-intelligence/GAIA-21588-df
Browse files Browse the repository at this point in the history
[GAIA-21588] Create a function to convert v2prime result into dataframe
  • Loading branch information
muzigao-gro authored Jun 16, 2023
2 parents 588a990 + a58325b commit d0b84e5
Show file tree
Hide file tree
Showing 5 changed files with 245 additions and 35 deletions.
3 changes: 3 additions & 0 deletions docs/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,3 +93,6 @@
# so, we ignore all other releases that preceded that.
smv_tag_whitelist = r"^v(?!1\.40\.[012345]).+$"
smv_branch_whitelist = r"^development$"

# -- Ignore Javascript Rendered page
linkcheck_ignore = [r'https://www.tandfonline.com/doi/abs/10.1080/2150704X.2016.1252471']
5 changes: 5 additions & 0 deletions groclient/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,3 +40,8 @@
]

ITR_CHUNK_READ_SIZE = 4096 * 1024 # 4 MB

V2_DATA_DESCRIPTION_PREFIX = "series_description"
V2_DATA_DESCRIPTION_ATTRS = DATA_SERIES_UNIQUE_TYPES_ID + ['unit_id']


156 changes: 121 additions & 35 deletions groclient/experimental.py
Original file line number Diff line number Diff line change
@@ -1,19 +1,21 @@
import pandas as pd

from typing import Dict, List

from groclient.client import GroClient
from groclient import lib

from groclient.constants import V2_DATA_DESCRIPTION_PREFIX, V2_DATA_DESCRIPTION_ATTRS

class Experimental(GroClient):
"""The experimental client will introduce a range of experimental functions with better user experience.
While you will be able to access better performance and new features at an early stage,
you should be aware that things might change (e.g response format)."""

def get_data_points(self, **selections):
def get_data_points(self, **selections: Dict) -> List[Dict]:
"""This function is a mirror of existing :meth:`~groclient.GroClient.get_data_points`, but with limited scope.
For example:
- Ontology expansion is under development.
- "Gro derived on-the-fly" is under development.
- Many sources are still under migration (please refer to internal confluence page for source migration timeline)
Expand All @@ -38,33 +40,34 @@ def get_data_points(self, **selections):
All data points with end dates after this date.
end_date : string, optional
All data points with start dates before this date.
coverage_threshold: float, optional
coverage_threshold : float, optional
Custom threshold on the coverage of geospatial data. Value should be between 0 and 1.
Returns
-------
dict
dictionary containing list of data_points and series_description
Example::
exp_client = Experimental(access_token="your_token_here")
exp_client.get_data_points(
**{
'metric_id': 2540047,
'item_ids': [3457],
'region_ids': [100023971, 100023990],
'frequency_id': 1,
'source_id': 26,
'start_date': '2021-12-20',
'end_date': '2021-12-21',
}
)
Returns::
{
"data_series": [
dict
dictionary containing list of data_points and series_description
Example::
from groclient.experimental import Experimental
exp_client = Experimental(access_token="your_token_here")
exp_client.get_data_points(
**{
'metric_id': 2540047,
'item_ids': [3457],
'region_ids': [100023971, 100023990],
'frequency_id': 1,
'source_id': 26,
'start_date': '2021-12-20',
'end_date': '2021-12-21',
}
)
Returns::
[
{
"data_points": [
{
Expand Down Expand Up @@ -99,14 +102,97 @@ def get_data_points(self, **selections):
"unit_id": 36
}
}
],
"meta": {
"version": "v1.266.0",
"copyright": "Copyright (c) Gro Intelligence",
"timestamp": "Wed, 19 Apr 2023 14:34:05 GMT"
}
}
]
"""
data_stream_list = lib.get_data_points_v2_prime(
self.access_token, self.api_host, **selections
)

# due to the issue in javascript when dealing with 'int64'
# here we would manually convert timestamp from str to int
for data_stream in data_stream_list:
for data_point in data_stream['data_points']:
data_point['start_timestamp'] = int(data_point['start_timestamp'])
data_point['end_timestamp'] = int(data_point['end_timestamp'])

return data_stream_list


def get_data_points_df(self, **selections: Dict) -> pd.DataFrame:
"""Call :meth:`~groclient.Experimental.get_data_points` and return as a combined
dataframe.
Parameters
----------
metric_id : integer
How something is measured. e.g. "Export Value" or "Area Harvested"
item_ids : integer or list of integers
What is being measured. e.g. "Corn" or "Rainfall"
region_ids : integer or list of integers
Where something is being measured e.g. "United States Corn Belt" or "China"
partner_region_ids : integer or list of integers, optional
partner_region refers to an interaction between two regions, like trade or
transportation. For example, for an Export metric, the "region" would be the exporter
and the "partner_region" would be the importer. For most series, this can be excluded
or set to 0 ("World") by default.
source_id : integer
frequency_id : integer
unit_id : integer, optional
start_date : string, optional
All data points with end dates after this date.
end_date : string, optional
All data points with start dates before this date.
coverage_threshold : float, optional
Custom threshold on the coverage of geospatial data. Value should be between 0 and 1.
Returns
-------
pandas.DataFrame
The results from :meth:`~groclient.Experimental.get_data_points`, appended together
into a single dataframe.
Data point attributes in timestamp format (e.g `start_timestamp`, `end_timestamp`)
will be converted into human readable format (`YYYY-MM-DD`), and renamed as
`start_date` and `end_date`
Example::
from groclient.experimental import Experimental
exp_client = Experimental(access_token="your_token_here")
exp_client.get_data_points_df(
**{
'metric_id': 2540047,
'item_ids': [3457],
'region_ids': [100023971, 100023990],
'frequency_id': 1,
'source_id': 26,
'start_date': '2021-12-20',
'end_date': '2021-12-21',
}
)
Returns::
value start_timestamp end_timestamp metric_id item_id region_id partner_region_id frequency_id source_id unit_id
0 33.204651 2021-12-20 2021-12-21 2540047 3457 100023971 NaN 1 26 36
1 32.734329 2021-12-20 2021-12-21 2540047 3457 100023990 NaN 1 26 36
"""
return lib.get_data_points_v2_prime(
res = lib.get_data_points_v2_prime(
self.access_token, self.api_host, **selections
)

v2_data_description_meta = [
[V2_DATA_DESCRIPTION_PREFIX, x] for x in V2_DATA_DESCRIPTION_ATTRS
]
df = pd.json_normalize(
res, record_path=['data_points'], meta=v2_data_description_meta, errors='ignore'
)

if not df.empty:
ts_cols = ["start_timestamp", "end_timestamp"]
df[ts_cols] = df[ts_cols].apply(pd.to_datetime, unit="s")

df.columns = df.columns.str.replace('series_description.', '')
df[V2_DATA_DESCRIPTION_ATTRS] = df[V2_DATA_DESCRIPTION_ATTRS].apply(pd.to_numeric)

return df
69 changes: 69 additions & 0 deletions groclient/experimental_test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
try:
# Python 3.3+
from unittest.mock import patch
except ImportError:
# Python 2.7
from mock import patch

import numpy as np
import pandas as pd

from pandas.testing import assert_frame_equal
from unittest import TestCase

from groclient import Experimental
from groclient.mock_data import mock_v2_prime_data_request, mock_v2_prime_data_response

MOCK_HOST = "pytest.groclient.url"
MOCK_TOKEN = "pytest.groclient.token"

class ExperimentalTests(TestCase):
def setUp(self):
self.client = Experimental(MOCK_HOST, MOCK_TOKEN)
self.assertTrue(isinstance(self.client, Experimental))

@patch("groclient.lib.get_data_points_v2_prime")
def test_get_data_points(self, mock_get_data_points):
mock_get_data_points.return_value = mock_v2_prime_data_response.copy()

res = self.client.get_data_points(**mock_v2_prime_data_request)

self.assertEqual(len(res), len(mock_v2_prime_data_response))
self.assertIn('data_points', res[0])
self.assertIn('series_description', res[0])
point = res[0]['data_points'][0]
self.assertTrue(isinstance(point["start_timestamp"], int))
self.assertTrue(isinstance(point["end_timestamp"], int))

@patch("groclient.lib.get_data_points_v2_prime")
def test_get_data_points_df(self, mock_get_data_points):
mock_get_data_points.return_value = mock_v2_prime_data_response.copy()
df = self.client.get_data_points_df(**mock_v2_prime_data_request)

expected_df = pd.DataFrame({
"value": [33.20, 32.73],
"start_timestamp":['2023-05-01', '2023-05-01'],
"end_timestamp": ['2023-05-02', '2023-05-02'],
"metric_id": [2540047, 2540047],
"item_id": [3457, 3457],
"region_id": [12344, 12345],
"partner_region_id": [np.nan, np.nan],
"frequency_id": [1,1],
"source_id": [26, 26],
"unit_id": [36, 36]
}).astype({
'start_timestamp': 'datetime64',
'end_timestamp': 'datetime64'
})

print(df.dtypes)

assert_frame_equal(df, expected_df)


@patch("groclient.lib.get_data_points_v2_prime")
def test_get_data_points_df_no_data(self, mock_get_data_points):
mock_get_data_points.return_value = []
df = self.client.get_data_points_df(**mock_v2_prime_data_request)

self.assertTrue(df.empty)
47 changes: 47 additions & 0 deletions groclient/mock_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -190,3 +190,50 @@
"frequency_id": 4,
"source_id": 5,
}

mock_v2_prime_data_request = {
"metric_id": 2540047,
"item_ids": [3457],
"source_id": 26,
"frequency_id": 1,
"unit_id": 36,
"start_date": "2023-05-01",
"region_ids": [1215],
}

mock_v2_prime_data_response = [
{
"data_points": [
{
"value": 33.20,
"start_timestamp": "1682899200",
"end_timestamp": "1682985600"
}
],
"series_description": {
"source_id": 26,
"item_id": 3457,
"metric_id": 2540047,
"frequency_id": 1,
"region_id": 12344,
"unit_id": 36
}
},
{
"data_points": [
{
"value": 32.73,
"start_timestamp": "1682899200",
"end_timestamp": "1682985600"
}
],
"series_description": {
"source_id": 26,
"item_id": 3457,
"metric_id": 2540047,
"frequency_id": 1,
"region_id": 12345,
"unit_id": 36
}
}
]

0 comments on commit d0b84e5

Please sign in to comment.