Add time range parameters to sync script (#483)

This PR adds start and end times to the sync data script. The command line arguments `start_time` and `end_time` are added to the `sync_data` script. The script is called as, e.g., ```bash python -m src.data_sync.sync_data --sync-table order_data --start-time 2025-12-30 --end-time 2025-01-07 ``` Only data between `start_time` and `end_time` is computed. This data is then _upserted_ into the corresponding table of the month. I.e. if a row was not in the table already, it is inserted into the table. Rows from the new data replace rows of the old data if it exists. Old data which was not recomputed stays as is. The code is structured as follows: 1. Arguments are parsed with appropriate default values. 2. The full time range is partitioned into monthly ranges. 3. Block ranges and months are computed from those time ranges. 4. Essentially the old code is used for computing data for those block ranges. 5. Data is written to the database. The convention for the stat time to be inclusive and for the end time to be exclusive is used. This way the two ranges `(2024-12-30, 2025-01-02), (2025-01-02, 2025-01-07)` would give the same result as the range `(2024-12-30, 2025-01-07)`. Though some overlap is required in cases `end_time` is beyond the last finalized block. If no argument is supplied, the start of the month and the start of the next month are used as default for `start_time` and `end_time`, respectively, to compute data for the full month, until the last finalized block. The previous month is not automatically recomputed on the first of the next month. Instead, one can use a time range which contains whatever time window for which data needs to be recomputed. --------- Co-authored-by: Haris Angelidakis <[email protected]>
cowprotocol · Jan 10, 2025 · e383645 · e383645
1 parent 84931af
commit e383645
Show file tree

Hide file tree

Showing 6 changed files with 231 additions and 149 deletions.
diff --git a/README.md b/README.md
@@ -1,4 +1,4 @@
-# CoW Protocol: Solver Reimbursement & Rewards Distributor
+# CoW Protocol: Solver Accounting
 
 [![Code style: black](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/psf/black)
 
@@ -11,13 +11,19 @@ cp .env.sample .env    <----- Copy your Dune and orderbook credentials here!
 
 Fill out your Dune credentials in the `.env` file.
 
-Generate the solver-payouts with for the accounting period 7 days with today as end date).
+Generate the solver-payouts with for the accounting period 7 days (with today as end date).
 
 ```shell
 python -m src.fetch.transfer_file
 ```
 
-For more advanced usage of this payout script see below.
+To generate order data for the current month to upload to Dune run the following command.
+
+```shell
+python -m src.data_sync.sync_data --sync-table order_data
+```
+
+For more advanced usage of these scripts see below.
 
 # Summary of Accounting Procedure
 
@@ -142,6 +148,18 @@ docker run --pull=always -it --rm \
 
 and (usually after about 30 seconds) find the transfer file written to your current working directory.
 
-### Managing Dependencies
+# Creating payment data for syncing
+
+The script `src/data_sync/sync_data.py` creates tables for syncing to dune. The scripts can be called with a table to sync, start and end times, and a flag for dropping old data.
+
+To create order rewards tables with data from `2024-12-30` to `2025-01-02` use
+```shell
+python -m src.data_sync.sync_data --sync_table order_data --start-time 2024-12-30 --end-time 2025-01-02
+```
+This will update (or create, if they do not exist yet) the tables `order_data_{NETWORK}_2024_12` and `order_data_{NETWORK}_2025_01`.
+
+The script requires the additional environment variable `ANALYTICS_DB_URL`.
+
+# Managing Dependencies
 Python libraries can be added to the `requirements.in` file. After this `pip-compile` or `python -m pip-compile` will update the `requirements.txt` for you (you may have to install the libry manually first). 
 Warning: this might take a long time for large changes or when you run pip-compile for the first time. Running the command with the `-v` flag can help keep track of what's happening.
diff --git a/requirements.in b/requirements.in
@@ -14,6 +14,7 @@ pandas==2.0.3
 pandas-stubs==2.0.2.230605
 numpy==1.26.4
 pip-tools==7.4.1
+python-dateutil>=2.9.0.post0
 black
 mypy
 pylint

diff --git a/requirements.txt b/requirements.txt
@@ -1,5 +1,5 @@
 #
-# This file is autogenerated by pip-compile with Python 3.10
+# This file is autogenerated by pip-compile with Python 3.12
 # by the following command:
 #
 #    pip-compile
@@ -15,8 +15,6 @@ aiosignal==1.3.1
     # via aiohttp
 astroid==3.2.4
     # via pylint
-async-timeout==4.0.3
-    # via aiohttp
 attrs==24.2.0
     # via
     #   aiohttp
@@ -106,8 +104,6 @@ eth-utils==4.1.1
     #   rlp
     #   trie
     #   web3
-exceptiongroup==1.2.2
-    # via pytest
 frozenlist==1.4.1
     # via
     #   aiohttp
@@ -205,6 +201,7 @@ pytest==8.3.2
     # via -r requirements.in
 python-dateutil==2.9.0.post0
     # via
+    #   -r requirements.in
     #   dune-client
     #   pandas
 python-dotenv==1.0.1
@@ -253,14 +250,6 @@ sqlalchemy==1.4.53
     # via -r requirements.in
 sqlalchemy-stubs==0.4
     # via -r requirements.in
-tomli==2.0.1
-    # via
-    #   black
-    #   build
-    #   mypy
-    #   pip-tools
-    #   pylint
-    #   pytest
 tomlkit==0.13.2
     # via pylint
 toolz==0.12.1
@@ -285,9 +274,6 @@ types-setuptools==73.0.0.20240822
     # via dune-client
 typing-extensions==4.12.2
     # via
-    #   astroid
-    #   black
-    #   eth-rlp
     #   eth-typing
     #   mypy
     #   sqlalchemy-stubs

diff --git a/src/data_sync/common.py b/src/data_sync/common.py
@@ -1,13 +1,78 @@
 """Shared methods between both sync scripts."""
 
 from datetime import datetime, timezone
-from typing import List, Tuple
+
+from dateutil.relativedelta import relativedelta
 from web3 import Web3
+
 from src.logger import set_log
+from src.models.block_range import BlockRange
 
 log = set_log(__name__)
 
 
+def partition_time_range(
+    start_time: datetime, end_time: datetime
+) -> list[tuple[datetime, datetime]]:
+    """Computes (list of) time ranges from input parameters.
+    If both times are from the same month, only [(start_time, end_time)] is returned.
+    Otherwise, the range is split into n pieces of the form [(start_time, start_of_month_2),
+    (start_of_month_2, start_of_month_3),..., (start_of_month_n, end_time)].
+    """
+    assert start_time < end_time, "start_time must be strictly smaller than end_time"
+
+    # if there is just one month to consider
+    if end_time <= datetime(start_time.year, start_time.month, 1).replace(
+        tzinfo=timezone.utc
+    ) + relativedelta(months=1):
+        return [(start_time, end_time)]
+
+    # if there are multiple months to consider
+    next_month_start_time = datetime(start_time.year, start_time.month, 1).replace(
+        tzinfo=timezone.utc
+    ) + relativedelta(months=1)
+    time_range_list = [(start_time, next_month_start_time)]
+    while end_time > next_month_start_time + relativedelta(months=1):
+        time_range_list.append(
+            (next_month_start_time, next_month_start_time + relativedelta(months=1))
+        )
+        next_month_start_time = next_month_start_time + relativedelta(months=1)
+    time_range_list.append((next_month_start_time, end_time))
+
+    return time_range_list
+
+
+def compute_block_range(
+    start_time: datetime, end_time: datetime, node: Web3
+) -> BlockRange:
+    """Computes a block range from start and end time.
+    The convention for block ranges is to be inclusive, while the end time is exclusive.
+    Only finalized blocks are considered.
+    """
+    latest_block = node.eth.get_block("finalized")
+    latest_block_time = datetime.fromtimestamp(
+        latest_block["timestamp"], tz=timezone.utc
+    )
+
+    assert (
+        start_time < latest_block_time
+    ), "start time must be smaller than latest block time"
+
+    start_block = find_block_with_timestamp(node, start_time.timestamp())
+    if latest_block_time < end_time:
+        log.info(
+            f"Latest finalized block time {latest_block_time} is smaller than {end_time}."
+            "Using latest finalized block."
+        )
+        end_block = int(latest_block["number"])
+    else:
+        end_block = find_block_with_timestamp(node, end_time.timestamp()) - 1
+
+    assert start_block < end_block, "start block must be smaller than end block"
+
+    return BlockRange(block_from=start_block, block_to=end_block)
+
+
 def find_block_with_timestamp(node: Web3, time_stamp: float) -> int:
     """
     This implements binary search and returns the smallest block number
@@ -40,77 +105,3 @@ def find_block_with_timestamp(node: Web3, time_stamp: float) -> int:
     # fallback in case correct block number hasn't been found
     # in that case, we will include some more blocks than necessary
     return mid_block_number + 200
-
-
-def compute_block_and_month_range(  # pylint: disable=too-many-locals
-    node: Web3, recompute_previous_month: bool
-) -> Tuple[List[Tuple[int, int]], List[str]]:
-    """
-    This determines the block range and the relevant months
-    for which we will compute and upload data on Dune.
-    """
-    # The function first a list of block ranges, followed by a list of
-    # # months. Block ranges are stored as (start_block, end_block) pairs,
-    # and are meant to be interpreted as closed intervals.
-    # Moreover, we assume that the job runs at least once every 24h
-    # Because of that, if it is the first day of month, we also
-    # compute the previous month's table just to be on the safe side
-
-    latest_finalized_block = node.eth.get_block("finalized")
-
-    current_month_end_block = int(latest_finalized_block["number"])
-    current_month_end_timestamp = latest_finalized_block["timestamp"]
-
-    current_month_end_datetime = datetime.fromtimestamp(
-        current_month_end_timestamp, tz=timezone.utc
-    )
-    current_month_start_datetime = datetime(
-        current_month_end_datetime.year, current_month_end_datetime.month, 1, 00, 00
-    )
-    current_month_start_timestamp = current_month_start_datetime.replace(
-        tzinfo=timezone.utc
-    ).timestamp()
-
-    current_month_start_block = find_block_with_timestamp(
-        node, current_month_start_timestamp
-    )
-
-    current_month = (
-        f"{current_month_end_datetime.year}_{current_month_end_datetime.month}"
-    )
-    ## in case the month is 1-9, we add a "0" prefix, so that we have a fixed-length representation
-    ## e.g., 2024-12, 2024-01
-    if len(current_month) < 7:
-        current_month = current_month[:5] + "0" + current_month[5]
-    months_list = [current_month]
-    block_range = [(current_month_start_block, current_month_end_block)]
-    if current_month_end_datetime.day == 1 or recompute_previous_month:
-        if current_month_end_datetime.month == 1:
-            previous_month = f"{current_month_end_datetime.year - 1}_12"
-            previous_month_start_datetime = datetime(
-                current_month_end_datetime.year - 1, 12, 1, 00, 00
-            )
-        else:
-            previous_month = f"""{current_month_end_datetime.year}_
-                {current_month_end_datetime.month - 1}
-            """
-            if len(previous_month) < 7:
-                previous_month = previous_month[:5] + "0" + previous_month[5]
-            previous_month_start_datetime = datetime(
-                current_month_end_datetime.year,
-                current_month_end_datetime.month - 1,
-                1,
-                00,
-                00,
-            )
-        months_list.append(previous_month)
-        previous_month_start_timestamp = previous_month_start_datetime.replace(
-            tzinfo=timezone.utc
-        ).timestamp()
-        previous_month_start_block = find_block_with_timestamp(
-            node, previous_month_start_timestamp
-        )
-        previous_month_end_block = current_month_start_block - 1
-        block_range.append((previous_month_start_block, previous_month_end_block))
-
-    return block_range, months_list