Skip to content

Commit

Permalink
Update validators and healthcheck endpoint (project-serum#5)
Browse files Browse the repository at this point in the history
  • Loading branch information
nathanielparke authored Jan 11, 2021
1 parent 48ee54d commit 2da9831
Show file tree
Hide file tree
Showing 2 changed files with 54 additions and 38 deletions.
12 changes: 3 additions & 9 deletions sol/api.sh
Original file line number Diff line number Diff line change
Expand Up @@ -8,12 +8,9 @@ PATH=/home/sol/.local/share/solana/install/active_release/bin:/usr/sbin:/usr/bin
# Parameters from https://docs.solana.com/clusters#mainnet-beta
ENTRYPOINT=mainnet-beta.solana.com:8001
TRUSTED_VALIDATOR_PUBKEYS=(7Np41oeYqPefeNQEHSv1UDhYrehxin3NStELsSKCT4K2 GdnSyH3YtwcxFvQrVVJMm1JhTS4QVX7MFsX56uJLUfiZ DE1bawNcRJB9rVm3buyMVfr8mBEoyyu73NBovf2oXJsJ CakcnaRDHka2gXyfbEd2d3xsvkJkqsLw2akB3zsN1D2S)
EXPECTED_BANK_HASH=5eykt4UsFv8P8NJdTREpY1vzqKqZKvdpKuc147dw2N9d
EXPECTED_BANK_HASH=Fi4p8z3AkfsuGXZzQ4TD28N8QDNSWC7ccqAqTs2GPdPu
EXPECTED_GENESIS_HASH=5eykt4UsFv8P8NJdTREpY1vzqKqZKvdpKuc147dw2N9d
EXPECTED_SHRED_VERSION=64864

# NOTE: Check if this is reasonable
RPC_HEALTH_CHECK_SLOT_DISTANCE=15
EXPECTED_SHRED_VERSION=13490

# Delete any zero-length snapshots that can cause validator startup to fail
find /data/sol/ledger/snapshot-* -size 0 -print -exec rm {} \; || true
Expand Down Expand Up @@ -48,6 +45,7 @@ args=(
--identity "$identity_keypair"
--enable-rpc-transaction-history
--limit-ledger-size 50000000
--health-check-slot-distance 500
--cuda
--rpc-port 8899
--private-rpc
Expand All @@ -61,10 +59,6 @@ args=(
--wal-recovery-mode skip_any_corrupted_record
)

if [[ -n "$RPC_HEALTH_CHECK_SLOT_DISTANCE" ]]; then
args+=(--health-check-slot-distance "$RPC_HEALTH_CHECK_SLOT_DISTANCE")
fi

# Note: can get into a bad state that requires actually fetching a new snapshot. One such error that indicates this:
# "...processing for bank 0 must succeed: FailedToLoadEntries(InvalidShredData(Custom(\"could not reconstruct entries\")))"
if [[ -d /data/sol/ledger ]]; then
Expand Down
80 changes: 51 additions & 29 deletions sol/health/main.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,12 @@
import logging
import socket
import time
import traceback
from functools import wraps
from pathlib import Path
from typing import Union, Tuple, Optional
from typing import Union, Tuple, Optional, Dict

import gevent
import jsonpickle
import requests
from flask import Flask
Expand All @@ -15,10 +17,16 @@
logger = logging.getLogger('health.main')

PORT = 9090
TRUSTED_VALIDATOR_ENDPOINT = 'http://vip-api.mainnet-beta.solana.com'
LOCAL_VALIDATOR_ENDPOINT = 'http://localhost:8899'
ENDPOINTS = {
'local': 'http://localhost:8899',
'mainnet': 'http://vip-api.mainnet-beta.solana.com',
'cluster': 'https://solana-api.projectserum.com',
}
UNHEALTHY_BLOCKHEIGHT_DIFF = 15
DATA_DIR = 'data'
UPSTREAM_DOWN_TOLERANCE_SECONDS = 30

_last_successful_trusted_fetch = 0


def serve_flask_app(app: Flask, port: int, allow_remote_connections: bool = False,
Expand Down Expand Up @@ -52,41 +60,37 @@ def wrapped(*args, **kwargs):
return wrapped


@app.route('/')
@api_endpoint
def get_status():
return f'Hello from {socket.gethostname()}.'


@app.route('/status')
@api_endpoint
def get_validator_status():
local = get_epoch_info(LOCAL_VALIDATOR_ENDPOINT)['result']['blockHeight']
trusted = get_epoch_info(TRUSTED_VALIDATOR_ENDPOINT)['result']['blockHeight']
return {
'local': local,
'trusted': trusted
}
return get_all_slots()


@app.route('/health')
@api_endpoint
def get_health_status():
local = get_epoch_info(LOCAL_VALIDATOR_ENDPOINT)['result']['blockHeight']
trusted = get_epoch_info(TRUSTED_VALIDATOR_ENDPOINT)['result']['blockHeight']
diff = trusted - local
if diff < 0:
logger.info(f'Local block height is greater than trusted validator. '
global _last_successful_trusted_fetch
slots = get_all_slots()
logger.info(f'slots: {slots}')

local = slots['local']
upstream_height = max([v for k, v in slots.items() if k != 'local'])
if upstream_height == 0 and _last_successful_trusted_fetch < time.time() - UPSTREAM_DOWN_TOLERANCE_SECONDS:
raise Exception(
f'Both upstreams have been returning errors for more than {UPSTREAM_DOWN_TOLERANCE_SECONDS} seconds'
)
elif upstream_height > 0:
_last_successful_trusted_fetch = time.time()

behind = upstream_height - local
if behind < 0:
logger.info(f'Local block height is greater than upstreams. '
f'Current block height: {local}, '
f'Trusted block height: {trusted}')
behind = max(0, diff)
f'Upstream block height: {upstream_height}')
unhealthy_blockheight_diff = load_data_file_locally('unhealthy_block_threshold') or UNHEALTHY_BLOCKHEIGHT_DIFF
if behind > int(unhealthy_blockheight_diff):
raise Exception(f'Local validator is behind trusted validator by more than {unhealthy_blockheight_diff} blocks.')
return {
'local': local,
'trusted': trusted
}
return slots


def load_data_file_locally(filename: str, mode='r') -> Optional[str]:
Expand All @@ -97,19 +101,37 @@ def load_data_file_locally(filename: str, mode='r') -> Optional[str]:
return None


def get_all_slots() -> Dict[str, int]:
futures = {k: gevent.spawn(get_slot, v) for k, v in ENDPOINTS.items()}
return {k: v.get() for k, v in futures.items()}


def get_slot(url: str) -> int:
try:
return get_epoch_info(url)['result']['absoluteSlot']
except Exception as e:
logger.info(f'Received error fetching blockheight from {url}')
logger.info(e)
return 0


def get_epoch_info(url: str):
res = requests.post(
url,
headers={
'Content-Type': 'application/json'
json={
'jsonrpc': '2.0',
'id': 1,
'method': 'getEpochInfo',
'params': [{'commitment': 'single'}],
},
json={"jsonrpc":"2.0", "id":1, "method":"getEpochInfo", "params":[]}
timeout=1,
)
res.raise_for_status()
return res.json()


if __name__ == '__main__':
logging.basicConfig(level=logging.INFO)
serve_flask_app(
app, PORT, allow_remote_connections=True, allow_multiple_listeners=True
)

0 comments on commit 2da9831

Please sign in to comment.