-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
feat(broker): local worker peer lifecycle
- Loading branch information
krhougs
committed
Oct 18, 2022
1 parent
ec97b09
commit 8fcce80
Showing
12 changed files
with
377 additions
and
102 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,205 @@ | ||
use crate::local_worker::KnownLocalWorkerStatus::*; | ||
use crate::local_worker::LocalWorkerManagerChannelMessage::*; | ||
use crate::mgmt::local_worker::LocalWorkerIdentity; | ||
use log::{error, info, warn}; | ||
use service_network::config::{BROKER_PEER_DEAD_COUNT, BROKER_PEER_LOST_COUNT}; | ||
use std::collections::BTreeMap; | ||
use std::net::Ipv4Addr; | ||
use std::str::FromStr; | ||
use tokio::sync::mpsc::{channel, Receiver, Sender}; | ||
|
||
pub enum LocalWorkerManagerChannelMessage { | ||
ShouldCheckPeerHealth, | ||
ReceivedKeepAlive(LocalWorkerIdentity), | ||
} | ||
|
||
pub type LocalWorkerManagerChannelMessageSender = Sender<LocalWorkerManagerChannelMessage>; | ||
pub type LocalWorkerManagerChannelMessageReceiver = Receiver<LocalWorkerManagerChannelMessage>; | ||
|
||
pub type LocalWorkerMap = BTreeMap<String, KnownLocalWorker>; | ||
|
||
pub async fn local_worker_manager( | ||
_tx: LocalWorkerManagerChannelMessageSender, | ||
mut rx: LocalWorkerManagerChannelMessageReceiver, | ||
) { | ||
let mut lw_vec_keys: Vec<String> = Vec::new(); | ||
let mut lw_map: LocalWorkerMap = BTreeMap::new(); | ||
|
||
loop { | ||
while let Some(msg) = rx.recv().await { | ||
match msg { | ||
ShouldCheckPeerHealth => { | ||
check_peer_health(&mut lw_map, &lw_vec_keys); | ||
} | ||
ReceivedKeepAlive(lw) => { | ||
let key = lw.public_key.as_str(); | ||
let klw = lw_map.get_mut(key); | ||
if let Some(mut klw) = klw { | ||
match klw.status { | ||
Dead => { | ||
lw_vec_keys = create_local_worker(&mut lw_map, lw); | ||
} | ||
_ => { | ||
update_local_worker(&mut klw, lw); | ||
} | ||
} | ||
} else { | ||
lw_vec_keys = create_local_worker(&mut lw_map, lw); | ||
} | ||
} | ||
} | ||
} | ||
} | ||
} | ||
|
||
fn create_local_worker(lw_map: &mut LocalWorkerMap, lw: LocalWorkerIdentity) -> Vec<String> { | ||
let LocalWorkerIdentity { | ||
instance_name, | ||
instance_id, | ||
address_string, | ||
public_key, | ||
public_port, | ||
} = lw; | ||
|
||
let key = public_key.clone(); | ||
let address = Ipv4Addr::from_str(address_string.as_str()).expect( | ||
format!( | ||
"Invalid IP address for worker {}: {}", | ||
instance_name.as_str(), | ||
address_string.as_str() | ||
) | ||
.as_str(), | ||
); | ||
|
||
info!( | ||
"Hello new worker({}/{}).", | ||
instance_name.as_str(), | ||
public_key.as_str() | ||
); | ||
|
||
let ret = KnownLocalWorker { | ||
status: Active, | ||
address, | ||
instance_name, | ||
instance_id, | ||
address_string, | ||
public_key, | ||
public_port, | ||
lost_count: 0, | ||
lost_mark: false, | ||
}; | ||
lw_map.insert(key, ret); | ||
lw_map | ||
.iter() | ||
.filter_map(|(key, val)| match val.status.clone() { | ||
Dead => None, | ||
_ => Some(key.clone()), | ||
}) | ||
.collect::<Vec<String>>() | ||
} | ||
|
||
fn update_local_worker(klw: &mut KnownLocalWorker, lw: LocalWorkerIdentity) { | ||
let LocalWorkerIdentity { | ||
instance_name, | ||
instance_id, | ||
address_string, | ||
public_key, | ||
.. | ||
} = lw; | ||
|
||
if !(instance_id.eq(klw.instance_id.as_str())) { | ||
warn!( | ||
"Worker {} has changed instance id, it may have restarted.", | ||
instance_name.as_str() | ||
) | ||
} | ||
klw.instance_id = instance_id; | ||
|
||
if !(address_string.eq(klw.address_string.as_str())) { | ||
warn!( | ||
"Worker {} has changed its IP address, there may be IP address collisions.", | ||
instance_name.as_str() | ||
) | ||
} | ||
if !(public_key.eq(klw.public_key.as_str())) { | ||
error!( | ||
"[FATAL] Worker {} has changed public key, please check your network environment.", | ||
instance_name.as_str() | ||
) | ||
} | ||
|
||
match klw.status.clone() { | ||
Active => { | ||
klw.lost_mark = false; | ||
} | ||
Lost => { | ||
klw.status = Active; | ||
klw.lost_count = 0; | ||
klw.lost_mark = false; | ||
info!( | ||
"Worker {} has recovered from lost state.", | ||
instance_name.as_str() | ||
) | ||
} | ||
Dead => { | ||
warn!( | ||
"[FATAL] This is a bug, `update_local_worker` ran for dead worker {}.", | ||
instance_name.as_str() | ||
) | ||
} | ||
} | ||
} | ||
|
||
fn check_peer_health(lw_map: &mut LocalWorkerMap, lw_vec_keys: &Vec<String>) { | ||
let _ = lw_vec_keys.iter().for_each(|k| { | ||
let lw = lw_map.get_mut(k); | ||
if let Some(mut lw) = lw { | ||
match lw.status.clone() { | ||
Active => { | ||
if lw.lost_mark && (lw.lost_count > BROKER_PEER_LOST_COUNT) { | ||
lw.status = Lost; | ||
warn!( | ||
"Worker peer unreachable: {}/{}", | ||
lw.instance_name, lw.public_key | ||
); | ||
} | ||
lw.lost_mark = true; | ||
lw.lost_count += 1; | ||
} | ||
Lost => { | ||
if lw.lost_count > BROKER_PEER_DEAD_COUNT { | ||
lw.status = Dead; | ||
warn!("Worker peer dead: {}/{}", lw.instance_name, lw.public_key); | ||
} | ||
lw.lost_mark = true; | ||
lw.lost_count += 1; | ||
} | ||
Dead => { | ||
// ignored | ||
} | ||
} | ||
} | ||
}); | ||
} | ||
|
||
#[derive(Clone, Debug)] | ||
pub enum KnownLocalWorkerStatus { | ||
Active, | ||
Lost, | ||
Dead, | ||
} | ||
|
||
#[derive(Clone, Debug)] | ||
pub struct KnownLocalWorker { | ||
pub status: KnownLocalWorkerStatus, | ||
pub address: Ipv4Addr, | ||
pub address_string: String, | ||
pub public_port: u16, | ||
pub public_key: String, | ||
pub instance_name: String, | ||
pub instance_id: String, | ||
pub lost_count: u8, | ||
pub lost_mark: bool, | ||
} | ||
|
||
impl KnownLocalWorker {} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file was deleted.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,31 @@ | ||
use crate::mgmt::{BrokerMgmtShared, MyIdentity}; | ||
use crate::LocalWorkerManagerChannelMessage::ReceivedKeepAlive; | ||
use axum::http::StatusCode; | ||
use axum::response::IntoResponse; | ||
use axum::{extract::Extension, Json}; | ||
use serde::{Deserialize, Serialize}; | ||
use std::sync::Arc; | ||
|
||
#[derive(Clone, Serialize, Deserialize)] | ||
pub struct LocalWorkerIdentity { | ||
pub instance_name: String, | ||
pub instance_id: String, | ||
pub address_string: String, | ||
pub public_port: u16, | ||
pub public_key: String, | ||
} | ||
|
||
pub async fn handle_keepalive( | ||
Extension(shared): Extension<Arc<BrokerMgmtShared>>, | ||
Json(lwi): Json<LocalWorkerIdentity>, | ||
) -> impl IntoResponse { | ||
let config = &shared.config; | ||
let _ = shared.tx.clone().send(ReceivedKeepAlive(lwi.clone())).await; | ||
( | ||
StatusCode::IM_A_TEAPOT, | ||
Json(MyIdentity { | ||
instance_name: config.common.instance_name.to_string(), | ||
instance_id: config.instance_id.to_string(), | ||
}), | ||
) | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,60 @@ | ||
pub mod local_worker; | ||
|
||
use crate::LocalWorkerManagerChannelMessageSender; | ||
use axum::routing::put; | ||
use axum::{Extension, Router}; | ||
use hyper::Server; | ||
use log::info; | ||
use serde::{Deserialize, Serialize}; | ||
use service_network::config::PeerConfig; | ||
use service_network::runtime::AsyncRuntimeContext; | ||
use std::sync::Arc; | ||
|
||
pub struct BrokerMgmtShared { | ||
pub tx: LocalWorkerManagerChannelMessageSender, | ||
pub config: PeerConfig, | ||
} | ||
|
||
pub async fn start_server( | ||
tx: LocalWorkerManagerChannelMessageSender, | ||
_ctx: &AsyncRuntimeContext, | ||
config: &'static PeerConfig, | ||
) { | ||
let shared = BrokerMgmtShared { | ||
tx, | ||
config: config.clone(), | ||
}; | ||
tokio::spawn(async move { | ||
let router = create_router(); | ||
// TODO: add RSA identity for peer management api | ||
let bind_addr = config.common.mgmt_port; | ||
let bind_addr = format!("0.0.0.0:{}", bind_addr); | ||
info!("Starting management API on {}...", &bind_addr); | ||
let bind_addr = &bind_addr.parse().unwrap(); | ||
let router = router.layer(Extension(Arc::new(shared))); | ||
Server::bind(bind_addr) | ||
.serve(router.into_make_service()) | ||
.await | ||
.unwrap(); | ||
}) | ||
.await | ||
.expect("Failed to start management server"); | ||
} | ||
|
||
fn create_router() -> Router { | ||
let router = Router::new(); | ||
|
||
// Internal API v0 | ||
let router = router.route( | ||
"/v0/local_worker/keepalive", | ||
put(local_worker::handle_keepalive), | ||
); | ||
|
||
router | ||
} | ||
|
||
#[derive(Serialize, Deserialize)] | ||
pub struct MyIdentity { | ||
instance_name: String, | ||
instance_id: String, | ||
} |
Oops, something went wrong.