From 592ac510efa3741ea778ec6b994318ac7ef79470 Mon Sep 17 00:00:00 2001 From: Chris O'Neil Date: Thu, 11 Apr 2024 18:06:42 +0100 Subject: [PATCH] feat: distinguish failure to start during upgrade It's possible for a service to be upgraded but then not subsequently start. In this case, it has still been upgraded to a new version. It's worth making a distinction to the user between an actual error in the upgrade process, or a failure to start, because in the latter case, they do actually have an upgrade. They can then take action to try and start their services again. As part of this change, the start process attempts to find whether the service process did indeed start, because you don't always seem to get errors back from the service infrastructure. We also make sure that we return an error if there was a failure with the upgrade process for any services. This is necessary for visibility on our own deploy process. --- sn_node_manager/src/add_services/tests.rs | 4 +- sn_node_manager/src/cmd/mod.rs | 7 + sn_node_manager/src/cmd/node.rs | 15 +- sn_node_manager/src/lib.rs | 256 +++++++++++++++++++++- sn_service_management/src/control.rs | 23 +- sn_service_management/src/daemon.rs | 2 +- sn_service_management/src/error.rs | 2 +- sn_service_management/src/faucet.rs | 2 +- sn_service_management/src/lib.rs | 1 + 9 files changed, 289 insertions(+), 23 deletions(-) diff --git a/sn_node_manager/src/add_services/tests.rs b/sn_node_manager/src/add_services/tests.rs index b7fab7bdc5..b8497b68d2 100644 --- a/sn_node_manager/src/add_services/tests.rs +++ b/sn_node_manager/src/add_services/tests.rs @@ -31,7 +31,7 @@ use sn_service_management::{ use std::{ ffi::OsString, net::{IpAddr, Ipv4Addr, SocketAddr}, - path::PathBuf, + path::{Path, PathBuf}, str::FromStr, }; @@ -54,7 +54,7 @@ mock! { fn create_service_user(&self, username: &str) -> ServiceControlResult<()>; fn get_available_port(&self) -> ServiceControlResult; fn install(&self, install_ctx: ServiceInstallCtx) -> ServiceControlResult<()>; - fn get_process_pid(&self, name: &str) -> ServiceControlResult; + fn get_process_pid(&self, bin_path: &Path) -> ServiceControlResult; fn is_service_process_running(&self, pid: u32) -> bool; fn start(&self, service_name: &str) -> ServiceControlResult<()>; fn stop(&self, service_name: &str) -> ServiceControlResult<()>; diff --git a/sn_node_manager/src/cmd/mod.rs b/sn_node_manager/src/cmd/mod.rs index 4634d3b09d..07e38f8c83 100644 --- a/sn_node_manager/src/cmd/mod.rs +++ b/sn_node_manager/src/cmd/mod.rs @@ -75,6 +75,13 @@ pub fn print_upgrade_summary(upgrade_summary: Vec<(String, UpgradeResult)>) { service_name ); } + UpgradeResult::UpgradedButNotStarted(previous_version, new_version, _) => { + println!( + "{} {} was upgraded from {previous_version} to {new_version} but it did not start", + "✕".red(), + service_name + ); + } UpgradeResult::Forced(previous_version, target_version) => { println!( "{} Forced {} version change from {previous_version} to {target_version}.", diff --git a/sn_node_manager/src/cmd/node.rs b/sn_node_manager/src/cmd/node.rs index 16c8ed6eb6..a8352752cb 100644 --- a/sn_node_manager/src/cmd/node.rs +++ b/sn_node_manager/src/cmd/node.rs @@ -18,7 +18,7 @@ use crate::{ helpers::{download_and_extract_release, get_bin_version}, refresh_node_registry, status_report, ServiceManager, VerbosityLevel, }; -use color_eyre::{eyre::eyre, Result}; +use color_eyre::{eyre::eyre, Help, Result}; use colored::Colorize; use libp2p_identity::PeerId; use semver::Version; @@ -358,9 +358,18 @@ pub async fn upgrade( } } - print_upgrade_summary(upgrade_summary); - node_registry.save()?; + print_upgrade_summary(upgrade_summary.clone()); + + if upgrade_summary.iter().any(|(_, r)| { + matches!(r, UpgradeResult::Error(_)) + || matches!(r, UpgradeResult::UpgradedButNotStarted(_, _, _)) + }) { + return Err(eyre!("There was a problem upgrading one or more nodes").suggestion( + "For any services that were upgraded but did not start, you can attempt to start them \ + again using the 'start' command.")); + } + Ok(()) } diff --git a/sn_node_manager/src/lib.rs b/sn_node_manager/src/lib.rs index b60860d75e..c16c6016f7 100644 --- a/sn_node_manager/src/lib.rs +++ b/sn_node_manager/src/lib.rs @@ -44,6 +44,7 @@ use sn_service_management::{ NodeRegistry, NodeServiceData, ServiceStateActions, ServiceStatus, UpgradeOptions, UpgradeResult, }; +use tracing::debug; pub const DAEMON_DEFAULT_PORT: u16 = 12500; pub const DAEMON_SERVICE_NAME: &str = "safenodemand"; @@ -91,6 +92,31 @@ impl ServiceManager { self.service_control.start(&self.service.name())?; self.service_control.wait(RPC_START_UP_DELAY_MS); + // This is an attempt to see whether the service process has actually launched. You don't + // always get an error from the service infrastructure. + // + // There might be many different `safenode` processes running, but since each service has + // its own isolated binary, we use the binary path to uniquely identify it. + match self + .service_control + .get_process_pid(&self.service.bin_path()) + { + Ok(pid) => { + debug!( + "Service process started for {} with PID {}", + self.service.name(), + pid + ); + } + Err(sn_service_management::error::Error::ServiceProcessNotFound(_)) => { + return Err(eyre!( + "The '{}' service has failed to start", + self.service.name() + )); + } + Err(e) => return Err(e.into()), + } + self.service.on_start().await?; println!("{} Started {} service", "✓".green(), self.service.name()); @@ -236,7 +262,18 @@ impl ServiceManager { )?; if options.start_service { - self.start().await?; + match self.start().await { + Ok(()) => {} + Err(e) => { + self.service + .set_version(&options.target_version.to_string()); + return Ok(UpgradeResult::UpgradedButNotStarted( + current_version.to_string(), + options.target_version.to_string(), + e.to_string(), + )); + } + } } self.service .set_version(&options.target_version.to_string()); @@ -444,14 +481,14 @@ mod tests { use predicates::prelude::*; use service_manager::ServiceInstallCtx; use sn_service_management::{ - error::Result as ServiceControlResult, + error::{Error as ServiceControlError, Result as ServiceControlResult}, node::{NodeService, NodeServiceData}, rpc::{NetworkInfo, NodeInfo, RecordAddress, RpcActions}, UpgradeOptions, UpgradeResult, }; use std::{ net::{IpAddr, Ipv4Addr, SocketAddr}, - path::PathBuf, + path::{Path, PathBuf}, str::FromStr, }; @@ -475,7 +512,7 @@ mod tests { fn create_service_user(&self, username: &str) -> ServiceControlResult<()>; fn get_available_port(&self) -> ServiceControlResult; fn install(&self, install_ctx: ServiceInstallCtx) -> ServiceControlResult<()>; - fn get_process_pid(&self, name: &str) -> ServiceControlResult; + fn get_process_pid(&self, bin_path: &Path) -> ServiceControlResult; fn is_service_process_running(&self, pid: u32) -> bool; fn start(&self, service_name: &str) -> ServiceControlResult<()>; fn stop(&self, service_name: &str) -> ServiceControlResult<()>; @@ -499,6 +536,13 @@ mod tests { .with(eq(3000)) .times(1) .returning(|_| ()); + mock_service_control + .expect_get_process_pid() + .with(eq(PathBuf::from( + "/var/safenode-manager/services/safenode1/safenode", + ))) + .times(1) + .returning(|_| Ok(100)); mock_rpc_client.expect_node_info().times(1).returning(|| { Ok(NodeInfo { pid: 1000, @@ -575,6 +619,13 @@ mod tests { .with(eq(3000)) .times(1) .returning(|_| ()); + mock_service_control + .expect_get_process_pid() + .with(eq(PathBuf::from( + "/var/safenode-manager/services/safenode1/safenode", + ))) + .times(1) + .returning(|_| Ok(100)); mock_rpc_client.expect_node_info().times(1).returning(|| { Ok(NodeInfo { pid: 1000, @@ -697,6 +748,11 @@ mod tests { let mut mock_service_control = MockServiceControl::new(); let mut mock_rpc_client = MockRpcClient::new(); + mock_service_control + .expect_is_service_process_running() + .with(eq(1000)) + .times(1) + .returning(|_| false); mock_service_control .expect_start() .with(eq("safenode1")) @@ -708,10 +764,12 @@ mod tests { .times(1) .returning(|_| ()); mock_service_control - .expect_is_service_process_running() - .with(eq(1000)) + .expect_get_process_pid() + .with(eq(PathBuf::from( + "/var/safenode-manager/services/safenode1/safenode", + ))) .times(1) - .returning(|_| false); + .returning(|_| Ok(100)); mock_rpc_client.expect_node_info().times(1).returning(|| { Ok(NodeInfo { pid: 1000, @@ -775,6 +833,65 @@ mod tests { Ok(()) } + #[tokio::test] + async fn start_should_return_an_error_if_the_process_was_not_found() -> Result<()> { + let mut mock_service_control = MockServiceControl::new(); + + mock_service_control + .expect_start() + .with(eq("safenode1")) + .times(1) + .returning(|_| Ok(())); + mock_service_control + .expect_wait() + .with(eq(3000)) + .times(1) + .returning(|_| ()); + mock_service_control + .expect_get_process_pid() + .with(eq(PathBuf::from( + "/var/safenode-manager/services/safenode1/safenode", + ))) + .times(1) + .returning(|_| { + Err(ServiceControlError::ServiceProcessNotFound( + "/var/safenode-manager/services/safenode1/safenode".to_string(), + )) + }); + + let mut service_data = NodeServiceData { + connected_peers: None, + data_dir_path: PathBuf::from("/var/safenode-manager/services/safenode1"), + genesis: false, + listen_addr: None, + local: false, + log_dir_path: PathBuf::from("/var/log/safenode/safenode1"), + number: 1, + peer_id: None, + pid: None, + rpc_socket_addr: SocketAddr::new(IpAddr::V4(Ipv4Addr::new(127, 0, 0, 1)), 8081), + safenode_path: PathBuf::from("/var/safenode-manager/services/safenode1/safenode"), + service_name: "safenode1".to_string(), + status: ServiceStatus::Added, + user: "safe".to_string(), + version: "0.98.1".to_string(), + }; + let service = NodeService::new(&mut service_data, Box::new(MockRpcClient::new())); + let mut service_manager = ServiceManager::new( + service, + Box::new(mock_service_control), + VerbosityLevel::Normal, + ); + + let result = service_manager.start().await; + match result { + Ok(_) => panic!("This test should have resulted in an error"), + Err(e) => assert_eq!("The 'safenode1' service has failed to start", e.to_string()), + } + + Ok(()) + } + #[tokio::test] async fn stop_should_stop_a_running_service() -> Result<()> { let mut mock_service_control = MockServiceControl::new(); @@ -994,6 +1111,11 @@ mod tests { .with(eq(3000)) .times(1) .returning(|_| ()); + mock_service_control + .expect_get_process_pid() + .with(eq(current_node_bin.to_path_buf().clone())) + .times(1) + .returning(|_| Ok(100)); mock_rpc_client.expect_node_info().times(1).returning(|| { Ok(NodeInfo { pid: 2000, @@ -1185,6 +1307,11 @@ mod tests { .with(eq(3000)) .times(1) .returning(|_| ()); + mock_service_control + .expect_get_process_pid() + .with(eq(current_node_bin.to_path_buf().clone())) + .times(1) + .returning(|_| Ok(100)); mock_rpc_client.expect_node_info().times(1).returning(|| { Ok(NodeInfo { pid: 2000, @@ -1401,6 +1528,121 @@ mod tests { Ok(()) } + #[tokio::test] + async fn upgrade_should_return_upgraded_but_not_started_if_service_did_not_start() -> Result<()> + { + let current_version = "0.1.0"; + let target_version = "0.2.0"; + + let tmp_data_dir = assert_fs::TempDir::new()?; + let current_install_dir = tmp_data_dir.child("safenode_install"); + current_install_dir.create_dir_all()?; + + let current_node_bin = current_install_dir.child("safenode"); + current_node_bin.write_binary(b"fake safenode binary")?; + let target_node_bin = tmp_data_dir.child("safenode"); + target_node_bin.write_binary(b"fake safenode binary")?; + + let current_node_bin_str = current_node_bin.to_path_buf().to_string_lossy().to_string(); + + let mut mock_service_control = MockServiceControl::new(); + + // before binary upgrade + mock_service_control + .expect_is_service_process_running() + .with(eq(1000)) + .times(1) + .returning(|_| true); + mock_service_control + .expect_stop() + .with(eq("safenode1")) + .times(1) + .returning(|_| Ok(())); + + // after binary upgrade + mock_service_control + .expect_uninstall() + .with(eq("safenode1")) + .times(1) + .returning(|_| Ok(())); + mock_service_control + .expect_install() + .with(always()) + .times(1) + .returning(|_| Ok(())); + + // after service restart + mock_service_control + .expect_start() + .with(eq("safenode1")) + .times(1) + .returning(|_| Ok(())); + mock_service_control + .expect_wait() + .with(eq(3000)) + .times(1) + .returning(|_| ()); + mock_service_control + .expect_get_process_pid() + .with(eq(current_node_bin.to_path_buf().clone())) + .times(1) + .returning(move |_| { + Err(ServiceControlError::ServiceProcessNotFound( + current_node_bin_str.clone(), + )) + }); + + let mut service_data = NodeServiceData { + connected_peers: None, + data_dir_path: PathBuf::from("/var/safenode-manager/services/safenode1"), + genesis: false, + listen_addr: None, + local: false, + log_dir_path: PathBuf::from("/var/log/safenode/safenode1"), + number: 1, + peer_id: Some(PeerId::from_str( + "12D3KooWS2tpXGGTmg2AHFiDh57yPQnat49YHnyqoggzXZWpqkCR", + )?), + pid: Some(1000), + rpc_socket_addr: SocketAddr::new(IpAddr::V4(Ipv4Addr::new(127, 0, 0, 1)), 8081), + safenode_path: current_node_bin.to_path_buf(), + service_name: "safenode1".to_string(), + status: ServiceStatus::Running, + user: "safe".to_string(), + version: current_version.to_string(), + }; + let service = NodeService::new(&mut service_data, Box::new(MockRpcClient::new())); + let mut service_manager = ServiceManager::new( + service, + Box::new(mock_service_control), + VerbosityLevel::Normal, + ); + + let upgrade_result = service_manager + .upgrade(UpgradeOptions { + bootstrap_peers: Vec::new(), + env_variables: None, + force: false, + start_service: true, + target_bin_path: target_node_bin.to_path_buf(), + target_version: Version::parse(target_version).unwrap(), + }) + .await?; + + match upgrade_result { + UpgradeResult::UpgradedButNotStarted(old_version, new_version, _) => { + assert_eq!(old_version, current_version); + assert_eq!(new_version, target_version); + } + _ => panic!( + "Expected UpgradeResult::UpgradedButNotStarted but was {:#?}", + upgrade_result + ), + } + + Ok(()) + } + #[tokio::test] async fn remove_should_remove_an_added_node() -> Result<()> { let temp_dir = assert_fs::TempDir::new()?; diff --git a/sn_service_management/src/control.rs b/sn_service_management/src/control.rs index 11bc2e7218..8a8a6f5565 100644 --- a/sn_service_management/src/control.rs +++ b/sn_service_management/src/control.rs @@ -12,7 +12,10 @@ use service_manager::{ ServiceInstallCtx, ServiceLabel, ServiceManager, ServiceStartCtx, ServiceStopCtx, ServiceUninstallCtx, }; -use std::net::{SocketAddr, TcpListener}; +use std::{ + net::{SocketAddr, TcpListener}, + path::Path, +}; use sysinfo::{Pid, System}; /// A thin wrapper around the `service_manager::ServiceManager`, which makes our own testing @@ -26,7 +29,7 @@ pub trait ServiceControl: Sync { fn create_service_user(&self, username: &str) -> Result<()>; fn get_available_port(&self) -> Result; fn install(&self, install_ctx: ServiceInstallCtx) -> Result<()>; - fn get_process_pid(&self, name: &str) -> Result; + fn get_process_pid(&self, path: &Path) -> Result; fn is_service_process_running(&self, pid: u32) -> bool; fn start(&self, service_name: &str) -> Result<()>; fn stop(&self, service_name: &str) -> Result<()>; @@ -161,17 +164,21 @@ impl ServiceControl for ServiceController { Ok(port) } - fn get_process_pid(&self, name: &str) -> Result { + fn get_process_pid(&self, bin_path: &Path) -> Result { let mut system = System::new_all(); system.refresh_all(); for (pid, process) in system.processes() { - if process.name() == name { - // There does not seem to be any easy way to get the process ID from the `Pid` - // type. Probably something to do with representing it in a cross-platform way. - return Ok(pid.to_string().parse::()?); + if let Some(path) = process.exe() { + if bin_path == path { + // There does not seem to be any easy way to get the process ID from the `Pid` + // type. Probably something to do with representing it in a cross-platform way. + return Ok(pid.to_string().parse::()?); + } } } - Err(Error::ServiceProcessNotFound(name.to_string())) + Err(Error::ServiceProcessNotFound( + bin_path.to_string_lossy().to_string(), + )) } fn install(&self, install_ctx: ServiceInstallCtx) -> Result<()> { diff --git a/sn_service_management/src/daemon.rs b/sn_service_management/src/daemon.rs index 72c8da6135..4822303e60 100644 --- a/sn_service_management/src/daemon.rs +++ b/sn_service_management/src/daemon.rs @@ -96,7 +96,7 @@ impl<'a> ServiceStateActions for DaemonService<'a> { // get_process_pid causes errors for the daemon. Maybe because it is being run as root? if let Ok(pid) = self .service_control - .get_process_pid(&self.service_data.service_name) + .get_process_pid(&self.service_data.daemon_path) { self.service_data.pid = Some(pid); } diff --git a/sn_service_management/src/error.rs b/sn_service_management/src/error.rs index 354e5ea864..726775ee69 100644 --- a/sn_service_management/src/error.rs +++ b/sn_service_management/src/error.rs @@ -43,7 +43,7 @@ pub enum Error { RpcNodeUpdateError(String), #[error("Could not obtain record addresses through RPC: {0}")] RpcRecordAddressError(String), - #[error("Could not find process '{0}'")] + #[error("Could not find process at '{0}'")] ServiceProcessNotFound(String), #[error("The user may have removed the '{0}' service outwith the node manager")] ServiceRemovedManually(String), diff --git a/sn_service_management/src/faucet.rs b/sn_service_management/src/faucet.rs index 7607278bef..fc30f6ad91 100644 --- a/sn_service_management/src/faucet.rs +++ b/sn_service_management/src/faucet.rs @@ -102,7 +102,7 @@ impl<'a> ServiceStateActions for FaucetService<'a> { async fn on_start(&mut self) -> Result<()> { let pid = self .service_control - .get_process_pid(&self.service_data.service_name)?; + .get_process_pid(&self.service_data.faucet_path)?; self.service_data.pid = Some(pid); self.service_data.status = ServiceStatus::Running; Ok(()) diff --git a/sn_service_management/src/lib.rs b/sn_service_management/src/lib.rs index 457e0926bd..9a2dca1f5e 100644 --- a/sn_service_management/src/lib.rs +++ b/sn_service_management/src/lib.rs @@ -49,6 +49,7 @@ pub enum UpgradeResult { Forced(String, String), NotRequired, Upgraded(String, String), + UpgradedButNotStarted(String, String, String), Error(String), }