Skip to content

Commit

Permalink
chore: allow automatically replacing a node even if it is active as a…
Browse files Browse the repository at this point in the history
…n API BN (#3707)

When a node is redeployed, but hasn't been removed from the registry
before that, it is automatically replaced: the old node record is
removed and the new one created.

Initially, this was only possible for unassigned nodes. Then,
functionality was added such that this also worked for nodes assigned to
a subnet.

This PR enables the same behavior for API boundary nodes. When a node
that is currently active as API boundary node is redeployed, the old
node record and the old API boundary node record are removed, and a new
node record and a new API boundary node record are created.

We can't just use any node as API boundary node. The node needs to be
configured with a domain name. This case will fail as we have an
invariant in `invariants/api_boundary_node.rs` that ensures that each
API boundary node record has an associated node record with a domain
name.

In addition, I removed the test `should_replace_node_in_subnet` in
`do_remove_node_directly.rs` as it is a literal copy of
`should_replace_node_in_subnet_and_update_allowance` with one difference
in the comment of line 458 and 528.
  • Loading branch information
r-birkner authored Feb 4, 2025
1 parent 65a50e4 commit 30b3069
Show file tree
Hide file tree
Showing 4 changed files with 214 additions and 73 deletions.
4 changes: 2 additions & 2 deletions rs/nns/governance/unreleased_changelog.md
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ on the process that this file is part of, see

Two new fields are added to the request, and one to the response.

The request now supports `page_size` and `page_number`. If `page_size` is greater than
The request now supports `page_size` and `page_number`. If `page_size` is greater than
`MAX_LIST_NEURONS_RESULTS` (currently 500), the API will treat it as `MAX_LIST_NEURONS_RESULTS`, and
continue procesisng the request. If `page_number` is None, the API will treat it as Some(0)

Expand All @@ -22,7 +22,7 @@ additional requests need to be made.

This will only affect neuron holders with more than 500 neurons, which is a small minority.

This allows neuron holders with many neurons to list all of their neurons, whereas before,
This allows neuron holders with many neurons to list all of their neurons, whereas before,
responses could be too large to be sent by the protocol.

### Migrating Active Neurons to Stable Memory
Expand Down
70 changes: 63 additions & 7 deletions rs/registry/canister/src/mutations/node_management/do_add_node.rs
Original file line number Diff line number Diff line change
Expand Up @@ -55,9 +55,7 @@ impl Registry {
println!("{}do_add_node: The node id is {:?}", LOG_PREFIX, node_id);

// 2. Clear out any nodes that already exist at this IP.
// This will only succeed if:
// - the same NO was in control of the original nodes.
// - the nodes are no longer in subnets.
// This will only succeed if the same NO was in control of the original nodes.
//
// (We use the http endpoint to be in line with what is used by the
// release dashboard.)
Expand Down Expand Up @@ -118,7 +116,7 @@ impl Registry {
.transpose()?
.map(|node_reward_type| node_reward_type as i32);

// 5. Validate the domain is valid
// 5. Validate the domain
let domain: Option<String> = payload
.domain
.as_ref()
Expand All @@ -132,7 +130,7 @@ impl Registry {
})
.transpose()?;

// 6. If there is an IPv4 config, make sure that the IPv4 is not used by anyone else
// 6. If there is an IPv4 config, make sure that the same IPv4 address is not used by any other node
let ipv4_intf_config = payload.public_ipv4_config.clone().map(|ipv4_config| {
ipv4_config.panic_on_invalid();
IPv4InterfaceConfig {
Expand Down Expand Up @@ -322,10 +320,15 @@ mod tests {
use ic_base_types::{NodeId, PrincipalId};
use ic_config::crypto::CryptoConfig;
use ic_crypto_node_key_generation::generate_node_keys_once;
use ic_protobuf::registry::node_operator::v1::NodeOperatorRecord;
use ic_protobuf::registry::{
api_boundary_node::v1::ApiBoundaryNodeRecord, node_operator::v1::NodeOperatorRecord,
};
use ic_registry_canister_api::IPv4Config;
use ic_registry_keys::{make_node_operator_record_key, make_node_record_key};
use ic_registry_keys::{
make_api_boundary_node_record_key, make_node_operator_record_key, make_node_record_key,
};
use ic_registry_transport::insert;
use ic_types::ReplicaVersion;
use itertools::Itertools;
use lazy_static::lazy_static;
use prost::Message;
Expand Down Expand Up @@ -859,6 +862,59 @@ mod tests {
}
}

#[test]
fn should_add_node_and_replace_existing_api_boundary_node() {
// This test verifies that adding a new node replaces an existing node in a subnet
let mut registry = invariant_compliant_registry(0);

// Add a node to the registry
let (mutate_request, node_ids_and_dkg_pks) = prepare_registry_with_nodes(1, 1);
registry.maybe_apply_mutation_internal(mutate_request.mutations);
let node_ids: Vec<NodeId> = node_ids_and_dkg_pks.keys().cloned().collect();

let old_node_id = node_ids[0];
let old_node = registry.get_node(old_node_id).unwrap();

let node_operator_id = registry_add_node_operator_for_node(&mut registry, old_node_id, 0);

// Turn that node into an API boundary node
let api_bn = ApiBoundaryNodeRecord {
version: ReplicaVersion::default().to_string(),
};
registry.maybe_apply_mutation_internal(vec![insert(
make_api_boundary_node_record_key(old_node_id),
api_bn.encode_to_vec(),
)]);

// Add a new node with the same IP address and port as an existing node, which should replace the existing node
let (mut payload, _valid_pks) = prepare_add_node_payload(2);
let http = old_node.http.unwrap();
payload
.http_endpoint
.clone_from(&format!("[{}]:{}", http.ip_addr, http.port));
let new_node_id = registry
.do_add_node_(payload.clone(), node_operator_id)
.expect("failed to add a node");

// Verify that there is an API boundary node record for the new node
assert!(registry
.get(
make_api_boundary_node_record_key(new_node_id).as_bytes(),
registry.latest_version()
)
.is_some());

// Verify the old node is removed from the registry
assert!(registry.get_node(old_node_id).is_none());

// Verify the new node is present in the registry
assert!(registry.get_node(new_node_id).is_some());

// Verify node operator allowance is unchanged
let updated_operator = get_node_operator_record(&registry, node_operator_id).unwrap();
assert_eq!(updated_operator.node_allowance, 0);
}

#[test]
#[should_panic(expected = "Node allowance for this Node Operator is exhausted")]
fn should_panic_if_node_allowance_is_exhausted() {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ use dfn_core::println;
use ic_base_types::{NodeId, PrincipalId};
use ic_registry_keys::{make_api_boundary_node_record_key, make_subnet_record_key};
use ic_registry_transport::pb::v1::RegistryMutation;
use ic_registry_transport::upsert;
use ic_registry_transport::{delete, insert, upsert};
use prost::Message;

impl Registry {
Expand Down Expand Up @@ -51,8 +51,9 @@ impl Registry {
}

// Prepare mutations for removing or replacing a node in the registry.
// If new_node_id is Some, the old node is in-place replaced with the new node, even if the old node is in a subnet.
// If new_node_id is None, the old node is only removed from the registry and is not allowed to be in a subnet.
// * If new_node_id is Some, the old node is in-place replaced with the new node, even if the old node is
// in active use (i.e., assigned to a subnet or acts as an API boundary node).
// * If new_node_id is None, the old node is only removed from the registry and is not allowed to be in active use.
pub fn make_remove_or_replace_node_mutations(
&mut self,
payload: RemoveNodeDirectlyPayload,
Expand Down Expand Up @@ -121,21 +122,32 @@ impl Registry {
);
}

// 3. Ensure the node is not an API Boundary Node.
// In order to succeed, a corresponding ApiBoundaryNodeRecord should be removed first via proposal.
let api_bn_id = self.get_api_boundary_node_record(payload.node_id);
if api_bn_id.is_some() {
panic!(
"{}do_remove_node_directly: Cannot remove a node, as it has ApiBoundaryNodeRecord with record_key: {}",
LOG_PREFIX,
make_api_boundary_node_record_key(payload.node_id)
);
let mut mutations = vec![];

// 3. Check if the node is an API Boundary Node. If there is a replacement node, remove the existing node
// and try to assign the new one to act as API boundary node. This will only work if the new node meets all
// the requirements of an API boundary node (e.g., it is configured with a domain name).
if let Some(api_bn_record) = self.get_api_boundary_node_record(payload.node_id) {
let Some(replacement_node_id) = new_node_id else {
panic!(
"{}do_remove_node_directly: Cannot remove this node, as it is an active API boundary node: {}",
LOG_PREFIX,
make_api_boundary_node_record_key(payload.node_id)
);
};

// remove the existing API boundary node record
let old_key = make_api_boundary_node_record_key(payload.node_id);
mutations.push(delete(old_key));

// create the new API boundary node record by just cloning the old one and inserting it with the new key
let new_key = make_api_boundary_node_record_key(replacement_node_id);
mutations.push(insert(new_key, api_bn_record.clone().encode_to_vec()));
}

// 4. Check if node is in a subnet, and if so, replace it in the subnet by updating the membership in the subnet record.
let subnet_list_record = get_subnet_list_record(self);
let is_node_in_subnet = find_subnet_for_node(self, payload.node_id, &subnet_list_record);
let mut mutations = vec![];
if let Some(subnet_id) = is_node_in_subnet {
if new_node_id.is_some() {
// The node is in a subnet and is being replaced with a new node.
Expand All @@ -157,10 +169,10 @@ impl Registry {
&mut subnet_record,
subnet_membership,
);
mutations = vec![upsert(
mutations.push(upsert(
make_subnet_record_key(subnet_id),
subnet_record.encode_to_vec(),
)];
));
} else {
panic!("{}do_remove_node_directly: Cannot remove a node that is a member of a subnet. This node is a member of Subnet: {}",
LOG_PREFIX,
Expand Down Expand Up @@ -217,7 +229,7 @@ mod tests {
api_boundary_node::v1::ApiBoundaryNodeRecord, node_operator::v1::NodeOperatorRecord,
};
use ic_registry_keys::{make_node_operator_record_key, make_node_record_key};
use ic_registry_transport::insert;
use ic_registry_transport::{insert, update};
use ic_types::ReplicaVersion;
use prost::Message;
use std::str::FromStr;
Expand All @@ -234,8 +246,8 @@ mod tests {
}

#[test]
#[should_panic(expected = "Cannot remove a node, as it has ApiBoundaryNodeRecord")]
fn should_panic_if_node_is_api_boundary_node() {
#[should_panic(expected = "Cannot remove this node, as it is an active API boundary node")]
fn should_panic_if_node_is_api_boundary_node_and_no_replacement() {
let mut registry = invariant_compliant_registry(0);
// Add node to registry
let (mutate_request, node_ids_and_dkg_pks) =
Expand All @@ -261,6 +273,119 @@ mod tests {
registry.do_remove_node(payload, node_operator_id);
}

#[test]
fn should_succeed_to_replace_api_boundary_node() {
let mut registry = invariant_compliant_registry(0);
// Add node to registry
let (mutate_request, node_ids_and_dkg_pks) =
prepare_registry_with_nodes(1 /* mutation id */, 2 /* node count */);
registry.maybe_apply_mutation_internal(mutate_request.mutations);
let mut node_ids = node_ids_and_dkg_pks.keys();
let old_node_id = node_ids
.next()
.expect("should contain at least one node ID")
.to_owned();
let new_node_id = node_ids
.next()
.expect("should contain at least two node IDs")
.to_owned();

let node_operator_id = registry_add_node_operator_for_node(&mut registry, old_node_id, 0);

// turn first node into an API BN by adding the record to the registry
let api_bn = ApiBoundaryNodeRecord {
version: ReplicaVersion::default().to_string(),
};
registry.maybe_apply_mutation_internal(vec![insert(
make_api_boundary_node_record_key(old_node_id),
api_bn.encode_to_vec(),
)]);
let payload = RemoveNodeDirectlyPayload {
node_id: old_node_id,
};

registry.do_replace_node_with_another(payload, node_operator_id, new_node_id);

// Verify the removed node's API boundary node record has been removed
assert!(registry
.get(
make_api_boundary_node_record_key(old_node_id).as_bytes(),
registry.latest_version()
)
.is_none());

// Verify the replacement node has been turned into an API boundary node
assert!(registry
.get(
make_api_boundary_node_record_key(new_node_id).as_bytes(),
registry.latest_version()
)
.is_some());

// Verify the old node is removed from the registry
assert!(registry
.get(
make_node_record_key(old_node_id).as_bytes(),
registry.latest_version()
)
.is_none());

// Verify the new node is present in the registry
assert!(registry.get_node(new_node_id).is_some());

// Verify node operator allowance set to 1
let updated_operator = get_node_operator_record(&registry, node_operator_id).unwrap();
assert_eq!(updated_operator.node_allowance, 1);
}

#[test]
#[should_panic(
expected = "invariant check failed with message: domain field of the NodeRecord"
)]
fn should_panic_to_replace_api_boundary_node_if_new_node_has_no_domain() {
let mut registry = invariant_compliant_registry(0);
// Add node to registry
let (mutate_request, node_ids_and_dkg_pks) =
prepare_registry_with_nodes(1 /* mutation id */, 2 /* node count */);
registry.maybe_apply_mutation_internal(mutate_request.mutations);
let mut node_ids = node_ids_and_dkg_pks.keys();
let old_node_id = node_ids
.next()
.expect("should contain at least one node ID")
.to_owned();
let new_node_id = node_ids
.next()
.expect("should contain at least two node IDs")
.to_owned();

let node_operator_id = registry_add_node_operator_for_node(&mut registry, old_node_id, 0);

// turn first node into an API BN by adding the record to the registry
let api_bn = ApiBoundaryNodeRecord {
version: ReplicaVersion::default().to_string(),
};
registry.maybe_apply_mutation_internal(vec![insert(
make_api_boundary_node_record_key(old_node_id),
api_bn.encode_to_vec(),
)]);

// remove the default domain name from the replacement node such that the API boundary node invariant check fails
let mut node_record = registry.get_node_or_panic(new_node_id);
node_record.domain = None;
let update_node_record = update(
make_node_record_key(new_node_id).as_bytes(),
node_record.encode_to_vec(),
);
let mutations = vec![update_node_record];
registry.maybe_apply_mutation_internal(mutations);

let payload = RemoveNodeDirectlyPayload {
node_id: old_node_id,
};

registry.do_replace_node_with_another(payload, node_operator_id, new_node_id);
}

#[test]
fn should_succeed() {
let mut registry = invariant_compliant_registry(0);
Expand Down Expand Up @@ -441,52 +566,6 @@ mod tests {
// Should fail because the DC of operator1 and operator2 does not match
registry.do_remove_node(payload, operator2_id);
}
#[test]
fn should_replace_node_in_subnet() {
let mut registry = invariant_compliant_registry(0);

// Add nodes to the registry
let (mutate_request, node_ids_and_dkg_pks) = prepare_registry_with_nodes(1, 2);
registry.maybe_apply_mutation_internal(mutate_request.mutations);
let node_ids = node_ids_and_dkg_pks.keys().cloned().collect::<Vec<_>>();
let node_operator_id = registry_add_node_operator_for_node(&mut registry, node_ids[0], 0);

// Create a subnet with the first node
let subnet_id =
registry_create_subnet_with_nodes(&mut registry, &node_ids_and_dkg_pks, &[0]);

// Replace the node_ids[0] with node_ids[1], while node_ids[0] is in a subnet
let payload = RemoveNodeDirectlyPayload {
node_id: node_ids[0],
};

registry.do_replace_node_with_another(payload, node_operator_id, node_ids[1]);

// Verify the subnet record is updated with the new node
let expected_membership: Vec<NodeId> = vec![node_ids[1]];
let actual_membership: Vec<NodeId> = registry
.get_subnet_or_panic(subnet_id)
.membership
.iter()
.map(|bytes| NodeId::from(PrincipalId::try_from(bytes).unwrap()))
.collect();
assert_eq!(actual_membership, expected_membership);

// Verify the old node is removed from the registry
assert!(registry
.get(
make_node_record_key(node_ids[0]).as_bytes(),
registry.latest_version()
)
.is_none());

// Verify the new node is present in the registry
assert!(registry.get_node(node_ids[1]).is_some());

// Verify node operator allowance increased by 1
let updated_operator = get_node_operator_record(&registry, node_operator_id).unwrap();
assert_eq!(updated_operator.node_allowance, 1);
}

#[test]
#[should_panic(expected = "Cannot remove a node that is a member of a subnet")]
Expand Down
Loading

0 comments on commit 30b3069

Please sign in to comment.