From 172a79c8a57073a17176bf52621f932787177e1c Mon Sep 17 00:00:00 2001 From: Nikita Zheleztsov Date: Wed, 6 Dec 2023 09:17:09 +0300 Subject: [PATCH] replicaset: introduce name validation In case of name identification, no UUID may be passed at all, so we cannot verify only UUID, when connecting to storage. It seems impossible to extend the current net.box greeting by exposing net_box.conn.name to it, as iproto greeting doesn't have enough free space to save 64 bit instance name. So, we should deal with name validation on vshard side. Let's validate name only during connection establishing, not on every reconnect, as it's done for UUID now. The motivation for that, is the fact, that it's not cheap to validate name. Closes #426 NO_DOC=internal --- test/replicaset-luatest/replicaset_3_test.lua | 10 +++- test/storage/storage.result | 8 +-- test/storage/storage.test.lua | 4 +- vshard/error.lua | 5 ++ vshard/replicaset.lua | 50 ++++++++++++++++++- vshard/storage/init.lua | 1 + 6 files changed, 68 insertions(+), 10 deletions(-) diff --git a/test/replicaset-luatest/replicaset_3_test.lua b/test/replicaset-luatest/replicaset_3_test.lua index 5d53761f..46a49985 100644 --- a/test/replicaset-luatest/replicaset_3_test.lua +++ b/test/replicaset-luatest/replicaset_3_test.lua @@ -265,9 +265,15 @@ test_group.test_named_replicaset = function(g) t.assert_equals(rs.id, rs.name) t.assert_equals(replica_1_a.id, replica_1_a.name) - -- Just to be sure, that it works. - local uuid_a = g.replica_1_a:instance_uuid() + -- Name is not set, name mismatch error. local ret, err = rs:callrw('get_uuid', {}, timeout_opts) + t.assert_equals(err.name, 'INSTANCE_NAME_MISMATCH') + t.assert_equals(ret, nil) + + -- Set name, everything works from now on. + g.replica_1_a:exec(function() box.cfg{instance_name = 'replica_1_a'} end) + local uuid_a = g.replica_1_a:instance_uuid() + ret, err = rs:callrw('get_uuid', {}, timeout_opts) t.assert_equals(err, nil) t.assert_equals(ret, uuid_a) diff --git a/test/storage/storage.result b/test/storage/storage.result index 37c6ac78..8cd1cdf4 100644 --- a/test/storage/storage.result +++ b/test/storage/storage.result @@ -1014,16 +1014,16 @@ vshard.storage.internal.errinj.ERRINJ_RECOVERY_PAUSE = false -- -- Internal info function. -- -vshard.storage._call('info') +vshard.storage._call('info').is_master --- -- is_master: true +- true ... _ = test_run:switch('storage_1_b') --- ... -vshard.storage._call('info') +vshard.storage._call('info').is_master --- -- is_master: false +- false ... -- -- gh-123, gh-298: storage auto-enable/disable depending on instance state. diff --git a/test/storage/storage.test.lua b/test/storage/storage.test.lua index 8fd8cd26..6c3b5515 100644 --- a/test/storage/storage.test.lua +++ b/test/storage/storage.test.lua @@ -327,9 +327,9 @@ vshard.storage.internal.errinj.ERRINJ_RECOVERY_PAUSE = false -- -- Internal info function. -- -vshard.storage._call('info') +vshard.storage._call('info').is_master _ = test_run:switch('storage_1_b') -vshard.storage._call('info') +vshard.storage._call('info').is_master -- -- gh-123, gh-298: storage auto-enable/disable depending on instance state. diff --git a/vshard/error.lua b/vshard/error.lua index 3042f88a..0a24d862 100644 --- a/vshard/error.lua +++ b/vshard/error.lua @@ -202,6 +202,11 @@ local error_message_template = { msg = 'Bucket %s update is invalid: %s', args = {'bucket_id', 'reason'}, }, + [40] = { + name = 'INSTANCE_NAME_MISMATCH', + msg = 'Mismatch server name: expected "%s", but got "%s"', + args = {'expected_name', 'actual_name'}, + } } -- diff --git a/vshard/replicaset.lua b/vshard/replicaset.lua index 4e20bbaf..c8341bd8 100644 --- a/vshard/replicaset.lua +++ b/vshard/replicaset.lua @@ -85,6 +85,12 @@ local function netbox_on_connect(conn) -- If a replica's connection has revived, then unset -- replica.down_ts - it is not down anymore. replica.down_ts = nil + if conn.name then + -- Name is checked during the first call to the instance. + local opts = {is_async = true} + conn.name = conn:call('vshard.storage._call', {'info'}, opts) + conn.name_cond:broadcast() + end if replica.uuid and conn.peer_uuid ~= replica.uuid and -- XXX: Zero UUID means not a real Tarantool instance. It -- is likely to be a cartridge.remote-control server, @@ -225,6 +231,13 @@ local function replicaset_connect_to_replica(replicaset, replica) conn:on_connect(netbox_on_connect) conn.on_disconnect_ref = netbox_on_disconnect conn:on_disconnect(netbox_on_disconnect) + -- Whether name validation is needed. + conn.name = replica.id == replica.name + if conn.name then + -- Replica call can be done before the connection is established. + -- Create fiber condition to wait for future object appearance. + conn.name_cond = fiber.cond() + end replica.conn = conn end return conn @@ -402,6 +415,40 @@ local function replica_on_success_request(replica) end end +-- +-- Validate instance_name if neeed and make a call. +-- +local function replica_conn_call(replica, func, args, opts) + local conn = replica.conn + if conn.name then + if conn.name == true and + not fiber_cond_wait(conn.name_cond, opts.timeout) then + -- Connection have not been established. + return nil, lerror.timeout() + end + -- It's future object now. + assert(type(conn.name) == 'userdata') + conn.name_cond = nil + -- It's the first call to instance, validate name. + local res, err = util.future_wait(conn.name, opts.timeout) + if res == nil then + -- Failed to get instance name, e.g. timeout error. + conn.name:discard() + conn:close() + return nil, err + end + if res[1].name ~= replica.name then + conn:close() + return nil, lerror.vshard(lerror.code.INSTANCE_NAME_MISMATCH, + replica.name, res[1].name) + end + -- Everything is all right. + conn.name = nil + end + + return pcall(conn.call, conn, func, args, opts) +end + -- -- Call a function on a replica using its connection. The typical -- usage is calls under storage.call, because of which there @@ -419,9 +466,8 @@ end local function replica_call(replica, func, args, opts) assert(opts and opts.timeout) replica.activity_ts = fiber_clock() - local conn = replica.conn local net_status, storage_status, retval, error_object = - pcall(conn.call, conn, func, args, opts) + replica_conn_call(replica, func, args, opts) if not net_status then -- Do not increase replica's network timeout, if the -- requested one was less, than network's one. For diff --git a/vshard/storage/init.lua b/vshard/storage/init.lua index 155a9857..09ed7eaa 100644 --- a/vshard/storage/init.lua +++ b/vshard/storage/init.lua @@ -3468,6 +3468,7 @@ end local function storage_service_info() return { is_master = this_is_master(), + name = box.info.name } end