From 985a90bfd3de7d62c2187aad56e792f9d789a308 Mon Sep 17 00:00:00 2001 From: Nitzan Mordechai Date: Thu, 27 Jan 2022 15:13:28 +0200 Subject: [PATCH 1/3] osd/OSD: osd_fast_shutdown_notify_mon not quite right When osd_fast_shutdown and osd_fast_shutdown_notify_mon set as true, OSD marked as Down it should be marked as Dead, Fixed: https://tracker.ceph.com/issues/53327 Signed-off-by: Nitzan Mordechai nd nd --- src/messages/MOSDMarkMeDown.h | 13 ++++++++++++- src/mon/OSDMonitor.cc | 24 ++++++++++++++++++++++++ src/mon/OSDMonitor.h | 1 + src/msg/Message.cc | 1 + src/msg/Message.h | 1 + src/osd/OSD.cc | 31 +++++++++++++++++++++++-------- src/vstart.sh | 4 +++- 7 files changed, 65 insertions(+), 10 deletions(-) diff --git a/src/messages/MOSDMarkMeDown.h b/src/messages/MOSDMarkMeDown.h index e9428518639c8..c31adf4935949 100644 --- a/src/messages/MOSDMarkMeDown.h +++ b/src/messages/MOSDMarkMeDown.h @@ -19,7 +19,7 @@ class MOSDMarkMeDown final : public PaxosServiceMessage { private: - static constexpr int HEAD_VERSION = 3; + static constexpr int HEAD_VERSION = 4; static constexpr int COMPAT_VERSION = 3; public: @@ -28,6 +28,7 @@ class MOSDMarkMeDown final : public PaxosServiceMessage { entity_addrvec_t target_addrs; epoch_t epoch = 0; bool request_ack = false; // ack requested + bool down_and_dead = false; // mark down and dead MOSDMarkMeDown() : PaxosServiceMessage{MSG_OSD_MARK_ME_DOWN, 0, @@ -38,6 +39,12 @@ class MOSDMarkMeDown final : public PaxosServiceMessage { HEAD_VERSION, COMPAT_VERSION}, fsid(fs), target_osd(osd), target_addrs(av), epoch(e), request_ack(request_ack) {} + MOSDMarkMeDown(const uuid_d &fs, int osd, const entity_addrvec_t& av, + epoch_t e, bool request_ack, bool down_and_dead) + : PaxosServiceMessage{MSG_OSD_MARK_ME_DOWN_AND_DEAD, e, + HEAD_VERSION, COMPAT_VERSION}, + fsid(fs), target_osd(osd), target_addrs(av), + epoch(e), request_ack(request_ack), down_and_dead(down_and_dead) {} private: ~MOSDMarkMeDown() final {} @@ -54,6 +61,8 @@ class MOSDMarkMeDown final : public PaxosServiceMessage { decode(target_addrs, p); decode(epoch, p); decode(request_ack, p); + assert(header.version >= 4); + decode(down_and_dead, p); } void encode_payload(uint64_t features) override { @@ -67,12 +76,14 @@ class MOSDMarkMeDown final : public PaxosServiceMessage { encode(target_addrs, payload, features); encode(epoch, payload); encode(request_ack, payload); + encode(down_and_dead, payload); } std::string_view get_type_name() const override { return "MOSDMarkMeDown"; } void print(std::ostream& out) const override { out << "MOSDMarkMeDown(" << "request_ack=" << request_ack + << ", down_and_dead=" << down_and_dead << ", osd." << target_osd << ", " << target_addrs << ", fsid=" << fsid diff --git a/src/mon/OSDMonitor.cc b/src/mon/OSDMonitor.cc index 3e59384a38901..910c1436e0b29 100644 --- a/src/mon/OSDMonitor.cc +++ b/src/mon/OSDMonitor.cc @@ -2734,6 +2734,7 @@ bool OSDMonitor::preprocess_query(MonOpRequestRef op) // damp updates case MSG_OSD_MARK_ME_DOWN: + case MSG_OSD_MARK_ME_DOWN_AND_DEAD: return preprocess_mark_me_down(op); case MSG_OSD_MARK_ME_DEAD: return preprocess_mark_me_dead(op); @@ -2779,6 +2780,8 @@ bool OSDMonitor::prepare_update(MonOpRequestRef op) // damp updates case MSG_OSD_MARK_ME_DOWN: return prepare_mark_me_down(op); + case MSG_OSD_MARK_ME_DOWN_AND_DEAD: + return prepare_mark_me_down_and_dead(op); case MSG_OSD_MARK_ME_DEAD: return prepare_mark_me_dead(op); case MSG_OSD_FULL: @@ -3065,6 +3068,27 @@ bool OSDMonitor::prepare_mark_me_down(MonOpRequestRef op) return true; } +bool OSDMonitor::prepare_mark_me_down_and_dead(MonOpRequestRef op) +{ + op->mark_osdmon_event(__func__); + auto m = op->get_req(); + int target_osd = m->target_osd; + + ceph_assert(osdmap.is_up(target_osd)); + ceph_assert(osdmap.get_addrs(target_osd) == m->target_addrs); + + mon.clog->info() << "osd." << target_osd << " marked itself down and dead as of e" + << m->get_epoch(); + pending_inc.new_state[target_osd] = CEPH_OSD_UP; + if (!pending_inc.new_xinfo.count(target_osd)) { + pending_inc.new_xinfo[target_osd] = osdmap.osd_xinfo[target_osd]; + } + pending_inc.new_xinfo[target_osd].dead_epoch = m->get_epoch(); + if (m->request_ack) + wait_for_finished_proposal(op, new C_AckMarkedDown(this, op)); + return true; +} + bool OSDMonitor::preprocess_mark_me_dead(MonOpRequestRef op) { op->mark_osdmon_event(__func__); diff --git a/src/mon/OSDMonitor.h b/src/mon/OSDMonitor.h index 55d4e8c10f3aa..9ec8b74172f36 100644 --- a/src/mon/OSDMonitor.h +++ b/src/mon/OSDMonitor.h @@ -430,6 +430,7 @@ class OSDMonitor : public PaxosService, bool preprocess_failure(MonOpRequestRef op); bool prepare_failure(MonOpRequestRef op); bool prepare_mark_me_down(MonOpRequestRef op); + bool prepare_mark_me_down_and_dead(MonOpRequestRef op); void process_failures(); void take_all_failures(std::list& ls); diff --git a/src/msg/Message.cc b/src/msg/Message.cc index 6c57d355bdf49..df57230fa470e 100644 --- a/src/msg/Message.cc +++ b/src/msg/Message.cc @@ -499,6 +499,7 @@ Message *decode_message(CephContext *cct, m = make_message(); break; case MSG_OSD_MARK_ME_DOWN: + case MSG_OSD_MARK_ME_DOWN_AND_DEAD: m = make_message(); break; case MSG_OSD_MARK_ME_DEAD: diff --git a/src/msg/Message.h b/src/msg/Message.h index 362ae5ec6089d..0d722959172ec 100644 --- a/src/msg/Message.h +++ b/src/msg/Message.h @@ -80,6 +80,7 @@ #define MSG_OSD_MARK_ME_DOWN 74 #define MSG_OSD_FULL 75 #define MSG_OSD_MARK_ME_DEAD 123 +#define MSG_OSD_MARK_ME_DOWN_AND_DEAD 124 // removed right after luminous //#define MSG_OSD_SUBOP 76 diff --git a/src/osd/OSD.cc b/src/osd/OSD.cc index 3b0fc860b91b4..cb8b9a001ec36 100644 --- a/src/osd/OSD.cc +++ b/src/osd/OSD.cc @@ -1299,20 +1299,35 @@ bool OSDService::prepare_to_stop() OSDMapRef osdmap = get_osdmap(); if (osdmap && osdmap->is_up(whoami)) { - dout(0) << __func__ << " telling mon we are shutting down" << dendl; set_state(PREPARING_TO_STOP); - monc->send_mon_message( - new MOSDMarkMeDown( - monc->get_fsid(), - whoami, - osdmap->get_addrs(whoami), - osdmap->get_epoch(), - true // request ack + if (cct->_conf->osd_fast_shutdown && + cct->_conf->osd_fast_shutdown_notify_mon) { + dout(0) << __func__ << " telling mon we are shutting down and dead " << dendl; + monc->send_mon_message( + new MOSDMarkMeDown( + monc->get_fsid(), + whoami, + osdmap->get_addrs(whoami), + osdmap->get_epoch(), + true, // request ack + true // mark as down and dead + )); + } else { + dout(0) << __func__ << " telling mon we are shutting down" << dendl; + monc->send_mon_message( + new MOSDMarkMeDown( + monc->get_fsid(), + whoami, + osdmap->get_addrs(whoami), + osdmap->get_epoch(), + true // request ack )); + } const auto timeout = ceph::make_timespan(cct->_conf->osd_mon_shutdown_timeout); is_stopping_cond.wait_for(l, timeout, [this] { return get_state() == STOPPING; }); } + dout(0) << __func__ << " starting shutdown" << dendl; set_state(STOPPING); return true; diff --git a/src/vstart.sh b/src/vstart.sh index 5a58135278de7..36eecc8c95dc3 100755 --- a/src/vstart.sh +++ b/src/vstart.sh @@ -794,7 +794,7 @@ $DAEMONOPTS osd class dir = $OBJCLASS_PATH osd class load list = * osd class default list = * - osd fast shutdown = false + osd fast shutdown = true filestore wbthrottle xfs ios start flusher = 10 filestore wbthrottle xfs ios hard limit = 20 @@ -1408,6 +1408,8 @@ osd_scrub_load_threshold = 2000 osd_debug_op_order = true osd_debug_misdirected_ops = true osd_copyfrom_max_chunk = 524288 +osd fast shutdown = true +osd fast shutdown notify mon= true [mds] mds_debug_frag = true From 5fbd123d3a5df21aa01c08b3643f432f4f272e32 Mon Sep 17 00:00:00 2001 From: Satoru Takeuchi Date: Thu, 18 Nov 2021 20:48:18 +0000 Subject: [PATCH 2/3] osd: make osd_fast_shutdown_notify_mon option true by default osd_fast_shutdown_notify_mon option is false by default. So users suffer from error log flood, slow ops, and the long I/O timeouts on voluntary OS shutdown before they are aware of the existence of this option. Let's make this option true by default. Fixes: https://tracker.ceph.com/issues/53328 Signed-off-by: Satoru Takeuchi --- src/common/options/global.yaml.in | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/common/options/global.yaml.in b/src/common/options/global.yaml.in index ada0451b691a7..cd2df3f5c3bdf 100644 --- a/src/common/options/global.yaml.in +++ b/src/common/options/global.yaml.in @@ -3271,7 +3271,7 @@ options: desc: Tell mon about OSD shutdown on immediate shutdown long_desc: Tell the monitor the OSD is shutting down on immediate shutdown. This helps with cluster log messages from other OSDs reporting it immediately failed. - default: false + default: true see_also: - osd_fast_shutdown - osd_mon_shutdown_timeout From da5023df351caa0f4d074bb9ba368367d6b2f03f Mon Sep 17 00:00:00 2001 From: Manuel Lausch Date: Sun, 27 Feb 2022 19:02:20 +0100 Subject: [PATCH 3/3] Since the new message type is send from a OSD, we should allow it send by one Signed-off-by: Manuel Lausch --- src/mon/Monitor.cc | 1 + 1 file changed, 1 insertion(+) diff --git a/src/mon/Monitor.cc b/src/mon/Monitor.cc index d647316e9e5c1..1253320b2c6da 100644 --- a/src/mon/Monitor.cc +++ b/src/mon/Monitor.cc @@ -4546,6 +4546,7 @@ void Monitor::dispatch_op(MonOpRequestRef op) case MSG_OSD_BEACON: case MSG_OSD_MARK_ME_DOWN: case MSG_OSD_MARK_ME_DEAD: + case MSG_OSD_MARK_ME_DOWN_AND_DEAD: case MSG_OSD_FULL: case MSG_OSD_FAILURE: case MSG_OSD_BOOT: