Skip to content

Commit

Permalink
osd/OSD: osd_fast_shutdown_notify_mon not quite right
Browse files Browse the repository at this point in the history
When osd_fast_shutdown and osd_fast_shutdown_notify_mon set as true, OSD marked as Down
it should be marked as Dead,

Fixed: https://tracker.ceph.com/issues/53327

Signed-off-by: Nitzan Mordechai <[email protected]>

nd

nd
  • Loading branch information
NitzanMordhai committed Mar 13, 2022
1 parent 41b14ad commit a4c7134
Show file tree
Hide file tree
Showing 4 changed files with 43 additions and 10 deletions.
13 changes: 12 additions & 1 deletion src/messages/MOSDMarkMeDown.h
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@

class MOSDMarkMeDown final : public PaxosServiceMessage {
private:
static constexpr int HEAD_VERSION = 3;
static constexpr int HEAD_VERSION = 4;
static constexpr int COMPAT_VERSION = 3;

public:
Expand All @@ -28,6 +28,7 @@ class MOSDMarkMeDown final : public PaxosServiceMessage {
entity_addrvec_t target_addrs;
epoch_t epoch = 0;
bool request_ack = false; // ack requested
bool down_and_dead = false; // mark down and dead

MOSDMarkMeDown()
: PaxosServiceMessage{MSG_OSD_MARK_ME_DOWN, 0,
Expand All @@ -38,6 +39,12 @@ class MOSDMarkMeDown final : public PaxosServiceMessage {
HEAD_VERSION, COMPAT_VERSION},
fsid(fs), target_osd(osd), target_addrs(av),
epoch(e), request_ack(request_ack) {}
MOSDMarkMeDown(const uuid_d &fs, int osd, const entity_addrvec_t& av,
epoch_t e, bool request_ack, bool down_and_dead)
: PaxosServiceMessage{MSG_OSD_MARK_ME_DOWN, e,
HEAD_VERSION, COMPAT_VERSION},
fsid(fs), target_osd(osd), target_addrs(av),
epoch(e), request_ack(request_ack), down_and_dead(down_and_dead) {}
private:
~MOSDMarkMeDown() final {}

Expand All @@ -54,6 +61,8 @@ class MOSDMarkMeDown final : public PaxosServiceMessage {
decode(target_addrs, p);
decode(epoch, p);
decode(request_ack, p);
if(header.version >= 4)
decode(down_and_dead, p);
}

void encode_payload(uint64_t features) override {
Expand All @@ -67,12 +76,14 @@ class MOSDMarkMeDown final : public PaxosServiceMessage {
encode(target_addrs, payload, features);
encode(epoch, payload);
encode(request_ack, payload);
encode(down_and_dead, payload);
}

std::string_view get_type_name() const override { return "MOSDMarkMeDown"; }
void print(std::ostream& out) const override {
out << "MOSDMarkMeDown("
<< "request_ack=" << request_ack
<< ", down_and_dead=" << down_and_dead
<< ", osd." << target_osd
<< ", " << target_addrs
<< ", fsid=" << fsid
Expand Down
8 changes: 7 additions & 1 deletion src/mon/OSDMonitor.cc
Original file line number Diff line number Diff line change
Expand Up @@ -3058,8 +3058,14 @@ bool OSDMonitor::prepare_mark_me_down(MonOpRequestRef op)
ceph_assert(osdmap.is_up(target_osd));
ceph_assert(osdmap.get_addrs(target_osd) == m->target_addrs);

mon.clog->info() << "osd." << target_osd << " marked itself down";
mon.clog->info() << "osd." << target_osd << " marked itself " << ((m->down_and_dead) ? "down and dead" : "down");
pending_inc.new_state[target_osd] = CEPH_OSD_UP;
if (m->down_and_dead) {
if (!pending_inc.new_xinfo.count(target_osd)) {
pending_inc.new_xinfo[target_osd] = osdmap.osd_xinfo[target_osd];
}
pending_inc.new_xinfo[target_osd].dead_epoch = m->get_epoch();
}
if (m->request_ack)
wait_for_finished_proposal(op, new C_AckMarkedDown(this, op));
return true;
Expand Down
1 change: 1 addition & 0 deletions src/mon/OSDMonitor.h
Original file line number Diff line number Diff line change
Expand Up @@ -430,6 +430,7 @@ class OSDMonitor : public PaxosService,
bool preprocess_failure(MonOpRequestRef op);
bool prepare_failure(MonOpRequestRef op);
bool prepare_mark_me_down(MonOpRequestRef op);
bool prepare_mark_me_down_and_dead(MonOpRequestRef op);
void process_failures();
void take_all_failures(std::list<MonOpRequestRef>& ls);

Expand Down
31 changes: 23 additions & 8 deletions src/osd/OSD.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1299,20 +1299,35 @@ bool OSDService::prepare_to_stop()

OSDMapRef osdmap = get_osdmap();
if (osdmap && osdmap->is_up(whoami)) {
dout(0) << __func__ << " telling mon we are shutting down" << dendl;
set_state(PREPARING_TO_STOP);
monc->send_mon_message(
new MOSDMarkMeDown(
monc->get_fsid(),
whoami,
osdmap->get_addrs(whoami),
osdmap->get_epoch(),
true // request ack
if (cct->_conf->osd_fast_shutdown &&
cct->_conf->osd_fast_shutdown_notify_mon) {
dout(0) << __func__ << " telling mon we are shutting down and dead " << dendl;
monc->send_mon_message(
new MOSDMarkMeDown(
monc->get_fsid(),
whoami,
osdmap->get_addrs(whoami),
osdmap->get_epoch(),
true, // request ack
true // mark as down and dead
));
} else {
dout(0) << __func__ << " telling mon we are shutting down" << dendl;
monc->send_mon_message(
new MOSDMarkMeDown(
monc->get_fsid(),
whoami,
osdmap->get_addrs(whoami),
osdmap->get_epoch(),
true // request ack
));
}
const auto timeout = ceph::make_timespan(cct->_conf->osd_mon_shutdown_timeout);
is_stopping_cond.wait_for(l, timeout,
[this] { return get_state() == STOPPING; });
}

dout(0) << __func__ << " starting shutdown" << dendl;
set_state(STOPPING);
return true;
Expand Down

0 comments on commit a4c7134

Please sign in to comment.