Skip to content

Commit

Permalink
Merge pull request ceph#53531 from ronen-fr/wip-rf-squeue2
Browse files Browse the repository at this point in the history
osd/scrub: extract scrub initiation code out of the OSD

Reviewed-by: Samuel Just <[email protected]>
  • Loading branch information
ronen-fr authored Sep 22, 2023
2 parents f0c48df + f7ddca6 commit 5ecd20f
Show file tree
Hide file tree
Showing 21 changed files with 1,508 additions and 910 deletions.
24 changes: 15 additions & 9 deletions qa/standalone/scrub/osd-scrub-test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -49,9 +49,12 @@ function TEST_scrub_test() {

run_mon $dir a --osd_pool_default_size=3 || return 1
run_mgr $dir x || return 1
local ceph_osd_args="--osd-scrub-interval-randomize-ratio=0 --osd-deep-scrub-randomize-ratio=0 "
ceph_osd_args+="--osd_scrub_backoff_ratio=0 --osd_stats_update_period_not_scrubbing=3 "
ceph_osd_args+="--osd_stats_update_period_scrubbing=2"
for osd in $(seq 0 $(expr $OSDS - 1))
do
run_osd $dir $osd || return 1
run_osd $dir $osd $ceph_osd_args || return 1
done

# Create a pool with a single pg
Expand Down Expand Up @@ -211,16 +214,17 @@ function TEST_scrub_extended_sleep() {

run_mon $dir a --osd_pool_default_size=3 || return 1
run_mgr $dir x || return 1

local ceph_osd_args="--osd-scrub-interval-randomize-ratio=0 --osd-deep-scrub-randomize-ratio=0 "
ceph_osd_args+="--osd_scrub_backoff_ratio=0 --osd_stats_update_period_not_scrubbing=3 "
ceph_osd_args+="--osd_stats_update_period_scrubbing=2 --osd_scrub_sleep=0 "
ceph_osd_args+="--osd_scrub_extended_sleep=20 --osd_scrub_begin_week_day=$DAY_START "
ceph_osd_args+="--osd_op_queue=wpq --osd_scrub_end_week_day=$DAY_END "
ceph_osd_args+="--bluestore_cache_autotune=false" # why needed?

for osd in $(seq 0 $(expr $OSDS - 1))
do
run_osd $dir $osd --osd_scrub_sleep=0 \
--osd_scrub_extended_sleep=20 \
--bluestore_cache_autotune=false \
--osd_deep_scrub_randomize_ratio=0.0 \
--osd_scrub_interval_randomize_ratio=0 \
--osd_scrub_begin_week_day=$DAY_START \
--osd_scrub_end_week_day=$DAY_END \
|| return 1
run_osd $dir $osd $ceph_osd_args || return 1
done

# Create a pool with a single pg
Expand Down Expand Up @@ -527,6 +531,8 @@ function TEST_dump_scrub_schedule() {
--osd_scrub_interval_randomize_ratio=0 \
--osd_scrub_backoff_ratio=0.0 \
--osd_op_queue=wpq \
--osd_stats_update_period_not_scrubbing=3 \
--osd_stats_update_period_scrubbing=2 \
--osd_scrub_sleep=0.2"

for osd in $(seq 0 $(expr $OSDS - 1))
Expand Down
3 changes: 3 additions & 0 deletions src/osd/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -22,9 +22,12 @@ set(osd_srcs
PGBackend.cc
OSDCap.cc
scrubber/pg_scrubber.cc
scrubber/osd_scrub.cc
scrubber/osd_scrub_sched.cc
scrubber/PrimaryLogScrub.cc
scrubber/scrub_job.cc
scrubber/scrub_machine.cc
scrubber/scrub_resources.cc
scrubber/ScrubStore.cc
scrubber/scrub_backend.cc
Watch.cc
Expand Down
147 changes: 19 additions & 128 deletions src/osd/OSD.cc
Original file line number Diff line number Diff line change
Expand Up @@ -246,7 +246,7 @@ OSDService::OSDService(OSD *osd, ceph::async::io_context_pool& poolctx) :
osd_skip_data_digest(cct->_conf, "osd_skip_data_digest"),
publish_lock{ceph::make_mutex("OSDService::publish_lock")},
pre_publish_lock{ceph::make_mutex("OSDService::pre_publish_lock")},
m_scrub_queue{cct, *this},
m_osd_scrub{cct, *this, cct->_conf},
agent_valid_iterator(false),
agent_ops(0),
flush_mode_high_count(0),
Expand Down Expand Up @@ -2853,7 +2853,7 @@ will start to track new ops received afterwards.";
f->close_section();
} else if (prefix == "dump_scrub_reservations") {
f->open_object_section("scrub_reservations");
service.get_scrub_services().dump_scrub_reservations(f);
service.get_scrub_services().resource_bookkeeper().dump_scrub_reservations(f);
f->close_section();
} else if (prefix == "get_latest_osdmap") {
get_latest_osdmap();
Expand Down Expand Up @@ -6282,9 +6282,7 @@ void OSD::tick_without_osd_lock()
}

if (is_active()) {
if (!scrub_random_backoff()) {
sched_scrub();
}
service.get_scrub_services().initiate_scrub(service.is_recovery_active());
service.promote_throttle_recalibrate();
resume_creating_pg();
bool need_send_beacon = false;
Expand Down Expand Up @@ -7597,130 +7595,16 @@ void OSD::handle_fast_scrub(MOSDScrub2 *m)
m->put();
}

bool OSD::scrub_random_backoff()
{
bool coin_flip = (rand() / (double)RAND_MAX >=
cct->_conf->osd_scrub_backoff_ratio);
if (!coin_flip) {
dout(20) << "scrub_random_backoff lost coin flip, randomly backing off (ratio: "
<< cct->_conf->osd_scrub_backoff_ratio << ")" << dendl;
return true;
}
return false;
}


void OSD::sched_scrub()
std::optional<PGLockWrapper> OSDService::get_locked_pg(spg_t pgid)
{
auto& scrub_scheduler = service.get_scrub_services();

if (auto blocked_pgs = scrub_scheduler.get_blocked_pgs_count();
blocked_pgs > 0) {
// some PGs managed by this OSD were blocked by a locked object during
// scrub. This means we might not have the resources needed to scrub now.
dout(10)
<< fmt::format(
"{}: PGs are blocked while scrubbing due to locked objects ({} PGs)",
__func__,
blocked_pgs)
<< dendl;
}

// fail fast if no resources are available
if (!scrub_scheduler.can_inc_scrubs()) {
dout(20) << __func__ << ": OSD cannot inc scrubs" << dendl;
return;
}

// if there is a PG that is just now trying to reserve scrub replica resources -
// we should wait and not initiate a new scrub
if (scrub_scheduler.is_reserving_now()) {
dout(20) << __func__ << ": scrub resources reservation in progress" << dendl;
return;
}

Scrub::ScrubPreconds env_conditions;

if (service.is_recovery_active() && !cct->_conf->osd_scrub_during_recovery) {
if (!cct->_conf->osd_repair_during_recovery) {
dout(15) << __func__ << ": not scheduling scrubs due to active recovery"
<< dendl;
return;
}
dout(10) << __func__
<< " will only schedule explicitly requested repair due to active recovery"
<< dendl;
env_conditions.allow_requested_repair_only = true;
}

if (g_conf()->subsys.should_gather<ceph_subsys_osd, 20>()) {
dout(20) << __func__ << " sched_scrub starts" << dendl;
auto all_jobs = scrub_scheduler.list_registered_jobs();
for (const auto& sj : all_jobs) {
dout(20) << "sched_scrub scrub-queue jobs: " << *sj << dendl;
}
}

auto was_started = scrub_scheduler.select_pg_and_scrub(env_conditions);
dout(20) << "sched_scrub done (" << ScrubQueue::attempt_res_text(was_started)
<< ")" << dendl;
}

Scrub::schedule_result_t OSDService::initiate_a_scrub(spg_t pgid,
bool allow_requested_repair_only)
{
dout(20) << __func__ << " trying " << pgid << dendl;

// we have a candidate to scrub. We need some PG information to know if scrubbing is
// allowed

PGRef pg = osd->lookup_lock_pg(pgid);
if (!pg) {
// the PG was dequeued in the short timespan between creating the candidates list
// (collect_ripe_jobs()) and here
dout(5) << __func__ << " pg " << pgid << " not found" << dendl;
return Scrub::schedule_result_t::no_such_pg;
}

// This has already started, so go on to the next scrub job
if (pg->is_scrub_queued_or_active()) {
pg->unlock();
dout(20) << __func__ << ": already in progress pgid " << pgid << dendl;
return Scrub::schedule_result_t::already_started;
}
// Skip other kinds of scrubbing if only explicitly requested repairing is allowed
if (allow_requested_repair_only && !pg->get_planned_scrub().must_repair) {
pg->unlock();
dout(10) << __func__ << " skip " << pgid
<< " because repairing is not explicitly requested on it" << dendl;
return Scrub::schedule_result_t::preconditions;
auto pg = osd->lookup_lock_pg(pgid);
if (pg) {
return PGLockWrapper{std::move(pg)};
} else {
return std::nullopt;
}

auto scrub_attempt = pg->sched_scrub();
pg->unlock();
return scrub_attempt;
}

void OSD::resched_all_scrubs()
{
dout(10) << __func__ << ": start" << dendl;
auto all_jobs = service.get_scrub_services().list_registered_jobs();
for (auto& e : all_jobs) {

auto& job = *e;
dout(20) << __func__ << ": examine " << job.pgid << dendl;

PGRef pg = _lookup_lock_pg(job.pgid);
if (!pg)
continue;

dout(15) << __func__ << ": updating scrub schedule on " << job.pgid << dendl;
pg->on_scrub_schedule_input_change();

pg->unlock();
}
dout(10) << __func__ << ": done" << dendl;
}

MPGStats* OSD::collect_pg_stats()
{
Expand Down Expand Up @@ -9955,10 +9839,17 @@ void OSD::handle_conf_change(const ConfigProxy& conf,
}

if (changed.count("osd_scrub_min_interval") ||
changed.count("osd_scrub_max_interval")) {
resched_all_scrubs();
dout(0) << __func__ << ": scrub interval change" << dendl;
changed.count("osd_scrub_max_interval") ||
changed.count("osd_deep_scrub_interval")) {
service.get_scrub_services().on_config_change();
dout(0) << fmt::format(
"{}: scrub interval change (min:{} deep:{} max:{})",
__func__, cct->_conf->osd_scrub_min_interval,
cct->_conf->osd_deep_scrub_interval,
cct->_conf->osd_scrub_max_interval)
<< dendl;
}

check_config();
if (changed.count("osd_asio_thread_count")) {
service.poolctx.stop();
Expand Down
30 changes: 8 additions & 22 deletions src/osd/OSD.h
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@
#include "common/EventTrace.h"
#include "osd/osd_perf_counters.h"
#include "common/Finisher.h"
#include "scrubber/osd_scrub_sched.h"
#include "scrubber/osd_scrub.h"

#define CEPH_OSD_PROTOCOL 10 /* cluster internal */

Expand Down Expand Up @@ -239,30 +239,18 @@ class OSDService : public Scrub::ScrubSchedListener {
void handle_misdirected_op(PG *pg, OpRequestRef op);

private:
/**
* The entity that maintains the set of PGs we may scrub (i.e. - those that we
* are their primary), and schedules their scrubbing.
*/
ScrubQueue m_scrub_queue;
/// the entity that offloads all scrubbing-related operations
OsdScrub m_osd_scrub;

public:
ScrubQueue& get_scrub_services() { return m_scrub_queue; }
OsdScrub& get_scrub_services() { return m_osd_scrub; }

/**
* A callback used by the ScrubQueue object to initiate a scrub on a specific PG.
*
* The request might fail for multiple reasons, as ScrubQueue cannot by its own
* check some of the PG-specific preconditions and those are checked here. See
* attempt_t definition.
*
* @param pgid to scrub
* @param allow_requested_repair_only
* @return a Scrub::attempt_t detailing either a success, or the failure reason.
* locks the named PG, returning an RAII wrapper that unlocks upon
* destruction.
* returns nullopt if failing to lock.
*/
Scrub::schedule_result_t initiate_a_scrub(
spg_t pgid,
bool allow_requested_repair_only) final;

std::optional<PGLockWrapper> get_locked_pg(spg_t pgid) final;

private:
// -- agent shared state --
Expand Down Expand Up @@ -1867,9 +1855,7 @@ class OSD : public Dispatcher,


// -- scrubbing --
void sched_scrub();
void resched_all_scrubs();
bool scrub_random_backoff();

// -- status reporting --
MPGStats *collect_pg_stats();
Expand Down
21 changes: 16 additions & 5 deletions src/osd/PG.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1348,19 +1348,22 @@ Scrub::schedule_result_t PG::sched_scrub()
ceph_assert(m_scrubber);

if (is_scrub_queued_or_active()) {
return schedule_result_t::already_started;
dout(10) << __func__ << ": already scrubbing" << dendl;
return schedule_result_t::target_specific_failure;
}

if (!is_primary() || !is_active() || !is_clean()) {
return schedule_result_t::bad_pg_state;
dout(10) << __func__ << ": cannot scrub (not a clean and active primary)"
<< dendl;
return schedule_result_t::target_specific_failure;
}

if (state_test(PG_STATE_SNAPTRIM) || state_test(PG_STATE_SNAPTRIM_WAIT)) {
// note that the trimmer checks scrub status when setting 'snaptrim_wait'
// (on the transition from NotTrimming to Trimming/WaitReservation),
// i.e. some time before setting 'snaptrim'.
dout(10) << __func__ << ": cannot scrub while snap-trimming" << dendl;
return schedule_result_t::bad_pg_state;
return schedule_result_t::target_specific_failure;
}

// analyse the combination of the requested scrub flags, the osd/pool configuration
Expand All @@ -1372,14 +1375,14 @@ Scrub::schedule_result_t PG::sched_scrub()
// (due to configuration or priority issues)
// The reason was already reported by the callee.
dout(10) << __func__ << ": failed to initiate a scrub" << dendl;
return schedule_result_t::preconditions;
return schedule_result_t::target_specific_failure;
}

// try to reserve the local OSD resources. If failing: no harm. We will
// be retried by the OSD later on.
if (!m_scrubber->reserve_local()) {
dout(10) << __func__ << ": failed to reserve locally" << dendl;
return schedule_result_t::no_local_resources;
return schedule_result_t::osd_wide_failure;
}

// can commit to the updated flags now, as nothing will stop the scrub
Expand Down Expand Up @@ -2836,3 +2839,11 @@ void PG::with_heartbeat_peers(std::function<void(int)>&& f)
uint64_t PG::get_min_alloc_size() const {
return osd->store->get_min_alloc_size();
}

PGLockWrapper::~PGLockWrapper()
{
if (m_pg) {
// otherwise - we were 'moved from'
m_pg->unlock();
}
}
18 changes: 18 additions & 0 deletions src/osd/PG.h
Original file line number Diff line number Diff line change
Expand Up @@ -1450,4 +1450,22 @@ class PG : public DoutPrefixProvider,
}
};

/**
* Initialized with a locked PG. That PG is unlocked in the
* destructor.
* Used by OsdScrub when initiating a scrub.
*/
class PGLockWrapper {
public:
explicit PGLockWrapper(PGRef locked_pg) : m_pg{locked_pg} {}
PGRef pg() { return m_pg; }
~PGLockWrapper();
PGLockWrapper(PGLockWrapper&& rhs) : m_pg(std::move(rhs.m_pg)) {
rhs.m_pg = nullptr;
}
PGLockWrapper(const PGLockWrapper& rhs) = delete;
private:
PGRef m_pg;
};

#endif
Loading

0 comments on commit 5ecd20f

Please sign in to comment.