Skip to content

Commit

Permalink
Merge pull request ceph#47615 from adamemerson/wip-objv-doc
Browse files Browse the repository at this point in the history
rgw: Document `RGWOvjVersionTracker`

Reviewed-by: Casey Bodley <[email protected]>
  • Loading branch information
cbodley authored Aug 18, 2022
2 parents 857f2bd + e6498ee commit 7427275
Show file tree
Hide file tree
Showing 2 changed files with 109 additions and 18 deletions.
115 changes: 103 additions & 12 deletions src/rgw/rgw_common.h
Original file line number Diff line number Diff line change
Expand Up @@ -996,39 +996,130 @@ struct rgw_bucket_placement {
void dump(Formatter *f) const;
};

/// `RGWObjVersionTracker`
/// ======================
///
/// What and why is this?
/// ---------------------
///
/// This is a wrapper around `cls_version` functionality. If two RGWs
/// (or two non-synchronized threads in the same RGW) are accessing
/// the same object, they may race and overwrite each other's work.
///
/// This class solves this issue by tracking and recording an object's
/// version in the extended attributes. Operations are failed with
/// ECANCELED if the version is not what we expect.
///
/// How to Use It
/// -------------
///
/// When preparing a read operation, call `prepare_op_for_read`.
/// For a write, call `prepare_op_for_write` when preparing the
/// operation, and `apply_write` after it succeeds.
///
/// Adhere to the following guidelines:
///
/// - Each RGWObjVersionTracker should be used with only one object.
///
/// - If you receive `ECANCELED`, throw away whatever you were doing
/// based on the content of the versioned object, re-read, and
/// restart as appropriate.
///
/// - If one code path uses RGWObjVersionTracker, then they all
/// should. In a situation where a writer should unconditionally
/// overwrite an object, call `generate_new_write_ver` on a default
/// constructed `RGWObjVersionTracker`.
///
/// - If we have a version from a previous read, we will check against
/// it and fail the read if it doesn't match. Thus, if we want to
/// re-read a new version of the object, call `clear()` on the
/// `RGWObjVersionTracker`.
///
/// - This type is not thread-safe. Every thread must have its own
/// instance.
///
struct RGWObjVersionTracker {
obj_version read_version;
obj_version write_version;

obj_version *version_for_read() {
obj_version read_version; //< The version read from an object. If
// set, this value is used to check the
// stored version.
obj_version write_version; //< Set the object to this version on
// write, if set.

/// Pointer to the read version.
obj_version* version_for_read() {
return &read_version;
}

obj_version *version_for_write() {
/// If we have a write version, return a pointer to it. Otherwise
/// return null. This is used in `prepare_op_for_write` to treat the
/// `write_version` as effectively an `option` type.
obj_version* version_for_write() {
if (write_version.ver == 0)
return NULL;
return nullptr;

return &write_version;
}

obj_version *version_for_check() {
/// If read_version is non-empty, return a pointer to it, otherwise
/// null. This is used internally by `prepare_op_for_read` and
/// `prepare_op_for_write` to treat the `read_version` as
/// effectively an `option` type.
obj_version* version_for_check() {
if (read_version.ver == 0)
return NULL;
return nullptr;

return &read_version;
}

void prepare_op_for_read(librados::ObjectReadOperation *op);
void prepare_op_for_write(librados::ObjectWriteOperation *op);

/// This function is to be called on any read operation. If we have
/// a non-empty `read_version`, assert on the OSD that the object
/// has the same version. Also reads the version into `read_version`.
///
/// This function is defined in `rgw_rados.cc` rather than `rgw_common.cc`.
void prepare_op_for_read(librados::ObjectReadOperation* op);

/// This function is to be called on any write operation. If we have
/// a non-empty read operation, assert on the OSD that the object
/// has the same version. If we have a non-empty `write_version`,
/// set the object to it. Otherwise increment the version on the OSD.
///
/// This function is defined in `rgw_rados.cc` rather than
/// `rgw_common.cc`.
void prepare_op_for_write(librados::ObjectWriteOperation* op);

/// This function is to be called after the completion of any write
/// operation on which `prepare_op_for_write` was called. If we did
/// not set the write version explicitly, it increments
/// `read_version`. If we did, it sets `read_version` to
/// `write_version`. In either case, it clears `write_version`.
///
/// RADOS write operations, at least those not using the relatively
/// new RETURNVEC flag, cannot return more information than an error
/// code. Thus, write operations can't simply fill in the read
/// version the way read operations can, so prepare_op_for_write`
/// instructs the OSD to increment the object as stored in RADOS and
/// `apply_write` increments our `read_version` in RAM.
///
/// This function is defined in `rgw_rados.cc` rather than
/// `rgw_common.cc`.
void apply_write();

/// Clear `read_version` and `write_version`, making the instance
/// identical to a default-constructed instance.
void clear() {
read_version = obj_version();
write_version = obj_version();
}

void generate_new_write_ver(CephContext *cct);
/// Set `write_version` to a new, unique version.
///
/// An `obj_version` contains an opaque, random tag and a
/// sequence. If the tags of two `obj_version`s don't match, the
/// versions are unordered and unequal. This function creates a
/// version with a new tag, ensuring that any other process
/// operating on the object will receive `ECANCELED` and will know
/// to re-read the object and restart whatever it was doing.
void generate_new_write_ver(CephContext* cct);
};

inline std::ostream& operator<<(std::ostream& out, const obj_version &v)
Expand Down
12 changes: 6 additions & 6 deletions src/rgw/rgw_rados.cc
Original file line number Diff line number Diff line change
Expand Up @@ -167,9 +167,9 @@ rgw_raw_obj rgw_obj_select::get_raw_obj(rgw::sal::RadosStore* store) const
return raw_obj;
}

void RGWObjVersionTracker::prepare_op_for_read(ObjectReadOperation *op)
void RGWObjVersionTracker::prepare_op_for_read(ObjectReadOperation* op)
{
obj_version *check_objv = version_for_check();
obj_version* check_objv = version_for_check();

if (check_objv) {
cls_version_check(*op, *check_objv, VER_COND_EQ);
Expand All @@ -180,8 +180,8 @@ void RGWObjVersionTracker::prepare_op_for_read(ObjectReadOperation *op)

void RGWObjVersionTracker::prepare_op_for_write(ObjectWriteOperation *op)
{
obj_version *check_objv = version_for_check();
obj_version *modify_version = version_for_write();
obj_version* check_objv = version_for_check();
obj_version* modify_version = version_for_write();

if (check_objv) {
cls_version_check(*op, *check_objv, VER_COND_EQ);
Expand Down Expand Up @@ -263,10 +263,10 @@ void RGWObjectCtx::invalidate(const rgw_obj& obj) {
}
}

void RGWObjVersionTracker::generate_new_write_ver(CephContext *cct)
void RGWObjVersionTracker::generate_new_write_ver(CephContext* cct)
{
static constexpr auto TAG_LEN = 24;
write_version.ver = 1;
#define TAG_LEN 24

write_version.tag.clear();
append_rand_alpha(cct, write_version.tag, write_version.tag, TAG_LEN);
Expand Down

0 comments on commit 7427275

Please sign in to comment.